diff --git a/.ci/linux-x64-cpu-gcc.yml b/.ci/linux-x64-cpu-gcc.yml new file mode 100644 index 000000000000..4f138d9d080b --- /dev/null +++ b/.ci/linux-x64-cpu-gcc.yml @@ -0,0 +1,119 @@ +name: linux-x64-cpu-gcc +on: + push: + branches: [master] + paths: + - '.ci/linux-x64-cpu-gcc.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' + mr: + target-branches: [master] + paths: + - '.ci/linux-x64-cpu-gcc.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/x86/**' + - 'tests/**' + - 'tools/**' + - '!tools/pnnx/**' + - 'examples/**' +concurrency: + group: linux-x64-cpu-gcc-${{ ci.head_ref }} + +jobs: + linux-gcc: + name: linux-gcc + strategy: + matrix: + include: + - { SSE2: 'OFF', AVX: 'OFF', AVX2: 'OFF', AVX512: 'OFF' } + - { SSE2: 'ON', AVX: 'OFF', AVX2: 'OFF', AVX512: 'OFF' } + - { SSE2: 'ON', AVX: 'ON', AVX2: 'OFF', AVX512: 'OFF' } + - { SSE2: 'ON', AVX: 'ON', AVX2: 'ON', AVX512: 'OFF' } + - { SSE2: 'ON', AVX: 'ON', AVX2: 'ON', AVX512: 'ON' } + + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y libprotobuf-dev protobuf-compiler libopencv-dev + + - name: build + run: | + mkdir build && cd build + cmake -DNCNN_SSE2=${{matrix.SSE2}} -DNCNN_AVX=${{matrix.AVX}} -DNCNN_AVX2=${{matrix.AVX2}} -DNCNN_AVX512=${{matrix.AVX512}} -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test + run: cd build && ctest --output-on-failure -j $(nproc) + - name: build-shared + run: | + mkdir build-shared && cd build-shared + cmake -DNCNN_SSE2=${{matrix.SSE2}} -DNCNN_AVX=${{matrix.AVX}} -DNCNN_AVX2=${{matrix.AVX2}} -DNCNN_AVX512=${{matrix.AVX512}} -DNCNN_SHARED_LIB=ON .. + cmake --build . -j $(nproc) + - name: build-noint8 + run: | + mkdir build-noint8 && cd build-noint8 + cmake -DNCNN_SSE2=${{matrix.SSE2}} -DNCNN_AVX=${{matrix.AVX}} -DNCNN_AVX2=${{matrix.AVX2}} -DNCNN_AVX512=${{matrix.AVX512}} -DNCNN_INT8=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test-noint8 + run: cd build-noint8 && ctest --output-on-failure -j $(nproc) + + linux-gcc-cpp03-nostdio-nostring-simplestl: + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: build-nostdio + run: | + mkdir build-nostdio && cd build-nostdio + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-c++03.toolchain.cmake -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j $(nproc) + - name: test-nostdio + run: cd build-nostdio && ctest --output-on-failure -j $(nproc) + - name: build-nostdio-nostring + run: | + mkdir build-nostdio-nostring && cd build-nostdio-nostring + cmake -DNCNN_STDIO=OFF -DNCNN_STRING=OFF -DNCNN_BUILD_TESTS=OFF -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j $(nproc) + - name: build-simplestl + run: | + mkdir build-simplestl && cd build-simplestl + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j $(nproc) + - name: test-simplestl + run: cd build-simplestl && ctest --output-on-failure -j $(nproc) + - name: build-simplestl-simpleomp + run: | + mkdir build-simplestl-simpleomp && cd build-simplestl-simpleomp + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake -DNCNN_STDIO=ON -DNCNN_STRING=ON -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEOMP=ON -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_BENCHMARK=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j $(nproc) + - name: test-simplestl-simpleomp + run: cd build-simplestl-simpleomp && ctest --output-on-failure -j $(nproc) diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml new file mode 100644 index 000000000000..596e753dec1c --- /dev/null +++ b/.ci/pnnx.yml @@ -0,0 +1,120 @@ +name: pnnx +on: + push: + branches: [master] + paths: + - '.ci/pnnx.yml' + - 'tools/pnnx/**' + - '!tools/pnnx/README.md' + mr: + target-branches: [master] + paths: + - '.ci/pnnx.yml' + - 'tools/pnnx/**' + - '!tools/pnnx/README.md' +concurrency: + group: pnnx-${{ ci.head_ref }} + +jobs: + ubuntu: + strategy: + matrix: + include: + - torch-version: 1.8.1 + torchvision-version: 0.9.1 + torchvision-cache-key: '0_9_1' + + - torch-version: 1.9.1 + torchvision-version: 0.10.1 + torchvision-cache-key: '0_10_1' + + - torch-version: 1.10.0 + torchvision-version: 0.11.1 + torchvision-cache-key: '0_11_1' + + - torch-version: 1.11.0 + torchvision-version: 0.12.0 + torchvision-cache-key: '0_12_0' + + - torch-version: 1.12.0 + torchvision-version: 0.13.0 + torchvision-cache-key: '0_13_0' + + - torch-version: 1.13.0 + torchvision-version: 0.14.0 + torchvision-cache-key: '0_14_0' + + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y python3-pip libjpeg-dev libpng-dev libprotobuf-dev protobuf-compiler + python3 -m pip install --upgrade pip + pip3 uninstall -y setuptools + pip3 install -U pytest setuptools wheel twine distribute requests + + - name: setup pytorch + run: | + export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}} + pip3 install --user torch==${{matrix.torch-version}}+cpu torchvision==${{matrix.torchvision-version}}+cpu -f https://download.pytorch.org/whl/torch_stable.html + + - name: cache-torchvision + id: cache-torchvision + uses: cache@1.* + with: + cachePaths: torchvision-${{matrix.torchvision-version}}-install + cacheKey: torchvision-${{matrix.torchvision-cache-key}}-linux-install-20211228 + - name: checkout-torchvision + if: steps.cache-torchvision.outputs.cacheHit != 'true' + checkout: https://github.com/pytorch/vision.git + with: + pullType: TAG + refName: v${{matrix.torchvision-version}} + localPath: vision + enableSubmodule: false + enableGitLfs: false + - name: torchvision + if: steps.cache-torchvision.outputs.cacheHit != 'true' + run: | + cd vision + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=${{ci.workspace}}/torchvision-${{matrix.torchvision-version}}-install -DTorch_DIR=${{ci.workspace}}/torch-${{matrix.torch-version}}/lib/python3.9/site-packages/torch/share/cmake/Torch -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j $(nproc) + cmake --build . --target install + + - name: build-ncnn + run: | + export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}} + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. + cmake --build . -j $(nproc) + cd .. + export CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) + pip3 install --user . + + - name: build-pnnx + run: | + export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}} + cd tools/pnnx + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=Release -DTorchVision_INSTALL_DIR=${{ci.workspace}}/torchvision-${{matrix.torchvision-version}}-install .. + cmake --build . -j $(nproc) + + - name: test + run: | + export PYTHONUSERBASE=${{ci.workspace}}/torch-${{matrix.torch-version}} + export OMP_NUM_THREADS=1 + export MKL_NUM_THREADS=1 + export MKL_ENABLE_INSTRUCTIONS=SSE4_2 + cd tools/pnnx + cd build && ctest --output-on-failure -j 16 diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml new file mode 100644 index 000000000000..9272caac1ba4 --- /dev/null +++ b/.ci/test-coverage.yml @@ -0,0 +1,808 @@ +name: test-coverage +on: + push: + branches: [master] + paths: + - '.ci/test-coverage.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/**' + - 'tests/**' + - 'toolchains/**' + mr: + target-branches: [master] + paths: + - '.ci/test-coverage.yml' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/**' + - 'tests/**' + - 'toolchains/**' +concurrency: + group: test-coverage-${{ ci.head_ref }} + +jobs: + linux-gcc-gpu: + name: linux-gcc-gpu + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov libvulkan-dev + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: cache-swiftshader + id: cache-swiftshader + uses: cache@1.* + with: + cachePaths: swiftshader-install + cacheKey: swiftshader-linux-install-20221026 + + - name: checkout-swiftshader + if: steps.cache-swiftshader.outputs.cacheHit != 'true' + checkout: https://github.com/google/swiftshader.git + with: + pullType: COMMIT_ID + refName: 04d007924c2d33ea1ac4be78ae423507a0b08b61 + localPath: swiftshader + enableSubmodule: false + enableGitLfs: false + + - name: swiftshader + if: steps.cache-swiftshader.outputs.cacheHit != 'true' + run: | + cd swiftshader + git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive + mkdir -p build; cd build + cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. + cmake --build . -j $(nproc) + mkdir ${{ci.workspace}}/swiftshader-install + cp Linux/* ${{ci.workspace}}/swiftshader-install + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test + run: | + printf "[Processor]\nThreadCount=4\n" > build/tests/SwiftShader.ini + export VK_ICD_FILENAMES="${{ci.workspace}}/swiftshader-install/vk_swiftshader_icd.json" + cd build && ctest --output-on-failure -j 4 + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov -r lcov.info '*/glslang/*' -o lcov.info + lcov --list lcov.info + - name: codecov + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + + linux-gcc-gpu-lavapipe: + name: linux-gcc-gpu-lavapipe + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov libvulkan-dev libxcb-shm0 + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: cache-lavapipe + id: cache-lavapipe + uses: cache@1.* + with: + cachePaths: lavapipe-install + cacheKey: lavapipe-linux-install-20211127-3 + + - name: checkout-lavapipe + if: steps.cache-lavapipe.outputs.cacheHit != 'true' + checkout: https://github.com/mesa3d/mesa.git + with: + pullType: COMMIT_ID + refName: cd39180cfab20734744b379b085cc3b5c2cecd3a + localPath: mesa + enableSubmodule: false + enableGitLfs: false + + - name: lavapipe + if: steps.cache-lavapipe.outputs.cacheHit != 'true' + run: | + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list + apt-get update + apt-get build-dep -y mesa + mkdir -p "${{ci.workspace}}/lavapipe-install" + cd mesa + mkdir build + cd build + meson -Dprefix="${{ci.workspace}}/lavapipe-install" -Dbuildtype=release -Db_lto=true -Db_ndebug=true -Dplatforms="x11" -Ddri3=enabled -Ddri-drivers="" -Dgallium-drivers=swrast -Dgallium-vdpau=disabled -Dgallium-xvmc=disabled -Dgallium-omx=disabled -Dgallium-va=disabled -Dgallium-xa=disabled -Dgallium-opencl=disabled -Dopencl-native=false -Dvulkan-drivers=swrast -Dshader-cache=disabled -Dgles1=disabled -Dgles2=disabled -Dopengl=false -Dgbm=disabled -Dglx=disabled -Degl=disabled -Dllvm=enabled -Dvalgrind=disabled -Dlibunwind=disabled -Dlmsensors=disabled .. + ninja -j$(nproc) + ninja install + find ${{ci.workspace}}/lavapipe-install + cat ${{ci.workspace}}/lavapipe-install/share/vulkan/icd.d/lvp_icd.x86_64.json + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test + run: | + export LP_NUM_THREADS=4 + export VK_ICD_FILENAMES="${{ci.workspace}}/lavapipe-install/share/vulkan/icd.d/lvp_icd.x86_64.json" + cd build + ctest --output-on-failure -j 4 + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov -r lcov.info '*/glslang/*' -o lcov.info + lcov --list lcov.info + - name: codecov + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + + linux-gcc-x64: + name: linux-gcc-x64 + strategy: + matrix: + # openmp: ['OFF', 'ON'] + include: + - { SSE2: 'OFF', AVX: 'OFF', XOP: 'OFF', F16C: 'OFF', FMA: 'OFF', AVX2: 'OFF', AVX512: 'OFF', AVX512VNNI: 'OFF', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'} + - { SSE2: 'ON', AVX: 'OFF', XOP: 'OFF', F16C: 'OFF', FMA: 'OFF', AVX2: 'OFF', AVX512: 'OFF', AVX512VNNI: 'OFF', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'} + - { SSE2: 'ON', AVX: 'ON', XOP: 'OFF', F16C: 'OFF', FMA: 'OFF', AVX2: 'OFF', AVX512: 'OFF', AVX512VNNI: 'OFF', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'} + - { SSE2: 'ON', AVX: 'ON', XOP: 'OFF', F16C: 'ON', FMA: 'ON', AVX2: 'ON', AVX512: 'OFF', AVX512VNNI: 'OFF', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'} + - { SSE2: 'ON', AVX: 'ON', XOP: 'OFF', F16C: 'ON', FMA: 'ON', AVX2: 'ON', AVX512: 'ON', AVX512VNNI: 'ON', AVXVNNI: 'OFF', AVX512BF16: 'OFF', AVX512FP16: 'OFF'} + + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON \ + -DNCNN_SSE2=${{matrix.SSE2}} \ + -DNCNN_AVX=${{matrix.AVX}} \ + -DNCNN_XOP=${{matrix.XOP}} \ + -DNCNN_F16C=${{matrix.F16C}} \ + -DNCNN_FMA=${{matrix.FMA}} \ + -DNCNN_AVX2=${{matrix.AVX2}} \ + -DNCNN_AVX512=${{matrix.AVX512}} \ + -DNCNN_AVXVNNI=${{matrix.AVXVNNI}} \ + -DNCNN_AVX512VNNI=${{matrix.AVX512VNNI}} \ + -DNCNN_AVX512BF16=${{matrix.AVX512BF16}} \ + -DNCNN_AVX512FP16=${{matrix.AVX512FP16}} \ + .. + cmake --build . -j $(nproc) + - name: test + run: cd build && ctest --output-on-failure -j $(nproc) + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov --list lcov.info + - name: codecov + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + + linux-gcc-arm: + name: linux-gcc-arm + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov g++-arm-linux-gnueabi g++-arm-linux-gnueabihf libcapstone4 libglib2.0-0 + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: cache-qemu + id: cache-qemu + uses: cache@1.* + with: + cachePaths: qemu-install + cacheKey: qemu-arm-install-20220831 + + - name: checkout-qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + checkout: https://github.com/qemu/qemu.git + with: + pullType: COMMIT_ID + refName: 621da7789083b80d6f1ff1c0fb499334007b4f51 + localPath: qemu + enableSubmodule: false + enableGitLfs: false + + - name: qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + run: | + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list + apt-get update + apt-get build-dep -y qemu + cd qemu + ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=arm-linux-user --disable-system + make -j$(nproc) + make install + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=ON -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test + run: | + export PATH=${{ci.workspace}}/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j $(nproc) + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov --list lcov.info + + - name: build-armhf-vfpv3-d16 + run: | + mkdir build-armhf-vfpv3-d16 && cd build-armhf-vfpv3-d16 + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test-armhf-vfpv3-d16 + run: | + export PATH=${{ci.workspace}}/qemu-install/bin:$PATH + cd build-armhf-vfpv3-d16 + TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j $(nproc) + - name: lcov-collect-armhf-vfpv3-d16 + run: | + cd build-armhf-vfpv3-d16 + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build-armhf-vfpv3-d16/*' -o lcov.info + lcov --list lcov.info + + - name: codecov + run: | + ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build-armhf-vfpv3-d16/lcov.info + + linux-gcc-aarch64: + name: linux-gcc-aarch64 + strategy: + matrix: + # openmp: ['OFF', 'ON'] + include: + - { ARM82: 'OFF', ARM82DOT: 'OFF', ARM82FP16FML: 'OFF', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'} + - { ARM82: 'ON', ARM82DOT: 'OFF', ARM82FP16FML: 'OFF', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'} + - { ARM82: 'ON', ARM82DOT: 'ON', ARM82FP16FML: 'ON', ARM84BF16: 'OFF', ARM84I8MM: 'OFF', ARM86SVE: 'OFF'} + - { ARM82: 'ON', ARM82DOT: 'ON', ARM82FP16FML: 'ON', ARM84BF16: 'ON', ARM84I8MM: 'ON', ARM86SVE: 'OFF'} + + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov g++-aarch64-linux-gnu libcapstone4 libglib2.0-0 + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: cache-qemu + id: cache-qemu + uses: cache@1.* + with: + cachePaths: qemu-install + cacheKey: qemu-aarch64-install-20220831 + + - name: checkout-qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + checkout: https://github.com/qemu/qemu.git + with: + pullType: COMMIT_ID + refName: 621da7789083b80d6f1ff1c0fb499334007b4f51 + localPath: qemu + enableSubmodule: false + enableGitLfs: false + + - name: qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + run: | + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list + apt-get update + apt-get build-dep -y qemu + cd qemu + ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=aarch64-linux-user --disable-system + make -j$(nproc) + make install + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake \ + -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON \ + -DNCNN_ARM82=${{matrix.ARM82}} \ + -DNCNN_ARM82DOT=${{matrix.ARM82DOT}} \ + -DNCNN_ARM82FP16FML=${{matrix.ARM82FP16FML}} \ + -DNCNN_ARM84BF16=${{matrix.ARM84BF16}} \ + -DNCNN_ARM84I8MM=${{matrix.ARM84I8MM}} \ + .. + cmake --build . -j $(nproc) + - name: test + run: | + export PATH=${{ci.workspace}}/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j $(nproc) + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov --list lcov.info + - name: codecov + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + + linux-gcc-mipsisa32r6el: + name: linux-gcc-mipsisa32r6el + strategy: + matrix: + OPENMP: ['OFF', 'ON'] + + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov g++-mipsisa32r6el-linux-gnu libcapstone4 libglib2.0-0 + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: cache-qemu + id: cache-qemu + uses: cache@1.* + with: + cachePaths: qemu-install + cacheKey: qemu-mipsel-install-20220831 + + - name: checkout-qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + checkout: https://github.com/qemu/qemu.git + with: + pullType: COMMIT_ID + refName: 621da7789083b80d6f1ff1c0fb499334007b4f51 + localPath: qemu + enableSubmodule: false + enableGitLfs: false + + - name: qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + run: | + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list + apt-get update + apt-get build-dep -y qemu + cd qemu + ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=mipsel-linux-user --disable-system + make -j$(nproc) + make install + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=OFF -DNCNN_MMI=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test + run: | + export PATH=${{ci.workspace}}/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j $(nproc) + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov --list lcov.info + - name: codecov + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + + linux-gcc-mipsisa64r6el: + name: linux-gcc-mipsisa64r6el + strategy: + matrix: + OPENMP: ['OFF', 'ON'] + + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov g++-mipsisa64r6el-linux-gnuabi64 libcapstone4 libglib2.0-0 + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: cache-qemu + id: cache-qemu + uses: cache@1.* + with: + cachePaths: qemu-install + cacheKey: qemu-mips64el-install-20220831 + + - name: checkout-qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + checkout: https://github.com/qemu/qemu.git + with: + pullType: COMMIT_ID + refName: 621da7789083b80d6f1ff1c0fb499334007b4f51 + localPath: qemu + enableSubmodule: false + enableGitLfs: false + + - name: qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + run: | + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list + apt-get update + apt-get build-dep -y qemu + cd qemu + ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=mips64el-linux-user --disable-system + make -j$(nproc) + make install + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=ON -DNCNN_MMI=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test + run: | + export PATH=${{ci.workspace}}/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j $(nproc) + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov --list lcov.info + - name: codecov + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + + linux-gcc-riscv64: + name: linux-gcc-riscv64 + strategy: + matrix: + OPENMP: ['OFF', 'ON'] + + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov g++-riscv64-linux-gnu libcapstone4 libglib2.0-0 + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: cache-qemu + id: cache-qemu + uses: cache@1.* + with: + cachePaths: qemu-install + cacheKey: qemu-riscv64-install-20220831 + + - name: checkout-qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + checkout: https://github.com/qemu/qemu.git + with: + pullType: COMMIT_ID + refName: 621da7789083b80d6f1ff1c0fb499334007b4f51 + localPath: qemu + enableSubmodule: false + enableGitLfs: false + + - name: qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + run: | + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list + apt-get update + apt-get build-dep -y qemu + cd qemu + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=riscv64-linux-user --disable-system + make -j$(nproc) + make install + + - name: build + run: | + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test + run: | + export PATH=${{ci.workspace}}/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j $(nproc) + - name: lcov-collect + run: | + cd build + lcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov --list lcov.info + - name: codecov + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + + linux-gcc-riscv64-rvv: + name: linux-gcc-riscv64-rvv + strategy: + matrix: + OPENMP: ['OFF', 'ON'] + + runs-on: + pool-name: docker + container: + image: bkci/ci:ubuntu + steps: + - name: checkout + checkout: self + with: + strategy: FRESH_CHECKOUT + enableSubmodule: false + enableGitLfs: false + + - name: install-deps + run: | + apt-get update + apt-get install -y lcov libcapstone4 libglib2.0-0 + curl https://uploader.codecov.io/verification.gpg | gpg --no-default-keyring --keyring trustedkeys.gpg --import + curl -Os https://uploader.codecov.io/latest/linux/codecov + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM + curl -Os https://uploader.codecov.io/latest/linux/codecov.SHA256SUM.sig + gpgv codecov.SHA256SUM.sig codecov.SHA256SUM + shasum -a 256 -c codecov.SHA256SUM + chmod +x codecov + + - name: cache-qemu + id: cache-qemu + uses: cache@1.* + with: + cachePaths: qemu-install + cacheKey: qemu-riscv64-install-20220831 + + - name: checkout-qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + checkout: https://github.com/qemu/qemu.git + with: + pullType: COMMIT_ID + refName: 621da7789083b80d6f1ff1c0fb499334007b4f51 + localPath: qemu + enableSubmodule: false + enableGitLfs: false + + - name: qemu + if: steps.cache-qemu.outputs.cacheHit != 'true' + run: | + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye main' | tee -a /etc/apt/sources.list + echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list + apt-get update + apt-get build-dep -y qemu + cd qemu + wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + ./configure --prefix=${{ci.workspace}}/qemu-install --target-list=riscv64-linux-user --disable-system + make -j$(nproc) + make install + + - name: cache-rv64gcv + id: cache-rv64gcv + uses: cache@1.* + with: + cachePaths: rv64gcv-install + cacheKey: rv64gcv-linux-install-20221029 + + - name: checkout-riscv-gnu-toolchain + if: steps.cache-rv64gcv.outputs.cacheHit != 'true' + checkout: https://github.com/riscv/riscv-gnu-toolchain.git + with: + pullType: COMMIT_ID + refName: da01ba455ce3802ffa84fdca3a089079996dbfc3 + localPath: riscv-gnu-toolchain + enableSubmodule: false + enableGitLfs: false + + - name: riscv-gnu-toolchain + if: steps.cache-rv64gcv.outputs.cacheHit != 'true' + run: | + apt-get update + apt-get install -y autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + cd riscv-gnu-toolchain + git submodule update --init --recursive --depth 1 glibc + git submodule update --init --recursive --depth 1 newlib + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + git submodule update --init --recursive --depth 1 riscv-dejagnu + git submodule update --init --recursive --depth 1 riscv-gdb + rm -rf riscv-binutils + git clone -b binutils-2_39-branch https://sourceware.org/git/binutils-gdb.git riscv-binutils + rm -rf riscv-gcc + git clone -b riscv-gcc-rvv-next https://github.com/riscv-collab/riscv-gcc.git riscv-gcc + cd riscv-gcc + git checkout 8a0c1b106f01c455a8fb478cfe52d859a69020fd + cd .. + sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c + ./configure --prefix=${{ci.workspace}}/rv64gcv-install --with-arch=rv64gcv_zfh + make linux -j$(nproc) + find ${{ci.workspace}}/rv64gcv-install -type f | xargs -i strip -g {} || true + + - name: build + run: | + export RISCV_ROOT_PATH=${{ci.workspace}}/rv64gcv-install + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_C_FLAGS="-O1" -DCMAKE_CXX_FLAGS="-O1" -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_OPENMP=${{matrix.OPENMP}} -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j $(nproc) + - name: test-vlen128 + run: | + export PATH=${{ci.workspace}}/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc) + - name: lcov-collect-vlen128 + run: | + cd build + lcov --gcov-tool ${{ci.workspace}}/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info + lcov --list lcov.info + - name: codecov-vlen128 + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info + - name: test-vlen256 + run: | + export PATH=${{ci.workspace}}/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc) + - name: lcov-collect-vlen256 + run: | + cd build + lcov --gcov-tool ${{ci.workspace}}/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info + lcov -r lcov.info '/usr/*' -o lcov.info + lcov -r lcov.info '*/build/*' -o lcov.info + lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info + lcov --list lcov.info + - name: codecov-vlen256 + run: ./codecov -t ${{settings.CODECOV_TOKEN.access_token}} -C ${{ ci.sha }} -B ${{ ci.head_ref }} -f build/lcov.info diff --git a/.github/workflows/linux-loongarch64-cpu-gcc.yml b/.github/workflows/linux-loongarch64-cpu-gcc.yml index b2f6bc15f656..cbeef7aa4084 100644 --- a/.github/workflows/linux-loongarch64-cpu-gcc.yml +++ b/.github/workflows/linux-loongarch64-cpu-gcc.yml @@ -4,23 +4,25 @@ on: branches: [master] paths: - '.github/workflows/linux-loongarch64-cpu-gcc.yml' + - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - - 'src/layer/loongarch64/**' + - 'src/layer/loongarch/**' - 'tests/**' pull_request: branches: [master] paths: - '.github/workflows/linux-loongarch64-cpu-gcc.yml' + - 'toolchains/loongarch64-linux-gnu.toolchain.cmake' - 'toolchains/loongarch64-unknown-linux-gnu.toolchain.cmake' - 'CMakeLists.txt' - 'cmake/**' - 'src/*' - 'src/layer/*' - - 'src/layer/loongarch64/**' + - 'src/layer/loongarch/**' - 'tests/**' concurrency: group: linux-loongarch64-cpu-gcc-${{ github.ref }} @@ -56,6 +58,21 @@ jobs: run: | export PATH=$GITHUB_WORKSPACE:$PATH export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/cross-tools/target/usr/lib64:$LD_LIBRARY_PATH - export QEMU_STRACE=1 cd build TESTS_EXECUTABLE_LOADER=qemu-loongarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;$GITHUB_WORKSPACE/cross-tools/target/usr" ctest --output-on-failure -j 4 + + linux-gcc-loongarch64-lsx: + runs-on: [self-hosted, linux, centos] + + steps: + - uses: actions/checkout@v2 + + - name: configure + run: | + export LOONGARCH64_ROOT_PATH=/data/action/osd/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1 + export LD_LIBRARY_PATH=/data/action/osd/loongson-gnu-toolchain-8.3-x86_64-loongarch64-linux-gnu-rc1.1/sysroot/usr/lib64:$LD_LIBRARY_PATH + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/loongarch64-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + + - name: build + run: cmake --build build -j 4 diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml index 05e0487cf540..3bb9b3632965 100644 --- a/.github/workflows/linux-riscv64-cpu-gcc.yml +++ b/.github/workflows/linux-riscv64-cpu-gcc.yml @@ -80,6 +80,19 @@ jobs: cd build TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 2 + linux-gcc-riscv64-c906: + runs-on: [self-hosted, linux, centos] + steps: + - uses: actions/checkout@v3 + + - name: configure + run: | + export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1 + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON .. + - name: build + run: cmake --build build -j 4 + linux-gcc-riscv64-rvv: runs-on: [self-hosted, linux, centos] steps: @@ -117,7 +130,7 @@ jobs: #id: cache-riscv #uses: actions/cache@v3 #with: - #path: rv64gcv-install + #path: rv64gcv-install-next #key: rv64gcv-linux-install-20210504 #- name: install-riscv-build-deps @@ -132,31 +145,31 @@ jobs: #with: #repository: riscv/riscv-gnu-toolchain #path: riscv-gnu-toolchain - #ref: 28271f03bb538d926ad2889dc8ad1b0cb1b3b45c + #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 #- name: checkout-riscv-gnu-toolchain-submodules #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 glibc + #git submodule update --init --recursive --depth 1 newlib #git submodule update --init --recursive --depth 1 riscv-binutils #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-glibc #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-newlib #git submodule update --init --recursive --depth 1 riscv-gdb #- name: riscv-gnu-toolchain #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #cd riscv-gnu-toolchain - #sed -i '/__OBSOLETE_MATH/d' riscv-newlib/newlib/libm/common/math_errf.c - #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install --with-arch=rv64gcv_zfh + #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c + #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh #make linux #- name: riscv-strip-install #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv64gcv-install -type f | xargs -i strip -g {} || true + #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true - name: configure - run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - name: build run: cmake --build build -j 4 @@ -164,10 +177,10 @@ jobs: run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4 + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 - name: test-vlen128 run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4 + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml new file mode 100644 index 000000000000..18ad114efa49 --- /dev/null +++ b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml @@ -0,0 +1,142 @@ +name: linux-riscv64-cpu-gnu-clang +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' + - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' + - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' +concurrency: + group: linux-riscv64-cpu-gnu-clang-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-riscv64-rvv: + runs-on: [self-hosted, linux, centos] + steps: + - uses: actions/checkout@v3 + + #- name: cache-qemu + #id: cache-qemu + #uses: actions/cache@v3 + #with: + #path: qemu-install + #key: qemu-riscv64-install-20220502-3 + #- name: install-qemu-build-deps + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev ninja-build + #- name: checkout-qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: qemu/qemu + #path: qemu + #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + #- name: qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #cd qemu + #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + #make -j2 + #make install + + #- name: cache-riscv + #id: cache-riscv + #uses: actions/cache@v3 + #with: + #path: rv64gcv-install-next + #key: rv64gcv-linux-install-20210504 + + #- name: install-riscv-build-deps + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + + #- name: checkout-riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-gnu-toolchain + #path: riscv-gnu-toolchain + #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 + #- name: checkout-riscv-gnu-toolchain-submodules + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 glibc + #git submodule update --init --recursive --depth 1 newlib + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + #git submodule update --init --recursive --depth 1 riscv-dejagnu + #git submodule update --init --recursive --depth 1 riscv-gdb + #- name: riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c + #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh + #make linux + + #- name: riscv-strip-install + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true + + # - name: install-clang + # run: | + # wget https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.1/llvm-project-15.0.1.src.tar.xz + # tar -xf llvm-project-15.0.1.src.tar.xz + # cd llvm-project-15.0.1.src + # mkdir build + # cd build + # cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/ + # make -j16 + # make install + + - name: build + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next + export PATH=/data/action/osd/llvm-project-15.0.1.src/build/install/bin:$PATH + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 4 + + - name: test-vlen256 + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 + + - name: test-vlen128 + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/pnnx.yml b/.github/workflows/pnnx.yml deleted file mode 100644 index 976f9a706d4c..000000000000 --- a/.github/workflows/pnnx.yml +++ /dev/null @@ -1,102 +0,0 @@ -name: pnnx -on: - push: - branches: [master] - paths: - - '.github/workflows/pnnx.yml' - - 'tools/pnnx/**' - - '!tools/pnnx/README.md' - pull_request: - branches: [master] - paths: - - '.github/workflows/pnnx.yml' - - 'tools/pnnx/**' - - '!tools/pnnx/README.md' -concurrency: - group: pnnx-${{ github.ref }} - cancel-in-progress: true -permissions: - contents: read - -jobs: - ubuntu: - runs-on: [self-hosted, linux, centos] - - strategy: - fail-fast: false - matrix: - include: - - torch-version: 1.8.1 - torchvision-version: 0.9.1 - - - torch-version: 1.9.1 - torchvision-version: 0.10.1 - - - torch-version: 1.10.0 - torchvision-version: 0.11.1 - - - torch-version: 1.11.0 - torchvision-version: 0.12.0 - - - torch-version: 1.12.0 - torchvision-version: 0.13.0 - - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - - name: setup pytorch-${{ matrix.torch-version }} - run: | - export PYTHONUSERBASE=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }} - pip install --user torch==${{ matrix.torch-version }}+cpu torchvision==${{ matrix.torchvision-version }}+cpu -f https://download.pytorch.org/whl/torch_stable.html - - - name: cache-torchvision-${{ matrix.torchvision-version }} - id: cache-torchvision - uses: actions/cache@v3 - with: - path: torchvision-${{ matrix.torchvision-version }}-install - key: torchvision-${{ matrix.torchvision-version }}-linux-install-20211228 - - name: checkout-torchvision-${{ matrix.torchvision-version }} - if: steps.cache-torchvision.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: pytorch/vision - path: vision - ref: v${{ matrix.torchvision-version }} - - name: torchvision-${{ matrix.torchvision-version }} - if: steps.cache-torchvision.outputs.cache-hit != 'true' - run: | - cd vision - mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=$GITHUB_WORKSPACE/torchvision-${{ matrix.torchvision-version }}-install -DTorch_DIR=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }}/lib/python3.8/site-packages/torch/share/cmake/Torch -DCMAKE_BUILD_TYPE=Release .. - cmake --build . -j 4 - cmake --build . --target install - - - name: build-ncnn - run: | - export PYTHONUSERBASE=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }} - pip install --user pytest setuptools wheel twine - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DNCNN_PYTHON=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF .. - cmake --build . -j 4 - cd .. - pip install --user . - - - name: build-pnnx - run: | - export PYTHONUSERBASE=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }} - cd tools/pnnx - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=Release -DTorchVision_INSTALL_DIR=$GITHUB_WORKSPACE/torchvision-${{ matrix.torchvision-version }}-install .. - cmake --build . -j 4 - - - name: test - run: | - export PYTHONUSERBASE=$GITHUB_WORKSPACE/torch-${{ matrix.torch-version }} - export OMP_NUM_THREADS=1 - export MKL_NUM_THREADS=1 - export MKL_ENABLE_INSTRUCTIONS=SSE4_2 - pip install --upgrade requests - cd tools/pnnx - cd build && ctest --output-on-failure -j 4 diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml index abbf1cccc9d9..25c75230b9e8 100644 --- a/.github/workflows/release-python.yml +++ b/.github/workflows/release-python.yml @@ -1,4 +1,5 @@ name: release-python +# on: [push, pull_request] on: push: tags: @@ -31,12 +32,26 @@ jobs: path: dist/*.tar.gz build_wheels: - name: Build wheels on ${{ matrix.os }} + name: ${{ matrix.arch }} ${{ matrix.build }} on ${{ matrix.os }} runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macos-11] + include: + - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-manylinux*' } + - { os: ubuntu-20.04, arch: x86_64, build: 'cp*-musllinux*' } + - { os: ubuntu-20.04, arch: x86_64, build: 'pp*' } + - { os: ubuntu-20.04, arch: i686, build: 'cp*-manylinux*' } + - { os: ubuntu-20.04, arch: i686, build: 'cp*-musllinux*' } + - { os: ubuntu-20.04, arch: i686, build: 'pp*' } + - { os: windows-2019, arch: x86, build: 'cp*' } + - { os: windows-2019, arch: AMD64, build: 'cp*' } + - { os: windows-2019, arch: AMD64, build: 'pp*' } + - { os: windows-2019, arch: ARM64, build: 'cp*' } + - { os: macos-11, arch: x86_64, build: 'cp*' } + - { os: macos-11, arch: x86_64, build: 'pp*' } + - { os: macos-11, arch: arm64, build: 'cp*' } + - { os: macos-11, arch: universal2, build: 'cp*' } steps: - uses: actions/checkout@v3 @@ -53,10 +68,14 @@ jobs: brew uninstall --ignore-dependencies libomp - name: Build wheels - uses: pypa/cibuildwheel@v2.9.0 + uses: pypa/cibuildwheel@v2.11.2 env: - CIBW_ARCHS_MACOS: x86_64 universal2 arm64 - CIBW_ARCHS_LINUX: x86_64 i686 + CIBW_ARCHS_MACOS: ${{ matrix.arch }} + CIBW_ARCHS_LINUX: ${{ matrix.arch }} + CIBW_ARCHS_WINDOWS: ${{ matrix.arch }} + CIBW_BUILD: ${{ matrix.build }} + CIBW_BUILD_VERBOSITY: 1 + CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2 - name: Show files run: ls -lh wheelhouse @@ -72,21 +91,21 @@ jobs: path: wheelhouse/*.whl build_wheels_qemu: - name: Build wheels ${{ matrix.arch }} ${{ matrix.build }} + name: ${{ matrix.arch }} ${{ matrix.build }} runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: arch: [aarch64, ppc64le, s390x] - build: ["cp36-*", "cp37-*", "cp38-*", "cp39-*", "cp310-*"] + build: ['cp36-*', 'cp37-*', 'cp38-*', 'cp39-*', 'cp310-*', 'cp311-*'] include: - arch: aarch64 - build: "pp37-*" + build: 'pp37-*' - arch: aarch64 - build: "pp38-*" + build: 'pp38-*' - arch: aarch64 - build: "pp39-*" + build: 'pp39-*' steps: - uses: actions/checkout@v3 @@ -103,10 +122,12 @@ jobs: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v2.9.0 + uses: pypa/cibuildwheel@v2.11.2 env: CIBW_ARCHS_LINUX: ${{ matrix.arch }} CIBW_BUILD: ${{ matrix.build }} + CIBW_BUILD_VERBOSITY: 1 + CIBW_ENVIRONMENT: CMAKE_BUILD_PARALLEL_LEVEL=2 - name: Show files run: ls -lh wheelhouse @@ -138,7 +159,7 @@ jobs: name: artifact path: dist - - uses: pypa/gh-action-pypi-publish@v1.5.1 + - uses: pypa/gh-action-pypi-publish@release/v1 with: user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9add249d2391..d17626167906 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -22,7 +22,7 @@ jobs: steps: - name: get-version id: get_version - run: echo ::set-output name=VERSION::${GITHUB_REF/refs\/tags\//} + run: echo "VERSION=${GITHUB_REF/refs\/tags\//}" >> $GITHUB_OUTPUT full-source: needs: [setup] @@ -237,7 +237,7 @@ jobs: path: ${{ env.PACKAGENAME }}.zip openmp-macos: - runs-on: macos-latest + runs-on: macos-11 steps: - name: cache-openmp id: cache-openmp @@ -290,7 +290,7 @@ jobs: macos: needs: [setup, openmp-macos] - runs-on: macos-latest + runs-on: macos-11 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos steps: @@ -358,7 +358,7 @@ jobs: macos-gpu: needs: [setup, openmp-macos] - runs-on: macos-latest + runs-on: macos-11 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-macos-vulkan steps: @@ -454,7 +454,7 @@ jobs: path: ${{ env.PACKAGENAME }}.zip openmp-ios: - runs-on: macos-latest + runs-on: macos-11 steps: - name: cache-openmp id: cache-openmp @@ -511,7 +511,7 @@ jobs: ios: needs: [setup, openmp-ios] - runs-on: macos-latest + runs-on: macos-11 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios steps: @@ -594,7 +594,7 @@ jobs: ios-gpu: needs: [setup, openmp-ios] - runs-on: macos-latest + runs-on: macos-11 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan steps: @@ -693,7 +693,7 @@ jobs: path: ${{ env.PACKAGENAME }}.zip openmp-ios-bitcode: - runs-on: macos-latest + runs-on: macos-11 steps: - name: cache-openmp id: cache-openmp @@ -750,7 +750,7 @@ jobs: ios-bitcode: needs: [setup, openmp-ios-bitcode] - runs-on: macos-latest + runs-on: macos-11 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-bitcode steps: @@ -833,7 +833,7 @@ jobs: ios-gpu-bitcode: needs: [setup, openmp-ios-bitcode] - runs-on: macos-latest + runs-on: macos-11 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-ios-vulkan-bitcode steps: @@ -1267,6 +1267,7 @@ jobs: runs-on: windows-2019 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015 + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: @@ -1332,6 +1333,7 @@ jobs: runs-on: windows-2019 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2015-shared + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: @@ -1397,6 +1399,7 @@ jobs: runs-on: windows-2019 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017 + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: @@ -1462,6 +1465,7 @@ jobs: runs-on: windows-2019 env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2017-shared + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: @@ -1527,6 +1531,7 @@ jobs: runs-on: windows-latest env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019 + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: @@ -1608,6 +1613,7 @@ jobs: runs-on: windows-latest env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2019-shared + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: @@ -1689,6 +1695,7 @@ jobs: runs-on: windows-latest env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022 + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: @@ -1770,6 +1777,7 @@ jobs: runs-on: windows-latest env: PACKAGENAME: ncnn-${{ needs.setup.outputs.VERSION }}-windows-vs2022-shared + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/test-coverage.yml b/.github/workflows/test-coverage.yml index f84eeb25222d..87401acd00fa 100644 --- a/.github/workflows/test-coverage.yml +++ b/.github/workflows/test-coverage.yml @@ -25,112 +25,6 @@ permissions: contents: read jobs: - linux-gcc-gpu: - runs-on: [self-hosted, linux, cvm] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - name: cache-swiftshader - id: cache-swiftshader - uses: actions/cache@v3 - with: - path: swiftshader-install - key: swiftshader-linux-install-20220211 - - name: checkout-swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: google/swiftshader - path: swiftshader - ref: 0863290dc7f6cc1649bab1858790e812b8aef02a - - name: checkout-swiftshader-submodules - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - git -c submodule."third_party/git-hooks".update=none submodule update --init --recursive - - name: swiftshader - if: steps.cache-swiftshader.outputs.cache-hit != 'true' - run: | - cd swiftshader - mkdir -p build; cd build - cmake -DCMAKE_INSTALL_PREFIX=install -DSWIFTSHADER_BUILD_EGL=FALSE -DSWIFTSHADER_BUILD_GLESv2=FALSE -DSWIFTSHADER_BUILD_GLES_CM=FALSE -DSWIFTSHADER_BUILD_VULKAN=TRUE -DSWIFTSHADER_BUILD_PVR=FALSE -DSWIFTSHADER_BUILD_TESTS=FALSE -DSWIFTSHADER_ENABLE_ASTC=FALSE -DSWIFTSHADER_WARNINGS_AS_ERRORS=FALSE -DREACTOR_BACKEND=Subzero -DREACTOR_DEFAULT_OPT_LEVEL=Default -DCMAKE_BUILD_TYPE=Release .. - cmake --build . -j 4 - mkdir $GITHUB_WORKSPACE/swiftshader-install - cp Linux/* $GITHUB_WORKSPACE/swiftshader-install - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 4 - - name: test - run: | - printf "[Processor]\nThreadCount=1\n" > build/tests/SwiftShader.ini - export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/swiftshader-install/vk_swiftshader_icd.json" - cd build && ctest --output-on-failure -j 4 - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov -r lcov.info '*/glslang/*' -o lcov.info - lcov --list lcov.info - - name: codecov - id: codecov - continue-on-error: true - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-1 - continue-on-error: true - id: codecov-vlen256-retry-1 - if: steps.codecov.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-2 - continue-on-error: true - id: codecov-vlen256-retry-2 - if: steps.codecov-vlen256-retry-1.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-3 - continue-on-error: true - id: codecov-vlen256-retry-3 - if: steps.codecov-vlen256-retry-2.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-4 - continue-on-error: true - id: codecov-vlen256-retry-4 - if: steps.codecov-vlen256-retry-3.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-5 - continue-on-error: true - id: codecov-vlen256-retry-5 - if: steps.codecov-vlen256-retry-4.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: set the status - if: always() - run: | - if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then - echo fine - else - exit 1 - fi - linux-gcc-gpu-t4: runs-on: [self-hosted, linux, t4] steps: @@ -215,1144 +109,39 @@ jobs: exit 1 fi - linux-gcc-gpu-lavapipe: - runs-on: [self-hosted, linux, cvm] - steps: - - uses: actions/checkout@v3 - with: - submodules: true - - - name: cache-lavapipe - id: cache-lavapipe - uses: actions/cache@v3 - with: - path: lavapipe-install - key: lavapipe-linux-install-20211127-2 - - name: checkout-lavapipe - if: steps.cache-lavapipe.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: mesa3d/mesa - path: mesa - ref: cd39180cfab20734744b379b085cc3b5c2cecd3a - - name: lavapipe - if: steps.cache-lavapipe.outputs.cache-hit != 'true' - run: | - mkdir -p "$GITHUB_WORKSPACE/lavapipe-install" - cd mesa - mkdir build - cd build - meson -Dprefix="$GITHUB_WORKSPACE/lavapipe-install" -Dbuildtype=release -Db_lto=true -Db_ndebug=true -Dplatforms="x11" -Ddri3=enabled -Ddri-drivers="" -Dgallium-drivers=swrast -Dgallium-vdpau=disabled -Dgallium-xvmc=disabled -Dgallium-omx=disabled -Dgallium-va=disabled -Dgallium-xa=disabled -Dgallium-opencl=disabled -Dopencl-native=false -Dvulkan-drivers=swrast -Dshader-cache=disabled -Dgles1=disabled -Dgles2=disabled -Dopengl=false -Dgbm=disabled -Dglx=disabled -Degl=disabled -Dllvm=enabled -Dvalgrind=disabled -Dlibunwind=disabled -Dlmsensors=disabled .. - ninja -j4 - ninja install - sed -ie "s@$GITHUB_WORKSPACE/lavapipe-install/lib/x86_64-linux-gnu/libvulkan_lvp.so@../../../lib/x86_64-linux-gnu/libvulkan_lvp.so@g" $GITHUB_WORKSPACE/lavapipe-install/share/vulkan/icd.d/lvp_icd.x86_64.json - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_VULKAN=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 4 - - name: test - run: | - export VK_ICD_FILENAMES="$GITHUB_WORKSPACE/lavapipe-install/share/vulkan/icd.d/lvp_icd.x86_64.json" - cd build && ctest --output-on-failure -j 4 - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov -r lcov.info '*/glslang/*' -o lcov.info - lcov --list lcov.info - - name: codecov - id: codecov - continue-on-error: true - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-1 - continue-on-error: true - id: codecov-vlen256-retry-1 - if: steps.codecov.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-2 - continue-on-error: true - id: codecov-vlen256-retry-2 - if: steps.codecov-vlen256-retry-1.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-3 - continue-on-error: true - id: codecov-vlen256-retry-3 - if: steps.codecov-vlen256-retry-2.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-4 - continue-on-error: true - id: codecov-vlen256-retry-4 - if: steps.codecov-vlen256-retry-3.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-5 - continue-on-error: true - id: codecov-vlen256-retry-5 - if: steps.codecov-vlen256-retry-4.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: set the status - if: always() - run: | - if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then - echo fine - else - exit 1 - fi - - linux-gcc-x64: - runs-on: ubuntu-latest + linux-gcc-x64-avx512-spr: + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v3 + - name: update + run: sudo apt-get update + - name: gcc12 + run: sudo apt-get install gcc-12 g++-12 - name: lcov run: sudo apt-get install lcov - - name: build-sse2 - run: | - mkdir build-sse2 && cd build-sse2 - cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX=OFF -DNCNN_AVX2=OFF -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-sse2 - run: cd build-sse2 && ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-sse2 - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-sse2/*' -o lcov.info - lcov --list lcov.info - - name: codecov-sse2 - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build-sse2/lcov.info - - - name: build-avx - run: | - mkdir build-avx && cd build-avx - cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=OFF -DNCNN_AVX=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx - run: cd build-avx && ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-avx - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-avx/*' -o lcov.info - lcov --list lcov.info - - name: codecov-avx - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build-avx/lcov.info - - - name: build-avx2 + - name: Setup SDE binaries + uses: petarpetrovt/setup-sde@v2 + - name: build-avx512-spr + env: + CC: gcc-12 + CXX: g++-12 run: | - mkdir build-avx2 && cd build-avx2 - cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + mkdir build-avx512-spr && cd build-avx512-spr + cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. cmake --build . -j 2 - - name: test-avx2 - run: cd build-avx2 && ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-avx2 - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-avx2/*' -o lcov.info - lcov --list lcov.info - - name: codecov-avx2 - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build-avx2/lcov.info - - linux-gcc-x64-avx512: - runs-on: [self-hosted, linux, t4] - steps: - - uses: actions/checkout@v3 - - name: build - env: - CC: gcc - CXX: g++ - LD_LIBRARY_PATH: /data/action/install/lib64 + - name: test-avx512-spr run: | - mkdir build && cd build - cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 4 - - name: test - env: - LD_LIBRARY_PATH: /data/action/install/lib64 - run: cd build && ctest --output-on-failure -j 4 + cd build-avx512-spr + TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2 - name: lcov-collect run: | - cd build - lcov -d ./src -c -o lcov.info + cd build-avx512-spr + lcov --gcov-tool gcov-12 -d ./src -c -o lcov.info lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/install/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info + lcov -r lcov.info '*/build-avx512-spr/*' -o lcov.info lcov --list lcov.info - - name: codecov - id: codecov - continue-on-error: true - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-1 - continue-on-error: true - id: codecov-vlen256-retry-1 - if: steps.codecov.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-2 - continue-on-error: true - id: codecov-vlen256-retry-2 - if: steps.codecov-vlen256-retry-1.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-3 - continue-on-error: true - id: codecov-vlen256-retry-3 - if: steps.codecov-vlen256-retry-2.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-4 - continue-on-error: true - id: codecov-vlen256-retry-4 - if: steps.codecov-vlen256-retry-3.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-5 - continue-on-error: true - id: codecov-vlen256-retry-5 - if: steps.codecov-vlen256-retry-4.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: set the status - if: always() - run: | - if ${{ steps.codecov.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then - echo fine - else - exit 1 - fi - - linux-gcc-x64-avx512-spr: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - name: update - run: sudo apt-get update - - name: gcc12 - run: sudo apt-get install gcc-12 g++-12 - - name: lcov - run: sudo apt-get install lcov - - name: Setup SDE binaries - uses: petarpetrovt/setup-sde@v2 - - name: build-avx512-spr - env: - CC: gcc-12 - CXX: g++-12 - run: | - mkdir build-avx512-spr && cd build-avx512-spr - cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test-avx512-spr - run: | - cd build-avx512-spr - TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-avx512-spr - lcov --gcov-tool gcov-12 -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-avx512-spr/*' -o lcov.info - lcov --list lcov.info - - name: codecov-avx512-spr + - name: codecov-avx512-spr uses: codecov/codecov-action@v3 with: token: ${{ secrets.CODECOV_TOKEN }} file: build-avx512-spr/lcov.info - - linux-gcc-armhf-vfpv3-d16: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - name: lcov - run: sudo apt-get install lcov - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-arm-install-20220502 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system - make -j2 - make install - - - name: arm-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-arm-linux-gnueabihf - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabihf" ctest --output-on-failure -j 2 - - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - linux-gcc-arm: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - name: lcov - run: sudo apt-get install lcov - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-arm-install-20220502 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=arm-linux-user --disable-system - make -j2 - make install - - - name: arm-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-arm-linux-gnueabi - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/arm-linux-gnueabi.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_VFPV4=ON -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-arm TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/arm-linux-gnueabi" ctest --output-on-failure -j 2 - - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - linux-gcc-aarch64: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - name: lcov - run: sudo apt-get install lcov - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-aarch64-install-20220502 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system - make -j2 - make install - - - name: aarch64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_ARM82=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - linux-gcc-arm82: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: lcov - run: sudo apt-get install lcov - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-aarch64-install-20220502 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system - make -j2 - make install - - - name: aarch64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu - - - name: build-arm82 - run: | - mkdir build-arm82 && cd build-arm82 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_ARM82=ON -DNCNN_ARM82FP16FML=OFF -DNCNN_ARM84BF16=OFF -DNCNN_ARM84I8MM=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-arm82 - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-arm82 - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-arm82/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build-arm82/lcov.info - - linux-gcc-arm82-omp: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: lcov - run: sudo apt-get install lcov - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-aarch64-install-20220502 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system - make -j2 - make install - - - name: aarch64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu - - - name: build-arm82-omp - run: | - mkdir build-arm82-omp && cd build-arm82-omp - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_ARM82=ON -DNCNN_ARM82FP16FML=OFF -DNCNN_ARM84BF16=OFF -DNCNN_ARM84I8MM=OFF -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-arm82-omp - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-arm82-omp - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-arm82-omp/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build-arm82-omp/lcov.info - - linux-gcc-arm82dot-omp: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: lcov - run: sudo apt-get install lcov - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-aarch64-install-20220502 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system - make -j2 - make install - - - name: aarch64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu - - - name: build-arm82dot-omp - run: | - mkdir build-arm82dot-omp && cd build-arm82dot-omp - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=ON -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_ARM82FP16FML=OFF -DNCNN_ARM84BF16=OFF -DNCNN_ARM84I8MM=OFF -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-arm82dot-omp - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-arm82dot-omp - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-arm82dot-omp/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build-arm82dot-omp/lcov.info - - linux-gcc-arm84: - runs-on: ubuntu-22.04 - steps: - - uses: actions/checkout@v3 - - - name: lcov - run: sudo apt-get install lcov - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-aarch64-install-20220502 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=aarch64-linux-user --disable-system - make -j2 - make install - - - name: aarch64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-aarch64-linux-gnu - - - name: build-arm84 - run: | - mkdir build-arm84 && cd build-arm84 - cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/aarch64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=ON -DNCNN_ARM82=ON -DNCNN_ARM82DOT=ON -DNCNN_ARM82FP16FML=ON -DNCNN_ARM84BF16=ON -DNCNN_ARM84I8MM=ON -DNCNN_OPENMP=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - cmake --build . -j 2 - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build-arm84 - TESTS_EXECUTABLE_LOADER=qemu-aarch64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/aarch64-linux-gnu" ctest --output-on-failure -j 2 - - name: lcov-collect - run: | - cd build-arm84 - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build-arm84/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build-arm84/lcov.info - - linux-gcc-mipsisa32r6el: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: lcov - run: sudo apt-get install lcov - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-mipsel-install-20220502 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mipsel-linux-user --disable-system - make -j2 - make install - - - name: mipsisa32r6el-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-mipsisa32r6el-linux-gnu - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa32r6el-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=OFF -DNCNN_MMI=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-mipsel TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa32r6el-linux-gnu" ctest --output-on-failure -j 2 - - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - linux-gcc-mipsisa64r6el: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: lcov - run: sudo apt-get install lcov - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-mips64el-install-20220502-3 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0001-target-mips-Fix-SAT_S-trans-helper.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0002-target-mips-Fix-df_extract_val-and-df_extract_df-dfe.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0003-target-mips-Fix-msa-checking-condition-in-trans_msa_.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0004-target-mips-Do-not-treat-msa-INSERT-as-NOP-when-wd-i.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch - patch -p1 -i 0001-target-mips-Fix-SAT_S-trans-helper.patch - patch -p1 -i 0002-target-mips-Fix-df_extract_val-and-df_extract_df-dfe.patch - patch -p1 -i 0003-target-mips-Fix-msa-checking-condition-in-trans_msa_.patch - patch -p1 -i 0004-target-mips-Do-not-treat-msa-INSERT-as-NOP-when-wd-i.patch - patch -p1 -i 0005-target-mips-Fix-FTRUNC_S-and-FTRUNC_U-trans-helper.patch - patch -p1 -i 0006-target-mips-Fix-store-adress-of-high-64bit-in-helper.patch - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system - make -j2 - make install - - - name: mipsisa64r6el-gnuabi64-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-mipsisa64r6el-linux-gnuabi64 - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/mipsisa64r6el-linux-gnuabi64.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_MSA=ON -DNCNN_MMI=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-mips64el TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/mipsisa64r6el-linux-gnuabi64" ctest --output-on-failure -j 2 - - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - linux-gcc-riscv64: - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - - name: lcov - run: sudo apt-get install lcov - - - name: cache-qemu - id: cache-qemu - uses: actions/cache@v3 - with: - path: qemu-install - key: qemu-riscv64-install-20220502-3 - - name: install-qemu-build-deps - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - sudo apt-get update - sudo apt-get install autoconf automake autotools-dev ninja-build - - name: checkout-qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - uses: actions/checkout@v3 - with: - repository: qemu/qemu - path: qemu - ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - - name: qemu - if: steps.cache-qemu.outputs.cache-hit != 'true' - run: | - cd qemu - wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system - make -j2 - make install - - - name: riscv64-gnu-toolchain - run: | - sudo apt-get update - sudo apt-get install g++-riscv64-linux-gnu - - - name: configure - run: mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 2 - - - name: test - run: | - export PATH=$GITHUB_WORKSPACE/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-L;/usr/riscv64-linux-gnu" ctest --output-on-failure -j 2 - - - name: lcov-collect - run: | - cd build - lcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov --list lcov.info - - name: codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - linux-gcc-riscv64-rvv: - runs-on: [self-hosted, linux, centos] - steps: - - uses: actions/checkout@v3 - - #- name: lcov - #run: sudo apt-get install lcov - - #- name: cache-qemu - #id: cache-qemu - #uses: actions/cache@v3 - #with: - #path: qemu-install - #key: qemu-riscv64-install-20220502-3 - #- name: install-qemu-build-deps - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev ninja-build - #- name: checkout-qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: qemu/qemu - #path: qemu - #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 - #- name: qemu - #if: steps.cache-qemu.outputs.cache-hit != 'true' - #run: | - #cd qemu - #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch - #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system - #make -j2 - #make install - - #- name: cache-riscv - #id: cache-riscv - #uses: actions/cache@v3 - #with: - #path: rv64gcv-install - #key: rv64gcv-linux-install-20210504 - - #- name: install-riscv-build-deps - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #sudo apt-get update - #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler - - #- name: checkout-riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #uses: actions/checkout@v3 - #with: - #repository: riscv/riscv-gnu-toolchain - #path: riscv-gnu-toolchain - #ref: 28271f03bb538d926ad2889dc8ad1b0cb1b3b45c - #- name: checkout-riscv-gnu-toolchain-submodules - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #git submodule update --init --recursive --depth 1 riscv-binutils - #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-glibc - #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-newlib - #git submodule update --init --recursive --depth 1 riscv-gdb - #- name: riscv-gnu-toolchain - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: | - #cd riscv-gnu-toolchain - #sed -i '/__OBSOLETE_MATH/d' riscv-newlib/newlib/libm/common/math_errf.c - #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install --with-arch=rv64gcv_zfh - #make linux - - #- name: riscv-strip-install - #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv64gcv-install -type f | xargs -i strip -g {} || true - - - name: configure - run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DCMAKE_BUILD_TYPE=debug -DCMAKE_C_FLAGS="-O1" -DCMAKE_CXX_FLAGS="-O1" -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - - name: build - run: cmake --build build -j 4 - - - name: test-vlen128 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4 - - - name: lcov-collect-vlen128 - run: | - cd build - lcov --gcov-tool /data/action/osd/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info - lcov --list lcov.info - - name: codecov-vlen128 - id: codecov-vlen128 - continue-on-error: true - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen128-retry-1 - continue-on-error: true - id: codecov-vlen128-retry-1 - if: steps.codecov-vlen128.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen128-retry-2 - continue-on-error: true - id: codecov-vlen128-retry-2 - if: steps.codecov-vlen128-retry-1.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen128-retry-3 - continue-on-error: true - id: codecov-vlen128-retry-3 - if: steps.codecov-vlen128-retry-2.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen128-retry-4 - continue-on-error: true - id: codecov-vlen128-retry-4 - if: steps.codecov-vlen128-retry-3.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen128-retry-5 - continue-on-error: true - id: codecov-vlen128-retry-5 - if: steps.codecov-vlen128-retry-4.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: set codecov-vlen128 status - if: always() - run: | - if ${{ steps.codecov-vlen128.outcome=='success' || steps.codecov-vlen128-retry-1.outcome=='success' || steps.codecov-vlen128-retry-2.outcome=='success' || steps.codecov-vlen128-retry-3.outcome=='success' || steps.codecov-vlen128-retry-4.outcome=='success' || steps.codecov-vlen128-retry-5.outcome=='success' }}; then - echo fine - else - exit 1 - fi - - - name: test-vlen256 - run: | - export PATH=/data/action/osd/qemu-install/bin:$PATH - cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4 - - - name: lcov-collect-vlen256 - run: | - cd build - lcov --gcov-tool /data/action/osd/rv64gcv-install/bin/riscv64-unknown-linux-gnu-gcov -d ./src -c -o lcov.info - lcov -r lcov.info '/usr/*' -o lcov.info - lcov -r lcov.info '*/build/*' -o lcov.info - lcov -r lcov.info '*/rv64gcv-install/*' -o lcov.info - lcov --list lcov.info - - name: codecov-vlen256 - id: codecov-vlen256 - continue-on-error: true - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-1 - continue-on-error: true - id: codecov-vlen256-retry-1 - if: steps.codecov-vlen256.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-2 - continue-on-error: true - id: codecov-vlen256-retry-2 - if: steps.codecov-vlen256-retry-1.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-3 - continue-on-error: true - id: codecov-vlen256-retry-3 - if: steps.codecov-vlen256-retry-2.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-4 - continue-on-error: true - id: codecov-vlen256-retry-4 - if: steps.codecov-vlen256-retry-3.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: codecov-vlen256-retry-5 - continue-on-error: true - id: codecov-vlen256-retry-5 - if: steps.codecov-vlen256-retry-4.outcome=='failure' - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: build/lcov.info - - name: set codecov-vlen256 status - if: always() - run: | - if ${{ steps.codecov-vlen256.outcome=='success' || steps.codecov-vlen256-retry-1.outcome=='success' || steps.codecov-vlen256-retry-2.outcome=='success' || steps.codecov-vlen256-retry-3.outcome=='success' || steps.codecov-vlen256-retry-4.outcome=='success' || steps.codecov-vlen256-retry-5.outcome=='success' }}; then - echo fine - else - exit 1 - fi diff --git a/.github/workflows/windows-arm-cpu.yml b/.github/workflows/windows-arm-cpu.yml index 2b1441ed74b0..d789482a5951 100644 --- a/.github/workflows/windows-arm-cpu.yml +++ b/.github/workflows/windows-arm-cpu.yml @@ -41,6 +41,8 @@ jobs: toolset-version: v143 os: windows-2022 + env: + UseMultiToolTask: true steps: - uses: actions/checkout@v3 - name: build diff --git a/.github/workflows/windows-arm64-cpu.yml b/.github/workflows/windows-arm64-cpu.yml index 49cd6e200ffe..a6bdbda01dee 100644 --- a/.github/workflows/windows-arm64-cpu.yml +++ b/.github/workflows/windows-arm64-cpu.yml @@ -41,6 +41,8 @@ jobs: toolset-version: v143 os: windows-2022 + env: + UseMultiToolTask: true steps: - uses: actions/checkout@v3 - name: build diff --git a/.github/workflows/windows-x64-cpu-vs2019-python.yml b/.github/workflows/windows-x64-cpu-vs2019-python.yml index 7ef6a1adcec0..3d4e6583766b 100644 --- a/.github/workflows/windows-x64-cpu-vs2019-python.yml +++ b/.github/workflows/windows-x64-cpu-vs2019-python.yml @@ -32,6 +32,8 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8, 3.9] + env: + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/windows-x64-cpu.yml b/.github/workflows/windows-x64-cpu.yml index c93fc8adfe12..200185d1a56a 100644 --- a/.github/workflows/windows-x64-cpu.yml +++ b/.github/workflows/windows-x64-cpu.yml @@ -55,6 +55,8 @@ jobs: toolset-version: v143 os: windows-2022 + env: + UseMultiToolTask: true steps: - uses: actions/checkout@v3 - name: cache-protobuf diff --git a/.github/workflows/windows-x64-gpu.yml b/.github/workflows/windows-x64-gpu.yml index 4101e43ce25c..3d707c17052a 100644 --- a/.github/workflows/windows-x64-gpu.yml +++ b/.github/workflows/windows-x64-gpu.yml @@ -57,6 +57,8 @@ jobs: toolset-version: v143 os: windows-2022 + env: + UseMultiToolTask: true steps: - uses: actions/checkout@v3 with: diff --git a/.github/workflows/windows-x86-cpu.yml b/.github/workflows/windows-x86-cpu.yml index 8e692e0d695a..b48431a97ac1 100644 --- a/.github/workflows/windows-x86-cpu.yml +++ b/.github/workflows/windows-x86-cpu.yml @@ -49,6 +49,8 @@ jobs: toolset-version: v143 os: windows-2022 + env: + UseMultiToolTask: true steps: - uses: actions/checkout@v3 - name: build diff --git a/.gitignore b/.gitignore index de1330fdeb7f..aa8ea4ddcb2e 100644 --- a/.gitignore +++ b/.gitignore @@ -46,6 +46,7 @@ build*/ .idea cmake-build-debug cmake-build-release +CMakeSettings.json # Compiled python __pycache__ @@ -56,3 +57,6 @@ python/setup.py # Clangd .cache/ + +# Xmake +.xmake/ \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index c453d23e5440..8d0b4c63a1fe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,7 +96,7 @@ else() option(NCNN_BUILD_EXAMPLES "build examples" ON) endif() -if(ANDROID OR IOS OR LINUX OR NCNN_SIMPLESTL) +if(ANDROID OR IOS OR NCNN_SIMPLESTL) option(NCNN_DISABLE_EXCEPTION "disable exception" ON) else() option(NCNN_DISABLE_EXCEPTION "disable exception" OFF) @@ -147,6 +147,7 @@ endif() if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|aarch64)") + OR (CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "ARM64") OR ((CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC")) AND (${CMAKE_GENERATOR_PLATFORM} MATCHES "^(arm|arm64)"))) set(NCNN_TARGET_ARCH arm) @@ -171,7 +172,7 @@ if((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm") check_cxx_source_compiles("#include \nint main() { float32x4_t _s; float16x8_t _a, _b; _s = vfmlalq_low_f16(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM82_FP16FML) set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+bf16") - check_cxx_source_compiles("#include \nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vbfmmlaq_f32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) + check_cxx_source_compiles("#include \nint main() { float32x4_t _s; bfloat16x8_t _a, _b; _s = vcvt_f32_bf16(vcvt_bf16_f32(vbfmmlaq_f32(_s, _a, _b))); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_BF16) set(CMAKE_REQUIRED_FLAGS "-march=armv8.4-a+i8mm") check_cxx_source_compiles("#include \nint main() { int32x4_t _s; int8x16_t _a, _b; _s = vmmlaq_s32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_ARM84_I8MM) @@ -290,16 +291,34 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)") else() message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.") endif() +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)") + set(NCNN_TARGET_ARCH loongarch) + + include(CheckCXXCompilerFlag) + + check_cxx_compiler_flag("-mlsx" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX) + + if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX) + option(NCNN_LSX "optimize loongarch platform with lsx extension" ON) + else() + message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.") + endif() + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") set(NCNN_TARGET_ARCH riscv) include(CheckCXXCompilerFlag) set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") - check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; float _v; word_type vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV) + check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; float _v; size_t vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV) set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh") - check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; word_type vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_FP16) + check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZFH) + + if(NOT NCNN_COMPILER_SUPPORT_RVV_ZFH) + set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16") + check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZVFH) + endif() unset(CMAKE_REQUIRED_FLAGS) @@ -309,16 +328,24 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") if(NCNN_RVV_CHECK_VFREDSUM) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/ncnn_check_rvv_vfredusum.cmake) endif() - if(NOT NCNN_COMPILER_SUPPORT_RVV_FP16) + if(NOT (NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH)) message(WARNING "The compiler does not support risc-v zfh extension. Upgrading your toolchain is strongly recommended.") endif() + option(NCNN_RVV_CHECK_PLAIN_SEGMENT "check compilter about rvv segment load/store interface" ON) + if(NCNN_RVV_CHECK_PLAIN_SEGMENT) + set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") + check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; size_t vl; float src[32]={.0f}; vlseg2e32_v_f32m1(&_s, &_w, src, vl); return 0; }" NCNN_COMPILER_USE_RVV_PLAIN_SEG) + unset(CMAKE_REQUIRED_FLAGS) + endif() + if(NOT NCNN_COMPILER_USE_RVV_PLAIN_SEG) + message(WARNING "The compiler uses tuple types for segment load/store. Upgrading your toolchain is strongly recommended.") + add_definitions(-D__rvv_tuple) + endif() else() message(WARNING "The compiler does not support risc-v v extension. NCNN_RVV will be OFF.") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") set(NCNN_TARGET_ARCH powerpc) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch)") - set(NCNN_TARGET_ARCH mips) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)") set(NCNN_TARGET_ARCH xtensa) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)") @@ -469,29 +496,38 @@ endif() if(NCNN_VULKAN) if(NCNN_SYSTEM_GLSLANG) - set(GLSLANG_TARGET_DIR "GLSLANG-NOTFOUND" CACHE PATH "Absolute path to glslangTargets.cmake directory") - if(NOT GLSLANG_TARGET_DIR AND NOT DEFINED ENV{GLSLANG_TARGET_DIR}) - message(WARNING "GLSLANG_TARGET_DIR must be defined! NCNN_SYSTEM_GLSLANG will be turned off.") - set(NCNN_SYSTEM_GLSLANG OFF) + find_package(Threads) + find_package(glslang QUIET) + if(glslang_FOUND) + add_library(glslang ALIAS glslang::glslang) + add_library(SPIRV ALIAS glslang::SPIRV) else() - message(STATUS "Using glslang install located at ${GLSLANG_TARGET_DIR}") - - find_package(Threads) - - include("${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake") - include("${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake") - if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") - # hlsl support can be optional - include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") - endif() - include("${GLSLANG_TARGET_DIR}/glslangTargets.cmake") - include("${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake") - - if (NOT TARGET glslang OR NOT TARGET SPIRV) - message(WARNING "glslang or SPIRV target not found! NCNN_SYSTEM_GLSLANG will be turned off.") + set(GLSLANG_TARGET_DIR "GLSLANG-NOTFOUND" CACHE PATH "Absolute path to glslangTargets.cmake directory") + if(NOT GLSLANG_TARGET_DIR AND NOT DEFINED ENV{GLSLANG_TARGET_DIR}) + message(WARNING "set glslang_DIR to glslang-config.cmake directory for using system glslang.") + message(WARNING "GLSLANG_TARGET_DIR must be defined! NCNN_SYSTEM_GLSLANG will be turned off.") set(NCNN_SYSTEM_GLSLANG OFF) + else() + include("${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake") + include("${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake") + if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") + # hlsl support can be optional + include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") + endif() + include("${GLSLANG_TARGET_DIR}/glslangTargets.cmake") + include("${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake") endif() endif() + + if (TARGET glslang AND TARGET SPIRV) + get_property(glslang_location TARGET glslang PROPERTY LOCATION) + get_property(SPIRV_location TARGET SPIRV PROPERTY LOCATION) + message(STATUS "Found glslang: ${glslang_location} (found version \"${glslang_VERSION}\")") + message(STATUS "Found SPIRV: ${SPIRV_location} (found version \"${glslang_VERSION}\")") + else() + message(WARNING "glslang or SPIRV target not found! NCNN_SYSTEM_GLSLANG will be turned off.") + set(NCNN_SYSTEM_GLSLANG OFF) + endif() endif() if(NOT NCNN_SYSTEM_GLSLANG) diff --git a/README.md b/README.md index c25a985aad4e..6c71520e0f24 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ ncnn 目前已在腾讯多款应用中使用,如:QQ,Qzone,微信,天 ## Current building status matrix -| System | CPU (32bit] | CPU (64bit) | GPU (32bit) | GPU (64bit) | +| System | CPU (32bit) | CPU (64bit) | GPU (32bit) | GPU (64bit) | | :---------------- | :------------------------------------------------------------------ | :------------------------------------------------------------------------------ | :-------------------------------------------------------------- | :------------------------------------------------------------------ | | Linux (GCC) | [![Build Status][pass-linux-x86-cpu-gcc]][ci-linux-x86-cpu-gcc] | [![Build Status][pass-linux-x64-cpu-gcc]][ci-linux-x64-cpu-gcc] | — | [![Build Status][pass-linux-x64-gpu-gcc]][ci-linux-x64-gpu-gcc] | | Linux (Clang) | [![Build Status][pass-linux-x86-cpu-clang]][ci-linux-x86-cpu-clang] | [![Build Status][pass-linux-x64-cpu-clang]][ci-linux-x64-cpu-clang] | — | [![Build Status][pass-linux-x64-gpu-clang]][ci-linux-x64-gpu-clang] | @@ -298,7 +298,7 @@ ncnn 目前已在腾讯多款应用中使用,如:QQ,Qzone,微信,天 --- -## Example project +## Project examples - - @@ -307,10 +307,12 @@ ncnn 目前已在腾讯多款应用中使用,如:QQ,Qzone,微信,天 - - - 🤩 +- - - -功能概述 +
+ +-
+ --- diff --git a/benchmark/README.md b/benchmark/README.md index dca02a2c82a6..004283d7682e 100644 --- a/benchmark/README.md +++ b/benchmark/README.md @@ -4236,7 +4236,7 @@ cooling_down = 0 yolo-fastestv2 min = 91.08 max = 102.93 avg = 94.41 ``` -### ### AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) +### AMD Ryzen 5700g (Zen3 3.8 GHz ~ 4.6 GHz x 8) test in wsl2 with ubuntu 20.04 ``` $ ./benchncnn 10 1 0 -1 0 @@ -4370,3 +4370,168 @@ cooling_down = 1 vision_transformer min = 22201.32 max = 22510.75 avg = 22315.09 FastestDet min = 146.94 max = 148.50 avg = 147.44 ``` + +### T-Head TH1520 (C910V, 1.848 GHz x 4) + +Tested on `Linux anolis-riscv 5.10.112-00579-g8e3db308d5a5 #23 SMP PREEMPT Fri Aug 12 10:17:32 CST 2022 riscv64 riscv64 riscv64 GNU/Linux` + +``` +[root@anolis-riscv benchmark]# ./benchncnn +syscall error -1 +loop_count = 4 +num_threads = 4 +powersave = 0 +gpu_device = -1 +cooling_down = 1 + squeezenet min = 187.88 max = 188.82 avg = 188.13 + squeezenet_int8 min = 2388.26 max = 2446.92 avg = 2411.46 + mobilenet min = 321.46 max = 323.34 avg = 322.19 + mobilenet_int8 min = 2318.93 max = 2458.55 avg = 2400.99 + mobilenet_v2 min = 214.01 max = 216.00 avg = 215.35 + mobilenet_v3 min = 247.71 max = 248.18 avg = 247.96 + shufflenet min = 155.58 max = 155.85 avg = 155.67 + shufflenet_v2 min = 99.50 max = 99.75 avg = 99.63 + mnasnet min = 261.46 max = 263.83 avg = 262.53 + proxylessnasnet min = 315.40 max = 316.89 avg = 316.28 + efficientnet_b0 min = 484.97 max = 486.16 avg = 485.55 + efficientnetv2_b0 min = 453.03 max = 453.40 avg = 453.21 + regnety_400m min = 314.09 max = 315.33 avg = 314.77 + blazeface min = 46.14 max = 46.69 avg = 46.39 + googlenet min = 650.99 max = 653.60 avg = 651.69 + googlenet_int8 min = 5435.11 max = 6391.98 avg = 6012.81 + resnet18 min = 505.48 max = 506.70 avg = 506.06 + resnet18_int8 min = 5053.33 max = 6599.94 avg = 6001.86 + alexnet min = 403.68 max = 404.60 avg = 404.23 + vgg16 min = 2731.55 max = 2746.48 avg = 2738.82 +``` + +### Rockchip RK3588 (Quad Core A76 2.4GHz + Quad Core A55 1.8GHz) +test in ROCK5 MODEL B + +``` +rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn 10 1 0 -1 0 +loop_count = 10 +num_threads = 1 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 15.22 max = 16.03 avg = 15.70 + squeezenet_int8 min = 16.77 max = 16.96 avg = 16.86 + mobilenet min = 23.07 max = 23.58 avg = 23.36 + mobilenet_int8 min = 18.58 max = 18.90 avg = 18.72 + mobilenet_v2 min = 18.74 max = 19.10 avg = 18.96 + mobilenet_v3 min = 14.40 max = 14.65 avg = 14.50 + shufflenet min = 9.74 max = 9.88 avg = 9.84 + shufflenet_v2 min = 9.44 max = 9.55 avg = 9.50 + mnasnet min = 14.73 max = 15.03 avg = 14.87 + proxylessnasnet min = 18.37 max = 18.59 avg = 18.46 + efficientnet_b0 min = 29.11 max = 30.18 avg = 29.63 + efficientnetv2_b0 min = 46.40 max = 46.95 avg = 46.76 + regnety_400m min = 19.18 max = 19.39 avg = 19.28 + blazeface min = 5.16 max = 5.23 avg = 5.20 + googlenet min = 64.64 max = 65.33 avg = 65.00 + googlenet_int8 min = 61.86 max = 63.41 avg = 62.42 + resnet18 min = 42.00 max = 43.34 avg = 42.48 + resnet18_int8 min = 67.22 max = 67.80 avg = 67.45 + alexnet min = 57.65 max = 58.21 avg = 58.01 + vgg16 min = 192.35 max = 193.36 avg = 192.84 + vgg16_int8 min = 570.86 max = 578.81 avg = 574.50 + resnet50 min = 107.86 max = 109.52 avg = 108.70 + resnet50_int8 min = 134.41 max = 135.86 avg = 135.18 + squeezenet_ssd min = 40.85 max = 41.24 avg = 41.02 + squeezenet_ssd_int8 min = 52.23 max = 53.70 avg = 52.54 + mobilenet_ssd min = 45.11 max = 45.50 avg = 45.32 + mobilenet_ssd_int8 min = 36.53 max = 36.63 avg = 36.59 + mobilenet_yolo min = 95.18 max = 96.79 avg = 95.90 + mobilenetv2_yolov3 min = 65.50 max = 65.88 avg = 65.72 + yolov4-tiny min = 86.13 max = 88.84 avg = 87.29 + nanodet_m min = 22.57 max = 22.87 avg = 22.74 + yolo-fastest-1.1 min = 9.23 max = 9.35 avg = 9.29 + yolo-fastestv2 min = 8.62 max = 8.83 avg = 8.73 + vision_transformer min = 3077.54 max = 3396.13 avg = 3339.58 + FastestDet min = 9.11 max = 9.30 avg = 9.20 + +rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn 10 8 0 -1 0 +loop_count = 10 +num_threads = 8 +powersave = 0 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 10.02 max = 11.01 avg = 10.43 + squeezenet_int8 min = 11.78 max = 13.77 avg = 12.55 + mobilenet min = 12.75 max = 13.58 avg = 13.12 + mobilenet_int8 min = 12.23 max = 14.29 avg = 13.54 + mobilenet_v2 min = 12.76 max = 14.27 avg = 13.40 + mobilenet_v3 min = 9.51 max = 9.81 avg = 9.71 + shufflenet min = 7.06 max = 7.23 avg = 7.13 + shufflenet_v2 min = 6.21 max = 7.32 avg = 6.38 + mnasnet min = 9.32 max = 12.49 avg = 10.75 + proxylessnasnet min = 13.79 max = 15.51 avg = 14.70 + efficientnet_b0 min = 16.59 max = 17.99 avg = 17.08 + efficientnetv2_b0 min = 28.26 max = 32.26 avg = 30.52 + regnety_400m min = 13.43 max = 15.00 avg = 13.72 + blazeface min = 3.87 max = 7.38 avg = 5.65 + googlenet min = 29.18 max = 44.00 avg = 36.31 + googlenet_int8 min = 31.14 max = 37.48 avg = 34.58 + resnet18 min = 21.47 max = 24.40 avg = 22.35 + resnet18_int8 min = 26.68 max = 29.89 avg = 28.45 + alexnet min = 29.35 max = 38.09 avg = 31.65 + vgg16 min = 112.37 max = 122.94 avg = 117.05 + vgg16_int8 min = 161.08 max = 215.29 avg = 176.89 + resnet50 min = 54.54 max = 57.50 avg = 55.71 + resnet50_int8 min = 54.76 max = 65.05 avg = 60.59 + squeezenet_ssd min = 26.21 max = 35.05 avg = 30.76 + squeezenet_ssd_int8 min = 33.34 max = 40.88 avg = 36.19 + mobilenet_ssd min = 26.71 max = 28.85 avg = 27.88 + mobilenet_ssd_int8 min = 22.03 max = 25.31 avg = 24.21 + mobilenet_yolo min = 60.51 max = 74.65 avg = 65.45 + mobilenetv2_yolov3 min = 37.27 max = 44.13 avg = 41.20 + yolov4-tiny min = 49.84 max = 58.12 avg = 53.93 + nanodet_m min = 16.54 max = 22.41 avg = 20.60 + yolo-fastest-1.1 min = 8.49 max = 13.50 avg = 9.91 + yolo-fastestv2 min = 6.28 max = 11.22 avg = 8.00 + vision_transformer min = 968.62 max = 1063.47 avg = 1019.12 + FastestDet min = 6.14 max = 11.92 avg = 7.85 + +rock@rock-5b:~/ncnn/build/benchmark$ ./benchncnn 10 4 2 -1 0 +loop_count = 10 +num_threads = 4 +powersave = 2 +gpu_device = -1 +cooling_down = 0 + squeezenet min = 6.78 max = 7.27 avg = 7.07 + squeezenet_int8 min = 4.58 max = 4.73 avg = 4.63 + mobilenet min = 5.67 max = 5.78 avg = 5.72 + mobilenet_int8 min = 5.01 max = 5.20 avg = 5.15 + mobilenet_v2 min = 5.44 max = 5.76 avg = 5.50 + mobilenet_v3 min = 4.67 max = 5.03 avg = 4.74 + shufflenet min = 4.22 max = 4.30 avg = 4.27 + shufflenet_v2 min = 3.48 max = 3.60 avg = 3.53 + mnasnet min = 4.52 max = 4.83 avg = 4.61 + proxylessnasnet min = 5.44 max = 6.01 avg = 5.56 + efficientnet_b0 min = 8.33 max = 8.52 avg = 8.41 + efficientnetv2_b0 min = 12.95 max = 13.08 avg = 13.02 + regnety_400m min = 8.60 max = 8.73 avg = 8.66 + blazeface min = 1.86 max = 1.95 avg = 1.90 + googlenet min = 16.58 max = 16.85 avg = 16.65 + googlenet_int8 min = 16.99 max = 17.13 avg = 17.06 + resnet18 min = 14.98 max = 15.30 avg = 15.08 + resnet18_int8 min = 20.10 max = 20.22 avg = 20.15 + alexnet min = 19.78 max = 20.21 avg = 19.87 + vgg16 min = 66.35 max = 94.16 avg = 75.24 + vgg16_int8 min = 131.02 max = 131.98 avg = 131.51 + resnet50 min = 28.07 max = 28.78 avg = 28.28 + resnet50_int8 min = 33.56 max = 35.53 avg = 33.84 + squeezenet_ssd min = 16.40 max = 16.80 avg = 16.49 + squeezenet_ssd_int8 min = 18.64 max = 19.00 avg = 18.76 + mobilenet_ssd min = 13.66 max = 13.78 avg = 13.72 + mobilenet_ssd_int8 min = 11.23 max = 11.42 avg = 11.33 + mobilenet_yolo min = 30.76 max = 31.03 avg = 30.86 + mobilenetv2_yolov3 min = 19.28 max = 21.07 avg = 20.30 + yolov4-tiny min = 33.44 max = 37.68 avg = 34.70 + nanodet_m min = 8.28 max = 8.55 avg = 8.38 + yolo-fastest-1.1 min = 4.30 max = 4.40 avg = 4.34 + yolo-fastestv2 min = 4.07 max = 4.18 avg = 4.13 + vision_transformer min = 815.67 max = 819.27 avg = 817.49 + FastestDet min = 4.34 max = 7.47 avg = 5.18 +``` \ No newline at end of file diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 032e3f9fbc42..714dca3180f6 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -161,8 +161,8 @@ void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& op int main(int argc, char** argv) { int loop_count = 4; - int num_threads = ncnn::get_cpu_count(); - int powersave = 0; + int num_threads = ncnn::get_physical_big_cpu_count(); + int powersave = 2; int gpu_device = -1; int cooling_down = 1; @@ -199,8 +199,8 @@ int main(int argc, char** argv) g_loop_count = loop_count; - g_blob_pool_allocator.set_size_compare_ratio(0.0f); - g_workspace_pool_allocator.set_size_compare_ratio(0.5f); + g_blob_pool_allocator.set_size_compare_ratio(0.f); + g_workspace_pool_allocator.set_size_compare_ratio(0.f); #if NCNN_VULKAN if (use_vulkan_compute) diff --git a/cmake/ncnnConfig.cmake.in b/cmake/ncnnConfig.cmake.in index b118713571e9..528c69da0ade 100644 --- a/cmake/ncnnConfig.cmake.in +++ b/cmake/ncnnConfig.cmake.in @@ -19,20 +19,24 @@ if(NCNN_VULKAN) if(NOT NCNN_SHARED_LIB) if(NCNN_SYSTEM_GLSLANG) - set(GLSLANG_TARGET_DIR "@GLSLANG_TARGET_DIR@") + find_package(glslang QUIET) + if(NOT glslang_FOUND) + set(GLSLANG_TARGET_DIR "@GLSLANG_TARGET_DIR@") + include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake) + include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake) + if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") + # hlsl support can be optional + include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") + endif() + include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake) + include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake) + endif() else() - set(GLSLANG_TARGET_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_LIBDIR@/cmake") - endif(NCNN_SYSTEM_GLSLANG) - - include(${GLSLANG_TARGET_DIR}/OSDependentTargets.cmake) - include(${GLSLANG_TARGET_DIR}/OGLCompilerTargets.cmake) - if(EXISTS "${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") - # hlsl support can be optional - include("${GLSLANG_TARGET_DIR}/HLSLTargets.cmake") + set(glslang_DIR "${CMAKE_CURRENT_LIST_DIR}/../../../@CMAKE_INSTALL_LIBDIR@/cmake/glslang") + find_package(glslang QUIET) endif() - include(${GLSLANG_TARGET_DIR}/glslangTargets.cmake) - include(${GLSLANG_TARGET_DIR}/SPIRVTargets.cmake) + endif() -endif(NCNN_VULKAN) +endif() include(${CMAKE_CURRENT_LIST_DIR}/ncnn.cmake) diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake index 89d61823deb2..857d3b528bac 100644 --- a/cmake/ncnn_add_layer.cmake +++ b/cmake/ncnn_add_layer.cmake @@ -270,9 +270,17 @@ macro(ncnn_add_layer class) endif() endif() + if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "loongarch") + if(NCNN_LSX) + ncnn_add_arch_opt_layer(${class} lsx "-mlsx") + endif() + endif() + if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv") - if(NCNN_COMPILER_SUPPORT_RVV_FP16) + if(NCNN_COMPILER_SUPPORT_RVV_ZFH) ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh") + elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH) + ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16") elseif(NCNN_COMPILER_SUPPORT_RVV) ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv") endif() diff --git a/cmake/ncnn_check_rvv_vfredusum.cmake b/cmake/ncnn_check_rvv_vfredusum.cmake index 81496a765d12..59065556356a 100644 --- a/cmake/ncnn_check_rvv_vfredusum.cmake +++ b/cmake/ncnn_check_rvv_vfredusum.cmake @@ -9,7 +9,7 @@ int main(void) { float in1[4] = {-1.f,0.f,+1.f,2.f}; float out1=0; - word_type vl = vsetvl_e32m8(4); + size_t vl = vsetvl_e32m8(4); vfloat32m8_t _add = vle32_v_f32m8(in1,vl); vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl); _sum = vfredsum_vs_f32m8_f32m1(_sum, _add, _sum, vl); @@ -23,7 +23,7 @@ int main(void) { float in1[4] = {-1.f,0.f,+1.f,2.f}; float out1=0; - word_type vl = vsetvl_e32m8(4); + size_t vl = vsetvl_e32m8(4); vfloat32m8_t _add = vle32_v_f32m8(in1,vl); vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl); _sum = vfredusum_vs_f32m8_f32m1(_sum, _add, _sum, vl); @@ -36,7 +36,7 @@ if(NCNN_COMPILER_USE_VFREDSUM AND NOT NCNN_COMPILER_USE_VFREDUSUM) message(WARNING "The compiler uses vfredsum. Upgrading your toolchain is strongly recommended.") foreach(LMUL 1 2 4 8) add_definitions(-Dvfredusum_vs_f32m${LMUL}_f32m1=vfredsum_vs_f32m${LMUL}_f32m1) - if(NCNN_COMPILER_SUPPORT_RVV_FP16) + if(NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH) add_definitions(-Dvfredusum_vs_f16m${LMUL}_f16m1=vfredsum_vs_f16m${LMUL}_f16m1) endif() endforeach() diff --git a/cmake/ncnn_generate_lsx_source.cmake b/cmake/ncnn_generate_lsx_source.cmake new file mode 100644 index 000000000000..4f8fb20299aa --- /dev/null +++ b/cmake/ncnn_generate_lsx_source.cmake @@ -0,0 +1,14 @@ + +# must define SRC DST CLASS + +file(READ ${SRC} source_data) + +# replace +string(TOUPPER ${CLASS} CLASS_UPPER) +string(TOLOWER ${CLASS} CLASS_LOWER) + +string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}") +string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}") +string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}") + +file(WRITE ${DST} "${source_data}") diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md index 5366da1e112c..81e04f1e6dc3 100644 --- a/docs/developer-guide/operators.md +++ b/docs/developer-guide/operators.md @@ -29,7 +29,9 @@ * [Exp](#exp) * [Flatten](#flatten) * [GELU](#gelu) +* [GLU](#glu) * [Gemm](#gemm) +* [GridSample](#gridsample) * [GroupNorm](#groupnorm) * [GRU](#gru) * [HardSigmoid](#hardsigmoid) @@ -784,6 +786,22 @@ else y = 0.5 * x * erfc(-0.70710678 * x) | --------- | ------------- | ----- | --------- | ----------------- | | 0 | fast_gelu | int | 0 | use approximation | +# GLU + +If axis < 0, we use axis = x.dims + axis + +GLU(a,b)=a⊗σ(b) + +where a is the first half of the input matrix and b is the second half. + +axis specifies the dimension to split the input + +* one_blob_only + +| param id | name | type | default | description | +| --------- | ------------- | ----- | --------- | ----------------- | +| 0 | axis | int | 0 | | + # Gemm ``` a = transA ? transpose(x0) : x0 @@ -799,6 +817,34 @@ y = gemm(a, b) * alpha + c * beta | 2 | transA | int | 0 | | | 3 | transb | int | 0 | | +# GridSample +``` +Given an input and a flow-field grid, computes the output using input values and pixel locations from grid. + +For each output location output[:, h2, w2], the size-2 vector grid[h2, w2, 2] specifies input pixel[:, h1, w1] locations x and y, +which are used to interpolate the output value output[:, h2, w2] + +This function is often used in conjunction with affine_grid() to build Spatial Transformer Networks . +``` + +| param id | name | type | default | description | +| --------- | ------------- | ----- | --------- | ----------------- | +| 0 | sample_type | int | 1 | | +| 1 | padding_mode | int | 1 | | +| 2 | align_corner | int | 0 | | + + +Sample type: +- 1 = Nearest +- 2 = Bilinear +- 3 = Bicubic + +Padding mode: +- 1 = zeros +- 2 = border +- 3 = reflection + + # GroupNorm ``` split x along channel axis into group x0, x1 ... @@ -1026,15 +1072,17 @@ y0, hidden y1, cell y2 = lstm(x0, hidden x1, cell x2) | param id | name | type | default | description | | --------- | ------------- | ----- | --------- | ----------------- | -| 0 | num_output | int | 0 | hidden size of output | +| 0 | num_output | int | 0 | output size of output | | 1 | weight_data_size| int | 0 | total size of IFOG weight matrix | | 2 | direction | int | 0 | 0=forward, 1=reverse, 2=bidirectional | +| 3 | hidden_size | int | num_output| hidden size | | weight | type | shape | | ------------- | ----- | --------------------- | -| weight_xc_data| float/fp16/int8 | [input_size, num_output * 4, num_directions] | -| bias_c_data | float/fp16/int8 | [num_output, 4, num_directions] | -| weight_hc_data| float/fp16/int8 | [num_output, num_output * 4, num_directions] | +| weight_xc_data| float/fp16/int8 | [input_size, hidden_size * 4, num_directions] | +| bias_c_data | float/fp16/int8 | [hidden_size, 4, num_directions] | +| weight_hc_data| float/fp16/int8 | [num_output, hidden_size * 4, num_directions] | +| weight_hr_data| float/fp16/int8 | [hidden_size, num_output, num_directions] | Direction flag: - 0 = forward only @@ -1084,14 +1132,16 @@ y = affine(out) | 0 | embed_dim | int | 0 | | | 1 | num_head | int | 1 | | | 2 | weight_data_size| int | 0 | | +| 3 | kdim | int | embed_dim | | +| 4 | vdim | int | embed_dim | | | weight | type | shape | | ------------- | ----- | --------------------- | | q_weight_data | float/fp16/int8 | [weight_data_size] | | q_bias_data | float | [embed_dim] | -| k_weight_data | float/fp16/int8 | [weight_data_size] | +| k_weight_data | float/fp16/int8 | [embed_dim * kdim] | | k_bias_data | float | [embed_dim] | -| v_weight_data | float/fp16/int8 | [weight_data_size] | +| v_weight_data | float/fp16/int8 | [embed_dim * vdim] | | v_bias_data | float | [embed_dim] | | out_weight_data| float/fp16/int8 | [weight_data_size] | | out_bias_data | float | [embed_dim] | diff --git a/docs/faq.en.md b/docs/faq.en.md index 8675a2fc9d51..072d0b33e26c 100644 --- a/docs/faq.en.md +++ b/docs/faq.en.md @@ -216,7 +216,7 @@ Fully customizable op, first change to one that can export (e.g. concat slice), 2. [Learn in 5 minutes! Converting TorchScript models to ncnn models with PNNX](https://zhuanlan.zhihu.com/p/427512763) -# 使用 +# Using - ## vkEnumeratePhysicalDevices failed -3 @@ -290,4 +290,4 @@ Fully customizable op, first change to one that can export (e.g. concat slice), ncnn::Mat in1(60, (void*)testData.data()).reshape(4, 5, 3); // just pass the pointer to the float data as a void*, and even specify the dimension (up says it's best to use reshape to solve the channel gap) float* a = new float[60]; // New a piece of memory yourself, you need to release it later ncnn::Mat in2 = ncnn::Mat(60, (void*)a).reshape(4, 5, 3).clone(); // use the same method as above, clone() to transfer data owner - ``` \ No newline at end of file + ``` diff --git a/docs/faq.md b/docs/faq.md index 4701414c57d0..8d72b792dfa7 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -111,7 +111,7 @@ # 怎样添加ncnn库到项目中?cmake方式怎么用? -编译ncnn,make install。linux/windows set/export ncnn_DIR 指向 isntall目录下下包含ncnnConfig.cmake 的目录 +编译ncnn,make install。linux/windows set/export ncnn_DIR 指向 install目录下包含ncnnConfig.cmake 的目录 - ## android diff --git a/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md b/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md index 526de5ab2656..f4cf49399f85 100644 --- a/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md +++ b/docs/how-to-use-and-FAQ/FAQ-ncnn-produce-wrong-result.md @@ -170,3 +170,19 @@ ncnn::Net net; // param_buffer is the content buffe of XYZ.param file net.load_param_mem(param_buffer); ``` + + +### disable fp16 + +Some models may overflow fp16, resulting in a nan result. + +So try to turn off fp16 lower-precision optimizations, and the precision will be improved to fp32 to investigate and solve the overflow problem caused by this. + +You can set it as follows +```cpp +ncnn::Net net; + +net.opt.use_fp16_packed = false; +net.opt.use_fp16_storage = false; +net.opt.use_fp16_arithmetic = false; +``` \ No newline at end of file diff --git a/examples/yolov5.cpp b/examples/yolov5.cpp index b1a8e8495532..88f6db21222b 100644 --- a/examples/yolov5.cpp +++ b/examples/yolov5.cpp @@ -26,9 +26,10 @@ #include #include -#define YOLOV5_V60 1 //YOLOv5 v6.0 +//#define YOLOV5_V60 1 //YOLOv5 v6.0 +#define YOLOV5_V62 1 //YOLOv5 v6.2 export onnx model method https://github.com/shaoshengsong/yolov5_62_export_ncnn -#if YOLOV5_V60 +#if YOLOV5_V60 || YOLOV5_V62 #define MAX_STRIDE 64 #else #define MAX_STRIDE 32 @@ -79,7 +80,7 @@ class YoloV5Focus : public ncnn::Layer }; DEFINE_LAYER_CREATOR(YoloV5Focus) -#endif //YOLOV5_V60 +#endif //YOLOV5_V60 YOLOV5_V62 struct Object { @@ -278,7 +279,12 @@ static int detect_yolov5(const cv::Mat& bgr, std::vector& objects) // original pretrained model from https://github.com/ultralytics/yolov5 // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models -#if YOLOV5_V60 +#if YOLOV5_V62 + if (yolov5.load_param("yolov5s_6.2.param")) + exit(-1); + if (yolov5.load_model("yolov5s_6.2.bin")) + exit(-1); +#elif YOLOV5_V60 if (yolov5.load_param("yolov5s_6.0.param")) exit(-1); if (yolov5.load_model("yolov5s_6.0.bin")) @@ -358,7 +364,10 @@ static int detect_yolov5(const cv::Mat& bgr, std::vector& objects) // stride 16 { ncnn::Mat out; -#if YOLOV5_V60 + +#if YOLOV5_V62 + ex.extract("353", out); +#elif YOLOV5_V60 ex.extract("376", out); #else ex.extract("781", out); @@ -381,7 +390,9 @@ static int detect_yolov5(const cv::Mat& bgr, std::vector& objects) // stride 32 { ncnn::Mat out; -#if YOLOV5_V60 +#if YOLOV5_V62 + ex.extract("367", out); +#elif YOLOV5_V60 ex.extract("401", out); #else ex.extract("801", out); diff --git a/glslang b/glslang index 86ff4bca1ddc..88fd417b0bb7 160000 --- a/glslang +++ b/glslang @@ -1 +1 @@ -Subproject commit 86ff4bca1ddc7e2262f119c16e7228d0efb67610 +Subproject commit 88fd417b0bb7d91755961c70e846d274c182f2b0 diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 140555ce7060..999efa1deb65 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -8,6 +8,19 @@ add_definitions(-DVERSION_INFO="${PACKAGE_VERSION}") set( CMAKE_CXX_STANDARD 11 ) set( CMAKE_CXX_STANDARD_REQUIRED ON ) +if(CMAKE_CXX_COMPILER_ARCHITECTURE_ID MATCHES "ARM64") + option(PYBIND11_PYTHONLIBS_OVERWRITE "" OFF) + + set(PYTHON_PREFIX "$ENV{LOCALAPPDATA}/pypa/cibuildwheel/Cache/nuget-cpython/pythonarm64.$ENV{PYTHON_VERSION}/tools") + if(NOT DEFINED $ENV{CIBUILDWHEEL}) + message(WARNING + " This is hack for cibuildwheel on github action\n" + " Use the right way to cross-compile python module for windows arm64 like follows\n" + " set(PYTHON_PREFIX \"\")\n" + ) + endif() +endif() + add_subdirectory(pybind11) if("${CMAKE_LIBRARY_OUTPUT_DIRECTORY}" STREQUAL "") diff --git a/python/pybind11 b/python/pybind11 index 70a58c577eaf..80dc998efced 160000 --- a/python/pybind11 +++ b/python/pybind11 @@ -1 +1 @@ -Subproject commit 70a58c577eaf067748c2ec31bfd0b0a614cffba6 +Subproject commit 80dc998efced8ceb2be59756668a7e90e8bef917 diff --git a/python/src/main.cpp b/python/src/main.cpp index cef29e9a0530..c90b289ef379 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -287,17 +287,18 @@ PYBIND11_MODULE(ncnn, m) .def_buffer([](Mat& m) -> py::buffer_info { return to_buffer_info(m); }) - .def("numpy", [](py::object obj, const std::string& format="") -> py::array { + .def( + "numpy", [](py::object obj, const std::string& format = "") -> py::array { auto* m = obj.cast(); return py::array(to_buffer_info(*m, format), obj); - }, py::arg("format")="") + }, + py::arg("format") = "", "i for int32, f for float32, d for double") //.def("fill", (void (Mat::*)(int))(&Mat::fill), py::arg("v")) .def("fill", (void (Mat::*)(float))(&Mat::fill), py::arg("v")) .def("clone", &Mat::clone, py::arg("allocator") = nullptr) .def("clone_from", &Mat::clone_from, py::arg("mat"), py::arg("allocator") = nullptr) .def( - "reshape", - [](Mat& mat, py::tuple shape, Allocator* allocator) { + "reshape", [](Mat& mat, py::tuple shape, Allocator* allocator) { switch (shape.size()) { case 1: @@ -316,18 +317,13 @@ PYBIND11_MODULE(ncnn, m) return Mat(); }, py::arg("shape") = py::tuple(1), py::arg("allocator") = nullptr) - .def("reshape", (Mat(Mat::*)(int, Allocator*) const) & Mat::reshape, - py::arg("w"), py::kw_only(), py::arg("allocator") = nullptr) - .def("reshape", (Mat(Mat::*)(int, int, Allocator*) const) & Mat::reshape, - py::arg("w"), py::arg("h"), py::kw_only(), py::arg("allocator") = nullptr) - .def("reshape", (Mat(Mat::*)(int, int, int, Allocator*) const) & Mat::reshape, - py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr) - .def("reshape", (Mat(Mat::*)(int, int, int, int, Allocator*) const) & Mat::reshape, - py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr) + .def("reshape", (Mat(Mat::*)(int, Allocator*) const) & Mat::reshape, py::arg("w"), py::kw_only(), py::arg("allocator") = nullptr) + .def("reshape", (Mat(Mat::*)(int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::kw_only(), py::arg("allocator") = nullptr) + .def("reshape", (Mat(Mat::*)(int, int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr) + .def("reshape", (Mat(Mat::*)(int, int, int, int, Allocator*) const) & Mat::reshape, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("allocator") = nullptr) .def( - "create", - [](Mat& mat, py::tuple shape, size_t elemsize, int elempack, Allocator* allocator) { + "create", [](Mat& mat, py::tuple shape, size_t elemsize, int elempack, Allocator* allocator) { switch (shape.size()) { case 1: @@ -345,23 +341,12 @@ PYBIND11_MODULE(ncnn, m) } return; }, - py::arg("shape"), py::kw_only(), - py::arg("elemsize") = 4, py::arg("elempack") = 1, - py::arg("allocator") = nullptr) - .def("create", (void (Mat::*)(int, size_t, int, Allocator*)) & Mat::create, - py::arg("w"), py::kw_only(), - py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) - .def("create", (void (Mat::*)(int, int, size_t, int, Allocator*)) & Mat::create, - py::arg("w"), py::arg("h"), py::kw_only(), - py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) - .def("create", (void (Mat::*)(int, int, int, size_t, int, Allocator*)) & Mat::create, - py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), - py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) - .def("create", (void (Mat::*)(int, int, int, int, size_t, int, Allocator*)) & Mat::create, - py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), - py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) - .def("create_like", (void (Mat::*)(const Mat&, Allocator*)) & Mat::create_like, - py::arg("m"), py::arg("allocator") = nullptr) + py::arg("shape"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) + .def("create", (void (Mat::*)(int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) + .def("create", (void (Mat::*)(int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) + .def("create", (void (Mat::*)(int, int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) + .def("create", (void (Mat::*)(int, int, int, int, size_t, int, Allocator*)) & Mat::create, py::arg("w"), py::arg("h"), py::arg("d"), py::arg("c"), py::kw_only(), py::arg("elemsize") = 4, py::arg("elempack") = 1, py::arg("allocator") = nullptr) + .def("create_like", (void (Mat::*)(const Mat&, Allocator*)) & Mat::create_like, py::arg("m"), py::arg("allocator") = nullptr) .def("addref", &Mat::addref) .def("release", &Mat::release) .def("empty", &Mat::empty) @@ -373,8 +358,7 @@ PYBIND11_MODULE(ncnn, m) .def("depth", (Mat(Mat::*)(int)) & Mat::depth, py::arg("z")) //.def("depth", (const Mat (Mat::*)(int) const) & Mat::depth, py::arg("z")) .def( - "row", - [](Mat& m, int y) { + "row", [](Mat& m, int y) { if (m.elempack != 1) { std::stringstream ss; diff --git a/python/src/pybind11_allocator.h b/python/src/pybind11_allocator.h index 7c568209cfea..64ce553c752a 100644 --- a/python/src/pybind11_allocator.h +++ b/python/src/pybind11_allocator.h @@ -25,11 +25,11 @@ class PyAllocator : public Base using Base::Base; // Inherit constructors void* fastMalloc(size_t size) override { - PYBIND11_OVERLOAD_PURE(void*, Base, fastMalloc, size); + PYBIND11_OVERRIDE_PURE(void*, Base, fastMalloc, size); } void fastFree(void* ptr) override { - PYBIND11_OVERLOAD_PURE(void, Base, fastFree, ptr); + PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr); } }; @@ -40,11 +40,11 @@ class PyAllocatorOther : public PyAllocator using PyAllocator::PyAllocator; void* fastMalloc(size_t size) override { - PYBIND11_OVERLOAD(void*, Other, fastMalloc, size); + PYBIND11_OVERRIDE(void*, Other, fastMalloc, size); } void fastFree(void* ptr) override { - PYBIND11_OVERLOAD(void, Other, fastFree, ptr); + PYBIND11_OVERRIDE(void, Other, fastFree, ptr); } }; @@ -56,23 +56,23 @@ class PyVkAllocator : public Base using Base::Base; // Inherit constructors void clear() override { - PYBIND11_OVERLOAD(void, Base, clear, ); + PYBIND11_OVERRIDE(void, Base, clear, ); } ncnn::VkBufferMemory* fastMalloc(size_t size) override { - PYBIND11_OVERLOAD_PURE(ncnn::VkBufferMemory*, Base, fastMalloc, size); + PYBIND11_OVERRIDE_PURE(ncnn::VkBufferMemory*, Base, fastMalloc, size); } void fastFree(ncnn::VkBufferMemory* ptr) override { - PYBIND11_OVERLOAD_PURE(void, Base, fastFree, ptr); + PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr); } int flush(ncnn::VkBufferMemory* ptr) override { - PYBIND11_OVERLOAD(int, Base, flush, ptr); + PYBIND11_OVERRIDE(int, Base, flush, ptr); } int invalidate(ncnn::VkBufferMemory* ptr) override { - PYBIND11_OVERLOAD(int, Base, invalidate, ptr); + PYBIND11_OVERRIDE(int, Base, invalidate, ptr); } }; @@ -83,15 +83,15 @@ class PyVkAllocatorOther : public PyVkAllocator using PyVkAllocator::PyVkAllocator; void clear() override { - PYBIND11_OVERLOAD(void, Other, clear, ); + PYBIND11_OVERRIDE(void, Other, clear, ); } ncnn::VkBufferMemory* fastMalloc(size_t size) override { - PYBIND11_OVERLOAD(ncnn::VkBufferMemory*, Other, fastMalloc, size); + PYBIND11_OVERRIDE(ncnn::VkBufferMemory*, Other, fastMalloc, size); } void fastFree(ncnn::VkBufferMemory* ptr) override { - PYBIND11_OVERLOAD(void, Other, fastFree, ptr); + PYBIND11_OVERRIDE(void, Other, fastFree, ptr); } }; @@ -102,17 +102,15 @@ class PyVkBlobAllocator : public Base using Base::Base; // Inherit constructors void clear() override { - PYBIND11_OVERLOAD(void, Base, clear, ); + PYBIND11_OVERRIDE(void, Base, clear, ); } - ncnn::VkImageMemory* fastMalloc(int width, int height, - VkFormat format) override + ncnn::VkImageMemory* fastMalloc(int width, int height, VkFormat format) override { - PYBIND11_OVERLOAD_PURE(ncnn::VkImageMemory*, Base, fastMalloc, width, - height, format); + PYBIND11_OVERRIDE_PURE(ncnn::VkImageMemory*, Base, fastMalloc, width, height, format); } void fastFree(ncnn::VkImageMemory* ptr) override { - PYBIND11_OVERLOAD_PURE(void, Base, fastFree, ptr); + PYBIND11_OVERRIDE_PURE(void, Base, fastFree, ptr); } }; @@ -124,14 +122,13 @@ class PyVkBlobAllocator : public Base // ncnn::VkImageMemory* fastMalloc(int width, int height, // VkFormat format) override // { -// PYBIND11_OVERLOAD(ncnn::VkImageMemory*, Other, fastMalloc, width, height, -// format); +// PYBIND11_OVERRIDE(ncnn::VkImageMemory*, Other, fastMalloc, width, height, format); // } // void fastFree(ncnn::VkImageMemory* ptr) override // { -// PYBIND11_OVERLOAD(void, Other, fastFree, ptr); +// PYBIND11_OVERRIDE(void, Other, fastFree, ptr); // } //}; #endif // NCNN_VULKAN -#endif \ No newline at end of file +#endif diff --git a/python/src/pybind11_datareader.h b/python/src/pybind11_datareader.h index e9ecfb8058e7..63b67ee47c75 100644 --- a/python/src/pybind11_datareader.h +++ b/python/src/pybind11_datareader.h @@ -42,12 +42,12 @@ class PyDataReader : public Base #if NCNN_STRING int scan(const char* format, void* p) const override { - PYBIND11_OVERLOAD(int, Base, scan, format, p); + PYBIND11_OVERRIDE(int, Base, scan, format, p); } #endif // NCNN_STRING size_t read(void* buf, size_t size) const override { - PYBIND11_OVERLOAD(size_t, Base, read, buf, size); + PYBIND11_OVERRIDE(size_t, Base, read, buf, size); } }; @@ -59,13 +59,13 @@ class PyDataReaderOther : public PyDataReader #if NCNN_STRING int scan(const char* format, void* p) const override { - PYBIND11_OVERLOAD(int, Other, scan, format, p); + PYBIND11_OVERRIDE(int, Other, scan, format, p); } #endif // NCNN_STRING size_t read(void* buf, size_t size) const override { - PYBIND11_OVERLOAD(size_t, Other, read, buf, size); + PYBIND11_OVERRIDE(size_t, Other, read, buf, size); } }; -#endif \ No newline at end of file +#endif diff --git a/python/src/pybind11_mat.h b/python/src/pybind11_mat.h index 1a1d1f1c626d..04663e829d0b 100644 --- a/python/src/pybind11_mat.h +++ b/python/src/pybind11_mat.h @@ -48,59 +48,69 @@ std::string get_mat_format(const ncnn::Mat& m) // f (float) // d (double) // leave it to empty to use get_mat_format -py::buffer_info to_buffer_info(ncnn::Mat &m, const std::string &format = "") { - if (m.elemsize != 1 && m.elemsize != 2 && m.elemsize != 4) { - std::stringstream ss; - ss << "convert ncnn.Mat to numpy.ndarray only elemsize 1, 2, 4 support " - "now, but given " - << m.elemsize; - pybind11::pybind11_fail(ss.str()); - } - if (m.elempack != 1) { - std::stringstream ss; - ss << "convert ncnn.Mat to numpy.ndarray only elempack 1 support now, but " - "given " - << m.elempack; - pybind11::pybind11_fail(ss.str()); - } - std::string _format(format); - if (_format.empty()) { - _format = get_mat_format(m); - } - std::vector shape; - std::vector strides; - if (m.dims == 1) { - shape.push_back(m.w); - strides.push_back(m.elemsize); - } else if (m.dims == 2) { - shape.push_back(m.h); - shape.push_back(m.w); - strides.push_back(m.w * m.elemsize); - strides.push_back(m.elemsize); - } else if (m.dims == 3) { - shape.push_back(m.c); - shape.push_back(m.h); - shape.push_back(m.w); - strides.push_back(m.cstep * m.elemsize); - strides.push_back(m.w * m.elemsize); - strides.push_back(m.elemsize); - } else if (m.dims == 4) { - shape.push_back(m.c); - shape.push_back(m.d); - shape.push_back(m.h); - shape.push_back(m.w); - strides.push_back(m.cstep * m.elemsize); - strides.push_back(m.w * m.h * m.elemsize); - strides.push_back(m.w * m.elemsize); - strides.push_back(m.elemsize); - } - return py::buffer_info(m.data, /* Pointer to buffer */ - m.elemsize, /* Size of one scalar */ - _format, /* Python struct-style format descriptor */ - m.dims, /* Number of dimensions */ - shape, /* Buffer dimensions */ - strides /* Strides (in bytes) for each index */ - ); +py::buffer_info to_buffer_info(ncnn::Mat& m, const std::string& format = "") +{ + if (m.elemsize != 1 && m.elemsize != 2 && m.elemsize != 4) + { + std::ostringstream ss; + ss << "Convert ncnn.Mat to numpy.ndarray. Support only elemsize 1, 2, 4; but given " + << m.elemsize; + py::pybind11_fail(ss.str()); + } + if (m.elempack != 1) + { + std::ostringstream ss; + ss << "Convert ncnn.Mat to numpy.ndarray. Support only elempack == 1, but " + "given " + << m.elempack; + py::pybind11_fail(ss.str()); + } + std::string _format(format); + if (_format.empty()) + { + _format = get_mat_format(m); + } + std::vector shape; + std::vector strides; + if (m.dims == 1) + { + shape.push_back(m.w); + strides.push_back(m.elemsize); + } + else if (m.dims == 2) + { + shape.push_back(m.h); + shape.push_back(m.w); + strides.push_back(m.w * m.elemsize); + strides.push_back(m.elemsize); + } + else if (m.dims == 3) + { + shape.push_back(m.c); + shape.push_back(m.h); + shape.push_back(m.w); + strides.push_back(m.cstep * m.elemsize); + strides.push_back(m.w * m.elemsize); + strides.push_back(m.elemsize); + } + else if (m.dims == 4) + { + shape.push_back(m.c); + shape.push_back(m.d); + shape.push_back(m.h); + shape.push_back(m.w); + strides.push_back(m.cstep * m.elemsize); + strides.push_back(m.w * m.h * m.elemsize); + strides.push_back(m.w * m.elemsize); + strides.push_back(m.elemsize); + } + return py::buffer_info(m.data, /* Pointer to buffer */ + m.elemsize, /* Size of one scalar */ + _format, /* Python struct-style format descriptor */ + m.dims, /* Number of dimensions */ + shape, /* Buffer dimensions */ + strides /* Strides (in bytes) for each index */ + ); } -#endif // PYBIND11_NCNN_MAT_H +#endif diff --git a/python/src/pybind11_modelbin.h b/python/src/pybind11_modelbin.h index 6b5e3a676caf..5f875061c886 100644 --- a/python/src/pybind11_modelbin.h +++ b/python/src/pybind11_modelbin.h @@ -25,13 +25,13 @@ class PyModelBin : public Base using Base::Base; // Inherit constructors ncnn::Mat load(int w, int type) const override { - PYBIND11_OVERLOAD_PURE(ncnn::Mat, Base, load, w, type); + PYBIND11_OVERRIDE_PURE(ncnn::Mat, Base, load, w, type); } //ncnn::Mat load(int w, int h, int type) const override { - // PYBIND11_OVERLOAD(ncnn::Mat, Base, load, w, h, type); + // PYBIND11_OVERRIDE(ncnn::Mat, Base, load, w, h, type); //} //ncnn::Mat load(int w, int h, int c, int type) const override { - // PYBIND11_OVERLOAD(ncnn::Mat, Base, load, w, h, c, type); + // PYBIND11_OVERRIDE(ncnn::Mat, Base, load, w, h, c, type); //} }; @@ -42,8 +42,8 @@ class PyModelBinOther : public PyModelBin using PyModelBin::PyModelBin; ncnn::Mat load(int w, int type) const override { - PYBIND11_OVERLOAD(ncnn::Mat, Other, load, w, type); + PYBIND11_OVERRIDE(ncnn::Mat, Other, load, w, type); } }; -#endif \ No newline at end of file +#endif diff --git a/python/tests/test_mat.py b/python/tests/test_mat.py index 605d59c39191..7019961a5770 100644 --- a/python/tests/test_mat.py +++ b/python/tests/test_mat.py @@ -211,10 +211,11 @@ def test_mat_dims4(): def test_numpy(): mat = ncnn.Mat(1) - array = np.array(mat) + array = mat.numpy() assert mat.dims == array.ndim and mat.w == array.shape[0] mat = ncnn.Mat(2, 3) - array = np.array(mat) + array = mat.numpy() + assert array.dtype == np.float32 assert ( mat.dims == array.ndim and mat.w == array.shape[1] and mat.h == array.shape[0] ) @@ -237,10 +238,10 @@ def test_numpy(): ) mat = ncnn.Mat(1, elemsize=1) - array = np.array(mat) + array = mat.numpy() assert array.dtype == np.int8 mat = ncnn.Mat(1, elemsize=2) - array = np.array(mat) + array = mat.numpy() assert array.dtype == np.float16 # pybind11 def_buffer throw bug # with pytest.raises(RuntimeError) as execinfo: @@ -251,7 +252,7 @@ def test_numpy(): # ) assert array.dtype == np.float16 mat = ncnn.Mat(1, elemsize=4) - array = np.array(mat) + array = mat.numpy() assert array.dtype == np.float32 mat = np.random.randint(0, 128, size=(12,)).astype(np.uint8) @@ -279,6 +280,19 @@ def test_numpy(): array = np.array(mat) assert (mat == array).all() + array = np.array([1, 2, 3], dtype=np.int32) + mat = ncnn.Mat(array) + array2 = mat.numpy(format='i') + assert array2.dtype == np.int32 + array[0] = 10 + assert array2[0] == 10 + + array = np.array([1, 2, 3], dtype=np.float32) + mat = ncnn.Mat(array) + array2 = mat.numpy(format='f') + assert array2.dtype == np.float32 + array2[0] = 100 + assert array[0] == 100 def test_fill(): mat = ncnn.Mat(1) diff --git a/setup.py b/setup.py index 3c97205e453c..89e78bf7764c 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,8 @@ def build_extension(self, ext): "-DPYTHON_EXECUTABLE={}".format(sys.executable), "-DCMAKE_BUILD_TYPE={}".format(cfg), # not used on MSVC, but no harm "-DNCNN_PYTHON=ON", + "-DNCNN_DISABLE_RTTI=OFF", + "-DNCNN_DISABLE_EXCEPTION=OFF", "-DNCNN_BUILD_BENCHMARK=OFF", "-DNCNN_BUILD_EXAMPLES=OFF", "-DNCNN_BUILD_TOOLS=OFF", diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index df423dfc0b7b..832dfe4a8bd1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -161,6 +161,9 @@ ncnn_add_layer(MakePadMask) ncnn_add_layer(RelShift) ncnn_add_layer(GLU) ncnn_add_layer(LSTM2) +ncnn_add_layer(Fold) +ncnn_add_layer(Unfold) +ncnn_add_layer(GridSample) if(NCNN_VULKAN) ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp) @@ -227,6 +230,8 @@ if(NCNN_OPENMP) elseif(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER 20)) target_compile_options(ncnn PRIVATE -fopenmp) target_link_libraries(ncnn PUBLIC -fopenmp -static-openmp) + # see cpu.cpp __wrap___kmp_abort_process comment for the linker magic + target_link_libraries(ncnn PUBLIC -Wl,-wrap,__kmp_affinity_determine_capable) elseif(OpenMP_CXX_FOUND) target_link_libraries(ncnn PUBLIC OpenMP::OpenMP_CXX) else() @@ -451,6 +456,12 @@ if(((IOS AND CMAKE_OSX_ARCHITECTURES MATCHES "arm64") OR (APPLE AND CMAKE_OSX_AR endif() endif() target_compile_options(ncnn PRIVATE ${ARM_MARCH_FLAG}) + + if(ANDROID_NDK_MAJOR AND (ANDROID_NDK_MAJOR GREATER_EQUAL 23)) + # llvm 12 in ndk-23 enables out-of-line atomics by default + # disable this feature for fixing linking atomic builtins issue with old ndk + target_compile_options(ncnn PRIVATE -mno-outline-atomics) + endif() endif() if(NCNN_TARGET_ARCH STREQUAL "mips") @@ -462,10 +473,18 @@ if(NCNN_TARGET_ARCH STREQUAL "mips") endif() endif() +if(NCNN_TARGET_ARCH STREQUAL "loongarch") + if(NOT NCNN_RUNTIME_CPU AND NCNN_LSX) + target_compile_options(ncnn PRIVATE -mlsx) + endif() +endif() + if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906) if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV) - if(NCNN_COMPILER_SUPPORT_RVV_FP16) + if(NCNN_COMPILER_SUPPORT_RVV_ZFH) target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh) + elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH) + target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16) elseif(NCNN_COMPILER_SUPPORT_RVV) target_compile_options(ncnn PRIVATE -march=rv64gcv) endif() @@ -533,6 +552,3 @@ endif() set_property(GLOBAL PROPERTY USE_FOLDERS ON) set_property(TARGET ncnn PROPERTY FOLDER "libncnn") set_property(TARGET ncnn-generate-spirv PROPERTY FOLDER "libncnn") - -add_executable(main main.cpp) -target_link_libraries(main ncnn) diff --git a/src/allocator.cpp b/src/allocator.cpp index d14c81511c30..485d07951d08 100644 --- a/src/allocator.cpp +++ b/src/allocator.cpp @@ -33,6 +33,7 @@ class PoolAllocatorPrivate Mutex budgets_lock; Mutex payouts_lock; unsigned int size_compare_ratio; // 0~256 + size_t size_drop_threshold; std::list > budgets; std::list > payouts; }; @@ -40,7 +41,8 @@ class PoolAllocatorPrivate PoolAllocator::PoolAllocator() : Allocator(), d(new PoolAllocatorPrivate) { - d->size_compare_ratio = 192; // 0.75f * 256 + d->size_compare_ratio = 0; + d->size_drop_threshold = 10; } PoolAllocator::~PoolAllocator() @@ -99,12 +101,17 @@ void PoolAllocator::set_size_compare_ratio(float scr) d->size_compare_ratio = (unsigned int)(scr * 256); } +void PoolAllocator::set_size_drop_threshold(size_t threshold) +{ + d->size_drop_threshold = threshold; +} + void* PoolAllocator::fastMalloc(size_t size) { d->budgets_lock.lock(); // find free budget - std::list >::iterator it = d->budgets.begin(); + std::list >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin(); for (; it != d->budgets.end(); ++it) { size_t bs = it->first; @@ -126,6 +133,35 @@ void* PoolAllocator::fastMalloc(size_t size) return ptr; } + + if (bs < it_min->first) + { + it_min = it; + } + if (bs > it_max->first) + { + it_max = it; + } + } + + if (d->budgets.size() >= d->size_drop_threshold) + { + // All chunks in pool are not chosen. Then try to drop some outdated + // chunks and return them to OS. + if (it_max->first < size) + { + // Current query is asking for a chunk larger than any cached chunks. + // Then remove the smallest one. + ncnn::fastFree(it_min->second); + d->budgets.erase(it_min); + } + else if (it_min->first > size) + { + // Current query is asking for a chunk smaller than any cached chunks. + // Then remove the largest one. + ncnn::fastFree(it_max->second); + d->budgets.erase(it_max); + } } d->budgets_lock.unlock(); @@ -178,6 +214,7 @@ class UnlockedPoolAllocatorPrivate { public: unsigned int size_compare_ratio; // 0~256 + size_t size_drop_threshold; std::list > budgets; std::list > payouts; }; @@ -185,7 +222,8 @@ class UnlockedPoolAllocatorPrivate UnlockedPoolAllocator::UnlockedPoolAllocator() : Allocator(), d(new UnlockedPoolAllocatorPrivate) { - d->size_compare_ratio = 192; // 0.75f * 256 + d->size_compare_ratio = 0; + d->size_drop_threshold = 10; } UnlockedPoolAllocator::~UnlockedPoolAllocator() @@ -240,10 +278,15 @@ void UnlockedPoolAllocator::set_size_compare_ratio(float scr) d->size_compare_ratio = (unsigned int)(scr * 256); } +void UnlockedPoolAllocator::set_size_drop_threshold(size_t threshold) +{ + d->size_drop_threshold = threshold; +} + void* UnlockedPoolAllocator::fastMalloc(size_t size) { // find free budget - std::list >::iterator it = d->budgets.begin(); + std::list >::iterator it = d->budgets.begin(), it_max = d->budgets.begin(), it_min = d->budgets.begin(); for (; it != d->budgets.end(); ++it) { size_t bs = it->first; @@ -259,6 +302,29 @@ void* UnlockedPoolAllocator::fastMalloc(size_t size) return ptr; } + + if (bs > it_max->first) + { + it_max = it; + } + if (bs < it_min->first) + { + it_min = it; + } + } + + if (d->budgets.size() >= d->size_drop_threshold) + { + if (it_max->first < size) + { + ncnn::fastFree(it_min->second); + d->budgets.erase(it_min); + } + else if (it_min->first > size) + { + ncnn::fastFree(it_max->second); + d->budgets.erase(it_max); + } } // new diff --git a/src/allocator.h b/src/allocator.h index c9fcf90d1812..3a5ebcac56bc 100644 --- a/src/allocator.h +++ b/src/allocator.h @@ -170,9 +170,13 @@ class NCNN_EXPORT PoolAllocator : public Allocator ~PoolAllocator(); // ratio range 0 ~ 1 - // default cr = 0.75 + // default cr = 0 void set_size_compare_ratio(float scr); + // budget drop threshold + // default threshold = 10 + void set_size_drop_threshold(size_t); + // release all budgets immediately void clear(); @@ -195,9 +199,13 @@ class NCNN_EXPORT UnlockedPoolAllocator : public Allocator ~UnlockedPoolAllocator(); // ratio range 0 ~ 1 - // default cr = 0.75 + // default cr = 0 void set_size_compare_ratio(float scr); + // budget drop threshold + // default threshold = 10 + void set_size_drop_threshold(size_t); + // release all budgets immediately void clear(); diff --git a/src/c_api.cpp b/src/c_api.cpp index 9bb1ba1819b8..516ceec7df40 100644 --- a/src/c_api.cpp +++ b/src/c_api.cpp @@ -138,8 +138,11 @@ ncnn_allocator_t ncnn_allocator_create_unlocked_pool_allocator() void ncnn_allocator_destroy(ncnn_allocator_t allocator) { - delete (Allocator*)allocator->pthis; - free(allocator); + if (allocator) + { + delete (Allocator*)allocator->pthis; + free(allocator); + } } /* option api */ @@ -163,6 +166,26 @@ void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads) ((Option*)opt)->num_threads = num_threads; } +int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt) +{ + return ((Option*)opt)->use_local_pool_allocator; +} + +void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator) +{ + ((Option*)opt)->use_local_pool_allocator = use_local_pool_allocator; +} + +void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator) +{ + ((Option*)opt)->blob_allocator = allocator ? (Allocator*)allocator->pthis : NULL; +} + +void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator) +{ + ((Option*)opt)->workspace_allocator = allocator ? (Allocator*)allocator->pthis : NULL; +} + int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt) { #if NCNN_VULKAN @@ -191,82 +214,82 @@ ncnn_mat_t ncnn_mat_create() ncnn_mat_t ncnn_mat_create_1d(int w, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, (size_t)4u, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_2d(int w, int h, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, (size_t)4u, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_3d(int w, int h, int c, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, c, (size_t)4u, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, c, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_4d(int w, int h, int d, int c, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, d, c, (size_t)4u, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, d, c, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_external_1d(int w, void* data, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, data, (size_t)4u, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_external_2d(int w, int h, void* data, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, data, (size_t)4u, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_external_3d(int w, int h, int c, void* data, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, c, data, (size_t)4u, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, c, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_external_4d(int w, int h, int d, int c, void* data, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, d, c, data, (size_t)4u, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, d, c, data, (size_t)4u, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_1d_elem(int w, size_t elemsize, int elempack, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, elemsize, elempack, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_2d_elem(int w, int h, size_t elemsize, int elempack, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, elemsize, elempack, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_3d_elem(int w, int h, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, c, elemsize, elempack, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, c, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_4d_elem(int w, int h, int d, int c, size_t elemsize, int elempack, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, d, c, elemsize, elempack, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, d, c, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_external_1d_elem(int w, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, data, elemsize, elempack, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_external_2d_elem(int w, int h, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, data, elemsize, elempack, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_external_3d_elem(int w, int h, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, c, data, elemsize, elempack, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, c, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL)); } ncnn_mat_t ncnn_mat_create_external_4d_elem(int w, int h, int d, int c, void* data, size_t elemsize, int elempack, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(w, h, d, c, data, elemsize, elempack, (Allocator*)allocator)); + return (ncnn_mat_t)(new Mat(w, h, d, c, data, elemsize, elempack, allocator ? (Allocator*)allocator->pthis : NULL)); } void ncnn_mat_destroy(ncnn_mat_t mat) @@ -281,27 +304,27 @@ void ncnn_mat_fill_float(ncnn_mat_t mat, float v) ncnn_mat_t ncnn_mat_clone(const ncnn_mat_t mat, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(((const Mat*)mat)->clone((Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(((const Mat*)mat)->clone(allocator ? (Allocator*)allocator->pthis : NULL))); } ncnn_mat_t ncnn_mat_reshape_1d(const ncnn_mat_t mat, int w, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, (Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, allocator ? (Allocator*)allocator->pthis : NULL))); } ncnn_mat_t ncnn_mat_reshape_2d(const ncnn_mat_t mat, int w, int h, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, (Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, allocator ? (Allocator*)allocator->pthis : NULL))); } ncnn_mat_t ncnn_mat_reshape_3d(const ncnn_mat_t mat, int w, int h, int c, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, c, (Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, c, allocator ? (Allocator*)allocator->pthis : NULL))); } ncnn_mat_t ncnn_mat_reshape_4d(const ncnn_mat_t mat, int w, int h, int d, int c, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, d, c, (Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(((const Mat*)mat)->reshape(w, h, d, c, allocator ? (Allocator*)allocator->pthis : NULL))); } int ncnn_mat_get_dims(const ncnn_mat_t mat) @@ -359,22 +382,22 @@ void* ncnn_mat_get_channel_data(const ncnn_mat_t mat, int c) /* mat pixel api */ ncnn_mat_t ncnn_mat_from_pixels(const unsigned char* pixels, int type, int w, int h, int stride, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(Mat::from_pixels(pixels, type, w, h, stride, (Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(Mat::from_pixels(pixels, type, w, h, stride, allocator ? (Allocator*)allocator->pthis : NULL))); } ncnn_mat_t ncnn_mat_from_pixels_resize(const unsigned char* pixels, int type, int w, int h, int stride, int target_width, int target_height, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(Mat::from_pixels_resize(pixels, type, w, h, stride, target_width, target_height, (Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(Mat::from_pixels_resize(pixels, type, w, h, stride, target_width, target_height, allocator ? (Allocator*)allocator->pthis : NULL))); } ncnn_mat_t ncnn_mat_from_pixels_roi(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi(pixels, type, w, h, stride, roix, roiy, roiw, roih, (Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi(pixels, type, w, h, stride, roix, roiy, roiw, roih, allocator ? (Allocator*)allocator->pthis : NULL))); } ncnn_mat_t ncnn_mat_from_pixels_roi_resize(const unsigned char* pixels, int type, int w, int h, int stride, int roix, int roiy, int roiw, int roih, int target_width, int target_height, ncnn_allocator_t allocator) { - return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi_resize(pixels, type, w, h, stride, roix, roiy, roiw, roih, target_width, target_height, (Allocator*)allocator))); + return (ncnn_mat_t)(new Mat(Mat::from_pixels_roi_resize(pixels, type, w, h, stride, roix, roiy, roiw, roih, target_width, target_height, allocator ? (Allocator*)allocator->pthis : NULL))); } void ncnn_mat_to_pixels(const ncnn_mat_t mat, unsigned char* pixels, int type, int stride) @@ -1190,6 +1213,11 @@ void ncnn_net_destroy(ncnn_net_t net) free(net); } +ncnn_option_t ncnn_net_get_option(ncnn_net_t net) +{ + return (ncnn_option_t)(&((Net*)(net->pthis))->opt); +} + void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt) { ((Net*)net->pthis)->opt = *((Option*)opt); diff --git a/src/c_api.h b/src/c_api.h index 39f872bdbb6c..b7435f846ba1 100644 --- a/src/c_api.h +++ b/src/c_api.h @@ -51,6 +51,12 @@ NCNN_EXPORT void ncnn_option_destroy(ncnn_option_t opt); NCNN_EXPORT int ncnn_option_get_num_threads(const ncnn_option_t opt); NCNN_EXPORT void ncnn_option_set_num_threads(ncnn_option_t opt, int num_threads); +NCNN_EXPORT int ncnn_option_get_use_local_pool_allocator(const ncnn_option_t opt); +NCNN_EXPORT void ncnn_option_set_use_local_pool_allocator(ncnn_option_t opt, int use_local_pool_allocator); + +NCNN_EXPORT void ncnn_option_set_blob_allocator(ncnn_option_t opt, ncnn_allocator_t allocator); +NCNN_EXPORT void ncnn_option_set_workspace_allocator(ncnn_option_t opt, ncnn_allocator_t allocator); + NCNN_EXPORT int ncnn_option_get_use_vulkan_compute(const ncnn_option_t opt); NCNN_EXPORT void ncnn_option_set_use_vulkan_compute(ncnn_option_t opt, int use_vulkan_compute); @@ -265,6 +271,7 @@ struct __ncnn_net_t NCNN_EXPORT ncnn_net_t ncnn_net_create(); NCNN_EXPORT void ncnn_net_destroy(ncnn_net_t net); +NCNN_EXPORT ncnn_option_t ncnn_net_get_option(ncnn_net_t net); NCNN_EXPORT void ncnn_net_set_option(ncnn_net_t net, ncnn_option_t opt); #if NCNN_STRING diff --git a/src/cpu.cpp b/src/cpu.cpp index 197093d6dd21..85c65335ccca 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -42,6 +42,12 @@ #include #endif +#if defined _WIN32 && !(defined __MINGW32__) +#define WIN32_LEAN_AND_MEAN +#include +#include +#endif + #if defined __ANDROID__ || defined __linux__ #if defined __ANDROID__ #if __ANDROID_API__ >= 18 @@ -88,10 +94,18 @@ #ifndef CPUFAMILY_ARM_AVALANCHE_BLIZZARD #define CPUFAMILY_ARM_AVALANCHE_BLIZZARD 0xda33d83d #endif +// A16 +#ifndef CPUFAMILY_ARM_EVEREST_SAWTOOTH +#define CPUFAMILY_ARM_EVEREST_SAWTOOTH 0x8765edea +#endif // M1 #ifndef CPUFAMILY_AARCH64_FIRESTORM_ICESTORM #define CPUFAMILY_AARCH64_FIRESTORM_ICESTORM 0x1b588bb3 #endif +// M2 +#ifndef CPUFAMILY_AARCH64_AVALANCHE_BLIZZARD +#define CPUFAMILY_AARCH64_AVALANCHE_BLIZZARD 0xda33d83d +#endif #endif // __APPLE__ #if defined(__SSE3__) @@ -159,7 +173,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type) return 0; } -#if __aarch64__ || __mips64 || __riscv_xlen == 64 +#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64 struct { uint64_t tag; @@ -236,6 +250,12 @@ static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2); #define HWCAP_LOONGSON_MMI (1 << 11) #endif +#if __loongarch64 +// from arch/loongarch/include/uapi/asm/hwcap.h +#define HWCAP_LOONGARCH_LSX (1 << 4) +#define HWCAP_LOONGARCH_LASX (1 << 5) +#endif + #if __riscv // from arch/riscv/include/uapi/asm/hwcap.h #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A')) @@ -272,9 +292,60 @@ static cpu_subtype_t get_hw_cpusubtype() static unsigned int g_hw_cpufamily = get_hw_cpufamily(); static cpu_type_t g_hw_cputype = get_hw_cputype(); static cpu_subtype_t g_hw_cpusubtype = get_hw_cpusubtype(); + +static int get_hw_capability(const char* cap) +{ + int64_t value = 0; + size_t len = sizeof(value); + sysctlbyname(cap, &value, &len, NULL, 0); + return value; +} + +static int g_hw_optional_arm_FEAT_FP16 = get_hw_capability("hw.optional.arm.FEAT_FP16"); +static int g_hw_optional_arm_FEAT_DotProd = get_hw_capability("hw.optional.arm.FEAT_DotProd"); +static int g_hw_optional_arm_FEAT_FHM = get_hw_capability("hw.optional.arm.FEAT_FHM"); +static int g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT_BF16"); +static int g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM"); #endif // __APPLE__ -#if defined __ANDROID__ || defined __linux__ +#if (defined _WIN32 && !(defined __MINGW32__)) +CpuSet::CpuSet() +{ + disable_all(); +} + +void CpuSet::enable(int cpu) +{ + mask |= (1 << cpu); +} + +void CpuSet::disable(int cpu) +{ + mask &= ~(1 << cpu); +} + +void CpuSet::disable_all() +{ + mask = 0; +} + +bool CpuSet::is_enabled(int cpu) const +{ + return mask & (1 << cpu); +} + +int CpuSet::num_enabled() const +{ + int num_enabled = 0; + for (int i = 0; i < (int)sizeof(mask) * 8; i++) + { + if (is_enabled(i)) + num_enabled++; + } + + return num_enabled; +} +#elif defined __ANDROID__ || defined __linux__ CpuSet::CpuSet() { disable_all(); @@ -444,7 +515,13 @@ int cpu_support_arm_asimdhp() #endif #elif __APPLE__ #if __aarch64__ - return g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD; + return g_hw_optional_arm_FEAT_FP16 + || g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL + || g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST + || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER + || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; #else return 0; #endif @@ -463,7 +540,11 @@ int cpu_support_arm_asimddp() #endif #elif __APPLE__ #if __aarch64__ - return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD; + return g_hw_optional_arm_FEAT_DotProd + || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER + || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; #else return 0; #endif @@ -482,7 +563,11 @@ int cpu_support_arm_asimdfhm() #endif #elif __APPLE__ #if __aarch64__ - return g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD; + return g_hw_optional_arm_FEAT_FHM + || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER + || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; #else return 0; #endif @@ -501,7 +586,9 @@ int cpu_support_arm_bf16() #endif #elif __APPLE__ #if __aarch64__ - return 0; // no known apple cpu support armv8.6 bf16 + return g_hw_optional_arm_FEAT_BF16 + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; #else return 0; #endif @@ -520,7 +607,9 @@ int cpu_support_arm_i8mm() #endif #elif __APPLE__ #if __aarch64__ - return 0; // no known apple cpu support armv8.6 i8mm + return g_hw_optional_arm_FEAT_I8MM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH; #else return 0; #endif @@ -1001,6 +1090,32 @@ int cpu_support_mips_msa() #endif } +int cpu_support_loongarch_lsx() +{ +#if defined __ANDROID__ || defined __linux__ +#if __loongarch64 + return g_hwcaps & HWCAP_LOONGARCH_LSX; +#else + return 0; +#endif +#else + return 0; +#endif +} + +int cpu_support_loongarch_lasx() +{ +#if defined __ANDROID__ || defined __linux__ +#if __loongarch64 + return g_hwcaps & HWCAP_LOONGARCH_LASX; +#else + return 0; +#endif +#else + return 0; +#endif +} + int cpu_support_loongson_mmi() { #if defined __ANDROID__ || defined __linux__ @@ -1069,6 +1184,10 @@ static int get_cpucount() count = emscripten_num_logical_cores(); else count = 1; +#elif (defined _WIN32 && !(defined __MINGW32__)) + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + count = system_info.dwNumberOfProcessors; #elif defined __ANDROID__ || defined __linux__ // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/cpuinfo", "rb"); @@ -1124,6 +1243,220 @@ int get_big_cpu_count() return big_cpu_count ? big_cpu_count : g_cpucount; } +#if defined __ANDROID__ || defined __linux__ +static int get_thread_siblings(int cpuid) +{ + char path[256]; + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid); + + FILE* fp = fopen(path, "rb"); + if (!fp) + return -1; + + int thread_siblings = -1; + int nscan = fscanf(fp, "%x", &thread_siblings); + if (nscan != 1) + { + // ignore + } + + fclose(fp); + + return thread_siblings; +} +#endif // defined __ANDROID__ || defined __linux__ + +static int get_physical_cpucount() +{ + int count = 0; +#if (defined _WIN32 && !(defined __MINGW32__)) + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi == NULL) + { + NCNN_LOGE("GetLogicalProcessorInformation is not supported"); + return g_cpucount; + } + + DWORD return_length = 0; + glpi(NULL, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationProcessorCore) + { + count++; + } + + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + free(buffer); +#elif defined __ANDROID__ || defined __linux__ + std::vector thread_set; + for (int i = 0; i < g_cpucount; i++) + { + int thread_siblings = get_thread_siblings(i); + if (thread_siblings == -1) + { + // ignore malformed one + continue; + } + + bool thread_siblings_exists = false; + for (size_t j = 0; j < thread_set.size(); j++) + { + if (thread_set[j] == thread_siblings) + { + thread_siblings_exists = true; + break; + } + } + + if (!thread_siblings_exists) + { + thread_set.push_back(thread_siblings); + count++; + } + } +#elif __APPLE__ + size_t len = sizeof(count); + sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0); +#else + count = g_cpucount; +#endif + + if (count > g_cpucount) + count = g_cpucount; + + return count; +} + +static int g_physical_cpucount = get_physical_cpucount(); + +int get_physical_cpu_count() +{ + return g_physical_cpucount; +} + +int get_physical_little_cpu_count() +{ + if (g_physical_cpucount == g_cpucount) + return get_little_cpu_count(); + + return g_physical_cpucount * 2 - g_cpucount; +} + +int get_physical_big_cpu_count() +{ + if (g_physical_cpucount == g_cpucount) + return get_big_cpu_count(); + + return g_cpucount - g_physical_cpucount; +} + +#if (defined _WIN32 && !(defined __MINGW32__)) +static CpuSet get_smt_cpu_mask() +{ + CpuSet smt_cpu_mask; + + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi == NULL) + { + NCNN_LOGE("GetLogicalProcessorInformation is not supported"); + return smt_cpu_mask; + } + + DWORD return_length = 0; + glpi(NULL, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationProcessorCore) + { + CpuSet smt_set; + smt_set.mask = ptr->ProcessorMask; + if (smt_set.num_enabled() > 1) + { + // this core is smt + smt_cpu_mask.mask |= smt_set.mask; + } + } + + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + free(buffer); + + return smt_cpu_mask; +} + +static std::vector get_max_freq_mhz() +{ + typedef struct _PROCESSOR_POWER_INFORMATION + { + ULONG Number; + ULONG MaxMhz; + ULONG CurrentMhz; + ULONG MhzLimit; + ULONG MaxIdleState; + ULONG CurrentIdleState; + } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION; + + HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll")); + + typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG); + LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation"); + if (cnpi == NULL) + { + NCNN_LOGE("CallNtPowerInformation is not supported"); + FreeLibrary(powrprof); + return std::vector(g_cpucount, 0); + } + + DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount; + PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length); + + cnpi(ProcessorInformation, NULL, 0, buffer, return_length); + + std::vector ret; + for (int i = 0; i < g_cpucount; i++) + { + ULONG max_mhz = buffer[i].MaxMhz; + ret.push_back(max_mhz); + } + + free(buffer); + FreeLibrary(powrprof); + return ret; +} + +static int set_sched_affinity(const CpuSet& thread_affinity_mask) +{ + DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask); + if (prev_mask == 0) + { + NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError()); + return -1; + } + + return 0; +} +#endif // (defined _WIN32 && !(defined __MINGW32__)) + #if defined __ANDROID__ || defined __linux__ static int get_max_freq_khz(int cpuid) { @@ -1199,6 +1532,39 @@ static int get_max_freq_khz(int cpuid) return max_freq_khz; } +static bool is_smt_cpu(int cpuid) +{ + // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72 + char path[256]; + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid); + + FILE* fp = fopen(path, "rb"); + + if (!fp) + { + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid); + fp = fopen(path, "rb"); + + if (!fp) + return false; + } + + bool is_smt = false; + while (!feof(fp)) + { + char ch = fgetc(fp); + if (ch == ',' || ch == '-') + { + is_smt = true; + break; + } + } + + fclose(fp); + + return is_smt; +} + static int set_sched_affinity(const CpuSet& thread_affinity_mask) { // set affinity for thread @@ -1289,7 +1655,48 @@ static int setup_thread_affinity_masks() { g_thread_affinity_mask_all.disable_all(); -#if defined __ANDROID__ || defined __linux__ +#if (defined _WIN32 && !(defined __MINGW32__)) + // get max freq mhz for all cores + int max_freq_mhz_min = INT_MAX; + int max_freq_mhz_max = 0; + std::vector cpu_max_freq_mhz = get_max_freq_mhz(); + for (int i = 0; i < g_cpucount; i++) + { + int max_freq_mhz = cpu_max_freq_mhz[i]; + + // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz); + + if (max_freq_mhz > max_freq_mhz_max) + max_freq_mhz_max = max_freq_mhz; + if (max_freq_mhz < max_freq_mhz_min) + max_freq_mhz_min = max_freq_mhz; + } + + int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2; + if (max_freq_mhz_medium == max_freq_mhz_max) + { + g_thread_affinity_mask_little.disable_all(); + g_thread_affinity_mask_big = g_thread_affinity_mask_all; + return 0; + } + + CpuSet smt_cpu_mask = get_smt_cpu_mask(); + + for (int i = 0; i < g_cpucount; i++) + { + if (smt_cpu_mask.is_enabled(i)) + { + // always treat smt core as big core + g_thread_affinity_mask_big.enable(i); + continue; + } + + if (cpu_max_freq_mhz[i] < max_freq_mhz_medium) + g_thread_affinity_mask_little.enable(i); + else + g_thread_affinity_mask_big.enable(i); + } +#elif defined __ANDROID__ || defined __linux__ int max_freq_khz_min = INT_MAX; int max_freq_khz_max = 0; std::vector cpu_max_freq_khz(g_cpucount); @@ -1297,7 +1704,7 @@ static int setup_thread_affinity_masks() { int max_freq_khz = get_max_freq_khz(i); - // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz); + // NCNN_LOGE("%d max freq = %d khz", i, max_freq_khz); cpu_max_freq_khz[i] = max_freq_khz; @@ -1317,6 +1724,13 @@ static int setup_thread_affinity_masks() for (int i = 0; i < g_cpucount; i++) { + if (is_smt_cpu(i)) + { + // always treat smt core as big core + g_thread_affinity_mask_big.enable(i); + continue; + } + if (cpu_max_freq_khz[i] < max_freq_khz_medium) g_thread_affinity_mask_little.enable(i); else @@ -1324,6 +1738,7 @@ static int setup_thread_affinity_masks() } #elif __APPLE__ // affinity info from cpu model + // TODO find a general way to get per-core frequency on macos if (g_hw_cpufamily == CPUFAMILY_ARM_MONSOON_MISTRAL) { // 2 + 4 @@ -1334,11 +1749,16 @@ static int setup_thread_affinity_masks() g_thread_affinity_mask_little.enable(4); g_thread_affinity_mask_little.enable(5); } - else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD) + else if (g_hw_cpufamily == CPUFAMILY_ARM_VORTEX_TEMPEST + || g_hw_cpufamily == CPUFAMILY_ARM_LIGHTNING_THUNDER + || g_hw_cpufamily == CPUFAMILY_ARM_FIRESTORM_ICESTORM + || g_hw_cpufamily == CPUFAMILY_ARM_AVALANCHE_BLIZZARD + || g_hw_cpufamily == CPUFAMILY_ARM_EVEREST_SAWTOOTH) { - // 2 + 4 or 4 + 4 - if (get_cpu_count() == 6) + int cpu_count = get_cpu_count(); + if (cpu_count == 6) { + // 2 + 4 g_thread_affinity_mask_big.enable(0); g_thread_affinity_mask_big.enable(1); g_thread_affinity_mask_little.enable(2); @@ -1346,8 +1766,9 @@ static int setup_thread_affinity_masks() g_thread_affinity_mask_little.enable(4); g_thread_affinity_mask_little.enable(5); } - else + else if (cpu_count == 8) { + // 4 + 4 g_thread_affinity_mask_big.enable(0); g_thread_affinity_mask_big.enable(1); g_thread_affinity_mask_big.enable(2); @@ -1357,6 +1778,42 @@ static int setup_thread_affinity_masks() g_thread_affinity_mask_little.enable(6); g_thread_affinity_mask_little.enable(7); } + else if (cpu_count == 10) + { + // 8 + 2 + g_thread_affinity_mask_big.enable(0); + g_thread_affinity_mask_big.enable(1); + g_thread_affinity_mask_big.enable(2); + g_thread_affinity_mask_big.enable(3); + g_thread_affinity_mask_big.enable(4); + g_thread_affinity_mask_big.enable(5); + g_thread_affinity_mask_big.enable(6); + g_thread_affinity_mask_big.enable(7); + g_thread_affinity_mask_little.enable(8); + g_thread_affinity_mask_little.enable(9); + } + else if (cpu_count == 20) + { + // 16 + 4 + g_thread_affinity_mask_big.enable(0); + g_thread_affinity_mask_big.enable(1); + g_thread_affinity_mask_big.enable(2); + g_thread_affinity_mask_big.enable(3); + g_thread_affinity_mask_big.enable(4); + g_thread_affinity_mask_big.enable(5); + g_thread_affinity_mask_big.enable(6); + g_thread_affinity_mask_big.enable(7); + g_thread_affinity_mask_big.enable(8); + g_thread_affinity_mask_big.enable(9); + g_thread_affinity_mask_big.enable(10); + g_thread_affinity_mask_big.enable(11); + g_thread_affinity_mask_big.enable(12); + g_thread_affinity_mask_big.enable(13); + g_thread_affinity_mask_big.enable(14); + g_thread_affinity_mask_big.enable(15); + g_thread_affinity_mask_little.enable(16); + g_thread_affinity_mask_little.enable(17); + } } else { @@ -1394,7 +1851,7 @@ const CpuSet& get_cpu_thread_affinity_mask(int powersave) int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask) { -#if defined __ANDROID__ || defined __linux__ +#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__)) int num_threads = thread_affinity_mask.num_enabled(); #ifdef _OPENMP @@ -1584,3 +2041,21 @@ int set_flush_denormals(int flush_denormals) } } // namespace ncnn + +#if defined __ANDROID__ && defined(_OPENMP) && __clang__ +#ifdef __cplusplus +extern "C" { +#endif +void __wrap___kmp_affinity_determine_capable(const char* /*env_var*/) +{ + // the internal affinity routines in llvm openmp call abort on __NR_sched_getaffinity / __NR_sched_setaffinity fails + // ref KMPNativeAffinity::get_system_affinity/set_system_affinity in openmp/runtime/src/kmp_affinity.h + // and cpu core goes offline in powersave mode on android, which triggers abort + // ATM there is no known api for controlling the abort behavior + // override __kmp_affinity_determine_capable with empty body to disable affinity regardless of KMP_AFFINITY env_var + // ugly hack works >.< --- nihui +} +#ifdef __cplusplus +} // extern "C" +#endif +#endif diff --git a/src/cpu.h b/src/cpu.h index 5a94106ef478..0f748f33d97a 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -17,6 +17,10 @@ #include +#if (defined _WIN32 && !(defined __MINGW32__)) +#define WIN32_LEAN_AND_MEAN +#include +#endif #if defined __ANDROID__ || defined __linux__ #include // cpu_set_t #endif @@ -36,6 +40,9 @@ class NCNN_EXPORT CpuSet int num_enabled() const; public: +#if (defined _WIN32 && !(defined __MINGW32__)) + ULONG_PTR mask; +#endif #if defined __ANDROID__ || defined __linux__ cpu_set_t cpu_set; #endif @@ -93,6 +100,11 @@ NCNN_EXPORT int cpu_support_x86_avx512_bf16(); // avx512_fp16 = x86 avx512 fp16 NCNN_EXPORT int cpu_support_x86_avx512_fp16(); +// lsx = loongarch lsx +NCNN_EXPORT int cpu_support_loongarch_lsx(); +// lasx = loongarch lasx +NCNN_EXPORT int cpu_support_loongarch_lasx(); + // msa = mips mas NCNN_EXPORT int cpu_support_mips_msa(); // mmi = loongson mmi @@ -110,6 +122,10 @@ NCNN_EXPORT int get_cpu_count(); NCNN_EXPORT int get_little_cpu_count(); NCNN_EXPORT int get_big_cpu_count(); +NCNN_EXPORT int get_physical_cpu_count(); +NCNN_EXPORT int get_physical_little_cpu_count(); +NCNN_EXPORT int get_physical_big_cpu_count(); + // bind all threads on little clusters if powersave enabled // affects HMP arch cpu like ARM big.LITTLE // only implemented on android at the moment diff --git a/src/layer.cpp b/src/layer.cpp index 518b666ec23f..953aebcd2bd7 100644 --- a/src/layer.cpp +++ b/src/layer.cpp @@ -253,6 +253,13 @@ Layer* create_layer(int index) } else #endif // NCNN_RUNTIME_CPU && NCNN_AVX +#if NCNN_RUNTIME_CPU && NCNN_LSX + if (ncnn::cpu_support_loongarch_lsx()) + { + layer_creator = layer_registry_lsx[index].creator; + } + else +#endif // NCNN_RUNTIME_CPU && NCNN_LSX #if NCNN_RUNTIME_CPU && NCNN_MSA if (ncnn::cpu_support_mips_msa()) { diff --git a/src/layer.h b/src/layer.h index 46fed5e456cd..d02f65bbca9b 100644 --- a/src/layer.h +++ b/src/layer.h @@ -96,10 +96,9 @@ class NCNN_EXPORT Layer bool support_reserved_7; bool support_reserved_8; bool support_reserved_9; - bool support_reserved_10; - bool support_reserved_11; - bool support_reserved_12; - bool support_reserved_13; + + // feature disabled set + int featmask; public: // implement inference diff --git a/src/layer/arm/cast_bf16.h b/src/layer/arm/cast_bf16.h index 468e5eca3efd..aa8223d73f57 100644 --- a/src/layer/arm/cast_bf16.h +++ b/src/layer/arm/cast_bf16.h @@ -150,7 +150,7 @@ static void cast_fp32_to_bf16_neon(const Mat& bottom_blob, Mat& top_blob, const static void cast_bf16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) { -#if NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC +#if NCNN_RUNTIME_CPU && NCNN_ARM84BF16 && __aarch64__ && !__ARM_FEATURE_BF16_VECTOR_ARITHMETIC if (ncnn::cpu_support_arm_bf16()) { cast_bf16_to_fp32_neon_bf16(bottom_blob, top_blob, opt); diff --git a/src/layer/arm/cast_fp16.h b/src/layer/arm/cast_fp16.h index bb326a970720..7e6db748aec2 100644 --- a/src/layer/arm/cast_fp16.h +++ b/src/layer/arm/cast_fp16.h @@ -47,12 +47,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const { #if __aarch64__ asm volatile( - "prfm pldl1keep, [%0, #512] \n" + "prfm pldl1keep, [%0, #512] \n" "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n" - "fcvtn v0.4h, v0.4s \n" - "fcvtn v1.4h, v1.4s \n" - "fcvtn v2.4h, v2.4s \n" - "fcvtn v3.4h, v3.4s \n" + "fcvtn v0.4h, v0.4s \n" + "fcvtn v1.4h, v1.4s \n" + "fcvtn v2.4h, v2.4s \n" + "fcvtn v3.4h, v3.4s \n" "st1 {v0.4h, v1.4h, v2.4h, v3.4h}, [%1], #32 \n" : "=r"(ptr), // %0 "=r"(outptr) // %1 @@ -61,12 +61,12 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const : "memory", "v0", "v1", "v2", "v3"); #else // __aarch64__ asm volatile( - "pld [%0, #512] \n" - "vldm %0!, {d0-d7} \n" - "vcvt.f16.f32 d0, q0 \n" - "vcvt.f16.f32 d1, q1 \n" - "vcvt.f16.f32 d2, q2 \n" - "vcvt.f16.f32 d3, q3 \n" + "pld [%0, #512] \n" + "vldm %0!, {d0-d7} \n" + "vcvt.f16.f32 d0, q0 \n" + "vcvt.f16.f32 d1, q1 \n" + "vcvt.f16.f32 d2, q2 \n" + "vcvt.f16.f32 d3, q3 \n" "vst1.u16 {d0-d3}, [%1 :128]! \n" : "=r"(ptr), // %0 "=r"(outptr) // %1 @@ -77,24 +77,61 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const } for (; i + 7 < size; i += 8) { - float32x4_t _p0_fp32 = vld1q_f32(ptr); - float32x4_t _p1_fp32 = vld1q_f32(ptr + 4); - float16x4_t _p0_fp16 = vcvt_f16_f32(_p0_fp32); - float16x4_t _p1_fp16 = vcvt_f16_f32(_p1_fp32); - uint16x8_t _p_fp16 = vcombine_u16(vreinterpret_u16_f16(_p0_fp16), vreinterpret_u16_f16(_p1_fp16)); - vst1q_u16(outptr, _p_fp16); - ptr += 8; - outptr += 8; + // This is originally implemented with neon fp16 intrinsics. + // In the new version of gcc, __ARM_FP16_FORMAT_IEEE or __ARM_FP16_FORMAT_ALTERNATIVE needs to be defined to use the float16x4_t type. + // That leads to compiler error when compiled with -mfpu=neon-vfpv4 but without -mfp16-format=ieee flag. + // We could add more macro conditions to differentiate between old and new versions, but that's pretty ugly! + // Just use all inline assembly here ~ + // --- nihui +#if __aarch64__ + asm volatile( + "ld1 {v0.4s, v1.4s}, [%0], #32 \n" + "fcvtn v0.4h, v0.4s \n" + "fcvtn v1.4h, v1.4s \n" + "st1 {v0.4h, v1.4h}, [%1], #16 \n" + : "=r"(ptr), // %0 + "=r"(outptr) // %1 + : "0"(ptr), + "1"(outptr) + : "memory", "v0", "v1"); +#else // __aarch64__ + asm volatile( + "vld1.f32 {d0-d3}, [%0]! \n" + "vcvt.f16.f32 d0, q0 \n" + "vcvt.f16.f32 d1, q1 \n" + "vst1.u16 {d0-d1}, [%1]! \n" + : "=r"(ptr), // %0 + "=r"(outptr) // %1 + : "0"(ptr), + "1"(outptr) + : "memory", "q0", "q1"); +#endif // __aarch64__ } for (; i + 3 < size; i += 4) { - float32x4_t _p_fp32 = vld1q_f32(ptr); - float16x4_t _p_fp16 = vcvt_f16_f32(_p_fp32); - vst1_u16(outptr, vreinterpret_u16_f16(_p_fp16)); - ptr += 4; - outptr += 4; +#if __aarch64__ + asm volatile( + "ld1 {v0.4s}, [%0], #16 \n" + "fcvtn v0.4h, v0.4s \n" + "st1 {v0.4h}, [%1], #8 \n" + : "=r"(ptr), // %0 + "=r"(outptr) // %1 + : "0"(ptr), + "1"(outptr) + : "memory", "v0"); +#else // __aarch64__ + asm volatile( + "vld1.f32 {d0-d1}, [%0]! \n" + "vcvt.f16.f32 d0, q0 \n" + "vst1.u16 {d0}, [%1]! \n" + : "=r"(ptr), // %0 + "=r"(outptr) // %1 + : "0"(ptr), + "1"(outptr) + : "memory", "q0"); +#endif // __aarch64__ } -#endif +#endif // (__ARM_FP & 2) for (; i < size; i++) { *outptr++ = float32_to_float16(*ptr++); @@ -104,7 +141,7 @@ static void cast_fp32_to_fp16_neon(const Mat& bottom_blob, Mat& top_blob, const static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt) { -#if NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2) +#if NCNN_RUNTIME_CPU && NCNN_VFPV4 && __ARM_NEON && !(__ARM_FP & 2) if (ncnn::cpu_support_arm_vfpv4()) { cast_fp16_to_fp32_neon_vfpv4(bottom_blob, top_blob, opt); @@ -132,12 +169,12 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const { #if __aarch64__ asm volatile( - "prfm pldl1keep, [%0, #256] \n" + "prfm pldl1keep, [%0, #256] \n" "ld1 {v0.4h, v1.4h, v2.4h, v3.4h}, [%0], #32 \n" - "fcvtl v0.4s, v0.4h \n" - "fcvtl v1.4s, v1.4h \n" - "fcvtl v2.4s, v2.4h \n" - "fcvtl v3.4s, v3.4h \n" + "fcvtl v0.4s, v0.4h \n" + "fcvtl v1.4s, v1.4h \n" + "fcvtl v2.4s, v2.4h \n" + "fcvtl v3.4s, v3.4h \n" "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" : "=r"(ptr), // %0 "=r"(outptr) // %1 @@ -146,13 +183,13 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const : "memory", "v0", "v1", "v2", "v3"); #else // __aarch64__ asm volatile( - "pld [%0, #256] \n" + "pld [%0, #256] \n" "vld1.u16 {d4-d7}, [%0 :128]! \n" - "vcvt.f32.f16 q0, d4 \n" - "vcvt.f32.f16 q1, d5 \n" - "vcvt.f32.f16 q2, d6 \n" - "vcvt.f32.f16 q3, d7 \n" - "vstm %1!, {d0-d7} \n" + "vcvt.f32.f16 q0, d4 \n" + "vcvt.f32.f16 q1, d5 \n" + "vcvt.f32.f16 q2, d6 \n" + "vcvt.f32.f16 q3, d7 \n" + "vstm %1!, {d0-d7} \n" : "=r"(ptr), // %0 "=r"(outptr) // %1 : "0"(ptr), @@ -162,25 +199,55 @@ static void cast_fp16_to_fp32_neon(const Mat& bottom_blob, Mat& top_blob, const } for (; i + 7 < size; i += 8) { - uint16x8_t _p_fp16 = vld1q_u16(ptr); - float16x4_t _p0_fp16 = vreinterpret_f16_u16(vget_low_u16(_p_fp16)); - float16x4_t _p1_fp16 = vreinterpret_f16_u16(vget_high_u16(_p_fp16)); - float32x4_t _p0_fp32 = vcvt_f32_f16(_p0_fp16); - float32x4_t _p1_fp32 = vcvt_f32_f16(_p1_fp16); - vst1q_f32(outptr, _p0_fp32); - vst1q_f32(outptr + 4, _p1_fp32); - ptr += 8; - outptr += 8; +#if __aarch64__ + asm volatile( + "ld1 {v0.4h, v1.4h}, [%0], #16 \n" + "fcvtl v0.4s, v0.4h \n" + "fcvtl v1.4s, v1.4h \n" + "st1 {v0.4s, v1.4s}, [%1], #32 \n" + : "=r"(ptr), // %0 + "=r"(outptr) // %1 + : "0"(ptr), + "1"(outptr) + : "memory", "v0", "v1"); +#else // __aarch64__ + asm volatile( + "vld1.u16 {d4-d5}, [%0]! \n" + "vcvt.f32.f16 q0, d4 \n" + "vcvt.f32.f16 q1, d5 \n" + "vst1.f32 {d0-d3}, [%1]! \n" + : "=r"(ptr), // %0 + "=r"(outptr) // %1 + : "0"(ptr), + "1"(outptr) + : "memory", "q0", "q1", "q2"); +#endif // __aarch64__ } for (; i + 3 < size; i += 4) { - float16x4_t _p_fp16 = vreinterpret_f16_u16(vld1_u16(ptr)); - float32x4_t _p_fp32 = vcvt_f32_f16(_p_fp16); - vst1q_f32(outptr, _p_fp32); - ptr += 4; - outptr += 4; +#if __aarch64__ + asm volatile( + "ld1 {v0.4h}, [%0], #8 \n" + "fcvtl v0.4s, v0.4h \n" + "st1 {v0.4s}, [%1], #16 \n" + : "=r"(ptr), // %0 + "=r"(outptr) // %1 + : "0"(ptr), + "1"(outptr) + : "memory", "v0"); +#else // __aarch64__ + asm volatile( + "vld1.u16 {d2}, [%0]! \n" + "vcvt.f32.f16 q0, d2 \n" + "vst1.f32 {d0-d1}, [%1]! \n" + : "=r"(ptr), // %0 + "=r"(outptr) // %1 + : "0"(ptr), + "1"(outptr) + : "memory", "q0", "q1"); +#endif // __aarch64__ } -#endif +#endif // (__ARM_FP & 2) for (; i < size; i++) { *outptr++ = float16_to_float32(*ptr++); diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index 1f8f0c1cb485..f31ed1576ca9 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -387,10 +387,8 @@ int Convolution_arm::create_pipeline(const Option& opt) // conv3x3s1_winograd63_transform_kernel_neon(weight_data, weight_winograd63_data, num_input, num_output, opt); conv3x3s1_winograd63_transform_kernel_neon5(weight_data, weight_winograd63_data, num_input, num_output, opt); } - else - { - weight_data_tm = weight_data; - } + + weight_data_tm = weight_data; } else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) { diff --git a/src/layer/arm/innerproduct_fp16s.h b/src/layer/arm/innerproduct_fp16s.h index 18214bc91fec..31edd9ed64a5 100644 --- a/src/layer/arm/innerproduct_fp16s.h +++ b/src/layer/arm/innerproduct_fp16s.h @@ -253,10 +253,10 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, float32x4_t _val = vld1q_f32(sptr); uint16x8_t _w01 = vld1q_u16(kptr); uint16x8_t _w23 = vld1q_u16(kptr + 8); - float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01))); - float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01))); - float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23))); - float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23))); + float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01))); + float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01))); + float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23))); + float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23))); #endif #if __aarch64__ @@ -281,7 +281,7 @@ static void innerproduct_pack4_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); #else - float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); + float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); #endif _sum0 = vfmaq_f32(_sum0, _val, _w); @@ -410,10 +410,10 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const float32x4_t _w3 = vcvt_f32_f16(vld1_f16(kptr3)); #else float32x4_t _val = vld1q_f32(sptr); - float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr0))); - float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr1))); - float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr2))); - float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr3))); + float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr0))); + float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr1))); + float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr2))); + float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr3))); #endif _sum0 = vfmaq_f32(_sum0, _val, _w0); @@ -507,7 +507,7 @@ static void innerproduct_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, const float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); #else float32x4_t _val = vld1q_f32(sptr); - float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); + float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); #endif _sum = vfmaq_f32(_sum, _val, _w); @@ -713,10 +713,10 @@ static void innerproduct_transform_kernel_fp16s_neon(const Mat& weight_data, Mat { // transpose 4x4 uint16x4x4_t _p; - _p.val[0] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k0))); - _p.val[1] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k1))); - _p.val[2] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k2))); - _p.val[3] = vreinterpret_u16_f16(vcvt_f16_f32(vld1q_f32(k3))); + _p.val[0] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k0))); + _p.val[1] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k1))); + _p.val[2] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k2))); + _p.val[3] = (uint16x4_t)(vcvt_f16_f32(vld1q_f32(k3))); vst4_u16(g0, _p); k0 += 4; diff --git a/src/layer/arm/innerproduct_gemm_fp16s.h b/src/layer/arm/innerproduct_gemm_fp16s.h index 8e6731dc847c..f7daa17da39b 100644 --- a/src/layer/arm/innerproduct_gemm_fp16s.h +++ b/src/layer/arm/innerproduct_gemm_fp16s.h @@ -120,7 +120,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); #else float32x4_t _val = vld1q_f32(m); - float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); + float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); #endif #if __aarch64__ @@ -214,10 +214,10 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, float32x4_t _val = vld1q_f32(m); uint16x8_t _w01 = vld1q_u16(kptr); uint16x8_t _w23 = vld1q_u16(kptr + 8); - float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01))); - float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01))); - float32x4_t _w2 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w23))); - float32x4_t _w3 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w23))); + float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01))); + float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01))); + float32x4_t _w2 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w23))); + float32x4_t _w3 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w23))); #endif #if __aarch64__ @@ -242,7 +242,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); #else - float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); + float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); #endif _sum0 = vfmaq_f32(_sum0, _val, _w); @@ -317,7 +317,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, float32x4_t _val1 = vld1q_f32(m + 4); float32x4_t _val2 = vld1q_f32(m + 8); float32x4_t _val3 = vld1q_f32(m + 12); - float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); + float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); #endif #if __aarch64__ @@ -414,8 +414,8 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, float32x4_t _val0 = vld1q_f32(m); float32x4_t _val1 = vld1q_f32(m + 4); uint16x8_t _w01 = vld1q_u16(kptr); - float32x4_t _w0 = vcvt_f32_f16(vreinterpret_f16_u16(vget_low_u16(_w01))); - float32x4_t _w1 = vcvt_f32_f16(vreinterpret_f16_u16(vget_high_u16(_w01))); + float32x4_t _w0 = vcvt_f32_f16((float16x4_t)(vget_low_u16(_w01))); + float32x4_t _w1 = vcvt_f32_f16((float16x4_t)(vget_high_u16(_w01))); #endif _sum0 = vfmaq_f32(_sum0, _val0, _w0); @@ -433,7 +433,7 @@ static void innerproduct_gemm_fp16s_neon(const Mat& bottom_blob, Mat& top_blob, float32x4_t _w = vcvt_f32_f16(vld1_f16(kptr)); #else float32x4_t _val = vld1q_f32(m); - float32x4_t _w = vcvt_f32_f16(vreinterpret_f16_u16(vld1_u16(kptr))); + float32x4_t _w = vcvt_f32_f16((float16x4_t)(vld1_u16(kptr))); #endif _sum0 = vfmaq_f32(_sum0, _val, _w); diff --git a/src/layer/arm/lstm_arm.cpp b/src/layer/arm/lstm_arm.cpp index 440c7bc8ce8e..075da57aff80 100644 --- a/src/layer/arm/lstm_arm.cpp +++ b/src/layer/arm/lstm_arm.cpp @@ -58,11 +58,11 @@ int LSTM_arm::create_pipeline(const Option& opt) // pack IFOG int num_directions = direction == 2 ? 2 : 1; - int size = weight_data_size / num_directions / num_output / 4; + int size = weight_data_size / num_directions / hidden_size / 4; - weight_xc_data_packed.create(size, num_output, num_directions, 16u, 4); - bias_c_data_packed.create(num_output, 1, num_directions, 16u, 4); - weight_hc_data_packed.create(num_output, num_output, num_directions, 16u, 4); + weight_xc_data_packed.create(size, hidden_size, num_directions, 16u, 4); + bias_c_data_packed.create(hidden_size, 1, num_directions, 16u, 4); + weight_hc_data_packed.create(num_output, hidden_size, num_directions, 16u, 4); #pragma omp parallel for num_threads(opt.num_threads) for (int dr = 0; dr < num_directions; dr++) @@ -82,7 +82,7 @@ int LSTM_arm::create_pipeline(const Option& opt) float* bias_c_IFOG = bias_c_data_packed_dr.row(0); - for (int q = 0; q < num_output; q++) + for (int q = 0; q < hidden_size; q++) { bias_c_IFOG[0] = bias_c_I[q]; bias_c_IFOG[1] = bias_c_F[q]; @@ -91,15 +91,15 @@ int LSTM_arm::create_pipeline(const Option& opt) bias_c_IFOG += 4; - const float* weight_xc_I = weight_xc.row(num_output * 0 + q); - const float* weight_xc_F = weight_xc.row(num_output * 1 + q); - const float* weight_xc_O = weight_xc.row(num_output * 2 + q); - const float* weight_xc_G = weight_xc.row(num_output * 3 + q); + const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q); + const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q); + const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q); + const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q); - const float* weight_hc_I = weight_hc.row(num_output * 0 + q); - const float* weight_hc_F = weight_hc.row(num_output * 1 + q); - const float* weight_hc_O = weight_hc.row(num_output * 2 + q); - const float* weight_hc_G = weight_hc.row(num_output * 3 + q); + const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q); + const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q); + const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q); + const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q); float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q); float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q); @@ -126,21 +126,37 @@ int LSTM_arm::create_pipeline(const Option& opt) } } + if (opt.lightmode) + { + weight_xc_data.release(); + bias_c_data.release(); + weight_hc_data.release(); + } + return 0; } -static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt) +static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt) { int size = bottom_blob.w; int T = bottom_blob.h; int num_output = top_blob.w; + int hidden_size = cell_state.w; - // 4 x num_output - Mat gates(4, num_output, 4u, opt.workspace_allocator); + // 4 x hidden_size + Mat gates(4, hidden_size, 4u, opt.workspace_allocator); if (gates.empty()) return -100; + Mat tmp_hidden_state; + if (num_output != hidden_size) + { + tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator); + if (tmp_hidden_state.empty()) + return -100; + } + // unroll for (int t = 0; t < T; t++) { @@ -155,7 +171,7 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w const float* x = bottom_blob.row(ti); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) + for (int q = 0; q < hidden_size; q++) { const float* bias_c_IFOG = (const float*)bias_c + q * 4; @@ -291,14 +307,15 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w float* cell_ptr = cell_state; float* hidden_ptr = hidden_state; + float* tmp_hidden_ptr = tmp_hidden_state; - int remain_num_output_start = 0; + int remain_hidden_size_start = 0; #if __ARM_NEON - int nn_num_output = num_output >> 2; - remain_num_output_start = nn_num_output << 2; + int nn_hidden_size = hidden_size >> 2; + remain_hidden_size_start = nn_hidden_size << 2; #pragma omp parallel for num_threads(opt.num_threads) - for (int qq = 0; qq < nn_num_output; qq++) + for (int qq = 0; qq < nn_hidden_size; qq++) { int q = qq * 4; @@ -315,12 +332,20 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2)); vst1q_f32(cell_ptr + q, _cell2); - vst1q_f32(hidden_ptr + q, _H); - vst1q_f32(output_data + q, _H); + + if (num_output == hidden_size) + { + vst1q_f32(hidden_ptr + q, _H); + vst1q_f32(output_data + q, _H); + } + else + { + vst1q_f32(tmp_hidden_ptr + q, _H); + } } #endif // __ARM_NEON #pragma omp parallel for num_threads(opt.num_threads) - for (int q = remain_num_output_start; q < num_output; q++) + for (int q = remain_hidden_size_start; q < hidden_size; q++) { const float* gates_data = gates.row(q); @@ -338,8 +363,43 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w float H = O * tanh(cell2); cell_ptr[q] = cell2; - hidden_ptr[q] = H; - output_data[q] = H; + if (num_output == hidden_size) + { + hidden_ptr[q] = H; + output_data[q] = H; + } + else + { + tmp_hidden_ptr[q] = H; + } + } + + if (num_output != hidden_size) + { + // int nn_num_output = num_output >> 2; + // int remain_num_output_start = nn_num_output << 2; + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int qq = 0; qq < nn_num_output; qq++) + // { + // int q = qq * 4; + // + // } + int remain_num_output_start = 0; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = remain_num_output_start; q < num_output; q++) + { + const float* hr = weight_hr.row(q); + const float* tmp_hidden_ptr = tmp_hidden_state; + + float H = 0; + for (int i = 0; i < hidden_size; i++) + { + H += tmp_hidden_ptr[i] * hr[i]; + } + + hidden_ptr[q] = H; + output_data[q] = H; + } } } @@ -375,7 +435,7 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) return -100; hidden.fill(0.f); - Mat cell(num_output, 4u, opt.workspace_allocator); + Mat cell(hidden_size, 4u, opt.workspace_allocator); if (cell.empty()) return -100; cell.fill(0.f); @@ -387,7 +447,7 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) // Uni directional if (direction == 0 || direction == 1) { - int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt); + int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret != 0) return ret; } @@ -402,14 +462,14 @@ int LSTM_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (top_blob_reverse.empty()) return -100; - int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt); + int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret0 != 0) return ret0; hidden.fill(0.0f); cell.fill(0.0f); - int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, cell, opt); + int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt); if (ret1 != 0) return ret1; @@ -466,7 +526,7 @@ int LSTM_arm::forward(const std::vector& bottom_blobs, std::vector& to return -100; hidden.fill(0.f); - cell.create(num_output, num_directions, 4u, hidden_cell_allocator); + cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator); if (cell.empty()) return -100; cell.fill(0.f); @@ -480,7 +540,7 @@ int LSTM_arm::forward(const std::vector& bottom_blobs, std::vector& to // Uni directional if (direction == 0 || direction == 1) { - int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt); + int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret != 0) return ret; } @@ -497,13 +557,13 @@ int LSTM_arm::forward(const std::vector& bottom_blobs, std::vector& to Mat hidden0 = hidden.row_range(0, 1); Mat cell0 = cell.row_range(0, 1); - int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden0, cell0, opt); + int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt); if (ret0 != 0) return ret0; Mat hidden1 = hidden.row_range(1, 1); Mat cell1 = cell.row_range(1, 1); - int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden1, cell1, opt); + int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt); if (ret1 != 0) return ret1; @@ -529,18 +589,27 @@ int LSTM_arm::forward(const std::vector& bottom_blobs, std::vector& to } #if NCNN_BF16 -static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt) +static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt) { int size = bottom_blob.w; int T = bottom_blob.h; int num_output = top_blob.w; + int hidden_size = cell_state.w; - // 4 x num_output - Mat gates(4, num_output, 4u, opt.workspace_allocator); + // 4 x hidden_size + Mat gates(4, hidden_size, 4u, opt.workspace_allocator); if (gates.empty()) return -100; + Mat tmp_hidden_state; + if (num_output != hidden_size) + { + tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator); + if (tmp_hidden_state.empty()) + return -100; + } + // unroll for (int t = 0; t < T; t++) { @@ -555,7 +624,7 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const const unsigned short* x = bottom_blob.row(ti); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) + for (int q = 0; q < hidden_size; q++) { const unsigned short* bias_c_IFOG = (const unsigned short*)bias_c + q * 4; @@ -693,14 +762,15 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const float* cell_ptr = cell_state; float* hidden_ptr = hidden_state; + float* tmp_hidden_ptr = tmp_hidden_state; - int remain_num_output_start = 0; + int remain_hidden_size_start = 0; #if __ARM_NEON - int nn_num_output = num_output >> 2; - remain_num_output_start = nn_num_output << 2; + int nn_hidden_size = hidden_size >> 2; + remain_hidden_size_start = nn_hidden_size << 2; #pragma omp parallel for num_threads(opt.num_threads) - for (int qq = 0; qq < nn_num_output; qq++) + for (int qq = 0; qq < nn_hidden_size; qq++) { int q = qq * 4; @@ -717,12 +787,20 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2)); vst1q_f32(cell_ptr + q, _cell2); - vst1q_f32(hidden_ptr + q, _H); - vst1_u16(output_data + q, bfloat2float(_H)); + + if (num_output == hidden_size) + { + vst1q_f32(hidden_ptr + q, _H); + vst1_u16(output_data + q, bfloat2float(_H)); + } + else + { + vst1q_f32(tmp_hidden_ptr + q, _H); + } } #endif // __ARM_NEON #pragma omp parallel for num_threads(opt.num_threads) - for (int q = remain_num_output_start; q < num_output; q++) + for (int q = remain_hidden_size_start; q < hidden_size; q++) { const float* gates_data = gates.row(q); @@ -740,8 +818,43 @@ static int lstm_bf16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const float H = O * tanh(cell2); cell_ptr[q] = cell2; - hidden_ptr[q] = H; - output_data[q] = float32_to_bfloat16(H); + if (num_output == hidden_size) + { + hidden_ptr[q] = H; + output_data[q] = float32_to_bfloat16(H); + } + else + { + tmp_hidden_ptr[q] = H; + } + } + + if (num_output != hidden_size) + { + // int nn_num_output = num_output >> 2; + // int remain_num_output_start = nn_num_output << 2; + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int qq = 0; qq < nn_num_output; qq++) + // { + // int q = qq * 4; + // + // } + int remain_num_output_start = 0; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = remain_num_output_start; q < num_output; q++) + { + const float* hr = weight_hr.row(q); + const float* tmp_hidden_ptr = tmp_hidden_state; + + float H = 0; + for (int i = 0; i < hidden_size; i++) + { + H += tmp_hidden_ptr[i] * hr[i]; + } + + hidden_ptr[q] = H; + output_data[q] = float32_to_bfloat16(H); + } } } @@ -752,11 +865,11 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt) { // pack IFOG int num_directions = direction == 2 ? 2 : 1; - int size = weight_data_size / num_directions / num_output / 4; + int size = weight_data_size / num_directions / hidden_size / 4; - weight_xc_data_packed.create(size, num_output, num_directions, 8u, 4); - bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4); - weight_hc_data_packed.create(num_output, num_output, num_directions, 8u, 4); + weight_xc_data_packed.create(size, hidden_size, num_directions, 8u, 4); + bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4); + weight_hc_data_packed.create(num_output, hidden_size, num_directions, 8u, 4); #pragma omp parallel for num_threads(opt.num_threads) for (int dr = 0; dr < num_directions; dr++) @@ -776,7 +889,7 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt) unsigned short* bias_c_IFOG = bias_c_data_packed_dr.row(0); - for (int q = 0; q < num_output; q++) + for (int q = 0; q < hidden_size; q++) { bias_c_IFOG[0] = float32_to_bfloat16(bias_c_I[q]); bias_c_IFOG[1] = float32_to_bfloat16(bias_c_F[q]); @@ -785,15 +898,15 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt) bias_c_IFOG += 4; - const float* weight_xc_I = weight_xc.row(num_output * 0 + q); - const float* weight_xc_F = weight_xc.row(num_output * 1 + q); - const float* weight_xc_O = weight_xc.row(num_output * 2 + q); - const float* weight_xc_G = weight_xc.row(num_output * 3 + q); + const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q); + const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q); + const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q); + const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q); - const float* weight_hc_I = weight_hc.row(num_output * 0 + q); - const float* weight_hc_F = weight_hc.row(num_output * 1 + q); - const float* weight_hc_O = weight_hc.row(num_output * 2 + q); - const float* weight_hc_G = weight_hc.row(num_output * 3 + q); + const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q); + const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q); + const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q); + const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q); unsigned short* weight_xc_IFOG = weight_xc_data_packed_dr.row(q); unsigned short* weight_hc_IFOG = weight_hc_data_packed_dr.row(q); @@ -820,6 +933,13 @@ int LSTM_arm::create_pipeline_bf16s(const Option& opt) } } + if (opt.lightmode) + { + weight_xc_data.release(); + bias_c_data.release(); + weight_hc_data.release(); + } + return 0; } @@ -835,7 +955,7 @@ int LSTM_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& return -100; hidden.fill(0.f); - Mat cell(num_output, 4u, opt.workspace_allocator); + Mat cell(hidden_size, 4u, opt.workspace_allocator); if (cell.empty()) return -100; cell.fill(0.f); @@ -847,7 +967,7 @@ int LSTM_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& // Uni directional if (direction == 0 || direction == 1) { - int ret = lstm_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt); + int ret = lstm_bf16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret != 0) return ret; } @@ -862,14 +982,14 @@ int LSTM_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& if (top_blob_reverse.empty()) return -100; - int ret0 = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt); + int ret0 = lstm_bf16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret0 != 0) return ret0; hidden.fill(0.f); cell.fill(0.f); - int ret1 = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, cell, opt); + int ret1 = lstm_bf16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt); if (ret1 != 0) return ret1; @@ -911,7 +1031,7 @@ int LSTM_arm::forward_bf16s(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector(ti); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) + for (int q = 0; q < hidden_size; q++) { const __fp16* bias_c_IFOG = (const __fp16*)bias_c + q * 4; @@ -141,11 +150,12 @@ static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const float* cell_ptr = cell_state; float* hidden_ptr = hidden_state; + float* tmp_hidden_ptr = tmp_hidden_state; - int nn_num_output = num_output >> 2; - int remain_num_output_start = nn_num_output << 2; + int nn_hidden_size = hidden_size >> 2; + int remain_hidden_size_start = nn_hidden_size << 2; #pragma omp parallel for num_threads(opt.num_threads) - for (int qq = 0; qq < nn_num_output; qq++) + for (int qq = 0; qq < nn_hidden_size; qq++) { int q = qq * 4; @@ -162,11 +172,19 @@ static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2)); vst1q_f32(cell_ptr + q, _cell2); - vst1q_f32(hidden_ptr + q, _H); - vst1_f16(output_data + q, vcvt_f16_f32(_H)); + + if (num_output == hidden_size) + { + vst1q_f32(hidden_ptr + q, _H); + vst1_f16(output_data + q, vcvt_f16_f32(_H)); + } + else + { + vst1q_f32(tmp_hidden_ptr + q, _H); + } } #pragma omp parallel for num_threads(opt.num_threads) - for (int q = remain_num_output_start; q < num_output; q++) + for (int q = remain_hidden_size_start; q < hidden_size; q++) { const float* gates_data = gates.row(q); @@ -184,26 +202,70 @@ static int lstm_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const float H = O * tanh(cell2); cell_ptr[q] = cell2; - hidden_ptr[q] = H; - output_data[q] = (__fp16)(H); + if (num_output == hidden_size) + { + hidden_ptr[q] = H; + output_data[q] = (__fp16)H; + } + else + { + tmp_hidden_ptr[q] = H; + } + } + + if (num_output != hidden_size) + { + // int nn_num_output = num_output >> 2; + // int remain_num_output_start = nn_num_output << 2; + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int qq = 0; qq < nn_num_output; qq++) + // { + // int q = qq * 4; + // + // } + int remain_num_output_start = 0; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = remain_num_output_start; q < num_output; q++) + { + const float* hr = weight_hr.row(q); + const float* tmp_hidden_ptr = tmp_hidden_state; + + float H = 0; + for (int i = 0; i < hidden_size; i++) + { + H += tmp_hidden_ptr[i] * hr[i]; + } + + hidden_ptr[q] = H; + output_data[q] = (__fp16)H; + } } } return 0; } -static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt) +static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt) { int size = bottom_blob.w; int T = bottom_blob.h; int num_output = top_blob.w; + int hidden_size = cell_state.w; - // 4 x num_output - Mat gates(4, num_output, 2u, opt.workspace_allocator); + // 4 x hidden_size + Mat gates(4, hidden_size, 2u, opt.workspace_allocator); if (gates.empty()) return -100; + Mat tmp_hidden_state; + if (num_output != hidden_size) + { + tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator); + if (tmp_hidden_state.empty()) + return -100; + } + // unroll for (int t = 0; t < T; t++) { @@ -216,10 +278,10 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const int ti = reverse ? T - 1 - t : t; - int nn_num_output = num_output >> 1; - int remain_num_output_start = nn_num_output << 1; + int nn_hidden_size = hidden_size >> 1; + int remain_hidden_size_start = nn_hidden_size << 1; #pragma omp parallel for num_threads(opt.num_threads) - for (int qq = 0; qq < nn_num_output; qq++) + for (int qq = 0; qq < nn_hidden_size; qq++) { int q = qq * 2; @@ -319,7 +381,7 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const vst1q_f16(gates_data, _IFOG); } #pragma omp parallel for num_threads(opt.num_threads) - for (int q = remain_num_output_start; q < num_output; q++) + for (int q = remain_hidden_size_start; q < hidden_size; q++) { const __fp16* bias_c_IFOG = (const __fp16*)bias_c + q * 4; @@ -428,11 +490,12 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const float* cell_ptr = cell_state; float* hidden_ptr = hidden_state; + float* tmp_hidden_ptr = tmp_hidden_state; - nn_num_output = num_output >> 2; - remain_num_output_start = nn_num_output << 2; + nn_hidden_size = hidden_size >> 2; + remain_hidden_size_start = nn_hidden_size << 2; #pragma omp parallel for num_threads(opt.num_threads) - for (int qq = 0; qq < nn_num_output; qq++) + for (int qq = 0; qq < nn_hidden_size; qq++) { int q = qq * 4; @@ -449,11 +512,19 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const float32x4_t _H = vmulq_f32(_O, tanh_ps(_cell2)); vst1q_f32(cell_ptr + q, _cell2); - vst1q_f32(hidden_ptr + q, _H); - vst1_f16(output_data + q, vcvt_f16_f32(_H)); + + if (num_output == hidden_size) + { + vst1q_f32(hidden_ptr + q, _H); + vst1_f16(output_data + q, vcvt_f16_f32(_H)); + } + else + { + vst1q_f32(tmp_hidden_ptr + q, _H); + } } #pragma omp parallel for num_threads(opt.num_threads) - for (int q = remain_num_output_start; q < num_output; q++) + for (int q = remain_hidden_size_start; q < hidden_size; q++) { const __fp16* gates_data = gates.row(q); @@ -471,8 +542,43 @@ static int lstm_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const float H = O * tanh(cell2); cell_ptr[q] = cell2; - hidden_ptr[q] = H; - output_data[q] = (__fp16)H; + if (num_output == hidden_size) + { + hidden_ptr[q] = H; + output_data[q] = (__fp16)H; + } + else + { + tmp_hidden_ptr[q] = H; + } + } + + if (num_output != hidden_size) + { + // int nn_num_output = num_output >> 2; + // int remain_num_output_start = nn_num_output << 2; + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int qq = 0; qq < nn_num_output; qq++) + // { + // int q = qq * 4; + // + // } + int remain_num_output_start = 0; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = remain_num_output_start; q < num_output; q++) + { + const float* hr = weight_hr.row(q); + const float* tmp_hidden_ptr = tmp_hidden_state; + + float H = 0; + for (int i = 0; i < hidden_size; i++) + { + H += tmp_hidden_ptr[i] * hr[i]; + } + + hidden_ptr[q] = H; + output_data[q] = (__fp16)H; + } } } @@ -483,19 +589,19 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt) { // pack IFOG int num_directions = direction == 2 ? 2 : 1; - int size = weight_data_size / num_directions / num_output / 4; + int size = weight_data_size / num_directions / hidden_size / 4; if (opt.use_fp16_arithmetic) { - weight_xc_data_packed.create(size, num_output / 2 + num_output % 2, num_directions, 16u, 8); - bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4); - weight_hc_data_packed.create(num_output, num_output / 2 + num_output % 2, num_directions, 16u, 8); + weight_xc_data_packed.create(size, hidden_size / 2 + hidden_size % 2, num_directions, 16u, 8); + bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4); + weight_hc_data_packed.create(num_output, hidden_size / 2 + hidden_size % 2, num_directions, 16u, 8); } else { - weight_xc_data_packed.create(size, num_output, num_directions, 8u, 4); - bias_c_data_packed.create(num_output, 1, num_directions, 8u, 4); - weight_hc_data_packed.create(num_output, num_output, num_directions, 8u, 4); + weight_xc_data_packed.create(size, hidden_size, num_directions, 8u, 4); + bias_c_data_packed.create(hidden_size, 1, num_directions, 8u, 4); + weight_hc_data_packed.create(num_output, hidden_size, num_directions, 8u, 4); } #pragma omp parallel for num_threads(opt.num_threads) @@ -519,7 +625,7 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt) if (opt.use_fp16_arithmetic) { int q = 0; - for (; q + 1 < num_output; q += 2) + for (; q + 1 < hidden_size; q += 2) { bias_c_IFOG[0] = (__fp16)bias_c_I[q]; bias_c_IFOG[1] = (__fp16)bias_c_F[q]; @@ -532,23 +638,23 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt) bias_c_IFOG += 8; - const float* weight_xc_I = weight_xc.row(num_output * 0 + q); - const float* weight_xc_F = weight_xc.row(num_output * 1 + q); - const float* weight_xc_O = weight_xc.row(num_output * 2 + q); - const float* weight_xc_G = weight_xc.row(num_output * 3 + q); - const float* weight_xc_I_1 = weight_xc.row(num_output * 0 + q + 1); - const float* weight_xc_F_1 = weight_xc.row(num_output * 1 + q + 1); - const float* weight_xc_O_1 = weight_xc.row(num_output * 2 + q + 1); - const float* weight_xc_G_1 = weight_xc.row(num_output * 3 + q + 1); - - const float* weight_hc_I = weight_hc.row(num_output * 0 + q); - const float* weight_hc_F = weight_hc.row(num_output * 1 + q); - const float* weight_hc_O = weight_hc.row(num_output * 2 + q); - const float* weight_hc_G = weight_hc.row(num_output * 3 + q); - const float* weight_hc_I_1 = weight_hc.row(num_output * 0 + q + 1); - const float* weight_hc_F_1 = weight_hc.row(num_output * 1 + q + 1); - const float* weight_hc_O_1 = weight_hc.row(num_output * 2 + q + 1); - const float* weight_hc_G_1 = weight_hc.row(num_output * 3 + q + 1); + const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q); + const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q); + const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q); + const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q); + const float* weight_xc_I_1 = weight_xc.row(hidden_size * 0 + q + 1); + const float* weight_xc_F_1 = weight_xc.row(hidden_size * 1 + q + 1); + const float* weight_xc_O_1 = weight_xc.row(hidden_size * 2 + q + 1); + const float* weight_xc_G_1 = weight_xc.row(hidden_size * 3 + q + 1); + + const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q); + const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q); + const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q); + const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q); + const float* weight_hc_I_1 = weight_hc.row(hidden_size * 0 + q + 1); + const float* weight_hc_F_1 = weight_hc.row(hidden_size * 1 + q + 1); + const float* weight_hc_O_1 = weight_hc.row(hidden_size * 2 + q + 1); + const float* weight_hc_G_1 = weight_hc.row(hidden_size * 3 + q + 1); __fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q / 2); __fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q / 2); @@ -581,7 +687,7 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt) weight_hc_IFOG += 8; } } - for (; q < num_output; q++) + for (; q < hidden_size; q++) { bias_c_IFOG[0] = (__fp16)bias_c_I[q]; bias_c_IFOG[1] = (__fp16)bias_c_F[q]; @@ -590,15 +696,15 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt) bias_c_IFOG += 4; - const float* weight_xc_I = weight_xc.row(num_output * 0 + q); - const float* weight_xc_F = weight_xc.row(num_output * 1 + q); - const float* weight_xc_O = weight_xc.row(num_output * 2 + q); - const float* weight_xc_G = weight_xc.row(num_output * 3 + q); + const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q); + const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q); + const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q); + const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q); - const float* weight_hc_I = weight_hc.row(num_output * 0 + q); - const float* weight_hc_F = weight_hc.row(num_output * 1 + q); - const float* weight_hc_O = weight_hc.row(num_output * 2 + q); - const float* weight_hc_G = weight_hc.row(num_output * 3 + q); + const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q); + const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q); + const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q); + const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q); __fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q / 2 + q % 2); __fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q / 2 + q % 2); @@ -626,7 +732,7 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt) } else { - for (int q = 0; q < num_output; q++) + for (int q = 0; q < hidden_size; q++) { bias_c_IFOG[0] = (__fp16)bias_c_I[q]; bias_c_IFOG[1] = (__fp16)bias_c_F[q]; @@ -635,15 +741,15 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt) bias_c_IFOG += 4; - const float* weight_xc_I = weight_xc.row(num_output * 0 + q); - const float* weight_xc_F = weight_xc.row(num_output * 1 + q); - const float* weight_xc_O = weight_xc.row(num_output * 2 + q); - const float* weight_xc_G = weight_xc.row(num_output * 3 + q); + const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q); + const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q); + const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q); + const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q); - const float* weight_hc_I = weight_hc.row(num_output * 0 + q); - const float* weight_hc_F = weight_hc.row(num_output * 1 + q); - const float* weight_hc_O = weight_hc.row(num_output * 2 + q); - const float* weight_hc_G = weight_hc.row(num_output * 3 + q); + const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q); + const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q); + const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q); + const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q); __fp16* weight_xc_IFOG = weight_xc_data_packed_dr.row<__fp16>(q); __fp16* weight_hc_IFOG = weight_hc_data_packed_dr.row<__fp16>(q); @@ -671,6 +777,13 @@ int LSTM_arm::create_pipeline_fp16s(const Option& opt) } } + if (opt.lightmode) + { + weight_xc_data.release(); + bias_c_data.release(); + weight_hc_data.release(); + } + return 0; } @@ -686,7 +799,7 @@ int LSTM_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& return -100; hidden.fill(0.f); - Mat cell(num_output, 4u, opt.workspace_allocator); + Mat cell(hidden_size, 4u, opt.workspace_allocator); if (cell.empty()) return -100; cell.fill(0.f); @@ -698,7 +811,7 @@ int LSTM_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& // Uni directional if (direction == 0 || direction == 1) { - int ret = lstm_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt); + int ret = lstm_fp16s(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret != 0) return ret; } @@ -713,14 +826,14 @@ int LSTM_arm::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& if (top_blob_reverse.empty()) return -100; - int ret0 = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), hidden, cell, opt); + int ret0 = lstm_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret0 != 0) return ret0; hidden.fill(0.f); cell.fill(0.f); - int ret1 = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), hidden, cell, opt); + int ret1 = lstm_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt); if (ret1 != 0) return ret1; @@ -762,7 +875,7 @@ int LSTM_arm::forward_fp16s(const std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::vector& bottom_blobs, std::v { const Mat& q_blob = bottom_blobs[0]; const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; - const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2]; - size_t elemsize = q_blob.elemsize; - int elempack = q_blob.elempack; + size_t src_elemsize = q_blob.elemsize; + int src_elempack = q_blob.elempack; + size_t dst_elemsize = k_blob.elemsize; + int dst_elempack = k_blob.elempack; - const int seqlen = q_blob.h; + const int src_seqlen = q_blob.h; + const int dst_seqlen = k_blob.h; const int embed_dim_per_head = embed_dim / num_head; const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); #if __ARM_NEON - if (elempack == 4) + if (src_elempack == 4) { Mat& top_blob = top_blobs[0]; - top_blob.create(embed_dim, seqlen, elemsize, elempack, opt.blob_allocator); + top_blob.create(embed_dim, src_seqlen, src_elemsize, src_elempack, opt.blob_allocator); if (top_blob.empty()) return -1; - Mat xq(embed_dim_per_head, seqlen, num_head, elemsize, elempack, opt.workspace_allocator); - Mat xk(embed_dim_per_head, seqlen, num_head, elemsize, elempack, opt.workspace_allocator); - Mat xv(seqlen, embed_dim_per_head, num_head, elemsize, elempack, opt.workspace_allocator); + Mat xq(embed_dim_per_head, src_seqlen, num_head, src_elemsize, src_elempack, opt.workspace_allocator); + Mat xk(embed_dim_per_head, dst_seqlen, num_head, dst_elemsize, dst_elempack, opt.workspace_allocator); + Mat xv(dst_seqlen, embed_dim_per_head, num_head, dst_elemsize, dst_elempack, opt.workspace_allocator); - Mat xqk(seqlen * elempack, seqlen, num_head, elemsize, elempack, opt.workspace_allocator); + Mat xqk(dst_seqlen * dst_elempack, src_seqlen, num_head, src_elemsize, src_elempack, opt.workspace_allocator); - Mat xqkv(embed_dim_per_head, num_head, seqlen, elemsize, elempack, opt.workspace_allocator); + Mat xqkv(embed_dim_per_head, num_head, src_seqlen, src_elemsize, src_elempack, opt.workspace_allocator); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < num_head; q++) @@ -67,7 +70,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v { Mat outm = xq.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* outptr = outm.row(i); @@ -99,27 +102,43 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v { Mat outm = xk.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < dst_seqlen; i++) { float* outptr = outm.row(i); for (int j = 0; j < embed_dim_per_head; j++) { const float* ptr = k_blob.row(i); - const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j); + const float* kptr = (const float*)k_weight_data + kdim * (q * embed_dim_per_head + j); - float32x4_t _sum = vdupq_n_f32(k_bias_data[q * embed_dim_per_head + j]); - for (int k = 0; k < embed_dim; k++) + if (dst_elempack == 4) { - float32x4_t _val = vld1q_f32(ptr); - float32x4_t _k = vdupq_n_f32(kptr[0]); - _sum = vmlaq_f32(_sum, _val, _k); - ptr += 4; - kptr += 1; + float32x4_t _sum = vdupq_n_f32(k_bias_data[q * embed_dim_per_head + j]); + for (int k = 0; k < kdim; k++) + { + float32x4_t _val = vld1q_f32(ptr); + float32x4_t _k = vdupq_n_f32(kptr[0]); + _sum = vmlaq_f32(_sum, _val, _k); + ptr += 4; + kptr += 1; + } + + vst1q_f32(outptr, _sum); + outptr += 4; + } + if (dst_elempack == 1) + { + float sum = k_bias_data[q * embed_dim_per_head + j]; + for (int k = 0; k < kdim; k++) + { + sum += ptr[0] * kptr[0]; + ptr += 1; + kptr += 1; + } + + outptr[0] = sum; + outptr += 1; } - - vst1q_f32(outptr, _sum); - outptr += 4; } } } @@ -132,30 +151,46 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v { float* outptr = outm.row(i); - for (int j = 0; j < seqlen; j++) + for (int j = 0; j < dst_seqlen; j++) { const float* ptr = v_blob.row(j); - const float* kptr = (const float*)v_weight_data + embed_dim * (q * embed_dim_per_head + i); + const float* kptr = (const float*)v_weight_data + vdim * (q * embed_dim_per_head + i); - float32x4_t _sum = vdupq_n_f32(v_bias_data[q * embed_dim_per_head + i]); - for (int k = 0; k < embed_dim; k++) + if (dst_elempack == 4) { - float32x4_t _val = vld1q_f32(ptr); - float32x4_t _k = vdupq_n_f32(kptr[0]); - _sum = vmlaq_f32(_sum, _val, _k); - ptr += 4; - kptr += 1; + float32x4_t _sum = vdupq_n_f32(v_bias_data[q * embed_dim_per_head + i]); + for (int k = 0; k < vdim; k++) + { + float32x4_t _val = vld1q_f32(ptr); + float32x4_t _k = vdupq_n_f32(kptr[0]); + _sum = vmlaq_f32(_sum, _val, _k); + ptr += 4; + kptr += 1; + } + + vst1q_f32(outptr, _sum); + outptr += 4; + } + if (dst_elempack == 1) + { + float sum = v_bias_data[q * embed_dim_per_head + i]; + for (int k = 0; k < vdim; k++) + { + sum += ptr[0] * kptr[0]; + ptr += 1; + kptr += 1; + } + + outptr[0] = sum; + outptr += 1; } - - vst1q_f32(outptr, _sum); - outptr += 4; } } } // xqk = xq * xk - // xq (embed_dim_per_head, seqlen) - // xk (embed_dim_per_head, seqlen) + // xq (embed_dim_per_head, src_seqlen) + // xk (embed_dim_per_head, dst_seqlen) { const Mat xqm = xq.channel(q); const Mat xkm = xk.channel(q); @@ -165,11 +200,11 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v Mat upxkm; convert_packing(xkm, upxkm, 1); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* outptr = outm.row(i); - for (int j = 0; j < seqlen * elempack; j++) + for (int j = 0; j < dst_seqlen * dst_elempack; j++) { const float* qptr = xqm.row(i); const float* kptr = upxkm.row(j); @@ -193,19 +228,19 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v // softmax(xqk) { Mat outm = xqk.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* ptr = outm.row(i); float32x4_t _max = vdupq_n_f32(-FLT_MAX); - for (int j = 0; j < seqlen * elempack; j++) + for (int j = 0; j < dst_seqlen * dst_elempack; j++) { float32x4_t _p = vld1q_f32(ptr + j * 4); _max = vmaxq_f32(_max, _p); } float32x4_t _sum = vdupq_n_f32(0.f); - for (int j = 0; j < seqlen * elempack; j++) + for (int j = 0; j < dst_seqlen * dst_elempack; j++) { float32x4_t _p = vld1q_f32(ptr + j * 4); _p = exp_ps(vsubq_f32(_p, _max)); @@ -213,7 +248,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v _sum = vaddq_f32(_sum, _p); } - for (int j = 0; j < seqlen * elempack; j++) + for (int j = 0; j < dst_seqlen * dst_elempack; j++) { float32x4_t _p = vld1q_f32(ptr + j * 4); #if __aarch64__ @@ -227,14 +262,14 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v } // xqkv = xqk * xv - // xqk (seqlen, seqlen) - // xv (seqlen, embed_dim_per_head) - // out (embed_dim_per_head, num_head, seqlen) + // xqk (dst_seqlen, src_seqlen) + // xv (dst_seqlen, embed_dim_per_head) + // out (embed_dim_per_head, num_head, src_seqlen) { const Mat xqkm = xqk.channel(q); const Mat xvm = xv.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* outptr = xqkv.channel(i).row(q); @@ -244,7 +279,7 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v const float* vptr = xvm.row(j); float32x4_t _sum = vdupq_n_f32(0.f); - for (int k = 0; k < seqlen * elempack; k++) + for (int k = 0; k < dst_seqlen * dst_elempack; k++) { float32x4_t _qk = vld1q_f32(qkptr); float32x4_t _v = vdupq_n_f32(vptr[0]); @@ -261,9 +296,9 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v } // out = affine(xqkv) - // xqkv (embed_dim, seqlen) + // xqkv (embed_dim, src_seqlen) #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* outptr = top_blob.row(i); @@ -292,7 +327,14 @@ int MultiHeadAttention_arm::forward(const std::vector& bottom_blobs, std::v #endif // __ARM_NEON // fallback to native implement - return MultiHeadAttention::forward(bottom_blobs, top_blobs, opt); + std::vector bottom_blobs_unpacked = bottom_blobs; + if (dst_elempack == 4) + { + convert_packing(bottom_blobs[1], bottom_blobs_unpacked[1], 1, opt); + if (bottom_blobs.size() == 3) + convert_packing(bottom_blobs[2], bottom_blobs_unpacked[2], 1, opt); + } + return MultiHeadAttention::forward(bottom_blobs_unpacked, top_blobs, opt); } } // namespace ncnn diff --git a/src/layer/arm/neon_mathfun_fp16s.h b/src/layer/arm/neon_mathfun_fp16s.h index f1759f5188ff..074681809bfc 100644 --- a/src/layer/arm/neon_mathfun_fp16s.h +++ b/src/layer/arm/neon_mathfun_fp16s.h @@ -89,9 +89,9 @@ static inline float16x4_t log_ps(float16x4_t x) * } else { x = x - 1.0; } */ uint16x4_t mask = vclt_f16(x, vdup_n_f16(c_cephes_SQRTHF)); - float16x4_t tmp = vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(x), mask)); + float16x4_t tmp = (float16x4_t)(vand_u16((uint16x4_t)(x), mask)); x = vsub_f16(x, one); - e = vsub_f16(e, vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(one), mask))); + e = vsub_f16(e, (float16x4_t)(vand_u16((uint16x4_t)(one), mask))); x = vadd_f16(x, tmp); float16x4_t z = vmul_f16(x, x); @@ -115,7 +115,7 @@ static inline float16x4_t log_ps(float16x4_t x) x = vadd_f16(x, y); x = vfma_f16(x, e, vdup_n_f16(c_cephes_log_q2)); - x = vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(x), invalid_mask)); // negative arg will be NAN + x = (float16x4_t)(vorr_u16((uint16x4_t)(x), invalid_mask)); // negative arg will be NAN return x; } @@ -208,9 +208,9 @@ static inline float16x4_t exp_ps(float16x4_t x) /* if greater, substract 1 */ uint16x4_t mask = vcgt_f16(tmp, fx); - mask = vand_u16(mask, vreinterpret_u16_f16(one)); + mask = vand_u16(mask, (uint16x4_t)(one)); - fx = vsub_f16(tmp, vreinterpret_f16_u16(mask)); + fx = vsub_f16(tmp, (float16x4_t)(mask)); tmp = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C1)); float16x4_t z = vmul_f16(fx, vdup_n_f16(c_cephes_exp_C2)); @@ -489,7 +489,7 @@ static inline float16x4_t tanh_ps(float16x4_t x) // clamp the inputs to the range [-9, 9] since anything outside // this range is -/+1.0f in single-precision. - x2 = vreinterpret_f16_u16(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), vreinterpret_u16_f16(x2), vreinterpret_u16_f16(vdup_n_f16(c_tanh_hi)))); + x2 = (float16x4_t)(vbsl_u16(vcge_f16(vdup_n_f16(c_tanh_hi), x2), (uint16x4_t)(x2), (uint16x4_t)(vdup_n_f16(c_tanh_hi)))); // since the polynomials are odd/even, we need x**2. float16x4_t z = vmul_f16(x2, x2); @@ -514,10 +514,10 @@ static inline float16x4_t tanh_ps(float16x4_t x) y = vdiv_f16(y, w); // reinstate the sign. - y = vreinterpret_f16_u16(vbsl_u16(vdup_n_u16(1u << 15), vreinterpret_u16_f16(x), vreinterpret_u16_f16(y))); + y = (float16x4_t)(vbsl_u16(vdup_n_u16(1u << 15), (uint16x4_t)(x), (uint16x4_t)(y))); // when the argument is very small in magnitude it's more accurate to just return it. - y = vreinterpret_f16_u16(vbsl_u16(tiny_mask, vreinterpret_u16_f16(y), vreinterpret_u16_f16(x))); + y = (float16x4_t)(vbsl_u16(tiny_mask, (uint16x4_t)(y), (uint16x4_t)(x))); return y; } diff --git a/src/layer/arm/pooling_3x3.h b/src/layer/arm/pooling_3x3.h index 5c0b281a379d..f105aeea7567 100644 --- a/src/layer/arm/pooling_3x3.h +++ b/src/layer/arm/pooling_3x3.h @@ -69,7 +69,7 @@ static void pooling3x3s2_max_neon(const Mat& bottom_blob, Mat& top_blob, const O "prfm pldl1keep, [%3, #256] \n" "ld2 {v10.4s, v11.4s}, [%3], #32 \n" - "ext v2.16b, v2.16b, v8.16b, #4 \n" + "ext v2.16b, v2.16b, v8.16b, #4 \n" "fmax v12.4s, v12.4s, v0.4s \n" "ext v4.16b, v4.16b, v10.16b, #4 \n" diff --git a/src/layer/arm/softmax_arm.cpp b/src/layer/arm/softmax_arm.cpp index c00e3d4414e5..77a0e696479f 100644 --- a/src/layer/arm/softmax_arm.cpp +++ b/src/layer/arm/softmax_arm.cpp @@ -76,15 +76,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const _sum = vaddq_f32(_sum, vrev64q_f32(_sum)); _sum = vaddq_f32(_sum, vextq_f32(_sum, _sum, 2)); #endif - + float32x4_t _reciprocal_sum = vrecpeq_f32(_sum); + _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum); for (int i = 0; i < w; i++) { float32x4_t _p = vld1q_f32(ptr + i * 4); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else - _p = div_ps(_p, _sum); -#endif + _p = vmulq_f32(_p, _reciprocal_sum); vst1q_f32(ptr + i * 4, _p); } } @@ -152,11 +149,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { float32x4_t _p = vld1q_f32(ptr); float32x4_t _sum = vdupq_n_f32(sum[j]); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else _p = div_ps(_p, _sum); -#endif vst1q_f32(ptr, _p); ptr += 4; } @@ -189,14 +182,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const _sum = vaddq_f32(_sum, _p); } + float32x4_t _reciprocal_sum = vrecpeq_f32(_sum); + _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum); for (int j = 0; j < w; j++) { float32x4_t _p = vld1q_f32(ptr + j * 4); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else - _p = div_ps(_p, _sum); -#endif + _p = vmulq_f32(_p, _reciprocal_sum); vst1q_f32(ptr + j * 4, _p); } } @@ -269,11 +260,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { float32x4_t _p = vld1q_f32(ptr); float32x4_t _sum = vdupq_n_f32(sum[i]); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else _p = div_ps(_p, _sum); -#endif vst1q_f32(ptr, _p); ptr += 4; } @@ -356,11 +343,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { float32x4_t _p = vld1q_f32(ptr); float32x4_t _sum = vld1q_f32(sumptr); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else _p = div_ps(_p, _sum); -#endif vst1q_f32(ptr, _p); ptr += 4; sumptr += 4; @@ -398,14 +381,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const _sum = vaddq_f32(_sum, _p); } + float32x4_t _reciprocal_sum = vrecpeq_f32(_sum); + _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum); for (int j = 0; j < w; j++) { float32x4_t _p = vld1q_f32(ptr + j * 4); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else - _p = div_ps(_p, _sum); -#endif + _p = vmulq_f32(_p, _reciprocal_sum); vst1q_f32(ptr + j * 4, _p); } @@ -480,14 +461,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int i = 0; #if __ARM_NEON float32x4_t _sum = vdupq_n_f32(sum); + float32x4_t _reciprocal_sum = vrecpeq_f32(_sum); + _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum); for (; i + 3 < w; i += 4) { float32x4_t _p = vld1q_f32(ptr + i); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else - _p = div_ps(_p, _sum); -#endif + _p = vmulq_f32(_p, _reciprocal_sum); vst1q_f32(ptr + i, _p); } #endif // __ARM_NEON @@ -587,11 +566,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { float32x4_t _p = vld1q_f32(ptr); float32x4_t _sum = vld1q_f32(psum); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else _p = div_ps(_p, _sum); -#endif vst1q_f32(ptr, _p); ptr += 4; @@ -674,14 +649,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int j = 0; #if __ARM_NEON float32x4_t _sum = vdupq_n_f32(sum); + float32x4_t _reciprocal_sum = vrecpeq_f32(_sum); + _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum); for (; j + 3 < w; j += 4) { float32x4_t _p = vld1q_f32(ptr + j); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else - _p = div_ps(_p, _sum); -#endif + _p = vmulq_f32(_p, _reciprocal_sum); vst1q_f32(ptr + j, _p); } #endif // __ARM_NEON @@ -790,11 +763,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { float32x4_t _p = vld1q_f32(ptr); float32x4_t _sum = vld1q_f32(sumptr); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else _p = div_ps(_p, _sum); -#endif // __aarch64__ vst1q_f32(ptr, _p); ptr += 4; @@ -902,11 +871,7 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { float32x4_t _p = vld1q_f32(ptr + j); float32x4_t _sum = vld1q_f32(sumptr + j); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else _p = div_ps(_p, _sum); -#endif vst1q_f32(ptr + j, _p); } #endif // __ARM_NEON @@ -989,14 +954,12 @@ int Softmax_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int j = 0; #if __ARM_NEON float32x4_t _sum = vdupq_n_f32(sum); + float32x4_t _reciprocal_sum = vrecpeq_f32(_sum); + _reciprocal_sum = vmulq_f32(vrecpsq_f32(_sum, _reciprocal_sum), _reciprocal_sum); for (; j + 3 < w; j += 4) { float32x4_t _p = vld1q_f32(ptr + j); -#if __aarch64__ - _p = vdivq_f32(_p, _sum); -#else - _p = div_ps(_p, _sum); -#endif + _p = vmulq_f32(_p, _reciprocal_sum); vst1q_f32(ptr + j, _p); } #endif // __ARM_NEON diff --git a/src/layer/expanddims.cpp b/src/layer/expanddims.cpp index 4b253d7e1c42..473a3b71b373 100644 --- a/src/layer/expanddims.cpp +++ b/src/layer/expanddims.cpp @@ -26,6 +26,7 @@ int ExpandDims::load_param(const ParamDict& pd) { expand_w = pd.get(0, 0); expand_h = pd.get(1, 0); + expand_d = pd.get(11, 0); expand_c = pd.get(2, 0); axes = pd.get(3, Mat()); @@ -36,16 +37,19 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt { int w = bottom_blob.w; int h = bottom_blob.h; + int channels = bottom_blob.c; int dims = bottom_blob.dims; bool _expand_w = false; bool _expand_h = false; + bool _expand_d = false; bool _expand_c = false; if (axes.empty()) { _expand_w = expand_w; _expand_h = expand_h; + _expand_d = expand_d; _expand_c = expand_c; } else @@ -77,6 +81,22 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt { _expand_w = true; } + if (dims == 3 && axis == 0) + { + _expand_c = true; + } + if (dims == 3 && axis == 1) + { + _expand_d = true; + } + if (dims == 3 && axis == 2) + { + _expand_h = true; + } + if (dims == 3 && axis == 3) + { + _expand_w = true; + } } } @@ -114,6 +134,26 @@ int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt } } + if (dims == 3) + { + if (_expand_w) + { + top_blob = bottom_blob.reshape(1, w, h, channels, opt.blob_allocator); + } + else if (_expand_h) + { + top_blob = bottom_blob.reshape(w, 1, h, channels, opt.blob_allocator); + } + else if (_expand_d) + { + top_blob = bottom_blob.reshape(w, h, 1, channels, opt.blob_allocator); + } + else if (_expand_c) + { + top_blob = bottom_blob.reshape(w, h, channels, 1, opt.blob_allocator); + } + } + if (top_blob.empty()) return -100; diff --git a/src/layer/expanddims.h b/src/layer/expanddims.h index 71933149eaf3..4c8c990f7561 100644 --- a/src/layer/expanddims.h +++ b/src/layer/expanddims.h @@ -31,6 +31,7 @@ class ExpandDims : public Layer public: int expand_w; int expand_h; + int expand_d; int expand_c; Mat axes; }; diff --git a/src/layer/fold.cpp b/src/layer/fold.cpp new file mode 100644 index 000000000000..c14f01fbb722 --- /dev/null +++ b/src/layer/fold.cpp @@ -0,0 +1,124 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fold.h" + +namespace ncnn { + +Fold::Fold() +{ + one_blob_only = true; +} + +int Fold::load_param(const ParamDict& pd) +{ + kernel_w = pd.get(1, 0); + kernel_h = pd.get(11, kernel_w); + dilation_w = pd.get(2, 1); + dilation_h = pd.get(12, dilation_w); + stride_w = pd.get(3, 1); + stride_h = pd.get(13, stride_w); + pad_left = pd.get(4, 0); + pad_right = pd.get(15, pad_left); + pad_top = pd.get(14, pad_left); + pad_bottom = pd.get(16, pad_top); + output_w = pd.get(20, 0); + output_h = pd.get(21, output_w); + + return 0; +} + +int Fold::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int size = bottom_blob.w; + const int max_channels = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int outw = output_w + pad_left + pad_right; + const int outh = output_h + pad_top + pad_bottom; + + const int inw = (outw - kernel_extent_w) / stride_w + 1; + const int inh = (outh - kernel_extent_h) / stride_h + 1; + + // assert inw * inh == size + + const int maxk = kernel_w * kernel_h; + const int channels = max_channels / maxk; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + top_blob_bordered.create(outw, outh, channels, elemsize, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, channels, elemsize, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + + // col2im + const int gap = outw * stride_h - inw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* sptr = bottom_blob.row(p * maxk); + Mat outm = top_blob_bordered.channel(p); + + outm.fill(0.f); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + float* ptr = outm.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < inh; i++) + { + for (int j = 0; j < inw; j++) + { + ptr[0] += sptr[0]; + + ptr += stride_w; + sptr += 1; + } + + ptr += gap; + } + } + } + } + + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + Option opt_b = opt; + opt_b.use_packing_layout = false; + copy_cut_border(top_blob_bordered, top_blob, pad_top, pad_bottom, pad_left, pad_right, opt_b); + if (top_blob.empty()) + return -100; + } + else + { + top_blob = top_blob_bordered; + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/fold.h b/src/layer/fold.h new file mode 100644 index 000000000000..e6eccad556a4 --- /dev/null +++ b/src/layer/fold.h @@ -0,0 +1,48 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_FOLD_H +#define LAYER_FOLD_H + +#include "layer.h" + +namespace ncnn { + +class Fold : public Layer +{ +public: + Fold(); + + virtual int load_param(const ParamDict& pd); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +public: + int kernel_w; + int kernel_h; + int dilation_w; + int dilation_h; + int stride_w; + int stride_h; + int pad_left; // -233=SAME_UPPER -234=SAME_LOWER + int pad_right; + int pad_top; + int pad_bottom; + int output_w; + int output_h; +}; + +} // namespace ncnn + +#endif // LAYER_FOLD_H diff --git a/src/layer/glu.cpp b/src/layer/glu.cpp index 245341a53957..bf99a4cd1ea3 100644 --- a/src/layer/glu.cpp +++ b/src/layer/glu.cpp @@ -18,187 +18,203 @@ namespace ncnn { -GLU::GLU() { - one_blob_only = true; - support_inplace = false; +GLU::GLU() +{ + one_blob_only = true; + support_inplace = false; } -int GLU::load_param(const ParamDict &pd) { - axis = pd.get(0, 0); +int GLU::load_param(const ParamDict& pd) +{ + axis = pd.get(0, 0); - return 0; + return 0; } -int GLU::forward(const Mat &bottom_blob, Mat &top_blob, - const Option &opt) const { - int dims = bottom_blob.dims; - int positive_axis = axis < 0 ? dims + axis : axis; +int GLU::forward(const Mat& bottom_blob, Mat& top_blob, + const Option& opt) const +{ + int dims = bottom_blob.dims; + int positive_axis = axis < 0 ? dims + axis : axis; - if (dims == 1) { // ignore axis - int w = bottom_blob.w; - int out_w = w / 2; - top_blob.create(out_w, sizeof(float), opt.blob_allocator); + if (dims == 1) + { // ignore axis + int w = bottom_blob.w; + int out_w = w / 2; + top_blob.create(out_w, sizeof(float), opt.blob_allocator); - const float *in_ptr = bottom_blob; - float *out_ptr = top_blob; + const float* in_ptr = bottom_blob; + float* out_ptr = top_blob; -#pragma omp parallel for num_threads(opt.num_threads) - for (int x = 0; x < out_w; ++x) { - float sigmoid = - static_cast(1.f / (1.f + expf(-in_ptr[x + out_w]))); + #pragma omp parallel for num_threads(opt.num_threads) + for (int x = 0; x < out_w; ++x) + { + float sigmoid = static_cast(1.f / (1.f + expf(-in_ptr[x + out_w]))); - out_ptr[x] = in_ptr[x] * sigmoid; - } + out_ptr[x] = in_ptr[x] * sigmoid; + } - return 0; - } // if (dims == 1) + return 0; + } // if (dims == 1) - if (dims == 2 && positive_axis == 0) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_w = w; - int out_h = h / 2; - top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator); + if (dims == 2 && positive_axis == 0) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_w = w; + int out_h = h / 2; + top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator); - int offset = out_w * out_h; + int offset = out_w * out_h; #if 0 -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < out_h; ++y) { - const float *in_ptr = bottom_blob.row(y); - float *out_ptr = top_blob.row(y); - - for (int x = 0; x < w; ++x) { - float sigmoid = - static_cast(1.f / (1.f + exp(-in_ptr[x + offset]))); - - out_ptr[x] = in_ptr[x] * sigmoid; - } - } + // this one is equivalent to the else branch. It is more readable + // but less efficient + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < out_h; ++y) { + const float *in_ptr = bottom_blob.row(y); + float *out_ptr = top_blob.row(y); + + for (int x = 0; x < w; ++x) { + float sigmoid = + static_cast(1.f / (1.f + exp(-in_ptr[x + offset]))); + + out_ptr[x] = in_ptr[x] * sigmoid; + } + } #else - int size = offset; - const float *in_ptr = bottom_blob; - float *out_ptr = top_blob; - -#pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < size; ++i) { - float sigmoid = - static_cast(1.f / (1.f + exp(-in_ptr[i + offset]))); - out_ptr[i] = in_ptr[i] * sigmoid; - } + int size = offset; + const float* in_ptr = bottom_blob; + float* out_ptr = top_blob; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < size; ++i) + { + float sigmoid = static_cast(1.f / (1.f + exp(-in_ptr[i + offset]))); + out_ptr[i] = in_ptr[i] * sigmoid; + } #endif - return 0; - } // if (dims == 2 && positive_axis == 0) - - if (dims == 2 && positive_axis == 1) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int out_w = w / 2; - int out_h = h; - - top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator); - -#pragma omp parallel for num_threads(opt.num_threads) - for (int y = 0; y < h; ++y) { - const float *in_ptr = bottom_blob.row(y); - float *out_ptr = top_blob.row(y); - - for (int x = 0; x < out_w; ++x) { - float sigmoid = - static_cast(1.f / (1.f + exp(-in_ptr[x + out_w]))); - out_ptr[x] = in_ptr[x] * sigmoid; - } - } - - return 0; - } // if (dims == 2 && positive_axis == 1) - - if (dims == 3 && positive_axis == 0) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int c = bottom_blob.c; - - int out_w = w; - int out_h = h; - int out_c = c / 2; - - top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator); - - int offset = out_c * bottom_blob.cstep; - int size = w * h; + return 0; + } // if (dims == 2 && positive_axis == 0) + + if (dims == 2 && positive_axis == 1) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_w = w / 2; + int out_h = h; + + top_blob.create(out_w, out_h, sizeof(float), opt.blob_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; ++y) + { + const float* in_ptr = bottom_blob.row(y); + float* out_ptr = top_blob.row(y); + + for (int x = 0; x < out_w; ++x) + { + float sigmoid = static_cast(1.f / (1.f + exp(-in_ptr[x + out_w]))); + out_ptr[x] = in_ptr[x] * sigmoid; + } + } -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < out_c; ++q) { - const float *in_ptr = bottom_blob.channel(q); - float *out_ptr = top_blob.channel(q); + return 0; + } // if (dims == 2 && positive_axis == 1) - for (int i = 0; i < size; ++i) { - float sigmoid = - static_cast(1.f / (1.f + exp(-in_ptr[i + offset]))); - out_ptr[i] = in_ptr[i] * sigmoid; - } - } - return 0; - } // if (dims == 3 && positive_axis == 0) { + if (dims == 3 && positive_axis == 0) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int c = bottom_blob.c; - if (dims == 3 && positive_axis == 1) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int c = bottom_blob.c; + int out_w = w; + int out_h = h; + int out_c = c / 2; - int out_w = w; - int out_h = h / 2; - int out_c = c; + top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator); - top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator); + int offset = out_c * bottom_blob.cstep; + int size = w * h; - int offset = out_h * out_w; - int size = offset; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < out_c; ++q) + { + const float* in_ptr = bottom_blob.channel(q); + float* out_ptr = top_blob.channel(q); -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; ++q) { - const float *in_ptr = bottom_blob.channel(q); - float *out_ptr = top_blob.channel(q); - - for (int i = 0; i < size; ++i) { - float sigmoid = - static_cast(1.f / (1.f + exp(-in_ptr[i + offset]))); - out_ptr[i] = in_ptr[i] * sigmoid; - } - } - return 0; - } // if (dims == 3 && positive_axis == 1) - - if (dims == 3 && positive_axis == 2) { - int w = bottom_blob.w; - int h = bottom_blob.h; - int c = bottom_blob.c; - - int out_w = w / 2; - int out_h = h; - int out_c = c; - - top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator); - -#pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < c; ++q) { - const float *in_ptr = bottom_blob.channel(q); - float *out_ptr = top_blob.channel(q); - for (int y = 0; y < h; ++y) { - for (int x = 0; x < out_w; ++x) { - float sigmoid = - static_cast(1.f / (1.f + exp(-in_ptr[x + out_w]))); - out_ptr[x] = in_ptr[x] * sigmoid; + for (int i = 0; i < size; ++i) + { + float sigmoid = static_cast(1.f / (1.f + exp(-in_ptr[i + offset]))); + out_ptr[i] = in_ptr[i] * sigmoid; + } } - in_ptr += w; - out_ptr += out_w; - } - } - return 0; - } // if (dims == 3 && positive_axis == 2) + return 0; + } // if (dims == 3 && positive_axis == 0) { + + if (dims == 3 && positive_axis == 1) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int c = bottom_blob.c; + + int out_w = w; + int out_h = h / 2; + int out_c = c; + + top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator); + + int offset = out_h * out_w; + int size = offset; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; ++q) + { + const float* in_ptr = bottom_blob.channel(q); + float* out_ptr = top_blob.channel(q); + + for (int i = 0; i < size; ++i) + { + float sigmoid = static_cast(1.f / (1.f + exp(-in_ptr[i + offset]))); + out_ptr[i] = in_ptr[i] * sigmoid; + } + } + return 0; + } // if (dims == 3 && positive_axis == 1) + + if (dims == 3 && positive_axis == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int c = bottom_blob.c; + + int out_w = w / 2; + int out_h = h; + int out_c = c; + + top_blob.create(out_w, out_h, out_c, sizeof(float), opt.blob_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; ++q) + { + const float* in_ptr = bottom_blob.channel(q); + float* out_ptr = top_blob.channel(q); + for (int y = 0; y < h; ++y) + { + for (int x = 0; x < out_w; ++x) + { + float sigmoid = static_cast(1.f / (1.f + exp(-in_ptr[x + out_w]))); + out_ptr[x] = in_ptr[x] * sigmoid; + } + in_ptr += w; + out_ptr += out_w; + } + } + return 0; + } // if (dims == 3 && positive_axis == 2) - return -100; + return -100; } -} // namespace ncnn +} // namespace ncnn diff --git a/src/layer/glu.h b/src/layer/glu.h index 762730635e19..003682955921 100644 --- a/src/layer/glu.h +++ b/src/layer/glu.h @@ -19,19 +19,20 @@ namespace ncnn { -class GLU : public Layer { - public: - GLU(); +class GLU : public Layer +{ +public: + GLU(); - virtual int load_param(const ParamDict &pd); + virtual int load_param(const ParamDict& pd); - virtual int forward(const Mat &bottom_blob, Mat &top_blob, - const Option &opt) const; + virtual int forward(const Mat& bottom_blob, Mat& top_blob, + const Option& opt) const; - public: - int axis; +public: + int axis; }; -} // namespace ncnn +} // namespace ncnn -#endif // LAYER_GLU_H +#endif // LAYER_GLU_H diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp new file mode 100644 index 000000000000..c9e3969100b2 --- /dev/null +++ b/src/layer/gridsample.cpp @@ -0,0 +1,451 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// coord compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to coord writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "gridsample.h" + +#include + +namespace ncnn { + +GridSample::GridSample() +{ + one_blob_only = false; + support_inplace = false; +} + +int GridSample::load_param(const ParamDict& pd) +{ + sample_type = pd.get(0, 1); + padding_mode = pd.get(1, 1); + align_corner = pd.get(2, 0); + + if (sample_type < 1 || sample_type > 3) + { + NCNN_LOGE("unsupported sample type %d", sample_type); + return -1; + } + + if (padding_mode < 1 || padding_mode > 3) + { + NCNN_LOGE("unsupported padding mode %d", padding_mode); + return -1; + } + + return 0; +} + +// Restore normalized location to acutal image location +// When align_corners is true: +// Normalized location (-1, -1) points to the top-left pixel. +// Normalized location (1, 1) points to the bottom-tight pixel. +// When align_corners is false [default]: +// Normalized location (-1, -1) points to the top-left pixel minus half +// pixel coord both directions, i.e, (-0.5, -0.5) coord acutal image space. +// Normalized location (1, 1) points to the bottom-tight pixel plus half +// pixel coord both directions, i.e. (H - 0.5, W - 0.5) coord acutal image space. +static float grid_sample_unormalize(int w, float coordx, int align_corner) +{ + return align_corner ? (coordx + 1) / 2.f * (w - 1) : ((coordx + 1) * w - 1) / 2.f; +} + +static float border_coord(int x, int border) +{ + return std::min(border, std::max(x, 0)); +} + +static float reflect_coord(float x, int high) +{ + x = abs(x); + x = high - abs(x - high); + return x; +} + +static int compute_coord(int sx, int w, int padding_mode, int align_corner) +{ + if (padding_mode == 2) // border + { + sx = border_coord(sx, w - 1); + } + else if (padding_mode == 3) // reflection + { + if (align_corner) + { + sx = reflect_coord(sx, w - 1); + } + else + { + sx = static_cast(reflect_coord(sx + 0.5, w) - 0.5); + sx = border_coord(sx, w - 1); + } + } + + return sx; +} + +static bool in_bounds(const Mat& image, int x, int y) +{ + return x >= 0 && y >= 0 && x < image.w && y < image.h; +} + +static bool in_bounds(const Mat& image, int x, int y, int z) +{ + return x >= 0 && y >= 0 && z >= 0 && x < image.w && y < image.h && z < image.c; +} + +static float get_value_bounded(const Mat& image, int x, int y) +{ + return in_bounds(image, x, y) ? image.row(y)[x] : 0.f; +} + +static float get_value_bounded(const Mat& image, int x, int y, int z) +{ + return in_bounds(image, x, y, z) ? image.channel(z).row(y)[x] : 0.f; +} + +static float get_value_bounded(const Mat& image, int x, int y, int padding_mode, int align_corner) +{ + x = compute_coord(x, image.w, padding_mode, align_corner); + y = compute_coord(y, image.h, padding_mode, align_corner); + + return get_value_bounded(image, x, y); +} + +static float get_value_bounded(const Mat& image, int x, int y, int z, int padding_mode, int align_corner) +{ + x = compute_coord(x, image.w, padding_mode, align_corner); + y = compute_coord(y, image.h, padding_mode, align_corner); + z = compute_coord(z, image.c, padding_mode, align_corner); + + return get_value_bounded(image, x, y, z); +} + +static inline void interpolate_cubic(float fx, float* coeffs) +{ + const float A = -0.75f; + + float fx0 = fx + 1; + float fx1 = fx; + float fx2 = 1 - fx; + // float fx3 = 2 - fx; + + coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; + coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; + coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + +int GridSample::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& grid = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + + if (dims == 3) + { + int outw = grid.h; + int outh = grid.c; + + top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (sample_type == 1) // bilinear + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat image = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + const float* gridptr = grid.channel(y); + + for (int x = 0; x < outw; x++) + { + float sample_x = gridptr[0]; + float sample_y = gridptr[1]; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + + // bilinear interpolate + float v; + { + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int x1 = x0 + 1; + int y1 = y0 + 1; + + float v00 = get_value_bounded(image, x0, y0, padding_mode, align_corner); + float v01 = get_value_bounded(image, x1, y0, padding_mode, align_corner); + float v10 = get_value_bounded(image, x0, y1, padding_mode, align_corner); + float v11 = get_value_bounded(image, x1, y1, padding_mode, align_corner); + + float alpha = sample_x - x0; + float beta = sample_y - y0; + + float v0 = v00 * (1 - alpha) + v01 * alpha; + float v1 = v10 * (1 - alpha) + v11 * alpha; + + v = v0 * (1 - beta) + v1 * beta; + } + + outptr[0] = v; + outptr += 1; + + gridptr += 2; + } + } + } + } + else if (sample_type == 2) // nearest + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat image = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + const float* gridptr = grid.channel(y); + + for (int x = 0; x < outw; x++) + { + float sample_x = gridptr[0]; + float sample_y = gridptr[1]; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + + int x0 = static_cast(round(sample_x)); + int y0 = static_cast(round(sample_y)); + + float v = get_value_bounded(image, x0, y0, padding_mode, align_corner); + + outptr[0] = v; + outptr += 1; + + gridptr += 2; + } + } + } + } + else if (sample_type == 3) // bicubic + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat image = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + const float* gridptr = grid.channel(y); + + for (int x = 0; x < outw; x++) + { + float sample_x = gridptr[0]; + float sample_y = gridptr[1]; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + + // bicubic interpolate + float v; + { + int x1 = floor(sample_x); + int y1 = floor(sample_y); + int x0 = x1 - 1; + int y0 = y1 - 1; + int x2 = x1 + 1; + int y2 = y1 + 1; + int x3 = x1 + 2; + int y3 = y1 + 2; + + float v00 = get_value_bounded(image, x0, y0, padding_mode, align_corner); + float v01 = get_value_bounded(image, x1, y0, padding_mode, align_corner); + float v02 = get_value_bounded(image, x2, y0, padding_mode, align_corner); + float v03 = get_value_bounded(image, x3, y0, padding_mode, align_corner); + float v10 = get_value_bounded(image, x0, y1, padding_mode, align_corner); + float v11 = get_value_bounded(image, x1, y1, padding_mode, align_corner); + float v12 = get_value_bounded(image, x2, y1, padding_mode, align_corner); + float v13 = get_value_bounded(image, x3, y1, padding_mode, align_corner); + float v20 = get_value_bounded(image, x0, y2, padding_mode, align_corner); + float v21 = get_value_bounded(image, x1, y2, padding_mode, align_corner); + float v22 = get_value_bounded(image, x2, y2, padding_mode, align_corner); + float v23 = get_value_bounded(image, x3, y2, padding_mode, align_corner); + float v30 = get_value_bounded(image, x0, y3, padding_mode, align_corner); + float v31 = get_value_bounded(image, x1, y3, padding_mode, align_corner); + float v32 = get_value_bounded(image, x2, y3, padding_mode, align_corner); + float v33 = get_value_bounded(image, x3, y3, padding_mode, align_corner); + + float x_coeffs[4]; + float y_coeffs[4]; + interpolate_cubic(sample_x - x1, x_coeffs); + interpolate_cubic(sample_y - y1, y_coeffs); + + float v0 = v00 * x_coeffs[0] + v01 * x_coeffs[1] + v02 * x_coeffs[2] + v03 * x_coeffs[3]; + float v1 = v10 * x_coeffs[0] + v11 * x_coeffs[1] + v12 * x_coeffs[2] + v13 * x_coeffs[3]; + float v2 = v20 * x_coeffs[0] + v21 * x_coeffs[1] + v22 * x_coeffs[2] + v23 * x_coeffs[3]; + float v3 = v30 * x_coeffs[0] + v31 * x_coeffs[1] + v32 * x_coeffs[2] + v33 * x_coeffs[3]; + + v = v0 * y_coeffs[0] + v1 * y_coeffs[1] + v2 * y_coeffs[2] + v3 * y_coeffs[3]; + } + + outptr[0] = v; + outptr += 1; + + gridptr += 2; + } + } + } + } + } + + if (dims == 4) + { + int outw = grid.h; + int outh = grid.d; + int outd = grid.c; + + top_blob.create(outw, outh, outd, channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (sample_type == 1) // bilinear + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat image = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int z = 0; z < outd; z++) + { + const float* gridptr = grid.channel(z); + + for (int y = 0; y < outh; y++) + { + for (int x = 0; x < outw; x++) + { + float sample_x = gridptr[0]; + float sample_y = gridptr[1]; + float sample_z = gridptr[2]; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + sample_z = grid_sample_unormalize(d, sample_z, align_corner); + + // bilinear interpolate + float v; + { + int x0 = (int)floor(sample_x); + int y0 = (int)floor(sample_y); + int z0 = (int)floor(sample_z); + int x1 = x0 + 1; + int y1 = y0 + 1; + int z1 = z0 + 1; + + float v000 = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner); + float v001 = get_value_bounded(image, x1, y0, z0, padding_mode, align_corner); + float v010 = get_value_bounded(image, x0, y1, z0, padding_mode, align_corner); + float v011 = get_value_bounded(image, x1, y1, z0, padding_mode, align_corner); + float v100 = get_value_bounded(image, x0, y0, z1, padding_mode, align_corner); + float v101 = get_value_bounded(image, x1, y0, z1, padding_mode, align_corner); + float v110 = get_value_bounded(image, x0, y1, z1, padding_mode, align_corner); + float v111 = get_value_bounded(image, x1, y1, z1, padding_mode, align_corner); + + float alpha = sample_x - x0; + float beta = sample_y - y0; + float gamma = sample_z - z0; + + float v00 = v000 * (1 - alpha) + v001 * alpha; + float v01 = v010 * (1 - alpha) + v011 * alpha; + float v10 = v100 * (1 - alpha) + v101 * alpha; + float v11 = v110 * (1 - alpha) + v111 * alpha; + + float v0 = v00 * (1 - beta) + v01 * beta; + float v1 = v10 * (1 - beta) + v11 * beta; + + v = v0 * (1 - gamma) + v1 * gamma; + } + + outptr[0] = v; + outptr += 1; + + gridptr += 3; + } + } + } + } + } + else if (sample_type == 2) // nearest + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat image = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int z = 0; z < outd; z++) + { + const float* gridptr = grid.channel(z); + + for (int y = 0; y < outh; y++) + { + for (int x = 0; x < outw; x++) + { + float sample_x = gridptr[0]; + float sample_y = gridptr[1]; + float sample_z = gridptr[2]; + + sample_x = grid_sample_unormalize(w, sample_x, align_corner); + sample_y = grid_sample_unormalize(h, sample_y, align_corner); + sample_z = grid_sample_unormalize(d, sample_z, align_corner); + + int x0 = static_cast(round(sample_x)); + int y0 = static_cast(round(sample_y)); + int z0 = static_cast(round(sample_z)); + + float v = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner); + + outptr[0] = v; + outptr += 1; + + gridptr += 3; + } + } + } + } + } + else if (sample_type == 3) + { + NCNN_LOGE("unsupported bicubic when dims == 4"); + return -1; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h new file mode 100644 index 000000000000..0ea540eb4baf --- /dev/null +++ b/src/layer/gridsample.h @@ -0,0 +1,40 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_GRIDSAMPLE_H +#define LAYER_GRIDSAMPLE_H + +#include "layer.h" + +namespace ncnn { + +class GridSample : public Layer +{ +public: + GridSample(); + + virtual int load_param(const ParamDict& pd); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + // param + int sample_type; // 1=bilinear 2=nearest 3=bicubic + int padding_mode; // 1=zeros 2=border 3=reflection + int align_corner; +}; + +} // namespace ncnn + +#endif // LAYER_GRIDSAMPLE_H diff --git a/src/layer/groupnorm.cpp b/src/layer/groupnorm.cpp index 81847d573195..596d3974308d 100644 --- a/src/layer/groupnorm.cpp +++ b/src/layer/groupnorm.cpp @@ -52,66 +52,180 @@ int GroupNorm::load_model(const ModelBin& mb) int GroupNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { - // x = (x - mean) / sqrt(var + eps) * gamma + beta + const int dims = bottom_top_blob.dims; + const int channels_per_group = channels / group; - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int size = w * h; - - int channels_per_group = channels / group; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < group; g++) + if (dims == 1) { - Mat bottom_top_blob_g = bottom_top_blob.channel_range(g * channels_per_group, channels_per_group); - - // mean and var - float sum = 0.f; - for (int q = 0; q < channels_per_group; q++) + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) { - const float* ptr = bottom_top_blob_g.channel(q); - for (int i = 0; i < size; i++) + Mat bottom_top_blob_g = bottom_top_blob.range(g * channels_per_group, channels_per_group); + const Mat gamma_data_g = gamma_data.range(g * channels_per_group, channels_per_group); + const Mat beta_data_g = beta_data.range(g * channels_per_group, channels_per_group); + + // mean and var + float sum = 0.f; + for (int q = 0; q < channels_per_group; q++) { - sum += ptr[i]; + sum += bottom_top_blob_g[q]; } - } - float mean = sum / (channels_per_group * size); + float mean = sum / channels_per_group; - float sqsum = 0.f; - for (int q = 0; q < channels_per_group; q++) - { - const float* ptr = bottom_top_blob_g.channel(q); - for (int i = 0; i < size; i++) + float sqsum = 0.f; + for (int q = 0; q < channels_per_group; q++) { - float tmp = ptr[i] - mean; + float tmp = bottom_top_blob_g[q] - mean; sqsum += tmp * tmp; } + float var = sqsum / channels_per_group; + + for (int q = 0; q < channels_per_group; q++) + { + float a; + float b; + if (affine) + { + float gamma = gamma_data_g[q]; + float beta = beta_data_g[q]; + + a = (float)(gamma / sqrt(var + eps)); + b = -mean * a + beta; + } + else + { + a = (float)(1.f / (sqrt(var + eps))); + b = -mean * a; + } + + bottom_top_blob_g[q] = bottom_top_blob_g[q] * a + b; + } } - float var = sqsum / (channels_per_group * size); + } - for (int q = 0; q < channels_per_group; q++) + if (dims == 2) + { + int w = bottom_top_blob.w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) { - float a; - float b; - if (affine) + Mat bottom_top_blob_g = bottom_top_blob.row_range(g * channels_per_group, channels_per_group); + const Mat gamma_data_g = gamma_data.range(g * channels_per_group, channels_per_group); + const Mat beta_data_g = beta_data.range(g * channels_per_group, channels_per_group); + + // mean and var + float sum = 0.f; + for (int q = 0; q < channels_per_group; q++) { - float gamma = gamma_data[g * channels_per_group + q]; - float beta = beta_data[g * channels_per_group + q]; + const float* ptr = bottom_top_blob_g.row(q); + for (int i = 0; i < w; i++) + { + sum += ptr[i]; + } + } + float mean = sum / (channels_per_group * w); - a = static_cast(gamma / sqrt(var + eps)); - b = -mean * a + beta; + float sqsum = 0.f; + for (int q = 0; q < channels_per_group; q++) + { + const float* ptr = bottom_top_blob_g.row(q); + for (int i = 0; i < w; i++) + { + float tmp = ptr[i] - mean; + sqsum += tmp * tmp; + } } - else + float var = sqsum / (channels_per_group * w); + + for (int q = 0; q < channels_per_group; q++) { - a = static_cast(1.f / (sqrt(var + eps))); - b = -mean * a; + float a; + float b; + if (affine) + { + float gamma = gamma_data_g[q]; + float beta = beta_data_g[q]; + + a = (float)(gamma / sqrt(var + eps)); + b = -mean * a + beta; + } + else + { + a = (float)(1.f / (sqrt(var + eps))); + b = -mean * a; + } + + float* ptr = bottom_top_blob_g.row(q); + for (int i = 0; i < w; i++) + { + ptr[i] = ptr[i] * a + b; + } } + } + } - float* ptr = bottom_top_blob_g.channel(q); + if (dims == 3 || dims == 4) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int size = w * h * d; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat bottom_top_blob_g = bottom_top_blob.channel_range(g * channels_per_group, channels_per_group); + const Mat gamma_data_g = gamma_data.range(g * channels_per_group, channels_per_group); + const Mat beta_data_g = beta_data.range(g * channels_per_group, channels_per_group); + + // mean and var + float sum = 0.f; + for (int q = 0; q < channels_per_group; q++) + { + const float* ptr = bottom_top_blob_g.channel(q); + for (int i = 0; i < size; i++) + { + sum += ptr[i]; + } + } + float mean = sum / (channels_per_group * size); + + float sqsum = 0.f; + for (int q = 0; q < channels_per_group; q++) + { + const float* ptr = bottom_top_blob_g.channel(q); + for (int i = 0; i < size; i++) + { + float tmp = ptr[i] - mean; + sqsum += tmp * tmp; + } + } + float var = sqsum / (channels_per_group * size); - for (int i = 0; i < size; i++) + for (int q = 0; q < channels_per_group; q++) { - ptr[i] = ptr[i] * a + b; + float a; + float b; + if (affine) + { + float gamma = gamma_data_g[q]; + float beta = beta_data_g[q]; + + a = (float)(gamma / sqrt(var + eps)); + b = -mean * a + beta; + } + else + { + a = (float)(1.f / (sqrt(var + eps))); + b = -mean * a; + } + + float* ptr = bottom_top_blob_g.channel(q); + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a + b; + } } } } diff --git a/src/layer/loongarch/absval_loongarch.cpp b/src/layer/loongarch/absval_loongarch.cpp new file mode 100644 index 000000000000..ea60b01eaf02 --- /dev/null +++ b/src/layer/loongarch/absval_loongarch.cpp @@ -0,0 +1,67 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "absval_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +namespace ncnn { + +AbsVal_loongarch::AbsVal_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128i _p = __lsx_vld(ptr, 0); + __m128i _outp = __lsx_vbitclri_w(_p, 31); + __lsx_vst(_outp, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = *ptr > 0 ? *ptr : -*ptr; + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/absval_loongarch.h b/src/layer/loongarch/absval_loongarch.h new file mode 100644 index 000000000000..0a3143cea432 --- /dev/null +++ b/src/layer/loongarch/absval_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_ABSVAL_LOONGARCH_H +#define LAYER_ABSVAL_LOONGARCH_H + +#include "absval.h" + +namespace ncnn { + +class AbsVal_loongarch : virtual public AbsVal +{ +public: + AbsVal_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_ABSVAL_LOONGARCH_H diff --git a/src/layer/loongarch/batchnorm_loongarch.cpp b/src/layer/loongarch/batchnorm_loongarch.cpp new file mode 100644 index 000000000000..f0e33b78efdc --- /dev/null +++ b/src/layer/loongarch/batchnorm_loongarch.cpp @@ -0,0 +1,145 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "batchnorm_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +BatchNorm_loongarch::BatchNorm_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int BatchNorm_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + int w = bottom_top_blob.w * elempack; + +#if __loongarch_sx + int nn_w = w / 4; + int remain_w_start = nn_w * 4; +#else + int remain_w_start = 0; +#endif // __loongarch_sx + + float* ptr = bottom_top_blob; + +#if __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < nn_w; i++) + { + float* ptr0 = ptr + i * 4; + + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __m128 _a = (__m128)__lsx_vld((const float*)a_data + i * 4, 0); + __m128 _b = (__m128)__lsx_vld((const float*)b_data + i * 4, 0); + _p = __lsx_vfmadd_s(_b, _p, _a); + __lsx_vst(_p, ptr0, 0); + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_w_start; i < w; i++) + { + ptr[i] = b_data[i] * ptr[i] + a_data[i]; + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w * elempack; + int h = bottom_top_blob.h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + float a = a_data[i]; + float b = b_data[i]; + + int j = 0; +#if __loongarch_sx + __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a); + __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b); + for (; j + 3 < w; j += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmadd_s(_b, _p, _a); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + *ptr = b * *ptr + a; + ptr++; + } + } + } + + if (dims == 3 || dims == 4) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + float* ptr = bottom_top_blob.channel(q); + float a = a_data[q]; + float b = b_data[q]; + + int i = 0; +#if __loongarch_sx + __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a); + __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmadd_s(_b, _p, _a); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = b * *ptr + a; + ptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/batchnorm_loongarch.h b/src/layer/loongarch/batchnorm_loongarch.h new file mode 100644 index 000000000000..8b38d5e1f666 --- /dev/null +++ b/src/layer/loongarch/batchnorm_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_BATCHNORM_LOONGARCH_H +#define LAYER_BATCHNORM_LOONGARCH_H + +#include "batchnorm.h" + +namespace ncnn { + +class BatchNorm_loongarch : virtual public BatchNorm +{ +public: + BatchNorm_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_BATCHNORM_LOONGARCH_H diff --git a/src/layer/loongarch/bias_loongarch.cpp b/src/layer/loongarch/bias_loongarch.cpp new file mode 100644 index 000000000000..74129a8d3284 --- /dev/null +++ b/src/layer/loongarch/bias_loongarch.cpp @@ -0,0 +1,70 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "bias_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +int Bias_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int size = w * h * d; + + const float* bias_ptr = bias_data; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + float bias = bias_ptr[q]; + +#if __loongarch_sx + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __loongarch_sx + +#if __loongarch_sx + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); + for (; nn > 0; nn--) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = __lsx_vfadd_s(_p, _bias); + __lsx_vst(_outp, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + + for (; remain > 0; remain--) + { + *ptr = *ptr + bias; + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/bias_loongarch.h b/src/layer/loongarch/bias_loongarch.h new file mode 100644 index 000000000000..f122ffa0dd92 --- /dev/null +++ b/src/layer/loongarch/bias_loongarch.h @@ -0,0 +1,30 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_BIAS_LOONGARCH_H +#define LAYER_BIAS_LOONGARCH_H + +#include "bias.h" + +namespace ncnn { + +class Bias_loongarch : virtual public Bias +{ +public: + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_BIAS_LOONGARCH_H diff --git a/src/layer/loongarch/binaryop_loongarch.cpp b/src/layer/loongarch/binaryop_loongarch.cpp new file mode 100644 index 000000000000..7832c9ca732b --- /dev/null +++ b/src/layer/loongarch/binaryop_loongarch.cpp @@ -0,0 +1,1066 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "binaryop_loongarch.h" + +#include + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +BinaryOp_loongarch::BinaryOp_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +template +static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = b.w; + int h = b.h; + int d = b.d; + int channels = b.c; + int elempack = b.elempack; + int size = w * h * d * elempack; + + // type 2 3 4 20 + c.create_like(b, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float a0 = a[0]; + const float* ptr = b.channel(q); + float* outptr = c.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _a0 = __lsx_vreplfr2vr_s(a0); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_a0, _p); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = op(a0, *ptr); + ptr += 1; + outptr += 1; + } + } + + return 0; +} + +template +static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + // type 6 11 16 25 + c.create_like(a, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float b0 = b[0]; + float* outptr = c.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _b0 = __lsx_vreplfr2vr_s(b0); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = op(*ptr, b0); + ptr += 1; + outptr += 1; + } + } + + return 0; +} + +template +static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + // type 7 13 19 29 + c.create_like(a, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = op(*ptr, *ptr1); + ptr += 1; + ptr1 += 1; + outptr += 1; + } + } + + return 0; +} + +#if __loongarch_sx +// broadcasting rule +// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting + +template +static int binary_op_pack4(const Mat& a, const Mat& b, Mat& c, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int size = w * h * d; + size_t elemsize = a.elemsize; + int elempack = a.elempack; + + int w1 = b.w; + int h1 = b.h; + int d1 = b.d; + int channels1 = b.c; + int size1 = w1 * h1 * d1; + size_t elemsize1 = b.elemsize; + int elempack1 = b.elempack; + + if (a.dims == 4) + { + if (b.dims == 4) + { + // type 29 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, d, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 3) + { + // type 28 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + for (int y = 0; y < h; y++) + { + __m128 _b0 = (__m128)__lsx_vld(ptr1, 0); + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 27 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d; z++) + { + __m128 _b0 = (__m128)__lsx_vld(ptr1, 0); + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + } + + ptr1 += 4; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 25 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 26 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m128 _b0 = (__m128)__lsx_vld((const float*)b + q * 4, 0); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + } + else if (a.dims == 3) + { + if (b.dims == 4) + { + // type 23 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + for (int y = 0; y < h1; y++) + { + __m128 _a0 = (__m128)__lsx_vld(ptr, 0); + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + } + } + + return 0; + } + + if (b.dims == 3) + { + if (w1 == 1 && h1 == 1 && channels1 == channels) + { + // special type 1 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* b0 = b.channel(q); + float* outptr = c.channel(q); + __m128 _b0 = (__m128)__lsx_vld(b0, 0); + for (int i = 0; i < size; i++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels1 == 1 && elempack1 == 1) + { + // special type 2 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b; + float* outptr = c.channel(q); + for (int i = 0; i < size; i++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = __lsx_vreplfr2vr_s(ptr1[0]); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + ptr1 += 1; + outptr += 4; + } + } + + return 0; + } + + if (w == 1 && h == 1 && channels1 == channels) + { + // special type 3 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* a0 = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + __m128 _a0 = (__m128)__lsx_vld(a0, 0); + for (int i = 0; i < size1; i++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (w1 == w && h1 == h && channels == 1 && elempack == 1) + { + // special type 4 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a; + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + for (int i = 0; i < size1; i++) + { + __builtin_prefetch(ptr + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _p = __lsx_vreplfr2vr_s(ptr[0]); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + ptr += 1; + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (w != 1 && w1 == 1 && h1 == h && channels1 == channels) + { + // special type 5 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m128 _p1 = (__m128)__lsx_vld(ptr1 + y * 4, 0); + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + + ptr += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 == w && h != 1 && h1 == 1 && channels1 == channels) + { + // special type 6 + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1 + x * 4, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + + ptr += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 != 1 && w == 1 && h1 == h && channels1 == channels) + { + // special type 7 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m128 _p = (__m128)__lsx_vld(ptr + y * 4, 0); + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + + ptr1 += 4; + outptr += 4; + } + } + } + + return 0; + } + + if (w1 == w && h1 != 1 && h == 1 && channels1 == channels) + { + // special type 8 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p = (__m128)__lsx_vld(ptr + x * 4, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_p, _p1); + __lsx_vst(_outp, outptr, 0); + + ptr1 += 4; + outptr += 4; + } + } + } + + return 0; + } + + // type 19 + return binary_op_7_13_19_29(a, b, c, opt); + } + + c.create(w, h, channels, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 18 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + const float* ptr1 = b.row(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h; y++) + { + __m128 _b0 = (__m128)__lsx_vld(ptr1, 0); + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + } + + return 0; + } + + if (b.dims == 1) + { + if (b.w == 1 && elempack1 == 1) + { + // type 16 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 17 + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = a.channel(q); + __m128 _b0 = (__m128)__lsx_vld((const float*)b + q * 4, 0); + float* outptr = c.channel(q); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + } + + return 0; + } + } + else if (a.dims == 2) + { + if (b.dims == 4) + { + // type 22 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int z = 0; z < d1; z++) + { + __m128 _a0 = (__m128)__lsx_vld(ptr, 0); + for (int y = 0; y < h1; y++) + { + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + } + + ptr += 4; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 14 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + const float* ptr = a.row(q); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int y = 0; y < h1; y++) + { + __m128 _a0 = (__m128)__lsx_vld(ptr, 0); + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + } + + return 0; + } + + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.dims == 2) + { + // type 13 + return binary_op_7_13_19_29(a, b, c, opt); + } + + if (b.dims == 1) + { + c.create(w, h, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 11 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 12 + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h; y++) + { + __m128 _b0 = (__m128)__lsx_vld(ptr1, 0); + for (int x = 0; x < w; x++) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = op(_p, _b0); + __lsx_vst(_outp, outptr, 0); + ptr += 4; + outptr += 4; + } + + ptr1 += 4; + } + + return 0; + } + } + else if (a.dims == 1) + { + if (a.w == 1 && elempack == 1) + { + // type 2 3 4 20 + return binary_op_2_3_4_20(a, b, c, opt); + } + + if (b.dims == 4) + { + // type 21 + c.create(w1, h1, d1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m128 _a0 = (__m128)__lsx_vld((const float*)a + q * 4, 0); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (b.dims == 3) + { + // type 9 + c.create(w1, h1, channels1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels1; q++) + { + __m128 _a0 = (__m128)__lsx_vld((const float*)a + q * 4, 0); + const float* ptr1 = b.channel(q); + float* outptr = c.channel(q); + + for (int i = 0; i < size1; i++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + } + + return 0; + } + + if (b.dims == 2) + { + // type 8 + c.create(w1, h1, elemsize1, elempack1, opt.blob_allocator); + if (c.empty()) + return -100; + + const float* ptr = a; + const float* ptr1 = b; + float* outptr = c; + + for (int y = 0; y < h1; y++) + { + __m128 _a0 = (__m128)__lsx_vld(ptr, 0); + for (int x = 0; x < w1; x++) + { + __builtin_prefetch(ptr1 + 16); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + __m128 _outp = op(_a0, _p1); + __lsx_vst(_outp, outptr, 0); + ptr1 += 4; + outptr += 4; + } + + ptr += 4; + } + + return 0; + } + + if (b.dims == 1) + { + c.create(w, elemsize, elempack, opt.blob_allocator); + if (c.empty()) + return -100; + + if (b.w == 1 && elempack1 == 1) + { + // type 6 + return binary_op_6_11_16_25(a, b, c, opt); + } + + // type 7 + binary_op_7_13_19_29(a, b, c, opt); + } + } + + return 0; +} +#endif // __loongarch_sx + +template +static int binary_op_scalar_inplace(Mat& a, float b, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = a.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _b = __lsx_vreplfr2vr_s(b); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = op(_p, _b); + __lsx_vst(_p, ptr, 0); + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = op(*ptr, b); + ptr++; + } + } + + return 0; +} + +namespace BinaryOp_loongarch_functor { + +#if __loongarch_sx +#define MAKE_FUNCTION(NAME, IMPL, IMPL4) \ + struct NAME \ + { \ + float operator()(const float& x, const float& y) const \ + { \ + return IMPL; \ + } \ + __m128 operator()(const __m128& x, const __m128& y) const \ + { \ + return IMPL4; \ + } \ + }; +#else +#define MAKE_FUNCTION(NAME, IMPL, IMPL4) \ + struct NAME \ + { \ + float operator()(const float& x, const float& y) const \ + { \ + return IMPL; \ + } \ + }; +#endif // __loongarch_sx + +// clang-format off +// *INDENT-OFF* +MAKE_FUNCTION(binary_op_add, x + y, __lsx_vfadd_s(x, y)) +MAKE_FUNCTION(binary_op_sub, x - y, __lsx_vfsub_s(x, y)) +MAKE_FUNCTION(binary_op_mul, x * y, __lsx_vfmul_s(x, y)) +MAKE_FUNCTION(binary_op_div, x / y, __lsx_vfdiv_s(x, y)) +MAKE_FUNCTION(binary_op_max, std::max(x, y), __lsx_vfmax_s(x, y)) +MAKE_FUNCTION(binary_op_min, std::min(x, y), __lsx_vfmin_s(x, y)) +MAKE_FUNCTION(binary_op_pow, (float)pow(x, y), pow_ps(x, y)) +MAKE_FUNCTION(binary_op_rsub, y - x, __lsx_vfsub_s(y, x)) +MAKE_FUNCTION(binary_op_rdiv, y / x, __lsx_vfdiv_s(y, x)) +// *INDENT-ON* +// clang-format on + +#undef MAKE_FUNCTION + +} // namespace BinaryOp_loongarch_functor + +int BinaryOp_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ +#if __loongarch_sx + using namespace BinaryOp_loongarch_functor; + + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& bottom_blob1 = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int elempack = bottom_blob.elempack; + int elempack1 = bottom_blob1.elempack; + + if (elempack == 4 || elempack1 == 4) + { + if (op_type == Operation_ADD) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_SUB) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MUL) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_DIV) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MAX) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_MIN) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_POW) + return binary_op_pack4(bottom_blob, bottom_blob1, top_blob, opt); + + if (op_type == Operation_RSUB) + return binary_op_pack4(bottom_blob1, bottom_blob, top_blob, opt); + + if (op_type == Operation_RDIV) + return binary_op_pack4(bottom_blob1, bottom_blob, top_blob, opt); + } +#endif // __loongarch_sx + + return BinaryOp::forward(bottom_blobs, top_blobs, opt); +} + +int BinaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + using namespace BinaryOp_loongarch_functor; + + if (op_type == Operation_ADD) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_SUB) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MUL) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_DIV) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MAX) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_MIN) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_POW) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_RSUB) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + if (op_type == Operation_RDIV) + return binary_op_scalar_inplace(bottom_top_blob, b, opt); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/binaryop_loongarch.h b/src/layer/loongarch/binaryop_loongarch.h new file mode 100644 index 000000000000..bcf9ef5442fc --- /dev/null +++ b/src/layer/loongarch/binaryop_loongarch.h @@ -0,0 +1,34 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_BINARYOP_LOONGARCH_H +#define LAYER_BINARYOP_LOONGARCH_H + +#include "binaryop.h" + +namespace ncnn { + +class BinaryOp_loongarch : virtual public BinaryOp +{ +public: + BinaryOp_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_BINARYOP_LOONGARCH_H diff --git a/src/layer/loongarch/cast_loongarch.cpp b/src/layer/loongarch/cast_loongarch.cpp new file mode 100644 index 000000000000..2e956657f142 --- /dev/null +++ b/src/layer/loongarch/cast_loongarch.cpp @@ -0,0 +1,209 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "cast_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +namespace ncnn { + +Cast_loongarch::Cast_loongarch() +{ + support_packing = true; +} + +int Cast_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + if (type_from == type_to) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + size_t out_elemsize = elemsize; + if (type_to == 1) + { + if (type_from == 3) + { + Cast::forward(bottom_blob, top_blob, opt); + } + + // float32 + out_elemsize = 4 * elempack; + } + else if (type_to == 2) + { + // float16 + out_elemsize = 2 * elempack; + } + else if (type_to == 3) + { + // int8 + out_elemsize = elempack; + } + else if (type_to == 4) + { + // bfloat16 + out_elemsize = 2 * elempack; + } + + if (dims == 1) + { + top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 2) + { + top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 3) + { + top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 4) + { + top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); + } + if (top_blob.empty()) + return -100; + + int size = w * h * d * elempack; + + if (type_from == 1 && type_to == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + unsigned short* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 16); + __m128 _p0 = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr + 4, 0); + __m128i _p = __lsx_vfcvt_h_s(_p1, _p0); + __lsx_vst(_p, outptr, 0); + + ptr += 8; + outptr += 8; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = float32_to_float16(*ptr); + outptr++; + ptr++; + } + } + } + + if (type_from == 2 && type_to == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const unsigned short* ptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 16); + __m128i _p = __lsx_vld(ptr, 0); + __m128 _p0 = __lsx_vfcvtl_s_h(_p); + __m128 _p1 = __lsx_vfcvth_s_h(_p); + __lsx_vst(_p0, outptr, 0); + __lsx_vst(_p1, outptr + 4, 0); + + ptr += 8; + outptr += 8; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = float16_to_float32(*ptr); + outptr++; + ptr++; + } + } + } + + if (type_from == 3 && type_to == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const signed char* ptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = (float)ptr[i]; + } + } + } + + if (type_from == 4 && type_to == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const unsigned short* ptr = bottom_blob.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; + for (; i < size; i++) + { + *outptr = bfloat16_to_float32(*ptr); + outptr++; + ptr++; + } + } + } + + if (type_from == 1 && type_to == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + unsigned short* outptr = top_blob.channel(q); + + int i = 0; + for (; i < size; i++) + { + *outptr = float32_to_bfloat16(*ptr); + outptr++; + ptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/cast_loongarch.h b/src/layer/loongarch/cast_loongarch.h new file mode 100644 index 000000000000..1fe75c687d8e --- /dev/null +++ b/src/layer/loongarch/cast_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CAST_LOONGARCH_H +#define LAYER_CAST_LOONGARCH_H + +#include "cast.h" + +namespace ncnn { + +class Cast_loongarch : virtual public Cast +{ +public: + Cast_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_CAST_LOONGARCH_H diff --git a/src/layer/loongarch/clip_loongarch.cpp b/src/layer/loongarch/clip_loongarch.cpp new file mode 100644 index 000000000000..7cf0246d060c --- /dev/null +++ b/src/layer/loongarch/clip_loongarch.cpp @@ -0,0 +1,76 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "clip_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Clip_loongarch::Clip_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Clip_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _max = (__m128)__lsx_vreplfr2vr_s(max); + __m128 _min = (__m128)__lsx_vreplfr2vr_s(min); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmax_s(_p, _min); + _p = __lsx_vfmin_s(_p, _max); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < min) + *ptr = min; + + if (*ptr > max) + *ptr = max; + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/clip_loongarch.h b/src/layer/loongarch/clip_loongarch.h new file mode 100644 index 000000000000..43df62035ff3 --- /dev/null +++ b/src/layer/loongarch/clip_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CLIP_LOONGARCH_H +#define LAYER_CLIP_LOONGARCH_H + +#include "clip.h" + +namespace ncnn { + +class Clip_loongarch : virtual public Clip +{ +public: + Clip_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_CLIP_LOONGARCH_H diff --git a/src/layer/loongarch/concat_loongarch.cpp b/src/layer/loongarch/concat_loongarch.cpp new file mode 100644 index 000000000000..50460f8c134b --- /dev/null +++ b/src/layer/loongarch/concat_loongarch.cpp @@ -0,0 +1,348 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "concat_loongarch.h" + +namespace ncnn { + +Concat_loongarch::Concat_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Concat_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + int dims = bottom_blobs[0].dims; + int positive_axis = axis < 0 ? dims + axis : axis; + + if (dims == 1) // positive_axis == 0 + { + // concat vector + // total length + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w * bottom_blob.elempack; + } + + int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + float* outptr = top_blob; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + const float* ptr = bottom_blob; + memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize); + + outptr += bottom_blob.w * bottom_blob.elempack; + } + } + + if (dims == 2 && positive_axis == 0) + { + // concat image + int w = bottom_blobs[0].w; + + // total height + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_h += bottom_blob.h * bottom_blob.elempack; + } + + int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + Mat top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + float* outptr = top_blob_unpacked; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + if (bottom_blob.elempack == 4 && elempack == 1) + { + for (int i = 0; i < bottom_blob.h; i++) + { + const float* r0 = bottom_blob.row(i); + + float* outptr0 = outptr; + float* outptr1 = outptr + w; + float* outptr2 = outptr + w * 2; + float* outptr3 = outptr + w * 3; + + for (int j = 0; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + + outptr += w * 4; + } + } + else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) + { + int size = w * bottom_blob.h; + + const float* ptr = bottom_blob; + memcpy(outptr, ptr, size * bottom_blob.elemsize); + + outptr += size * bottom_blob.elempack; + } + } + + // packing + if (elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + } + + if (dims == 2 && positive_axis == 1) + { + // interleave image row + int h = bottom_blobs[0].h; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total width + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* outptr = top_blob.row(i); + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + const float* ptr = bottom_blob.row(i); + memcpy(outptr, ptr, bottom_blob.w * elemsize); + + outptr += bottom_blob.w * elempack; + } + } + } + + if (dims == 3 && positive_axis == 0) + { + // concat dim + int w = bottom_blobs[0].w; + int h = bottom_blobs[0].h; + + // total channels + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_channels = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_channels += bottom_blob.c * bottom_blob.elempack; + } + + int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + Mat top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + int p = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + if (bottom_blob.elempack == 4 && elempack == 1) + { + int size = bottom_blob.w * bottom_blob.h; + + for (int q = 0; q < bottom_blob.c; q++) + { + const float* r0 = bottom_blob.channel(q); + + float* outptr0 = top_blob_unpacked.channel(p); + float* outptr1 = top_blob_unpacked.channel(p + 1); + float* outptr2 = top_blob_unpacked.channel(p + 2); + float* outptr3 = top_blob_unpacked.channel(p + 3); + + for (int i = 0; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + + p += 4; + } + } + else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4) + { + int size = bottom_blob.total(); + + const float* ptr = bottom_blob; + float* outptr = top_blob_unpacked.channel(p); + memcpy(outptr, ptr, size * bottom_blob.elemsize); + + p += bottom_blob.c; + } + } + + // packing + if (elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + } + + if (dims == 3 && positive_axis == 1) + { + // interleave dim height + int w = bottom_blobs[0].w; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + top_h += bottom_blob.h; + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + int size = bottom_blob.w * bottom_blob.h; + + const float* ptr = bottom_blob.channel(q); + memcpy(outptr, ptr, size * elemsize); + + outptr += size * elempack; + } + } + } + + if (dims == 3 && positive_axis == 2) + { + // interleave dim width + int h = bottom_blobs[0].h; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* outptr = top_blob.channel(q); + + for (int i = 0; i < h; i++) + { + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob = bottom_blobs[b]; + + const float* ptr = bottom_blob.channel(q).row(i); + memcpy(outptr, ptr, bottom_blob.w * elemsize); + + outptr += bottom_blob.w * elempack; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/concat_loongarch.h b/src/layer/loongarch/concat_loongarch.h new file mode 100644 index 000000000000..934c85244df3 --- /dev/null +++ b/src/layer/loongarch/concat_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONCAT_LOONGARCH_H +#define LAYER_CONCAT_LOONGARCH_H + +#include "concat.h" + +namespace ncnn { + +class Concat_loongarch : virtual public Concat +{ +public: + Concat_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_CONCAT_LOONGARCH_H diff --git a/src/layer/loongarch/convolution1d_loongarch.cpp b/src/layer/loongarch/convolution1d_loongarch.cpp new file mode 100644 index 000000000000..0b1a11c868f0 --- /dev/null +++ b/src/layer/loongarch/convolution1d_loongarch.cpp @@ -0,0 +1,379 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution1d_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +Convolution1D_loongarch::Convolution1D_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Convolution1D_loongarch::create_pipeline(const Option& opt) +{ + if (dynamic_weight) + return 0; + + const int num_input = weight_data_size / kernel_w / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + + // src = kw-inch-outch + // dst = pb-pa-kw-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(kernel_w, num_input, num_output); + + weight_data_packed.create(kernel_w, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_packed.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < kernel_w; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } + + return 0; +} + +int Convolution1D_loongarch::destroy_pipeline(const Option& /*opt*/) +{ + return 0; +} + +int Convolution1D_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + const int outw = (w - kernel_extent_w) / stride_w + 1; + const int outh = num_output / out_elempack; + + top_blob.create(outw, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 4 && out_elempack == 4) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + float* outptr = top_blob.row(p); + + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + const float* kptr = weight_data_packed.channel(p); + + for (int q = 0; q < h; q++) + { + const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4; + + for (int k = 0; k < kernel_w; k++) + { + __m128 _val0 = __lsx_vreplfr2vr_s(sptr[0]); + __m128 _val1 = __lsx_vreplfr2vr_s(sptr[1]); + __m128 _val2 = __lsx_vreplfr2vr_s(sptr[2]); + __m128 _val3 = __lsx_vreplfr2vr_s(sptr[3]); + + __m128 _w0 = (__m128)__lsx_vld(kptr, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0); + + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + _sum = __lsx_vfmadd_s(_w1, _val1, _sum); + _sum = __lsx_vfmadd_s(_w2, _val2, _sum); + _sum = __lsx_vfmadd_s(_w3, _val3, _sum); + + sptr += dilation_w * 4; + kptr += 16; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr, 0); + outptr += 4; + } + } + } + } + + if (elempack == 1 && out_elempack == 4) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + float* outptr = top_blob.row(p); + + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + const float* kptr = weight_data_packed.channel(p); + + for (int q = 0; q < h; q++) + { + const float* sptr = bottom_blob_bordered.row(q) + j * stride_w; + + for (int k = 0; k < kernel_w; k++) + { + __m128 _val = __lsx_vreplfr2vr_s(sptr[0]); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + + sptr += dilation_w; + kptr += 4; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr, 0); + outptr += 4; + } + } + } + } + + if (elempack == 4 && out_elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + float* outptr = top_blob.row(p); + + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + const float* kptr = weight_data_packed.channel(p); + + for (int q = 0; q < h; q++) + { + const float* sptr = bottom_blob_bordered.row(q) + j * stride_w * 4; + + for (int k = 0; k < kernel_w; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + + sptr += dilation_w * 4; + kptr += 4; + } + } + + sum += __lsx_reduce_fadd_s(_sum); + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + } + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack == 1) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outh; p++) + { + float* outptr = top_blob.row(p); + + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + const float* kptr = (const float*)weight_data + kernel_w * h * p; + + for (int q = 0; q < h; q++) + { + const float* sptr = bottom_blob_bordered.row(q) + j * stride_w; + + for (int k = 0; k < kernel_w; k++) + { + float val = sptr[0]; + float wt = kptr[0]; + sum += val * wt; + + sptr += dilation_w; + kptr += 1; + } + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + } + } + } + + return 0; +} + +int Convolution1D_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _kernel_w = _weight_data.w; + const int _num_output = _weight_data.c * _weight_data.elempack; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution1D); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(2, dilation_w); + pd.set(3, stride_w); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(18, pad_value); + pd.set(5, bias_term); + pd.set(6, weight_data_flattened.w); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_flattened; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/convolution1d_loongarch.h b/src/layer/loongarch/convolution1d_loongarch.h new file mode 100644 index 000000000000..36393df45688 --- /dev/null +++ b/src/layer/loongarch/convolution1d_loongarch.h @@ -0,0 +1,41 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONVOLUTION1D_LOONGARCH_H +#define LAYER_CONVOLUTION1D_LOONGARCH_H + +#include "convolution1d.h" + +namespace ncnn { + +class Convolution1D_loongarch : virtual public Convolution1D +{ +public: + Convolution1D_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +public: + // packn + Mat weight_data_packed; +}; + +} // namespace ncnn + +#endif // LAYER_CONVOLUTION1D_LOONGARCH_H diff --git a/src/layer/loongarch/convolution_1x1.h b/src/layer/loongarch/convolution_1x1.h new file mode 100644 index 000000000000..83d3778411ae --- /dev/null +++ b/src/layer/loongarch/convolution_1x1.h @@ -0,0 +1,26 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_int8.h b/src/layer/loongarch/convolution_1x1_int8.h new file mode 100644 index 000000000000..08f439c484ae --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_int8.h @@ -0,0 +1,83 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const signed char* r0 = bottom_blob.channel(p); + signed char* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + outptr[2] = r0[4]; + outptr[3] = r0[6]; + + r0 += 8; + outptr += 4; + } + for (; j + 1 < outw; j += 2) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + + r0 += 4; + outptr += 2; + } + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack1to4_int8.h b/src/layer/loongarch/convolution_1x1_pack1to4_int8.h new file mode 100644 index 000000000000..00e1e2581417 --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack1to4_int8.h @@ -0,0 +1,83 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const signed char* r0 = bottom_blob.channel(p); + signed char* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + outptr[2] = r0[4]; + outptr[3] = r0[6]; + + r0 += 8; + outptr += 4; + } + for (; j + 1 < outw; j += 2) + { + outptr[0] = r0[0]; + outptr[1] = r0[2]; + + r0 += 4; + outptr += 2; + } + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack4.h b/src/layer/loongarch/convolution_1x1_pack4.h new file mode 100644 index 000000000000..cf5a5b8e3638 --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack4.h @@ -0,0 +1,65 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} + +static void conv1x1s2_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = (w - 2 * outw + w) * 4; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* r0 = bottom_blob.channel(p); + float* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _val = (__m128)__lsx_vld(r0, 0); + __lsx_vst(_val, outptr, 0); + + r0 += 4 * 2; + outptr += 4; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack4_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack4to1.h b/src/layer/loongarch/convolution_1x1_pack4to1.h new file mode 100644 index 000000000000..b87129091e4a --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack4to1.h @@ -0,0 +1,65 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} + +static void conv1x1s2_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = (w - 2 * outw + w) * 4; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* r0 = bottom_blob.channel(p); + float* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _val = (__m128)__lsx_vld(r0, 0); + __lsx_vst(_val, outptr, 0); + + r0 += 4 * 2; + outptr += 4; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_shrinked, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack8to1_int8.h b/src/layer/loongarch/convolution_1x1_pack8to1_int8.h new file mode 100644 index 000000000000..8df0e128b7fb --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack8to1_int8.h @@ -0,0 +1,65 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const int64_t* r0 = bottom_blob.channel(p); + int64_t* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_1x1_pack8to4_int8.h b/src/layer/loongarch/convolution_1x1_pack8to4_int8.h new file mode 100644 index 000000000000..6aaa720d23d0 --- /dev/null +++ b/src/layer/loongarch/convolution_1x1_pack8to4_int8.h @@ -0,0 +1,65 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv1x1s1_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + const int size = w * h; + + Mat bottom_im2col = bottom_blob; + bottom_im2col.w = size; + bottom_im2col.h = 1; + + im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} + +static void conv1x1s2_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int tailstep = w - 2 * outw + w; + + Mat bottom_blob_shrinked; + bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const int64_t* r0 = bottom_blob.channel(p); + int64_t* outptr = bottom_blob_shrinked.channel(p); + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + outptr[0] = r0[0]; + + r0 += 2; + outptr += 1; + } + + r0 += tailstep; + } + } + + conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_shrinked, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_3x3.h b/src/layer/loongarch/convolution_3x3.h new file mode 100644 index 000000000000..66e10106b46c --- /dev/null +++ b/src/layer/loongarch/convolution_3x3.h @@ -0,0 +1,412 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd23_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt) +{ + Mat kernel_tm(4 * 4, inch, outch); + + // G + const float ktm[4][3] = { + {1.0f, 0.0f, 0.0f}, + {1.0f / 2, 1.0f / 2, 1.0f / 2}, + {1.0f / 2, -1.0f / 2, 1.0f / 2}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[4][3]; + for (int i = 0; i < 4; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 4; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 4; i++) + { + kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 16-inch-outch + // dst = inch-16-outch +#if __loongarch_sx + kernel_tm2.create(8 * inch, 16, outch / 8 + (outch % 8) / 4 + outch % 4); +#else + kernel_tm2.create(2 * inch, 16, outch / 2 + outch % 2); +#endif + + int q = 0; +#if __loongarch_sx + for (; q + 7 < outch; q += 8) + { + Mat g0 = kernel_tm2.channel(q / 8); + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 8; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } + for (; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4); + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 4; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#else // __loongarch_sx + for (; q + 1 < outch; q += 2) + { + Mat g0 = kernel_tm2.channel(q / 2); + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 2; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#endif // __loongarch_sx + for (; q < outch; q++) + { +#if __loongarch_sx + Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4); +#else + Mat g0 = kernel_tm2.channel(q / 2 + q % 2); +#endif + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + const float* k00 = kernel_tm.channel(q).row(p); + g00[0] = k00[k]; + g00++; + } + } + } +} + +static void conv3x3s1_winograd23_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 2n+2, winograd F(2,3) + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 1) / 2 * 2; + outh = (outh + 1) / 2 * 2; + + w = outw + 2; + h = outh + 2; + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 2; + int h_tiles = outh / 2; + int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 16, inch, 4u, opt.workspace_allocator); + conv3x3s1_winograd23_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); + } + { + conv3x3s1_winograd23_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} + +static void conv3x3s1_winograd43_transform_kernel_lsx(const Mat& kernel, Mat& kernel_tm2, int inch, int outch, const Option& opt) +{ + Mat kernel_tm(6 * 6, inch, outch); + + // G + const float ktm[6][3] = { + {1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = inch-36-outch +#if __loongarch_sx + kernel_tm2.create(8 * inch, 36, outch / 8 + (outch % 8) / 4 + outch % 4); +#else + kernel_tm2.create(2 * inch, 36, outch / 2 + outch % 2); +#endif + + int q = 0; +#if __loongarch_sx + for (; q + 7 < outch; q += 8) + { + Mat g0 = kernel_tm2.channel(q / 8); + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 8; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } + for (; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4); + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 4; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#else // __loongarch_sx + for (; q + 1 < outch; q += 2) + { + Mat g0 = kernel_tm2.channel(q / 2); + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + for (int i = 0; i < 2; i++) + { + const float* k00 = kernel_tm.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#endif // __loongarch_sx + for (; q < outch; q++) + { +#if __loongarch_sx + Mat g0 = kernel_tm2.channel(q / 8 + (q % 8) / 4 + q % 4); +#else + Mat g0 = kernel_tm2.channel(q / 2 + q % 2); +#endif + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p < inch; p++) + { + const float* k00 = kernel_tm.channel(q).row(p); + g00[0] = k00[k]; + g00++; + } + } + } +} + +static void conv3x3s1_winograd43_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2, winograd F(4,3) + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, 4u, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_3x3_int8.h b/src/layer/loongarch/convolution_3x3_int8.h new file mode 100644 index 000000000000..3ea28dd09445 --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_int8.h @@ -0,0 +1,252 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_kernel_int8_lsx(const Mat& kernel, Mat& kernel_tm_packed, int inch, int outch, const Option& opt) +{ + // winograd43 transform kernel + Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); + + const short ktm[6][3] = { + {6, 0, 0}, + {-4, -4, -4}, + {-4, 4, -4}, + {1, 2, 4}, + {1, -2, 4}, + {0, 0, 6} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; + short* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = 2b-inch-36-outch/2b +#if __loongarch_sx + if (outch >= 4) + { + if (inch >= 4) + kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch / 4 + outch % 4, (size_t)2u * 16, 16); + else + kernel_tm_packed.create(inch, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4); + } +#else // __loongarch_sx + if (outch >= 2) + { + kernel_tm_packed.create(inch, 36, outch / 2 + outch % 2, (size_t)2u * 2, 2); + } +#endif // __loongarch_sx + else + { +#if __loongarch_sx + if (inch >= 4) + kernel_tm_packed.create(inch / 4 + inch % 4, 36, outch, (size_t)2u * 4, 4); + else +#endif // __loongarch_sx + { + kernel_tm_packed.create(inch, 36, outch, (size_t)2u, 1); + } + } + + int p = 0; +#if __loongarch_sx + for (; p + 3 < outch; p += 4) + { + const Mat k0 = kernel_tm.channel(p); + const Mat k1 = kernel_tm.channel(p + 1); + const Mat k2 = kernel_tm.channel(p + 2); + const Mat k3 = kernel_tm.channel(p + 3); + + Mat g0 = kernel_tm_packed.channel(p / 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + g00[0] = k0.row(q)[k]; + g00[1] = k0.row(q + 1)[k]; + g00[2] = k0.row(q + 2)[k]; + g00[3] = k0.row(q + 3)[k]; + g00[4] = k1.row(q)[k]; + g00[5] = k1.row(q + 1)[k]; + g00[6] = k1.row(q + 2)[k]; + g00[7] = k1.row(q + 3)[k]; + g00[8] = k2.row(q)[k]; + g00[9] = k2.row(q + 1)[k]; + g00[10] = k2.row(q + 2)[k]; + g00[11] = k2.row(q + 3)[k]; + g00[12] = k3.row(q)[k]; + g00[13] = k3.row(q + 1)[k]; + g00[14] = k3.row(q + 2)[k]; + g00[15] = k3.row(q + 3)[k]; + g00 += 16; + } + for (; q < inch; q++) + { + g00[0] = k0.row(q)[k]; + g00[1] = k1.row(q)[k]; + g00[2] = k2.row(q)[k]; + g00[3] = k3.row(q)[k]; + g00 += 4; + } + } + } +#else // __loongarch_sx + for (; p + 1 < outch; p += 2) + { + const Mat k0 = kernel_tm.channel(p); + const Mat k1 = kernel_tm.channel(p + 1); + + Mat g0 = kernel_tm_packed.channel(p / 2); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + int q = 0; + for (; q < inch; q++) + { + g00[0] = k0.row(q)[k]; + g00[1] = k1.row(q)[k]; + g00 += 2; + } + } + } +#endif // __loongarch_sx + for (; p < outch; p++) + { + const Mat k0 = kernel_tm.channel(p); + +#if __loongarch_sx + Mat g0 = kernel_tm_packed.channel(p / 4 + p % 4); +#else + Mat g0 = kernel_tm_packed.channel(p / 2 + p % 2); +#endif + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + int q = 0; +#if __loongarch_sx + for (; q + 3 < inch; q += 4) + { + g00[0] = k0.row(q)[k]; + g00[1] = k0.row(q + 1)[k]; + g00[2] = k0.row(q + 2)[k]; + g00[3] = k0.row(q + 3)[k]; + g00 += 4; + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + g00[0] = k0.row(q)[k]; + g00 += 1; + } + } + } +} + +static void conv3x3s1_winograd43_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + // size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_3x3_pack1to4.h b/src/layer/loongarch/convolution_3x3_pack1to4.h new file mode 100644 index 000000000000..2bcb0ce166dd --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_pack1to4.h @@ -0,0 +1,812 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int inch = bottom_blob.c; + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + out0.fill(_bias0); + + const float* k0 = kernel.channel(p); + + int q = 0; + for (; q < inch; q++) + { + float* outptr0 = out0; + + const Mat img0 = bottom_blob.channel(q); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0); + __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j + 7 < outw; j += 8) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0); + __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0); + __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0); + __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + __m128i _r0nn = __lsx_vld(r0 + 8, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2); + __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3); + __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0); + __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3); + _sum4 = __lsx_vfmadd_s(_k00, _r04, _sum4); + _sum5 = __lsx_vfmadd_s(_k00, _r05, _sum5); + _sum6 = __lsx_vfmadd_s(_k00, _r06, _sum6); + _sum7 = __lsx_vfmadd_s(_k00, _r07, _sum7); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3); + _sum4 = __lsx_vfmadd_s(_k01, _r05, _sum4); + _sum5 = __lsx_vfmadd_s(_k01, _r06, _sum5); + _sum6 = __lsx_vfmadd_s(_k01, _r07, _sum6); + _sum7 = __lsx_vfmadd_s(_k01, _r08, _sum7); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3); + _sum4 = __lsx_vfmadd_s(_k02, _r06, _sum4); + _sum5 = __lsx_vfmadd_s(_k02, _r07, _sum5); + _sum6 = __lsx_vfmadd_s(_k02, _r08, _sum6); + _sum7 = __lsx_vfmadd_s(_k02, _r09, _sum7); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + __m128i _r1nn = __lsx_vld(r1 + 8, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2); + __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3); + __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0); + __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3); + _sum4 = __lsx_vfmadd_s(_k10, _r14, _sum4); + _sum5 = __lsx_vfmadd_s(_k10, _r15, _sum5); + _sum6 = __lsx_vfmadd_s(_k10, _r16, _sum6); + _sum7 = __lsx_vfmadd_s(_k10, _r17, _sum7); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3); + _sum4 = __lsx_vfmadd_s(_k11, _r15, _sum4); + _sum5 = __lsx_vfmadd_s(_k11, _r16, _sum5); + _sum6 = __lsx_vfmadd_s(_k11, _r17, _sum6); + _sum7 = __lsx_vfmadd_s(_k11, _r18, _sum7); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3); + _sum4 = __lsx_vfmadd_s(_k12, _r16, _sum4); + _sum5 = __lsx_vfmadd_s(_k12, _r17, _sum5); + _sum6 = __lsx_vfmadd_s(_k12, _r18, _sum6); + _sum7 = __lsx_vfmadd_s(_k12, _r19, _sum7); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + __m128i _r2nn = __lsx_vld(r2 + 8, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2); + __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3); + __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0); + __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3); + _sum4 = __lsx_vfmadd_s(_k20, _r24, _sum4); + _sum5 = __lsx_vfmadd_s(_k20, _r25, _sum5); + _sum6 = __lsx_vfmadd_s(_k20, _r26, _sum6); + _sum7 = __lsx_vfmadd_s(_k20, _r27, _sum7); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3); + _sum4 = __lsx_vfmadd_s(_k21, _r25, _sum4); + _sum5 = __lsx_vfmadd_s(_k21, _r26, _sum5); + _sum6 = __lsx_vfmadd_s(_k21, _r27, _sum6); + _sum7 = __lsx_vfmadd_s(_k21, _r28, _sum7); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3); + _sum4 = __lsx_vfmadd_s(_k22, _r26, _sum4); + _sum5 = __lsx_vfmadd_s(_k22, _r27, _sum5); + _sum6 = __lsx_vfmadd_s(_k22, _r28, _sum6); + _sum7 = __lsx_vfmadd_s(_k22, _r29, _sum7); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + __lsx_vst(_sum4, outptr0 + 4 * 4, 0); + __lsx_vst(_sum5, outptr0 + 4 * 5, 0); + __lsx_vst(_sum6, outptr0 + 4 * 6, 0); + __lsx_vst(_sum7, outptr0 + 4 * 7, 0); + + outptr0 += 4 * 8; + + r0 += 8; + r1 += 8; + r2 += 8; + } + for (; j + 3 < outw; j += 4) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r02, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r03, _sum3); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r03, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r04, _sum3); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r05, _sum3); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r12, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r13, _sum3); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r13, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r14, _sum3); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r15, _sum3); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r22, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r23, _sum3); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r23, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r24, _sum3); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r25, _sum3); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + + outptr0 += 4 * 4; + + r0 += 4; + r1 += 4; + r2 += 4; + } + for (; j + 1 < outw; j += 2) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r01, _sum1); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r02, _sum1); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r03, _sum1); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r11, _sum1); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r12, _sum1); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r13, _sum1); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r21, _sum1); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r22, _sum1); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r23, _sum1); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + + outptr0 += 4 * 2; + + r0 += 2; + r1 += 2; + r2 += 2; + } + for (; j < outw; j++) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 1; + r1 += 1; + r2 += 1; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + + k0 += 9 * 4; + } + } +} + +static void conv3x3s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2 * outw + w; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + out0.fill(_bias0); + + const float* k0 = kernel.channel(p); + + int q = 0; + for (; q < inch; q++) + { + float* outptr0 = out0; + + const Mat img0 = bottom_blob.channel(q); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0); + __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j + 7 < outw; j += 8) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + __m128 _sum4 = (__m128)__lsx_vld(outptr0 + 4 * 4, 0); + __m128 _sum5 = (__m128)__lsx_vld(outptr0 + 4 * 5, 0); + __m128 _sum6 = (__m128)__lsx_vld(outptr0 + 4 * 6, 0); + __m128 _sum7 = (__m128)__lsx_vld(outptr0 + 4 * 7, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + __m128i _r0nn = __lsx_vld(r0 + 8, 0); + __m128i _r0nnn = __lsx_vld(r0 + 12, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2); + __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3); + __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0); + __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1); + __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2); + __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3); + __m128 _r0c = (__m128)__lsx_vreplvei_w(_r0nnn, 0); + __m128 _r0d = (__m128)__lsx_vreplvei_w(_r0nnn, 1); + __m128 _r0e = (__m128)__lsx_vreplvei_w(_r0nnn, 2); + __m128 _r0f = (__m128)__lsx_vreplvei_w(_r0nnn, 3); + __m128 _r0g = __lsx_vreplfr2vr_s(r0[16]); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3); + _sum4 = __lsx_vfmadd_s(_k00, _r08, _sum4); + _sum5 = __lsx_vfmadd_s(_k00, _r0a, _sum5); + _sum6 = __lsx_vfmadd_s(_k00, _r0c, _sum6); + _sum7 = __lsx_vfmadd_s(_k00, _r0e, _sum7); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3); + _sum4 = __lsx_vfmadd_s(_k01, _r09, _sum4); + _sum5 = __lsx_vfmadd_s(_k01, _r0b, _sum5); + _sum6 = __lsx_vfmadd_s(_k01, _r0d, _sum6); + _sum7 = __lsx_vfmadd_s(_k01, _r0f, _sum7); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3); + _sum4 = __lsx_vfmadd_s(_k02, _r0a, _sum4); + _sum5 = __lsx_vfmadd_s(_k02, _r0c, _sum5); + _sum6 = __lsx_vfmadd_s(_k02, _r0e, _sum6); + _sum7 = __lsx_vfmadd_s(_k02, _r0g, _sum7); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + __m128i _r1nn = __lsx_vld(r1 + 8, 0); + __m128i _r1nnn = __lsx_vld(r1 + 12, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2); + __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3); + __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0); + __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1); + __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2); + __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3); + __m128 _r1c = (__m128)__lsx_vreplvei_w(_r1nnn, 0); + __m128 _r1d = (__m128)__lsx_vreplvei_w(_r1nnn, 1); + __m128 _r1e = (__m128)__lsx_vreplvei_w(_r1nnn, 2); + __m128 _r1f = (__m128)__lsx_vreplvei_w(_r1nnn, 3); + __m128 _r1g = __lsx_vreplfr2vr_s(r1[16]); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3); + _sum4 = __lsx_vfmadd_s(_k10, _r18, _sum4); + _sum5 = __lsx_vfmadd_s(_k10, _r1a, _sum5); + _sum6 = __lsx_vfmadd_s(_k10, _r1c, _sum6); + _sum7 = __lsx_vfmadd_s(_k10, _r1e, _sum7); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3); + _sum4 = __lsx_vfmadd_s(_k11, _r19, _sum4); + _sum5 = __lsx_vfmadd_s(_k11, _r1b, _sum5); + _sum6 = __lsx_vfmadd_s(_k11, _r1d, _sum6); + _sum7 = __lsx_vfmadd_s(_k11, _r1f, _sum7); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3); + _sum4 = __lsx_vfmadd_s(_k12, _r1a, _sum4); + _sum5 = __lsx_vfmadd_s(_k12, _r1c, _sum5); + _sum6 = __lsx_vfmadd_s(_k12, _r1e, _sum6); + _sum7 = __lsx_vfmadd_s(_k12, _r1g, _sum7); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + __m128i _r2nn = __lsx_vld(r2 + 8, 0); + __m128i _r2nnn = __lsx_vld(r2 + 12, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2); + __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3); + __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0); + __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1); + __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2); + __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3); + __m128 _r2c = (__m128)__lsx_vreplvei_w(_r2nnn, 0); + __m128 _r2d = (__m128)__lsx_vreplvei_w(_r2nnn, 1); + __m128 _r2e = (__m128)__lsx_vreplvei_w(_r2nnn, 2); + __m128 _r2f = (__m128)__lsx_vreplvei_w(_r2nnn, 3); + __m128 _r2g = __lsx_vreplfr2vr_s(r2[16]); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3); + _sum4 = __lsx_vfmadd_s(_k20, _r28, _sum4); + _sum5 = __lsx_vfmadd_s(_k20, _r2a, _sum5); + _sum6 = __lsx_vfmadd_s(_k20, _r2c, _sum6); + _sum7 = __lsx_vfmadd_s(_k20, _r2e, _sum7); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3); + _sum4 = __lsx_vfmadd_s(_k21, _r29, _sum4); + _sum5 = __lsx_vfmadd_s(_k21, _r2b, _sum5); + _sum6 = __lsx_vfmadd_s(_k21, _r2d, _sum6); + _sum7 = __lsx_vfmadd_s(_k21, _r2f, _sum7); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3); + _sum4 = __lsx_vfmadd_s(_k22, _r2a, _sum4); + _sum5 = __lsx_vfmadd_s(_k22, _r2c, _sum5); + _sum6 = __lsx_vfmadd_s(_k22, _r2e, _sum6); + _sum7 = __lsx_vfmadd_s(_k22, _r2g, _sum7); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + __lsx_vst(_sum4, outptr0 + 4 * 4, 0); + __lsx_vst(_sum5, outptr0 + 4 * 5, 0); + __lsx_vst(_sum6, outptr0 + 4 * 6, 0); + __lsx_vst(_sum7, outptr0 + 4 * 7, 0); + + outptr0 += 4 * 8; + + r0 += 16; + r1 += 16; + r2 += 16; + } + for (; j + 3 < outw; j += 4) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2); + __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3); + __m128 _r08 = __lsx_vreplfr2vr_s(r0[8]); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2); + __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3); + __m128 _r18 = __lsx_vreplfr2vr_s(r1[8]); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2); + __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3); + __m128 _r28 = __lsx_vreplfr2vr_s(r2[8]); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + + outptr0 += 4 * 4; + + r0 += 8; + r1 += 8; + r2 += 8; + } + for (; j + 1 < outw; j += 2) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = __lsx_vreplfr2vr_s(r0[4]); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = __lsx_vreplfr2vr_s(r1[4]); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = __lsx_vreplfr2vr_s(r2[4]); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + + outptr0 += 4 * 2; + + r0 += 4; + r1 += 4; + r2 += 4; + } + for (; j < outw; j++) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + + __m128i _r0 = __lsx_vld(r0, 0); + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + + __m128i _r1 = __lsx_vld(r1, 0); + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + + __m128i _r2 = __lsx_vld(r2, 0); + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 2; + r1 += 2; + r2 += 2; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + + k0 += 9 * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_3x3_pack4.h b/src/layer/loongarch/convolution_3x3_pack4.h new file mode 100644 index 000000000000..f06bb7e9068c --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_pack4.h @@ -0,0 +1,425 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd63_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt) +{ + // winograd63 transform kernel + Mat kernel_tm; + kernel_tm.create(8 * 8, inch, outch); + + const float ktm[8][3] = { + {1.0f, 0.0f, 0.0f}, + {-2.0f / 9, -2.0f / 9, -2.0f / 9}, + {-2.0f / 9, 2.0f / 9, -2.0f / 9}, + {1.0f / 90, 1.0f / 45, 2.0f / 45}, + {1.0f / 90, -1.0f / 45, 2.0f / 45}, + {1.0f / 45, 1.0f / 90, 1.0f / 180}, + {1.0f / 45, -1.0f / 90, 1.0f / 180}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel, transposed + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[8][3]; + for (int i = 0; i < 8; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // v + for (int j = 0; j < 8; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 8; i++) + { + kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 64-inch-outch + // dst = pb-pa-inch/pa-64-outch/pb + kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 4 * 4, 4 * 4); + + for (int q = 0; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm_pack4.channel(q / 4); + + for (int k = 0; k < 64; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = kernel_tm.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } +} + +static void conv3x3s1_winograd63_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 6n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 5) / 6 * 6; + outh = (outh + 5) / 6 * 6; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 6; + int h_tiles = outh / 6; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator); + conv3x3s1_winograd63_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator); + } + { + conv3x3s1_winograd63_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} + +static void conv3x3s1_winograd43_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt) +{ + // winograd43 transform kernel + Mat kernel_tm(6 * 6, inch, outch); + + const float ktm[6][3] = { + {1.0f / 4, 0.0f, 0.0f}, + {-1.0f / 6, -1.0f / 6, -1.0f / 6}, + {-1.0f / 6, 1.0f / 6, -1.0f / 6}, + {1.0f / 24, 1.0f / 12, 1.0f / 6}, + {1.0f / 24, -1.0f / 12, 1.0f / 6}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = pb-pa-inch/pa-36-outch/pb + kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 4 * 4, 4 * 4); + + for (int q = 0; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm_pack4.channel(q / 4); + + for (int k = 0; k < 36; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = kernel_tm.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } +} + +static void conv3x3s1_winograd43_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} + +static void conv3x3s1_winograd23_transform_kernel_pack4_lsx(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch, const Option& opt) +{ + // winograd23 transform kernel + Mat kernel_tm(4 * 4, inch, outch); + + const float ktm[4][3] = { + {1.0f, 0.0f, 0.0f}, + {1.0f / 2, 1.0f / 2, 1.0f / 2}, + {1.0f / 2, -1.0f / 2, 1.0f / 2}, + {0.0f, 0.0f, 1.0f} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9; + float* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + // h + float tmp[4][3]; + for (int i = 0; i < 4; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 4; j++) + { + float* tmpp = &tmp[j][0]; + + for (int i = 0; i < 4; i++) + { + kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 16-inch-outch + // dst = pb-pa-inch/pa-16-outch/pb + kernel_tm_pack4.create(inch / 4, 16, outch / 4, (size_t)4u * 4 * 4, 4 * 4); + + for (int q = 0; q + 3 < outch; q += 4) + { + Mat g0 = kernel_tm_pack4.channel(q / 4); + + for (int k = 0; k < 16; k++) + { + float* g00 = g0.row(k); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = kernel_tm.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } +} + +static void conv3x3s1_winograd23_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& bias, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 2n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 1) / 2 * 2; + outh = (outh + 1) / 2 * 2; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 2; + int h_tiles = outh / 2; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 16, inch, elemsize, elempack, opt.workspace_allocator); + conv3x3s1_winograd23_transform_input_pack4_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack4_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator); + } + { + conv3x3s1_winograd23_transform_output_pack4_lsx(top_blob_tm, top_blob_bordered, bias, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_3x3_pack8to1_int8.h b/src/layer/loongarch/convolution_3x3_pack8to1_int8.h new file mode 100644 index 000000000000..3c4f97187533 --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_pack8to1_int8.h @@ -0,0 +1,177 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt) +{ + // winograd43 transform kernel + Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); + + const short ktm[6][3] = { + {6, 0, 0}, + {-4, -4, -4}, + {-4, 4, -4}, + {1, 2, 4}, + {1, -2, 4}, + {0, 0, 6} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; + short* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = 4b-8a-inch/8a-36-outch/4b + kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4); + + int p = 0; + for (; p + 3 < outch; p += 4) + { + const Mat k0 = kernel_tm.channel(p); + const Mat k1 = kernel_tm.channel(p + 1); + const Mat k2 = kernel_tm.channel(p + 2); + const Mat k3 = kernel_tm.channel(p + 3); + + Mat g0 = kernel_tm_pack8to1.channel(p / 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + for (int q = 0; q + 7 < inch; q += 8) + { + for (int i = 0; i < 8; i++) + { + g00[0] = k0.row(q + i)[k]; + g00[1] = k1.row(q + i)[k]; + g00[2] = k2.row(q + i)[k]; + g00[3] = k3.row(q + i)[k]; + + g00 += 4; + } + } + } + } + for (; p < outch; p++) + { + const Mat k0 = kernel_tm.channel(p); + + Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = g0.row(k); + + for (int q = 0; q + 7 < inch; q += 8) + { + for (int i = 0; i < 8; i++) + { + g00[0] = k0.row(q + i)[k]; + + g00 += 1; + } + } + } + } +} + +static void conv3x3s1_winograd43_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + // size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack8to1_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_int8_lsx(top_blob_tm, top_blob_bordered, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_3x3_pack8to4_int8.h b/src/layer/loongarch/convolution_3x3_pack8to4_int8.h new file mode 100644 index 000000000000..bf328cee73f9 --- /dev/null +++ b/src/layer/loongarch/convolution_3x3_pack8to4_int8.h @@ -0,0 +1,161 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt) +{ + // winograd43 transform kernel + Mat kernel_tm(6 * 6, inch, outch, (size_t)2u); + + const short ktm[6][3] = { + {6, 0, 0}, + {-4, -4, -4}, + {-4, 4, -4}, + {1, 2, 4}, + {1, -2, 4}, + {0, 0, 6} + }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + for (int q = 0; q < inch; q++) + { + const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9; + short* kernel_tm0 = kernel_tm.channel(p).row(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[6][3]; + for (int i = 0; i < 6; i++) + { + tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j = 0; j < 6; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i = 0; i < 6; i++) + { + kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + // interleave + // src = 36-inch-outch + // dst = 4b-8a-inch/8a-36-outch/4b + kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32); + + int q = 0; + for (; q + 3 < outch; q += 4) + { + const Mat k0 = kernel_tm.channel(q); + const Mat k1 = kernel_tm.channel(q + 1); + const Mat k2 = kernel_tm.channel(q + 2); + const Mat k3 = kernel_tm.channel(q + 3); + + Mat kernel_tm = kernel_tm_pack8.channel(q / 4); + + for (int k = 0; k < 36; k++) + { + short* g00 = kernel_tm.row(k); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int i = 0; i < 8; i++) + { + const short* k00 = k0.row(p + i); + const short* k10 = k1.row(p + i); + const short* k20 = k2.row(p + i); + const short* k30 = k3.row(p + i); + + g00[0] = k00[k]; + g00[1] = k10[k]; + g00[2] = k20[k]; + g00[3] = k30[k]; + + g00 += 4; + } + } + } + } +} + +static void conv3x3s1_winograd43_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + // size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 4n+2 + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 3) / 4 * 4; + outh = (outh + 3) / 4 * 4; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tiles = outw / 4; + int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator); + conv3x3s1_winograd43_transform_input_pack8_int8_lsx(bottom_blob_bordered, bottom_blob_tm, opt); + } + bottom_blob_bordered = Mat(); + // END transform input + + // BEGIN dot + Mat top_blob_tm; + convolution_winograd_dot_pack8to4_int8_lsx(bottom_blob_tm, outch, kernel_tm, top_blob_tm, opt); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + if (outw == top_blob.w && outh == top_blob.h) + { + top_blob_bordered = top_blob; + } + else + { + top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator); + } + { + conv3x3s1_winograd43_transform_output_pack4_int8_lsx(top_blob_tm, top_blob_bordered, opt); + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt); +} diff --git a/src/layer/loongarch/convolution_7x7_pack1to4.h b/src/layer/loongarch/convolution_7x7_pack1to4.h new file mode 100644 index 000000000000..f57923b53d00 --- /dev/null +++ b/src/layer/loongarch/convolution_7x7_pack1to4.h @@ -0,0 +1,652 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv7x7s2_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int tailstep = w - 2 * outw + w; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + out0.fill(_bias0); + + for (int q = 0; q < inch; q++) + { + float* outptr0 = out0; + + const Mat img0 = bottom_blob.channel(q); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + const float* r3 = img0.row(3); + const float* r4 = img0.row(4); + const float* r5 = img0.row(5); + const float* r6 = img0.row(6); + + const float* kptr = kernel.channel(p).row(q); + + int i = 0; + + for (; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + __m128 _sum1 = (__m128)__lsx_vld(outptr0 + 4, 0); + __m128 _sum2 = (__m128)__lsx_vld(outptr0 + 4 * 2, 0); + __m128 _sum3 = (__m128)__lsx_vld(outptr0 + 4 * 3, 0); + + __m128 _k00 = (__m128)__lsx_vld(kptr, 0); + __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + __m128i _r0nn = __lsx_vld(r0 + 8, 0); + + __m128 _r00 = (__m128)__lsx_vreplvei_w(_r0, 0); + __m128 _r01 = (__m128)__lsx_vreplvei_w(_r0, 1); + __m128 _r02 = (__m128)__lsx_vreplvei_w(_r0, 2); + __m128 _r03 = (__m128)__lsx_vreplvei_w(_r0, 3); + __m128 _r04 = (__m128)__lsx_vreplvei_w(_r0n, 0); + __m128 _r05 = (__m128)__lsx_vreplvei_w(_r0n, 1); + __m128 _r06 = (__m128)__lsx_vreplvei_w(_r0n, 2); + __m128 _r07 = (__m128)__lsx_vreplvei_w(_r0n, 3); + __m128 _r08 = (__m128)__lsx_vreplvei_w(_r0nn, 0); + __m128 _r09 = (__m128)__lsx_vreplvei_w(_r0nn, 1); + __m128 _r0a = (__m128)__lsx_vreplvei_w(_r0nn, 2); + __m128 _r0b = (__m128)__lsx_vreplvei_w(_r0nn, 3); + __m128 _r0c = __lsx_vreplfr2vr_s(r0[12]); + + _sum0 = __lsx_vfmadd_s(_k00, _r00, _sum0); + _sum1 = __lsx_vfmadd_s(_k00, _r02, _sum1); + _sum2 = __lsx_vfmadd_s(_k00, _r04, _sum2); + _sum3 = __lsx_vfmadd_s(_k00, _r06, _sum3); + _sum0 = __lsx_vfmadd_s(_k01, _r01, _sum0); + _sum1 = __lsx_vfmadd_s(_k01, _r03, _sum1); + _sum2 = __lsx_vfmadd_s(_k01, _r05, _sum2); + _sum3 = __lsx_vfmadd_s(_k01, _r07, _sum3); + _sum0 = __lsx_vfmadd_s(_k02, _r02, _sum0); + _sum1 = __lsx_vfmadd_s(_k02, _r04, _sum1); + _sum2 = __lsx_vfmadd_s(_k02, _r06, _sum2); + _sum3 = __lsx_vfmadd_s(_k02, _r08, _sum3); + _sum0 = __lsx_vfmadd_s(_k03, _r03, _sum0); + _sum1 = __lsx_vfmadd_s(_k03, _r05, _sum1); + _sum2 = __lsx_vfmadd_s(_k03, _r07, _sum2); + _sum3 = __lsx_vfmadd_s(_k03, _r09, _sum3); + _sum0 = __lsx_vfmadd_s(_k04, _r04, _sum0); + _sum1 = __lsx_vfmadd_s(_k04, _r06, _sum1); + _sum2 = __lsx_vfmadd_s(_k04, _r08, _sum2); + _sum3 = __lsx_vfmadd_s(_k04, _r0a, _sum3); + _sum0 = __lsx_vfmadd_s(_k05, _r05, _sum0); + _sum1 = __lsx_vfmadd_s(_k05, _r07, _sum1); + _sum2 = __lsx_vfmadd_s(_k05, _r09, _sum2); + _sum3 = __lsx_vfmadd_s(_k05, _r0b, _sum3); + _sum0 = __lsx_vfmadd_s(_k06, _r06, _sum0); + _sum1 = __lsx_vfmadd_s(_k06, _r08, _sum1); + _sum2 = __lsx_vfmadd_s(_k06, _r0a, _sum2); + _sum3 = __lsx_vfmadd_s(_k06, _r0c, _sum3); + + __m128 _k10 = (__m128)__lsx_vld(kptr, 0); + __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + __m128i _r1nn = __lsx_vld(r1 + 8, 0); + + __m128 _r10 = (__m128)__lsx_vreplvei_w(_r1, 0); + __m128 _r11 = (__m128)__lsx_vreplvei_w(_r1, 1); + __m128 _r12 = (__m128)__lsx_vreplvei_w(_r1, 2); + __m128 _r13 = (__m128)__lsx_vreplvei_w(_r1, 3); + __m128 _r14 = (__m128)__lsx_vreplvei_w(_r1n, 0); + __m128 _r15 = (__m128)__lsx_vreplvei_w(_r1n, 1); + __m128 _r16 = (__m128)__lsx_vreplvei_w(_r1n, 2); + __m128 _r17 = (__m128)__lsx_vreplvei_w(_r1n, 3); + __m128 _r18 = (__m128)__lsx_vreplvei_w(_r1nn, 0); + __m128 _r19 = (__m128)__lsx_vreplvei_w(_r1nn, 1); + __m128 _r1a = (__m128)__lsx_vreplvei_w(_r1nn, 2); + __m128 _r1b = (__m128)__lsx_vreplvei_w(_r1nn, 3); + __m128 _r1c = __lsx_vreplfr2vr_s(r1[12]); + + _sum0 = __lsx_vfmadd_s(_k10, _r10, _sum0); + _sum1 = __lsx_vfmadd_s(_k10, _r12, _sum1); + _sum2 = __lsx_vfmadd_s(_k10, _r14, _sum2); + _sum3 = __lsx_vfmadd_s(_k10, _r16, _sum3); + _sum0 = __lsx_vfmadd_s(_k11, _r11, _sum0); + _sum1 = __lsx_vfmadd_s(_k11, _r13, _sum1); + _sum2 = __lsx_vfmadd_s(_k11, _r15, _sum2); + _sum3 = __lsx_vfmadd_s(_k11, _r17, _sum3); + _sum0 = __lsx_vfmadd_s(_k12, _r12, _sum0); + _sum1 = __lsx_vfmadd_s(_k12, _r14, _sum1); + _sum2 = __lsx_vfmadd_s(_k12, _r16, _sum2); + _sum3 = __lsx_vfmadd_s(_k12, _r18, _sum3); + _sum0 = __lsx_vfmadd_s(_k13, _r13, _sum0); + _sum1 = __lsx_vfmadd_s(_k13, _r15, _sum1); + _sum2 = __lsx_vfmadd_s(_k13, _r17, _sum2); + _sum3 = __lsx_vfmadd_s(_k13, _r19, _sum3); + _sum0 = __lsx_vfmadd_s(_k14, _r14, _sum0); + _sum1 = __lsx_vfmadd_s(_k14, _r16, _sum1); + _sum2 = __lsx_vfmadd_s(_k14, _r18, _sum2); + _sum3 = __lsx_vfmadd_s(_k14, _r1a, _sum3); + _sum0 = __lsx_vfmadd_s(_k15, _r15, _sum0); + _sum1 = __lsx_vfmadd_s(_k15, _r17, _sum1); + _sum2 = __lsx_vfmadd_s(_k15, _r19, _sum2); + _sum3 = __lsx_vfmadd_s(_k15, _r1b, _sum3); + _sum0 = __lsx_vfmadd_s(_k16, _r16, _sum0); + _sum1 = __lsx_vfmadd_s(_k16, _r18, _sum1); + _sum2 = __lsx_vfmadd_s(_k16, _r1a, _sum2); + _sum3 = __lsx_vfmadd_s(_k16, _r1c, _sum3); + + __m128 _k20 = (__m128)__lsx_vld(kptr, 0); + __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + __m128i _r2nn = __lsx_vld(r2 + 8, 0); + + __m128 _r20 = (__m128)__lsx_vreplvei_w(_r2, 0); + __m128 _r21 = (__m128)__lsx_vreplvei_w(_r2, 1); + __m128 _r22 = (__m128)__lsx_vreplvei_w(_r2, 2); + __m128 _r23 = (__m128)__lsx_vreplvei_w(_r2, 3); + __m128 _r24 = (__m128)__lsx_vreplvei_w(_r2n, 0); + __m128 _r25 = (__m128)__lsx_vreplvei_w(_r2n, 1); + __m128 _r26 = (__m128)__lsx_vreplvei_w(_r2n, 2); + __m128 _r27 = (__m128)__lsx_vreplvei_w(_r2n, 3); + __m128 _r28 = (__m128)__lsx_vreplvei_w(_r2nn, 0); + __m128 _r29 = (__m128)__lsx_vreplvei_w(_r2nn, 1); + __m128 _r2a = (__m128)__lsx_vreplvei_w(_r2nn, 2); + __m128 _r2b = (__m128)__lsx_vreplvei_w(_r2nn, 3); + __m128 _r2c = __lsx_vreplfr2vr_s(r2[12]); + + _sum0 = __lsx_vfmadd_s(_k20, _r20, _sum0); + _sum1 = __lsx_vfmadd_s(_k20, _r22, _sum1); + _sum2 = __lsx_vfmadd_s(_k20, _r24, _sum2); + _sum3 = __lsx_vfmadd_s(_k20, _r26, _sum3); + _sum0 = __lsx_vfmadd_s(_k21, _r21, _sum0); + _sum1 = __lsx_vfmadd_s(_k21, _r23, _sum1); + _sum2 = __lsx_vfmadd_s(_k21, _r25, _sum2); + _sum3 = __lsx_vfmadd_s(_k21, _r27, _sum3); + _sum0 = __lsx_vfmadd_s(_k22, _r22, _sum0); + _sum1 = __lsx_vfmadd_s(_k22, _r24, _sum1); + _sum2 = __lsx_vfmadd_s(_k22, _r26, _sum2); + _sum3 = __lsx_vfmadd_s(_k22, _r28, _sum3); + _sum0 = __lsx_vfmadd_s(_k23, _r23, _sum0); + _sum1 = __lsx_vfmadd_s(_k23, _r25, _sum1); + _sum2 = __lsx_vfmadd_s(_k23, _r27, _sum2); + _sum3 = __lsx_vfmadd_s(_k23, _r29, _sum3); + _sum0 = __lsx_vfmadd_s(_k24, _r24, _sum0); + _sum1 = __lsx_vfmadd_s(_k24, _r26, _sum1); + _sum2 = __lsx_vfmadd_s(_k24, _r28, _sum2); + _sum3 = __lsx_vfmadd_s(_k24, _r2a, _sum3); + _sum0 = __lsx_vfmadd_s(_k25, _r25, _sum0); + _sum1 = __lsx_vfmadd_s(_k25, _r27, _sum1); + _sum2 = __lsx_vfmadd_s(_k25, _r29, _sum2); + _sum3 = __lsx_vfmadd_s(_k25, _r2b, _sum3); + _sum0 = __lsx_vfmadd_s(_k26, _r26, _sum0); + _sum1 = __lsx_vfmadd_s(_k26, _r28, _sum1); + _sum2 = __lsx_vfmadd_s(_k26, _r2a, _sum2); + _sum3 = __lsx_vfmadd_s(_k26, _r2c, _sum3); + + __m128 _k30 = (__m128)__lsx_vld(kptr, 0); + __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r3 = __lsx_vld(r3, 0); + __m128i _r3n = __lsx_vld(r3 + 4, 0); + __m128i _r3nn = __lsx_vld(r3 + 8, 0); + + __m128 _r30 = (__m128)__lsx_vreplvei_w(_r3, 0); + __m128 _r31 = (__m128)__lsx_vreplvei_w(_r3, 1); + __m128 _r32 = (__m128)__lsx_vreplvei_w(_r3, 2); + __m128 _r33 = (__m128)__lsx_vreplvei_w(_r3, 3); + __m128 _r34 = (__m128)__lsx_vreplvei_w(_r3n, 0); + __m128 _r35 = (__m128)__lsx_vreplvei_w(_r3n, 1); + __m128 _r36 = (__m128)__lsx_vreplvei_w(_r3n, 2); + __m128 _r37 = (__m128)__lsx_vreplvei_w(_r3n, 3); + __m128 _r38 = (__m128)__lsx_vreplvei_w(_r3nn, 0); + __m128 _r39 = (__m128)__lsx_vreplvei_w(_r3nn, 1); + __m128 _r3a = (__m128)__lsx_vreplvei_w(_r3nn, 2); + __m128 _r3b = (__m128)__lsx_vreplvei_w(_r3nn, 3); + __m128 _r3c = __lsx_vreplfr2vr_s(r3[12]); + + _sum0 = __lsx_vfmadd_s(_k30, _r30, _sum0); + _sum1 = __lsx_vfmadd_s(_k30, _r32, _sum1); + _sum2 = __lsx_vfmadd_s(_k30, _r34, _sum2); + _sum3 = __lsx_vfmadd_s(_k30, _r36, _sum3); + _sum0 = __lsx_vfmadd_s(_k31, _r31, _sum0); + _sum1 = __lsx_vfmadd_s(_k31, _r33, _sum1); + _sum2 = __lsx_vfmadd_s(_k31, _r35, _sum2); + _sum3 = __lsx_vfmadd_s(_k31, _r37, _sum3); + _sum0 = __lsx_vfmadd_s(_k32, _r32, _sum0); + _sum1 = __lsx_vfmadd_s(_k32, _r34, _sum1); + _sum2 = __lsx_vfmadd_s(_k32, _r36, _sum2); + _sum3 = __lsx_vfmadd_s(_k32, _r38, _sum3); + _sum0 = __lsx_vfmadd_s(_k33, _r33, _sum0); + _sum1 = __lsx_vfmadd_s(_k33, _r35, _sum1); + _sum2 = __lsx_vfmadd_s(_k33, _r37, _sum2); + _sum3 = __lsx_vfmadd_s(_k33, _r39, _sum3); + _sum0 = __lsx_vfmadd_s(_k34, _r34, _sum0); + _sum1 = __lsx_vfmadd_s(_k34, _r36, _sum1); + _sum2 = __lsx_vfmadd_s(_k34, _r38, _sum2); + _sum3 = __lsx_vfmadd_s(_k34, _r3a, _sum3); + _sum0 = __lsx_vfmadd_s(_k35, _r35, _sum0); + _sum1 = __lsx_vfmadd_s(_k35, _r37, _sum1); + _sum2 = __lsx_vfmadd_s(_k35, _r39, _sum2); + _sum3 = __lsx_vfmadd_s(_k35, _r3b, _sum3); + _sum0 = __lsx_vfmadd_s(_k36, _r36, _sum0); + _sum1 = __lsx_vfmadd_s(_k36, _r38, _sum1); + _sum2 = __lsx_vfmadd_s(_k36, _r3a, _sum2); + _sum3 = __lsx_vfmadd_s(_k36, _r3c, _sum3); + + __m128 _k40 = (__m128)__lsx_vld(kptr, 0); + __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r4 = __lsx_vld(r4, 0); + __m128i _r4n = __lsx_vld(r4 + 4, 0); + __m128i _r4nn = __lsx_vld(r4 + 8, 0); + + __m128 _r40 = (__m128)__lsx_vreplvei_w(_r4, 0); + __m128 _r41 = (__m128)__lsx_vreplvei_w(_r4, 1); + __m128 _r42 = (__m128)__lsx_vreplvei_w(_r4, 2); + __m128 _r43 = (__m128)__lsx_vreplvei_w(_r4, 3); + __m128 _r44 = (__m128)__lsx_vreplvei_w(_r4n, 0); + __m128 _r45 = (__m128)__lsx_vreplvei_w(_r4n, 1); + __m128 _r46 = (__m128)__lsx_vreplvei_w(_r4n, 2); + __m128 _r47 = (__m128)__lsx_vreplvei_w(_r4n, 3); + __m128 _r48 = (__m128)__lsx_vreplvei_w(_r4nn, 0); + __m128 _r49 = (__m128)__lsx_vreplvei_w(_r4nn, 1); + __m128 _r4a = (__m128)__lsx_vreplvei_w(_r4nn, 2); + __m128 _r4b = (__m128)__lsx_vreplvei_w(_r4nn, 3); + __m128 _r4c = __lsx_vreplfr2vr_s(r4[12]); + + _sum0 = __lsx_vfmadd_s(_k40, _r40, _sum0); + _sum1 = __lsx_vfmadd_s(_k40, _r42, _sum1); + _sum2 = __lsx_vfmadd_s(_k40, _r44, _sum2); + _sum3 = __lsx_vfmadd_s(_k40, _r46, _sum3); + _sum0 = __lsx_vfmadd_s(_k41, _r41, _sum0); + _sum1 = __lsx_vfmadd_s(_k41, _r43, _sum1); + _sum2 = __lsx_vfmadd_s(_k41, _r45, _sum2); + _sum3 = __lsx_vfmadd_s(_k41, _r47, _sum3); + _sum0 = __lsx_vfmadd_s(_k42, _r42, _sum0); + _sum1 = __lsx_vfmadd_s(_k42, _r44, _sum1); + _sum2 = __lsx_vfmadd_s(_k42, _r46, _sum2); + _sum3 = __lsx_vfmadd_s(_k42, _r48, _sum3); + _sum0 = __lsx_vfmadd_s(_k43, _r43, _sum0); + _sum1 = __lsx_vfmadd_s(_k43, _r45, _sum1); + _sum2 = __lsx_vfmadd_s(_k43, _r47, _sum2); + _sum3 = __lsx_vfmadd_s(_k43, _r49, _sum3); + _sum0 = __lsx_vfmadd_s(_k44, _r44, _sum0); + _sum1 = __lsx_vfmadd_s(_k44, _r46, _sum1); + _sum2 = __lsx_vfmadd_s(_k44, _r48, _sum2); + _sum3 = __lsx_vfmadd_s(_k44, _r4a, _sum3); + _sum0 = __lsx_vfmadd_s(_k45, _r45, _sum0); + _sum1 = __lsx_vfmadd_s(_k45, _r47, _sum1); + _sum2 = __lsx_vfmadd_s(_k45, _r49, _sum2); + _sum3 = __lsx_vfmadd_s(_k45, _r4b, _sum3); + _sum0 = __lsx_vfmadd_s(_k46, _r46, _sum0); + _sum1 = __lsx_vfmadd_s(_k46, _r48, _sum1); + _sum2 = __lsx_vfmadd_s(_k46, _r4a, _sum2); + _sum3 = __lsx_vfmadd_s(_k46, _r4c, _sum3); + + __m128 _k50 = (__m128)__lsx_vld(kptr, 0); + __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r5 = __lsx_vld(r5, 0); + __m128i _r5n = __lsx_vld(r5 + 4, 0); + __m128i _r5nn = __lsx_vld(r5 + 8, 0); + + __m128 _r50 = (__m128)__lsx_vreplvei_w(_r5, 0); + __m128 _r51 = (__m128)__lsx_vreplvei_w(_r5, 1); + __m128 _r52 = (__m128)__lsx_vreplvei_w(_r5, 2); + __m128 _r53 = (__m128)__lsx_vreplvei_w(_r5, 3); + __m128 _r54 = (__m128)__lsx_vreplvei_w(_r5n, 0); + __m128 _r55 = (__m128)__lsx_vreplvei_w(_r5n, 1); + __m128 _r56 = (__m128)__lsx_vreplvei_w(_r5n, 2); + __m128 _r57 = (__m128)__lsx_vreplvei_w(_r5n, 3); + __m128 _r58 = (__m128)__lsx_vreplvei_w(_r5nn, 0); + __m128 _r59 = (__m128)__lsx_vreplvei_w(_r5nn, 1); + __m128 _r5a = (__m128)__lsx_vreplvei_w(_r5nn, 2); + __m128 _r5b = (__m128)__lsx_vreplvei_w(_r5nn, 3); + __m128 _r5c = __lsx_vreplfr2vr_s(r5[12]); + + _sum0 = __lsx_vfmadd_s(_k50, _r50, _sum0); + _sum1 = __lsx_vfmadd_s(_k50, _r52, _sum1); + _sum2 = __lsx_vfmadd_s(_k50, _r54, _sum2); + _sum3 = __lsx_vfmadd_s(_k50, _r56, _sum3); + _sum0 = __lsx_vfmadd_s(_k51, _r51, _sum0); + _sum1 = __lsx_vfmadd_s(_k51, _r53, _sum1); + _sum2 = __lsx_vfmadd_s(_k51, _r55, _sum2); + _sum3 = __lsx_vfmadd_s(_k51, _r57, _sum3); + _sum0 = __lsx_vfmadd_s(_k52, _r52, _sum0); + _sum1 = __lsx_vfmadd_s(_k52, _r54, _sum1); + _sum2 = __lsx_vfmadd_s(_k52, _r56, _sum2); + _sum3 = __lsx_vfmadd_s(_k52, _r58, _sum3); + _sum0 = __lsx_vfmadd_s(_k53, _r53, _sum0); + _sum1 = __lsx_vfmadd_s(_k53, _r55, _sum1); + _sum2 = __lsx_vfmadd_s(_k53, _r57, _sum2); + _sum3 = __lsx_vfmadd_s(_k53, _r59, _sum3); + _sum0 = __lsx_vfmadd_s(_k54, _r54, _sum0); + _sum1 = __lsx_vfmadd_s(_k54, _r56, _sum1); + _sum2 = __lsx_vfmadd_s(_k54, _r58, _sum2); + _sum3 = __lsx_vfmadd_s(_k54, _r5a, _sum3); + _sum0 = __lsx_vfmadd_s(_k55, _r55, _sum0); + _sum1 = __lsx_vfmadd_s(_k55, _r57, _sum1); + _sum2 = __lsx_vfmadd_s(_k55, _r59, _sum2); + _sum3 = __lsx_vfmadd_s(_k55, _r5b, _sum3); + _sum0 = __lsx_vfmadd_s(_k56, _r56, _sum0); + _sum1 = __lsx_vfmadd_s(_k56, _r58, _sum1); + _sum2 = __lsx_vfmadd_s(_k56, _r5a, _sum2); + _sum3 = __lsx_vfmadd_s(_k56, _r5c, _sum3); + + __m128 _k60 = (__m128)__lsx_vld(kptr, 0); + __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr -= 4 * 42; + + __m128i _r6 = __lsx_vld(r6, 0); + __m128i _r6n = __lsx_vld(r6 + 4, 0); + __m128i _r6nn = __lsx_vld(r6 + 8, 0); + + __m128 _r60 = (__m128)__lsx_vreplvei_w(_r6, 0); + __m128 _r61 = (__m128)__lsx_vreplvei_w(_r6, 1); + __m128 _r62 = (__m128)__lsx_vreplvei_w(_r6, 2); + __m128 _r63 = (__m128)__lsx_vreplvei_w(_r6, 3); + __m128 _r64 = (__m128)__lsx_vreplvei_w(_r6n, 0); + __m128 _r65 = (__m128)__lsx_vreplvei_w(_r6n, 1); + __m128 _r66 = (__m128)__lsx_vreplvei_w(_r6n, 2); + __m128 _r67 = (__m128)__lsx_vreplvei_w(_r6n, 3); + __m128 _r68 = (__m128)__lsx_vreplvei_w(_r6nn, 0); + __m128 _r69 = (__m128)__lsx_vreplvei_w(_r6nn, 1); + __m128 _r6a = (__m128)__lsx_vreplvei_w(_r6nn, 2); + __m128 _r6b = (__m128)__lsx_vreplvei_w(_r6nn, 3); + __m128 _r6c = __lsx_vreplfr2vr_s(r6[12]); + + _sum0 = __lsx_vfmadd_s(_k60, _r60, _sum0); + _sum1 = __lsx_vfmadd_s(_k60, _r62, _sum1); + _sum2 = __lsx_vfmadd_s(_k60, _r64, _sum2); + _sum3 = __lsx_vfmadd_s(_k60, _r66, _sum3); + _sum0 = __lsx_vfmadd_s(_k61, _r61, _sum0); + _sum1 = __lsx_vfmadd_s(_k61, _r63, _sum1); + _sum2 = __lsx_vfmadd_s(_k61, _r65, _sum2); + _sum3 = __lsx_vfmadd_s(_k61, _r67, _sum3); + _sum0 = __lsx_vfmadd_s(_k62, _r62, _sum0); + _sum1 = __lsx_vfmadd_s(_k62, _r64, _sum1); + _sum2 = __lsx_vfmadd_s(_k62, _r66, _sum2); + _sum3 = __lsx_vfmadd_s(_k62, _r68, _sum3); + _sum0 = __lsx_vfmadd_s(_k63, _r63, _sum0); + _sum1 = __lsx_vfmadd_s(_k63, _r65, _sum1); + _sum2 = __lsx_vfmadd_s(_k63, _r67, _sum2); + _sum3 = __lsx_vfmadd_s(_k63, _r69, _sum3); + _sum0 = __lsx_vfmadd_s(_k64, _r64, _sum0); + _sum1 = __lsx_vfmadd_s(_k64, _r66, _sum1); + _sum2 = __lsx_vfmadd_s(_k64, _r68, _sum2); + _sum3 = __lsx_vfmadd_s(_k64, _r6a, _sum3); + _sum0 = __lsx_vfmadd_s(_k65, _r65, _sum0); + _sum1 = __lsx_vfmadd_s(_k65, _r67, _sum1); + _sum2 = __lsx_vfmadd_s(_k65, _r69, _sum2); + _sum3 = __lsx_vfmadd_s(_k65, _r6b, _sum3); + _sum0 = __lsx_vfmadd_s(_k66, _r66, _sum0); + _sum1 = __lsx_vfmadd_s(_k66, _r68, _sum1); + _sum2 = __lsx_vfmadd_s(_k66, _r6a, _sum2); + _sum3 = __lsx_vfmadd_s(_k66, _r6c, _sum3); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + + outptr0 += 4 * 4; + + r0 += 8; + r1 += 8; + r2 += 8; + r3 += 8; + r4 += 8; + r5 += 8; + r6 += 8; + } + for (; j < outw; j++) + { + __m128 _sum0 = (__m128)__lsx_vld(outptr0, 0); + + __m128 _k00 = (__m128)__lsx_vld(kptr, 0); + __m128 _k01 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k05 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k06 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r0n = __lsx_vld(r0 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k00, (__m128)__lsx_vreplvei_w(_r0, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k01, (__m128)__lsx_vreplvei_w(_r0, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k02, (__m128)__lsx_vreplvei_w(_r0, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k03, (__m128)__lsx_vreplvei_w(_r0, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k04, (__m128)__lsx_vreplvei_w(_r0n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k05, (__m128)__lsx_vreplvei_w(_r0n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k06, (__m128)__lsx_vreplvei_w(_r0n, 2), _sum0); + + __m128 _k10 = (__m128)__lsx_vld(kptr, 0); + __m128 _k11 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k15 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k16 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r1n = __lsx_vld(r1 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k10, (__m128)__lsx_vreplvei_w(_r1, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k11, (__m128)__lsx_vreplvei_w(_r1, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k12, (__m128)__lsx_vreplvei_w(_r1, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k13, (__m128)__lsx_vreplvei_w(_r1, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k14, (__m128)__lsx_vreplvei_w(_r1n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k15, (__m128)__lsx_vreplvei_w(_r1n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k16, (__m128)__lsx_vreplvei_w(_r1n, 2), _sum0); + + __m128 _k20 = (__m128)__lsx_vld(kptr, 0); + __m128 _k21 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k25 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k26 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r2n = __lsx_vld(r2 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k20, (__m128)__lsx_vreplvei_w(_r2, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k21, (__m128)__lsx_vreplvei_w(_r2, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k22, (__m128)__lsx_vreplvei_w(_r2, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k23, (__m128)__lsx_vreplvei_w(_r2, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k24, (__m128)__lsx_vreplvei_w(_r2n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k25, (__m128)__lsx_vreplvei_w(_r2n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k26, (__m128)__lsx_vreplvei_w(_r2n, 2), _sum0); + + __m128 _k30 = (__m128)__lsx_vld(kptr, 0); + __m128 _k31 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k35 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k36 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r3 = __lsx_vld(r3, 0); + __m128i _r3n = __lsx_vld(r3 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k30, (__m128)__lsx_vreplvei_w(_r3, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k31, (__m128)__lsx_vreplvei_w(_r3, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k32, (__m128)__lsx_vreplvei_w(_r3, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k33, (__m128)__lsx_vreplvei_w(_r3, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k34, (__m128)__lsx_vreplvei_w(_r3n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k35, (__m128)__lsx_vreplvei_w(_r3n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k36, (__m128)__lsx_vreplvei_w(_r3n, 2), _sum0); + + __m128 _k40 = (__m128)__lsx_vld(kptr, 0); + __m128 _k41 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k45 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k46 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r4 = __lsx_vld(r4, 0); + __m128i _r4n = __lsx_vld(r4 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k40, (__m128)__lsx_vreplvei_w(_r4, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k41, (__m128)__lsx_vreplvei_w(_r4, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k42, (__m128)__lsx_vreplvei_w(_r4, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k43, (__m128)__lsx_vreplvei_w(_r4, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k44, (__m128)__lsx_vreplvei_w(_r4n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k45, (__m128)__lsx_vreplvei_w(_r4n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k46, (__m128)__lsx_vreplvei_w(_r4n, 2), _sum0); + + __m128 _k50 = (__m128)__lsx_vld(kptr, 0); + __m128 _k51 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k52 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k53 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k54 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k55 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k56 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr += 4 * 7; + + __m128i _r5 = __lsx_vld(r5, 0); + __m128i _r5n = __lsx_vld(r5 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k50, (__m128)__lsx_vreplvei_w(_r5, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k51, (__m128)__lsx_vreplvei_w(_r5, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k52, (__m128)__lsx_vreplvei_w(_r5, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k53, (__m128)__lsx_vreplvei_w(_r5, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k54, (__m128)__lsx_vreplvei_w(_r5n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k55, (__m128)__lsx_vreplvei_w(_r5n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k56, (__m128)__lsx_vreplvei_w(_r5n, 2), _sum0); + + __m128 _k60 = (__m128)__lsx_vld(kptr, 0); + __m128 _k61 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _k62 = (__m128)__lsx_vld(kptr + 4 * 2, 0); + __m128 _k63 = (__m128)__lsx_vld(kptr + 4 * 3, 0); + __m128 _k64 = (__m128)__lsx_vld(kptr + 4 * 4, 0); + __m128 _k65 = (__m128)__lsx_vld(kptr + 4 * 5, 0); + __m128 _k66 = (__m128)__lsx_vld(kptr + 4 * 6, 0); + + kptr -= 4 * 42; + + __m128i _r6 = __lsx_vld(r6, 0); + __m128i _r6n = __lsx_vld(r6 + 4, 0); + + _sum0 = __lsx_vfmadd_s(_k60, (__m128)__lsx_vreplvei_w(_r6, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k61, (__m128)__lsx_vreplvei_w(_r6, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k62, (__m128)__lsx_vreplvei_w(_r6, 2), _sum0); + _sum0 = __lsx_vfmadd_s(_k63, (__m128)__lsx_vreplvei_w(_r6, 3), _sum0); + _sum0 = __lsx_vfmadd_s(_k64, (__m128)__lsx_vreplvei_w(_r6n, 0), _sum0); + _sum0 = __lsx_vfmadd_s(_k65, (__m128)__lsx_vreplvei_w(_r6n, 1), _sum0); + _sum0 = __lsx_vfmadd_s(_k66, (__m128)__lsx_vreplvei_w(_r6n, 2), _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + r4 += 2; + r5 += 2; + r6 += 2; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + r3 += tailstep; + r4 += tailstep; + r5 += tailstep; + r6 += tailstep; + } + } + } +} diff --git a/src/layer/loongarch/convolution_int8.h b/src/layer/loongarch/convolution_int8.h new file mode 100644 index 000000000000..22c7a8ccbe6b --- /dev/null +++ b/src/layer/loongarch/convolution_int8.h @@ -0,0 +1,82 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + // const signed char* kptr = weight_data_int8.channel(p); + const signed char* kptr = (const signed char*)weight_data_int8 + maxk * channels * p; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + kptr += maxk; + } + + outptr[j] = sum; + } + + outptr += outw; + } + } +} diff --git a/src/layer/loongarch/convolution_loongarch.cpp b/src/layer/loongarch/convolution_loongarch.cpp new file mode 100644 index 000000000000..31719b3de92b --- /dev/null +++ b/src/layer/loongarch/convolution_loongarch.cpp @@ -0,0 +1,975 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_loongarch.h" + +#include "benchmark.h" +#include "cpu.h" +#include "layer_type.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +#include "cpu.h" + +namespace ncnn { + +#include "convolution_sgemm.h" +#include "convolution_winograd_transform.h" +#include "convolution_winograd_dot.h" +#include "convolution_1x1.h" +#include "convolution_3x3.h" + +#if NCNN_INT8 +#include "convolution_sgemm_int8.h" +#include "convolution_winograd_transform_int8.h" +#include "convolution_winograd_dot_int8.h" +#include "convolution_1x1_int8.h" +#include "convolution_3x3_int8.h" +#include "convolution_int8.h" +#endif // NCNN_INT8 + +#if __loongarch_sx +#include "convolution_pack4.h" +#include "convolution_pack1to4.h" +#include "convolution_pack4to1.h" + +#include "convolution_sgemm_pack4.h" +#include "convolution_sgemm_pack4to1.h" +#include "convolution_winograd_transform_pack4.h" +#include "convolution_winograd_dot_pack4.h" +#include "convolution_1x1_pack4.h" +#include "convolution_1x1_pack4to1.h" +#include "convolution_3x3_pack4.h" +#include "convolution_3x3_pack1to4.h" +#include "convolution_7x7_pack1to4.h" + +#if NCNN_INT8 +#include "convolution_pack8to4_int8.h" +#include "convolution_pack1to4_int8.h" +#include "convolution_pack8to1_int8.h" +#include "convolution_sgemm_pack8to4_int8.h" +#include "convolution_sgemm_pack1to4_int8.h" +#include "convolution_sgemm_pack8to1_int8.h" +#include "convolution_winograd_transform_pack4_int8.h" +#include "convolution_winograd_transform_pack8_int8.h" +#include "convolution_winograd_dot_pack8to4_int8.h" +#include "convolution_winograd_dot_pack8to1_int8.h" +#include "convolution_1x1_pack8to4_int8.h" +#include "convolution_1x1_pack1to4_int8.h" +#include "convolution_1x1_pack8to1_int8.h" +#include "convolution_3x3_pack8to4_int8.h" +#include "convolution_3x3_pack8to1_int8.h" +#endif // NCNN_INT8 +#endif // __loongarch_sx + +Convolution_loongarch::Convolution_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx + + activation = 0; +} + +static void convolution_transform_kernel_packed_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_tm.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } +} + +int Convolution_loongarch::create_pipeline(const Option& opt) +{ + if (dynamic_weight) + return 0; + + activation = create_activation_layer(activation_type, activation_params, opt); + +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_loongarch(opt); + } +#endif + + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + +#if __loongarch_sx + // pack4 + if (elempack == 4 && out_elempack == 4) + { + if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd63_transform_kernel_pack4_lsx(weight_data, weight_winograd63_data, num_input, num_output, opt); + else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd43_transform_kernel_pack4_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + else // if (opt.use_winograd23_convolution) + conv3x3s1_winograd23_transform_kernel_pack4_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt); + } + else + { + convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1ton + if (elempack == 1 && out_elempack == 4) + { + convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __loongarch_sx + + // pack1 + if (elempack == 1 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution) + { + conv3x3s1_winograd43_transform_kernel_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + } + else if (opt.use_winograd23_convolution) + { + conv3x3s1_winograd23_transform_kernel_lsx(weight_data, weight_winograd23_data, num_input, num_output, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + weight_data_tm = weight_data; + } + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int Convolution_loongarch::destroy_pipeline(const Option& opt) +{ + if (activation) + { + activation->destroy_pipeline(opt); + delete activation; + activation = 0; + } + + return 0; +} + +int Convolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + return forward_int8_loongarch(bottom_blob, top_blob, opt); + } +#endif + + // flattened blob, implement as InnerProduct + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + Mat bottom_blob_3d; + if (bottom_blob.elemsize % 16 == 0) + { + bottom_blob_3d = bottom_blob; + bottom_blob_3d.dims = 3; + bottom_blob_3d.w = 1; + bottom_blob_3d.h = 1; + bottom_blob_3d.c = bottom_blob.w; + bottom_blob_3d.cstep = 1; + } + else + { + bottom_blob_3d = bottom_blob.reshape(1, 1, bottom_blob.w, opt.workspace_allocator); + } + + Mat top_blob_3d; + int ret = forward(bottom_blob_3d, top_blob_3d, opt); + if (ret != 0) + return ret; + + if (top_blob_3d.elemsize % 16 == 0) + { + top_blob = top_blob_3d; + top_blob.dims = 1; + top_blob.w = top_blob_3d.c; + top_blob.h = 1; + top_blob.c = 1; + bottom_blob_3d.cstep = top_blob_3d.c; + } + else + { + top_blob = top_blob_3d.reshape(top_blob_3d.c, opt.blob_allocator); + } + + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Convolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int num_input = channels * elempack; + +#if __loongarch_sx + if (elempack == 4 && out_elempack == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution || opt.use_winograd63_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd63_convolution && num_input >= 8 && num_output >= 8 && num_input <= 64 && num_output <= 64) || (!opt.use_winograd43_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd63_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd63_data, bias_data, opt); + else if ((opt.use_winograd43_convolution && num_input >= 8 && num_output >= 8) || (!opt.use_winograd63_convolution && !opt.use_winograd23_convolution)) + conv3x3s1_winograd43_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt); + else // if (opt.use_winograd23_convolution) + conv3x3s1_winograd23_pack4_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 4) + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv3x3s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv7x7s2_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_pack1to4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + convolution_pack4to1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + if ((opt.use_winograd43_convolution && num_input >= 16 && num_output >= 16) || !opt.use_winograd23_convolution) + { + conv3x3s1_winograd43_lsx(bottom_blob_bordered, top_blob, weight_winograd43_data, bias_data, opt); + } + else if (opt.use_winograd23_convolution) + { + conv3x3s1_winograd23_lsx(bottom_blob_bordered, top_blob, weight_winograd23_data, bias_data, opt); + } + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_lsx(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + const float* kptr = (const float*)weight_data_tm + maxk * channels * p; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + const float* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + float val = sptr[space_ofs[k]]; + float wt = kptr[k]; + sum += val * wt; + } + + kptr += maxk; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } + } + } + + return 0; +} + +int Convolution_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.c * _weight_data.elempack; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(21, dilation_h); + pd.set(3, stride_w); + pd.set(31, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, pad_value); + pd.set(5, bias_term); + pd.set(6, weight_data_flattened.w); + pd.set(8, int8_scale_term); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_flattened; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + +#if NCNN_INT8 +static void convolution_transform_kernel_packed_int8_lsx(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pa-pb-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + signed char* g00 = weight_data_tm.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < out_elempack; i++) + { + for (int j = 0; j < elempack; j++) + { + const signed char* k00 = weight_data_r2.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } +} + +int Convolution_loongarch::create_pipeline_int8_loongarch(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + int elempack = 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = num_input % 8 == 0 ? 8 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + +#if __loongarch_sx + if (elempack == 8 && out_elempack == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_transform_kernel_pack8to4_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 1 && out_elempack == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 8 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_transform_kernel_pack8to1_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + convolution_transform_kernel_packed_int8_lsx(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_transform_kernel_int8_lsx(weight_data, weight_winograd43_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_int8_lsx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + weight_data_tm = weight_data; + } + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int Convolution_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + int w = bottom_blob_bordered.w; + int h = bottom_blob_bordered.h; + int channels = bottom_blob_bordered.c; + int elempack = bottom_blob_bordered.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int num_input = channels * elempack; + + int out_elempack_int32 = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack_int32 = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 8 && out_elempack_int32 == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack8to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + } + + if (elempack == 1 && out_elempack_int32 == 4) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack1to4_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + } + + if (elempack == 8 && out_elempack_int32 == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); + } + else if (opt.use_sgemm_convolution) // TODO better condition && num_input >= 8 && num_output >= 8) + { + convolution_im2col_sgemm_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_pack8to1_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack_int32 == 1) + { + if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv1x1s1_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv1x1s2_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, opt); + } + else if (opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_winograd43_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_int8_lsx(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + else + { + convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + + return 0; +} +#endif // NCNN_INT8 + +} // namespace ncnn diff --git a/src/layer/loongarch/convolution_loongarch.h b/src/layer/loongarch/convolution_loongarch.h new file mode 100644 index 000000000000..a84281bf7135 --- /dev/null +++ b/src/layer/loongarch/convolution_loongarch.h @@ -0,0 +1,56 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONVOLUTION_LOONGARCH_H +#define LAYER_CONVOLUTION_LOONGARCH_H + +#include "convolution.h" + +namespace ncnn { + +class Convolution_loongarch : virtual public Convolution +{ +public: + Convolution_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +protected: +#if NCNN_INT8 + int create_pipeline_int8_loongarch(const Option& opt); + int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + +public: + Layer* activation; + + Mat weight_data_tm; + Mat weight_sgemm_data; + Mat weight_winograd23_data; + Mat weight_winograd43_data; + Mat weight_winograd63_data; + +#if NCNN_INT8 + Mat scale_in_data; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_CONVOLUTION_LOONGARCH_H diff --git a/src/layer/loongarch/convolution_pack1to4.h b/src/layer/loongarch/convolution_pack1to4.h new file mode 100644 index 000000000000..b7e0123d5edd --- /dev/null +++ b/src/layer/loongarch/convolution_pack1to4.h @@ -0,0 +1,90 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_data_ptr) + { + _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0); + } + + const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const float* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) // 29.23 + { + __m128 _val = __lsx_vreplfr2vr_s(sptr[space_ofs[k]]); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + + kptr += 4; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_pack1to4_int8.h b/src/layer/loongarch/convolution_pack1to4_int8.h new file mode 100644 index 000000000000..b043503c2ac6 --- /dev/null +++ b/src/layer/loongarch/convolution_pack1to4_int8.h @@ -0,0 +1,87 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128i _sum = __lsx_vreplgr2vr_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vreplgr2vr_h((short)sptr[space_ofs[k]]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0); + + _sum = __lsx_vadd_w(_sum, _s032); + + kptr += 4; + } + } + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_pack4.h b/src/layer/loongarch/convolution_pack4.h new file mode 100644 index 000000000000..66a7863f015b --- /dev/null +++ b/src/layer/loongarch/convolution_pack4.h @@ -0,0 +1,102 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + const float* bias_data_ptr = bias_data; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_data_ptr) + { + _sum = (__m128)__lsx_vld(bias_data_ptr + p * 4, 0); + } + + const float* kptr = (const float*)weight_data_pack4.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + for (int k = 0; k < maxk; k++) // 29.23 + { + const float* slptr = sptr + space_ofs[k] * 4; + + __m128 _val0 = __lsx_vreplfr2vr_s(slptr[0]); + __m128 _val1 = __lsx_vreplfr2vr_s(slptr[1]); + __m128 _val2 = __lsx_vreplfr2vr_s(slptr[2]); + __m128 _val3 = __lsx_vreplfr2vr_s(slptr[3]); + + __m128 _w0 = (__m128)__lsx_vld(kptr, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0); + + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + _sum = __lsx_vfmadd_s(_w1, _val1, _sum); + _sum = __lsx_vfmadd_s(_w2, _val2, _sum); + _sum = __lsx_vfmadd_s(_w3, _val3, _sum); + + kptr += 16; + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_pack4to1.h b/src/layer/loongarch/convolution_pack4to1.h new file mode 100644 index 000000000000..872759fc7f12 --- /dev/null +++ b/src/layer/loongarch/convolution_pack4to1.h @@ -0,0 +1,94 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_data_ptr) + { + sum = bias_data_ptr[p]; + } + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + const float* kptr = (const float*)weight_data_pack4to1.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + + kptr += 4; + } + } + + sum += __lsx_reduce_fadd_s(_sum); + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } +} diff --git a/src/layer/loongarch/convolution_pack8to1_int8.h b/src/layer/loongarch/convolution_pack8to1_int8.h new file mode 100644 index 000000000000..c7463a472b6f --- /dev/null +++ b/src/layer/loongarch/convolution_pack8to1_int8.h @@ -0,0 +1,87 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128i _sum = __lsx_vreplgr2vr_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + + _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0)); + + kptr += 8; + } + } + + outptr[j] = __lsx_reduce_add_w(_sum); + } + + outptr += outw; + } + } +} diff --git a/src/layer/loongarch/convolution_pack8to4_int8.h b/src/layer/loongarch/convolution_pack8to4_int8.h new file mode 100644 index 000000000000..00d90387bbed --- /dev/null +++ b/src/layer/loongarch/convolution_pack8to4_int8.h @@ -0,0 +1,120 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_int8, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + const signed char* kptr = weight_data_int8.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + __m128i _s2 = __lsx_vmul_h(_val16, _w2); + __m128i _s3 = __lsx_vmul_h(_val16, _w3); + + _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0)); + _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1)); + _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2)); + _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3)); + + kptr += 32; + } + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + _sum0 = __lsx_vadd_w(_sum0, _sum2); + + __lsx_vst(_sum0, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/convolution_sgemm.h b/src/layer/loongarch/convolution_sgemm.h new file mode 100644 index 000000000000..7b74ceac14b2 --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm.h @@ -0,0 +1,650 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + const float* bias = _bias; + + // permute + Mat tmp; + if (size >= 4) + tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 4u, 1, opt.workspace_allocator); + { + int nn_size = size / 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = ii * 4; + + float* tmpptr = tmp.channel(i / 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { +#if __loongarch_sx + __lsx_vst(__lsx_vld(img0, 0), tmpptr, 0); +#else + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + tmpptr[2] = img0[2]; + tmpptr[3] = img0[3]; +#endif + img0 += size; + tmpptr += 4; + } + } + } + + int remain_size_start = nn_size * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + float* tmpptr = tmp.channel(i / 4 + i % 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + img0 += size; + tmpptr += 1; + } + } + } + } + +#if __loongarch_sx + int nn_outch = outch >> 3; + int remain_outch_start = nn_outch << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 8; + + float* outptr0 = top_blob.channel(p); + float* outptr1 = top_blob.channel(p + 1); + float* outptr2 = top_blob.channel(p + 2); + float* outptr3 = top_blob.channel(p + 3); + float* outptr4 = top_blob.channel(p + 4); + float* outptr5 = top_blob.channel(p + 5); + float* outptr6 = top_blob.channel(p + 6); + float* outptr7 = top_blob.channel(p + 7); + + const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 4); + const float* kptr = kernel.channel(p / 8); + + int nn = inch * maxk; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]); + __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]); + __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]); + __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]); + __m128 _sum4 = __lsx_vreplfr2vr_s(biasptr[4]); + __m128 _sum5 = __lsx_vreplfr2vr_s(biasptr[5]); + __m128 _sum6 = __lsx_vreplfr2vr_s(biasptr[6]); + __m128 _sum7 = __lsx_vreplfr2vr_s(biasptr[7]); + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 32); + __m128 _val = (__m128)__lsx_vld(tmpptr, 0); + __m128i _w0123 = __lsx_vld(kptr, 0); + __m128i _w4567 = __lsx_vld(kptr + 4, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3); + _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4); + _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5); + _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6); + _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7); + + tmpptr += 4; + kptr += 8; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + __lsx_vst(_sum2, outptr2, 0); + __lsx_vst(_sum3, outptr3, 0); + __lsx_vst(_sum4, outptr4, 0); + __lsx_vst(_sum5, outptr5, 0); + __lsx_vst(_sum6, outptr6, 0); + __lsx_vst(_sum7, outptr7, 0); + + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + outptr4 += 4; + outptr5 += 4; + outptr6 += 4; + outptr7 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 4 + i % 4); + const float* kptr = kernel.channel(p / 8); + + int nn = inch * maxk; // inch always > 0 + + float sum0 = biasptr[0]; + float sum1 = biasptr[1]; + float sum2 = biasptr[2]; + float sum3 = biasptr[3]; + float sum4 = biasptr[4]; + float sum5 = biasptr[5]; + float sum6 = biasptr[6]; + float sum7 = biasptr[7]; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + sum2 += tmpptr[0] * kptr[2]; + sum3 += tmpptr[0] * kptr[3]; + sum4 += tmpptr[0] * kptr[4]; + sum5 += tmpptr[0] * kptr[5]; + sum6 += tmpptr[0] * kptr[6]; + sum7 += tmpptr[0] * kptr[7]; + tmpptr++; + kptr += 8; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr2[0] = sum2; + outptr3[0] = sum3; + outptr4[0] = sum4; + outptr5[0] = sum5; + outptr6[0] = sum6; + outptr7[0] = sum7; + + outptr0++; + outptr1++; + outptr2++; + outptr3++; + outptr4++; + outptr5++; + outptr6++; + outptr7++; + } + } + + nn_outch = (outch - remain_outch_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = remain_outch_start + pp * 4; + + float* outptr0 = top_blob.channel(p); + float* outptr1 = top_blob.channel(p + 1); + float* outptr2 = top_blob.channel(p + 2); + float* outptr3 = top_blob.channel(p + 3); + + const float zeros[4] = {0.f, 0.f, 0.f, 0.f}; + const float* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 4); + const float* kptr = kernel.channel(p / 8 + (p % 8) / 4); + + int nn = inch * maxk; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(biasptr[0]); + __m128 _sum1 = __lsx_vreplfr2vr_s(biasptr[1]); + __m128 _sum2 = __lsx_vreplfr2vr_s(biasptr[2]); + __m128 _sum3 = __lsx_vreplfr2vr_s(biasptr[3]); + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 16); + __m128 _val = (__m128)__lsx_vld(tmpptr, 0); + __m128i _w0123 = __lsx_vld(kptr, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3); + + tmpptr += 4; + kptr += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + __lsx_vst(_sum2, outptr2, 0); + __lsx_vst(_sum3, outptr3, 0); + + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 4 + i % 4); + const float* kptr = kernel.channel(p / 8 + (p % 8) / 4); + + int nn = inch * maxk; // inch always > 0 + + float sum0 = biasptr[0]; + float sum1 = biasptr[1]; + float sum2 = biasptr[2]; + float sum3 = biasptr[3]; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + sum2 += tmpptr[0] * kptr[2]; + sum3 += tmpptr[0] * kptr[3]; + tmpptr++; + kptr += 4; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr2[0] = sum2; + outptr3[0] = sum3; + + outptr0++; + outptr1++; + outptr2++; + outptr3++; + } + } + + remain_outch_start += nn_outch << 2; +#else // __loongarch_sx + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + float* outptr0 = top_blob.channel(p); + float* outptr1 = top_blob.channel(p + 1); + + const float zeros[2] = {0.f, 0.f}; + const float* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 4); + const float* kptr = kernel.channel(p / 2); + + int nn = inch * maxk; // inch always > 0 + + float sum00 = biasptr[0]; + float sum01 = biasptr[0]; + float sum02 = biasptr[0]; + float sum03 = biasptr[0]; + float sum10 = biasptr[1]; + float sum11 = biasptr[1]; + float sum12 = biasptr[1]; + float sum13 = biasptr[1]; + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 8); + float k0 = kptr[0]; + float k1 = kptr[1]; + sum00 += tmpptr[0] * k0; + sum01 += tmpptr[1] * k0; + sum02 += tmpptr[2] * k0; + sum03 += tmpptr[3] * k0; + sum10 += tmpptr[0] * k1; + sum11 += tmpptr[1] * k1; + sum12 += tmpptr[2] * k1; + sum13 += tmpptr[3] * k1; + tmpptr += 4; + kptr += 2; + } + + outptr0[0] = sum00; + outptr0[1] = sum01; + outptr0[2] = sum02; + outptr0[3] = sum03; + outptr1[0] = sum10; + outptr1[1] = sum11; + outptr1[2] = sum12; + outptr1[3] = sum13; + + outptr0 += 4; + outptr1 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 4 + i % 4); + const float* kptr = kernel.channel(p / 2); + + int nn = inch * maxk; // inch always > 0 + + float sum0 = biasptr[0]; + float sum1 = biasptr[1]; + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 4); + __builtin_prefetch(kptr + 8); + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[0] * kptr[1]; + tmpptr++; + kptr += 2; + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + + outptr0++; + outptr1++; + } + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + float* outptr0 = top_blob.channel(p); + + const float bias0 = bias ? bias[p] : 0.f; + + int i = 0; + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 4); +#if __loongarch_sx + const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4); +#else + const float* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int nn = inch * maxk; // inch always > 0 + +#if __loongarch_sx + __m128 _sum0 = __lsx_vreplfr2vr_s(bias0); + + for (int q = 0; q < nn; q++) + { + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(tmpptr, 0), __lsx_vreplfr2vr_s(kptr[0]), _sum0); + tmpptr += 4; + kptr++; + } + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; +#else + float sum0 = bias0; + float sum1 = bias0; + float sum2 = bias0; + float sum3 = bias0; + + for (int q = 0; q < nn; q++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 4); + sum0 += tmpptr[0] * kptr[0]; + sum1 += tmpptr[1] * kptr[0]; + sum2 += tmpptr[2] * kptr[0]; + sum3 += tmpptr[3] * kptr[0]; + tmpptr += 4; + kptr++; + } + + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0[2] = sum2; + outptr0[3] = sum3; + + outptr0 += 4; +#endif // __loongarch_sx + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 4 + i % 4); +#if __loongarch_sx + const float* kptr = kernel.channel(p / 8 + (p % 8) / 4 + p % 4); +#else + const float* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int nn = inch * maxk; // inch always > 0 + + float sum0 = bias0; + + for (int q = 0; q < nn; q++) + { + sum0 += tmpptr[0] * kptr[0]; + tmpptr++; + kptr++; + } + + outptr0[0] = sum0; + + outptr0++; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8b-maxk-inch-outch/8b + Mat kernel = _kernel.reshape(maxk, inch, outch); +#if __loongarch_sx + kernel_tm.create(8 * maxk, inch, outch / 8 + (outch % 8) / 4 + outch % 4); +#else + kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2); +#endif + + int q = 0; +#if __loongarch_sx + for (; q + 7 < outch; q += 8) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + const Mat k2 = kernel.channel(q + 2); + const Mat k3 = kernel.channel(q + 3); + const Mat k4 = kernel.channel(q + 4); + const Mat k5 = kernel.channel(q + 5); + const Mat k6 = kernel.channel(q + 6); + const Mat k7 = kernel.channel(q + 7); + + float* g00 = kernel_tm.channel(q / 8); + + for (int p = 0; p < inch; p++) + { + const float* k00 = k0.row(p); + const float* k10 = k1.row(p); + const float* k20 = k2.row(p); + const float* k30 = k3.row(p); + const float* k40 = k4.row(p); + const float* k50 = k5.row(p); + const float* k60 = k6.row(p); + const float* k70 = k7.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + g00[1] = k10[k]; + g00[2] = k20[k]; + g00[3] = k30[k]; + g00[4] = k40[k]; + g00[5] = k50[k]; + g00[6] = k60[k]; + g00[7] = k70[k]; + + g00 += 8; + } + } + } + for (; q + 3 < outch; q += 4) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + const Mat k2 = kernel.channel(q + 2); + const Mat k3 = kernel.channel(q + 3); + + float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4); + + for (int p = 0; p < inch; p++) + { + const float* k00 = k0.row(p); + const float* k10 = k1.row(p); + const float* k20 = k2.row(p); + const float* k30 = k3.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + g00[1] = k10[k]; + g00[2] = k20[k]; + g00[3] = k30[k]; + + g00 += 4; + } + } + } +#else + for (; q + 1 < outch; q += 2) + { + const Mat k0 = kernel.channel(q); + const Mat k1 = kernel.channel(q + 1); + + float* g00 = kernel_tm.channel(q / 2); + + for (int p = 0; p < inch; p++) + { + const float* k00 = k0.row(p); + const float* k10 = k1.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + g00[1] = k10[k]; + + g00 += 2; + } + } + } +#endif // __loongarch_sx + for (; q < outch; q++) + { + const Mat k0 = kernel.channel(q); + +#if __loongarch_sx + float* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + q % 4); +#else + float* g00 = kernel_tm.channel(q / 2 + q % 2); +#endif + + for (int p = 0; p < inch; p++) + { + const float* k00 = k0.row(p); + + for (int k = 0; k < maxk; k++) + { + g00[0] = k00[k]; + + g00 += 1; + } + } + } +} + +static void convolution_im2col_sgemm_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + float* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const float* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_int8.h b/src/layer/loongarch/convolution_sgemm_int8.h new file mode 100644 index 000000000000..98f47760901f --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_int8.h @@ -0,0 +1,800 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; +#if __loongarch_sx + if (inch >= 4) + { + if (size >= 2) + tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator); + } + else +#endif // __loongarch_sx + { + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); + } + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + signed char* tmpptr = tmp.channel(i / 2); + + int q = 0; +#if __loongarch_sx + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr[4] = img0[1]; + tmpptr[5] = img1[1]; + tmpptr[6] = img2[1]; + tmpptr[7] = img3[1]; + tmpptr += 8; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + + tmpptr += 2; + + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + signed char* tmpptr = tmp.channel(i / 2 + i % 2); + + int q = 0; +#if __loongarch_sx + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr += 4; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + + tmpptr += 1; + + img0 += size; + } + } + } + } + +#if __loongarch_sx + int nn_outch = outch >> 2; + int remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + int* outptr2 = top_blob.channel(p + 2); + int* outptr3 = top_blob.channel(p + 3); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _val0 = __lsx_vilvl_d(_val01, _val01); + __m128i _val1 = __lsx_vilvh_d(_val01, _val01); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + + __m128i _s00 = __lsx_vmul_h(_val0, _w0); + __m128i _s01 = __lsx_vmul_h(_val0, _w1); + __m128i _s10 = __lsx_vmul_h(_val1, _w0); + __m128i _s11 = __lsx_vmul_h(_val1, _w1); + + __m128i _exts00 = __lsx_vslti_h(_s00, 0); + __m128i _exts01 = __lsx_vslti_h(_s01, 0); + __m128i _exts10 = __lsx_vslti_h(_s10, 0); + __m128i _exts11 = __lsx_vslti_h(_s11, 0); + __m128i _s00l = __lsx_vilvl_h(_exts00, _s00); + __m128i _s00h = __lsx_vilvh_h(_exts00, _s00); + __m128i _s01l = __lsx_vilvl_h(_exts01, _s01); + __m128i _s01h = __lsx_vilvh_h(_exts01, _s01); + __m128i _s10l = __lsx_vilvl_h(_exts10, _s10); + __m128i _s10h = __lsx_vilvh_h(_exts10, _s10); + __m128i _s11l = __lsx_vilvl_h(_exts11, _s11); + __m128i _s11h = __lsx_vilvh_h(_exts11, _s11); + + _sum00 = __lsx_vadd_w(_sum00, _s00l); + _sum01 = __lsx_vadd_w(_sum01, _s00h); + _sum02 = __lsx_vadd_w(_sum02, _s01l); + _sum03 = __lsx_vadd_w(_sum03, _s01h); + _sum10 = __lsx_vadd_w(_sum10, _s10l); + _sum11 = __lsx_vadd_w(_sum11, _s10h); + _sum12 = __lsx_vadd_w(_sum12, _s11l); + _sum13 = __lsx_vadd_w(_sum13, _s11h); + + tmpptr += 8; + kptr += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + } + + int j = 0; + for (; j < nn1; j++) + { + __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]); + __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]); + __m128i _val = __lsx_vilvl_d(_val1, _val0); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum00 = __lsx_vadd_w(_sum00, _s0l); + _sum10 = __lsx_vadd_w(_sum10, _s0h); + + tmpptr += 2; + kptr += 4; + } + + int sum[8]; + __lsx_vst(_sum00, sum, 0); + __lsx_vst(_sum10, sum + 4, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0[1] = sum[4]; + outptr1[1] = sum[5]; + outptr2[1] = sum[6]; + outptr3[1] = sum[7]; + outptr0 += 2; + outptr1 += 2; + outptr2 += 2; + outptr3 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + _val16 = __lsx_vilvl_d(_val16, _val16); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _exts1 = __lsx_vslti_h(_s1, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + __m128i _s1l = __lsx_vilvl_h(_exts1, _s1); + __m128i _s1h = __lsx_vilvh_h(_exts1, _s1); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + _sum2 = __lsx_vadd_w(_sum2, _s1l); + _sum3 = __lsx_vadd_w(_sum3, _s1h); + + tmpptr += 4; + kptr += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + _sum0 = __lsx_vadd_w(_sum0, _sum2); + } + int j = 0; + for (; j < nn1; j++) + { + __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s032); + + tmpptr += 1; + kptr += 4; + } + + int sum[4]; + __lsx_vst(_sum0, sum, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } +#else // __loongarch_sx + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 2); + + int sum00 = 0; + int sum01 = 0; + int sum10 = 0; + int sum11 = 0; + + int nn1 = inch * maxk; + + int j = 0; + for (; j < nn1; j++) + { + signed char val0 = tmpptr[0]; + signed char val1 = tmpptr[1]; + signed char w0 = kptr[0]; + signed char w1 = kptr[1]; + + sum00 += val0 * w0; + sum01 += val1 * w0; + sum10 += val0 * w1; + sum11 += val1 * w1; + + tmpptr += 2; + kptr += 2; + } + + outptr0[0] = sum00; + outptr0[1] = sum01; + outptr1[0] = sum10; + outptr1[1] = sum11; + outptr0 += 2; + outptr1 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 2); + + int sum00 = 0; + int sum10 = 0; + + int nn1 = inch * maxk; + + int j = 0; + for (; j < nn1; j++) + { + signed char val0 = tmpptr[0]; + signed char w0 = kptr[0]; + signed char w1 = kptr[1]; + + sum00 += val0 * w0; + sum10 += val0 * w1; + + tmpptr += 1; + kptr += 2; + } + + outptr0[0] = sum00; + outptr1[0] = sum10; + outptr0 += 1; + outptr1 += 1; + } + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); +#if __loongarch_sx + const signed char* kptr = kernel.channel(p / 4 + p % 4); +#else + const signed char* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int sum0 = 0; + int sum1 = 0; + +#if __loongarch_sx + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + if (nn4 > 0) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + + tmpptr += 8; + kptr += 4; + } + + sum0 = __lsx_reduce_add_w(_sum0); + sum1 = __lsx_reduce_add_w(_sum1); + } +#else + int nn1 = inch * maxk; +#endif // __loongarch_sx + + int j = 0; + for (; j < nn1; j++) + { + signed char val0 = tmpptr[0]; + signed char val1 = tmpptr[1]; + signed char w = kptr[0]; + + sum0 += val0 * w; + sum1 += val1 * w; + + tmpptr += 2; + kptr += 1; + } + + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); +#if __loongarch_sx + const signed char* kptr = kernel.channel(p / 4 + p % 4); +#else + const signed char* kptr = kernel.channel(p / 2 + p % 2); +#endif + + int sum = 0; + +#if __loongarch_sx + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + if (nn4 > 0) + { + __m128i _sum = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0); + + _sum = __lsx_vadd_w(_sum, _s032); + + tmpptr += 4; + kptr += 4; + } + + sum = __lsx_reduce_add_w(_sum); + } +#else + int nn1 = inch * maxk; +#endif // __loongarch_sx + + int j = 0; + for (; j < nn1; j++) + { + signed char val = tmpptr[0]; + signed char w = kptr[0]; + + sum += val * w; + + tmpptr += 1; + kptr += 1; + } + + outptr0[0] = sum; + outptr0 += 1; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 4a-4b-maxk-inch/4a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); +#if __loongarch_sx + if (outch >= 4) + { + if (inch >= 4) + kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4 + outch % 4, (size_t)1u); + else + kernel_tm.create(4 * maxk, inch, outch / 4 + outch % 4, (size_t)1u); + } +#else + if (outch >= 2) + { + kernel_tm.create(2 * maxk, inch, outch / 2 + outch % 2, (size_t)1u); + } +#endif // __loongarch_sx + else + { +#if __loongarch_sx + if (inch >= 4) + kernel_tm.create(4 * maxk, inch / 4 + inch % 4, outch, (size_t)1u); + else +#endif // __loongarch_sx + { + kernel_tm.create(1 * maxk, inch, outch, (size_t)1u); + } + } + + int q = 0; +#if __loongarch_sx + for (; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + int p = 0; + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + g00[0] = k00[k]; + g00++; + } + } + } + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + const signed char* k00 = kernel.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#else // __loongarch_sx + for (; q + 1 < outch; q += 2) + { + signed char* g00 = kernel_tm.channel(q / 2); + + int p = 0; + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 2; i++) + { + const signed char* k00 = kernel.channel(q + i).row(p); + g00[0] = k00[k]; + g00++; + } + } + } + } +#endif // __loongarch_sx + for (; q < outch; q++) + { +#if __loongarch_sx + signed char* g00 = kernel_tm.channel(q / 4 + q % 4); +#else + signed char* g00 = kernel_tm.channel(q / 2 + q % 2); +#endif + + int p = 0; +#if __loongarch_sx + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q).row(p + j); + g00[0] = k00[k]; + g00++; + } + } + } +#endif // __loongarch_sx + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k00 = kernel.channel(q).row(p); + g00[0] = k00[k]; + g00++; + } + } + } +} + +static void convolution_im2col_sgemm_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + signed char* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const signed char* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + ptr[2] = sptr[stride_w * 2]; + ptr[3] = sptr[stride_w * 3]; + + sptr += stride_w * 4; + ptr += 4; + } + for (; j + 1 < outw; j += 2) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + + sptr += stride_w * 2; + ptr += 2; + } + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h b/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h new file mode 100644 index 000000000000..3429bfae5fa6 --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack1to4_int8.h @@ -0,0 +1,481 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (inch >= 4) + { + if (size >= 2) + tmp.create(2 * maxk, inch / 4 + inch % 4, size / 2 + size % 2, 4u, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch / 4 + inch % 4, size, 4u, 4, opt.workspace_allocator); + } + else + { + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 1u, 1, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 1u, 1, opt.workspace_allocator); + } + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + signed char* tmpptr = tmp.channel(i / 2); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr[4] = img0[1]; + tmpptr[5] = img1[1]; + tmpptr[6] = img2[1]; + tmpptr[7] = img3[1]; + tmpptr += 8; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img0[1]; + + tmpptr += 2; + + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + signed char* tmpptr = tmp.channel(i / 2 + i % 2); + + int q = 0; + for (; q + 3 < inch; q += 4) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + const signed char* img1 = (const signed char*)bottom_im2col.channel(q + 1) + i; + const signed char* img2 = (const signed char*)bottom_im2col.channel(q + 2) + i; + const signed char* img3 = (const signed char*)bottom_im2col.channel(q + 3) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr[1] = img1[0]; + tmpptr[2] = img2[0]; + tmpptr[3] = img3[0]; + tmpptr += 4; + + img0 += size; + img1 += size; + img2 += size; + img3 += size; + } + } + for (; q < inch; q++) + { + const signed char* img0 = (const signed char*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + + tmpptr += 1; + + img0 += size; + } + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val01 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _val0 = __lsx_vilvl_d(_val01, _val01); + __m128i _val1 = __lsx_vilvh_d(_val01, _val01); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + + __m128i _s00 = __lsx_vmul_h(_val0, _w0); + __m128i _s01 = __lsx_vmul_h(_val0, _w1); + __m128i _s10 = __lsx_vmul_h(_val1, _w0); + __m128i _s11 = __lsx_vmul_h(_val1, _w1); + + __m128i _exts00 = __lsx_vslti_h(_s00, 0); + __m128i _exts01 = __lsx_vslti_h(_s01, 0); + __m128i _exts10 = __lsx_vslti_h(_s10, 0); + __m128i _exts11 = __lsx_vslti_h(_s11, 0); + __m128i _s00l = __lsx_vilvl_h(_exts00, _s00); + __m128i _s00h = __lsx_vilvh_h(_exts00, _s00); + __m128i _s01l = __lsx_vilvl_h(_exts01, _s01); + __m128i _s01h = __lsx_vilvh_h(_exts01, _s01); + __m128i _s10l = __lsx_vilvl_h(_exts10, _s10); + __m128i _s10h = __lsx_vilvh_h(_exts10, _s10); + __m128i _s11l = __lsx_vilvl_h(_exts11, _s11); + __m128i _s11h = __lsx_vilvh_h(_exts11, _s11); + + _sum00 = __lsx_vadd_w(_sum00, _s00l); + _sum01 = __lsx_vadd_w(_sum01, _s00h); + _sum02 = __lsx_vadd_w(_sum02, _s01l); + _sum03 = __lsx_vadd_w(_sum03, _s01h); + _sum10 = __lsx_vadd_w(_sum10, _s10l); + _sum11 = __lsx_vadd_w(_sum11, _s10h); + _sum12 = __lsx_vadd_w(_sum12, _s11l); + _sum13 = __lsx_vadd_w(_sum13, _s11h); + + tmpptr += 8; + kptr += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + } + + int j = 0; + for (; j < nn1; j++) + { + __m128i _val0 = __lsx_vreplgr2vr_h(tmpptr[0]); + __m128i _val1 = __lsx_vreplgr2vr_h(tmpptr[1]); + __m128i _val = __lsx_vilvl_d(_val1, _val0); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum00 = __lsx_vadd_w(_sum00, _s0l); + _sum10 = __lsx_vadd_w(_sum10, _s0h); + + tmpptr += 2; + kptr += 4; + } + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum10, outptr0 + 4, 0); + outptr0 += 8; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p); + + int nn4 = (inch / 4) * maxk; + int nn1 = (inch % 4) * maxk; + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + _val16 = __lsx_vilvl_d(_val16, _val16); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _exts1 = __lsx_vslti_h(_s1, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + __m128i _s1l = __lsx_vilvl_h(_exts1, _s1); + __m128i _s1h = __lsx_vilvh_h(_exts1, _s1); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + _sum2 = __lsx_vadd_w(_sum2, _s1l); + _sum3 = __lsx_vadd_w(_sum3, _s1h); + + tmpptr += 4; + kptr += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + _sum0 = __lsx_vadd_w(_sum0, _sum2); + } + + int j = 0; + for (; j < nn1; j++) + { + __m128i _val = __lsx_vreplgr2vr_h(tmpptr[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _s032 = __lsx_vilvl_h(__lsx_vslti_h(_s0, 0), _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s032); + + tmpptr += 1; + kptr += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + outptr0 += 4; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack1to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 4a-4b-maxk-inch/4a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + if (inch >= 4) + kernel_tm.create(16 * maxk, inch / 4 + inch % 4, outch / 4, (size_t)1u); + else + kernel_tm.create(4 * maxk, inch, outch / 4, (size_t)1u); + + for (int q = 0; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + int p = 0; + for (; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + const signed char* k00 = kernel.channel(q + i).row(p); + + g00[0] = k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_pack1to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 1u, 1, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + signed char* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const signed char* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j + 3 < outw; j += 4) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + ptr[2] = sptr[stride_w * 2]; + ptr[3] = sptr[stride_w * 3]; + + sptr += stride_w * 4; + ptr += 4; + } + for (; j + 1 < outw; j += 2) + { + ptr[0] = sptr[0]; + ptr[1] = sptr[stride_w]; + + sptr += stride_w * 2; + ptr += 2; + } + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack1to4_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack4.h b/src/layer/loongarch/convolution_sgemm_pack4.h new file mode 100644 index 000000000000..e3e7279a5d2c --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack4.h @@ -0,0 +1,519 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack4_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + const float* bias = _bias; + + // permute + Mat tmp; + if (size >= 12) + tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + (size % 12 % 4) / 2 + size % 12 % 2, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 8) + tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 4) + tmp.create(4 * maxk, inch, size / 4 + (size % 4) / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 4u * 4, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = size / 12; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 12; + + float* tmpptr = tmp.channel(i / 12); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x12 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0); + __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0); + __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0); + __m128i _ra = __lsx_vld(img0 + 4 * 10, 0); + __m128i _rb = __lsx_vld(img0 + 4 * 11, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r89r = __lsx_vilvl_w(_r9, _r8); + __m128i _r89l = __lsx_vilvh_w(_r9, _r8); + __m128i _rabr = __lsx_vilvl_w(_rb, _ra); + __m128i _rabl = __lsx_vilvh_w(_rb, _ra); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r); + __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r); + __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l); + __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0); + __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0); + __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0); + __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0); + + img0 += size * 4; + tmpptr += 48; + } + } + } + + remain_size_start += nn_size * 12; + nn_size = (size - remain_size_start) >> 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 8; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x8 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0); + + img0 += size * 4; + tmpptr += 32; + } + } + } + + remain_size_start += nn_size << 3; + nn_size = (size - remain_size_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 4; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r0123_1, tmpptr + 4, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0); + + img0 += size * 4; + tmpptr += 16; + } + } + } + + remain_size_start += nn_size << 2; + nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x2 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + + __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0); + __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0); + + __lsx_vst(_r01_0, tmpptr, 0); + __lsx_vst(_r01_1, tmpptr + 4, 0); + + img0 += size * 4; + tmpptr += 8; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vld(img0, 0); + __lsx_vst(_val, tmpptr, 0); + + img0 += size * 4; + tmpptr += 4; + } + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 11 < size; i += 12) + { + const float* tmpptr = tmp.channel(i / 12); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = _sum0; + __m128 _sum2 = _sum0; + __m128 _sum3 = _sum0; + __m128 _sum4 = _sum0; + __m128 _sum5 = _sum0; + __m128 _sum6 = _sum0; + __m128 _sum7 = _sum0; + __m128 _sum8 = _sum0; + __m128 _sum9 = _sum0; + __m128 _suma = _sum0; + __m128 _sumb = _sum0; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 48); + __builtin_prefetch(kptr0 + 16); + __m128i _val0123 = __lsx_vld(tmpptr, 0); + __m128i _val4567 = __lsx_vld(tmpptr + 4, 0); + __m128i _val89ab = __lsx_vld(tmpptr + 8, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4); + _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5); + _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6); + _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7); + _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8); + _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9); + _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma); + _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb); + + tmpptr += 12; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + __lsx_vst(_sum4, outptr0 + 4 * 4, 0); + __lsx_vst(_sum5, outptr0 + 4 * 5, 0); + __lsx_vst(_sum6, outptr0 + 4 * 6, 0); + __lsx_vst(_sum7, outptr0 + 4 * 7, 0); + __lsx_vst(_sum8, outptr0 + 4 * 8, 0); + __lsx_vst(_sum9, outptr0 + 4 * 9, 0); + __lsx_vst(_suma, outptr0 + 4 * 10, 0); + __lsx_vst(_sumb, outptr0 + 4 * 11, 0); + + outptr0 += 4 * 12; + } + for (; i + 7 < size; i += 8) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = _sum0; + __m128 _sum2 = _sum0; + __m128 _sum3 = _sum0; + __m128 _sum4 = _sum0; + __m128 _sum5 = _sum0; + __m128 _sum6 = _sum0; + __m128 _sum7 = _sum0; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr0 + 16); + __m128i _val0123 = __lsx_vld(tmpptr, 0); + __m128i _val4567 = __lsx_vld(tmpptr + 4, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4); + _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5); + _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6); + _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7); + + tmpptr += 8; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + __lsx_vst(_sum4, outptr0 + 4 * 4, 0); + __lsx_vst(_sum5, outptr0 + 4 * 5, 0); + __lsx_vst(_sum6, outptr0 + 4 * 6, 0); + __lsx_vst(_sum7, outptr0 + 4 * 7, 0); + + outptr0 += 4 * 8; + } + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = _sum0; + __m128 _sum2 = _sum0; + __m128 _sum3 = _sum0; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr0 + 16); + __m128i _val0123 = __lsx_vld(tmpptr, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + + tmpptr += 4; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 4 * 2, 0); + __lsx_vst(_sum3, outptr0 + 4 * 3, 0); + + outptr0 += 4 * 4; + } + for (; i + 1 < size; i += 2) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = _sum0; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 8); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++); + __m128 _val1 = __lsx_vreplfr2vr_s(*tmpptr++); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0); + _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1); + + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + + outptr0 += 4 * 2; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2); + const float* kptr0 = kernel.channel(p); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum = bias ? (__m128)__lsx_vld(bias + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 4); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + + kptr0 += 4; + } + + __lsx_vst(_sum, outptr0, 0); + + outptr0 += 4; + } + } +} + +static void convolution_im2col_sgemm_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator); + { + const int gap = (w * stride_h - outw * stride_w) * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + float* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const float* sptr = img.row(dilation_h * u) + dilation_w * v * 4; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __lsx_vst(_val, ptr, 0); + + sptr += stride_w * 4; + ptr += 4; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack4_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack4to1.h b/src/layer/loongarch/convolution_sgemm_pack4to1.h new file mode 100644 index 000000000000..3748645b4d4c --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack4to1.h @@ -0,0 +1,667 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack4to1_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + const float* bias = _bias; + + Mat tmp; + if (size >= 12) + tmp.create(12 * maxk, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + size % 12 % 4, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 8) + tmp.create(8 * maxk, inch, size / 8 + (size % 8) / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator); + else if (size >= 4) + tmp.create(4 * maxk, inch, size / 4 + size % 4, 4u * 4, 4, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 4u * 4, 4, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = size / 12; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 12; + + float* tmpptr = tmp.channel(i / 12); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x12 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0); + __m128i _r8 = __lsx_vld(img0 + 4 * 8, 0); + __m128i _r9 = __lsx_vld(img0 + 4 * 9, 0); + __m128i _ra = __lsx_vld(img0 + 4 * 10, 0); + __m128i _rb = __lsx_vld(img0 + 4 * 11, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r89r = __lsx_vilvl_w(_r9, _r8); + __m128i _r89l = __lsx_vilvh_w(_r9, _r8); + __m128i _rabr = __lsx_vilvl_w(_rb, _ra); + __m128i _rabl = __lsx_vilvh_w(_rb, _ra); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r); + __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r); + __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l); + __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0); + __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0); + __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0); + __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0); + + img0 += size * 4; + tmpptr += 48; + } + } + } + + remain_size_start += nn_size * 12; + nn_size = (size - remain_size_start) >> 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 8; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x8 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(img0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(img0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(img0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(img0 + 4 * 7, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0); + + img0 += size * 4; + tmpptr += 32; + } + } + } + + remain_size_start += nn_size << 3; + nn_size = (size - remain_size_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 4; + + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(img0, 0); + __m128i _r1 = __lsx_vld(img0 + 4, 0); + __m128i _r2 = __lsx_vld(img0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(img0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r0123_1, tmpptr + 4, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0); + + img0 += size * 4; + tmpptr += 16; + } + } + } + + remain_size_start += nn_size << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4); + + for (int q = 0; q < inch; q++) + { + const float* img0 = (const float*)bottom_im2col.channel(q) + i * 4; + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(img0, 0); + __lsx_vst(_val, tmpptr, 0); + + img0 += size * 4; + tmpptr += 4; + } + } + } + } + + int nn_outch = outch / 4; + int remain_outch_start = nn_outch * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + float* outptr0 = top_blob.channel(p); + float* outptr1 = top_blob.channel(p + 1); + float* outptr2 = top_blob.channel(p + 2); + float* outptr3 = top_blob.channel(p + 3); + + const float zeros[4] = {0.f}; + const float* biasptr = bias ? bias + p : zeros; + + int i = 0; + for (; i + 11 < size; i += 12) + { + const float* tmpptr = tmp.channel(i / 12); + const float* kptr0 = kernel.channel(p / 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128i _bias = __lsx_vld(biasptr, 0); + __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum8 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum9 = (__m128)__lsx_vreplvei_w(_bias, 3); + __m128 _suma = (__m128)__lsx_vreplvei_w(_bias, 3); + __m128 _sumb = (__m128)__lsx_vreplvei_w(_bias, 3); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 48); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0); + __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0); + __m128i _w0123 = __lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val2, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum3); + _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum4); + _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val2, _sum5); + _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum6); + _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum7); + _sum8 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val2, _sum8); + _sum9 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum9); + _suma = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _suma); + _sumb = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val2, _sumb); + + tmpptr += 12; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 8, 0); + __lsx_vst(_sum3, outptr1, 0); + __lsx_vst(_sum4, outptr1 + 4, 0); + __lsx_vst(_sum5, outptr1 + 8, 0); + __lsx_vst(_sum6, outptr2, 0); + __lsx_vst(_sum7, outptr2 + 4, 0); + __lsx_vst(_sum8, outptr2 + 8, 0); + __lsx_vst(_sum9, outptr3, 0); + __lsx_vst(_suma, outptr3 + 4, 0); + __lsx_vst(_sumb, outptr3 + 8, 0); + + outptr0 += 12; + outptr1 += 12; + outptr2 += 12; + outptr3 += 12; + } + for (; i + 7 < size; i += 8) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + const float* kptr0 = kernel.channel(p / 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128i _bias = __lsx_vld(biasptr, 0); + __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum4 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum5 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum6 = (__m128)__lsx_vreplvei_w(_bias, 3); + __m128 _sum7 = (__m128)__lsx_vreplvei_w(_bias, 3); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0); + __m128i _w0123 = __lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val1, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val1, _sum3); + _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum4); + _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val1, _sum5); + _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum6); + _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val1, _sum7); + + tmpptr += 8; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr1, 0); + __lsx_vst(_sum3, outptr1 + 4, 0); + __lsx_vst(_sum4, outptr2, 0); + __lsx_vst(_sum5, outptr2 + 4, 0); + __lsx_vst(_sum6, outptr3, 0); + __lsx_vst(_sum7, outptr3 + 4, 0); + + outptr0 += 8; + outptr1 += 8; + outptr2 += 8; + outptr3 += 8; + } + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + const float* kptr0 = kernel.channel(p / 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128i _bias = __lsx_vld(biasptr, 0); + __m128 _sum0 = (__m128)__lsx_vreplvei_w(_bias, 0); + __m128 _sum1 = (__m128)__lsx_vreplvei_w(_bias, 1); + __m128 _sum2 = (__m128)__lsx_vreplvei_w(_bias, 2); + __m128 _sum3 = (__m128)__lsx_vreplvei_w(_bias, 3); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128i _w0123 = __lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val0, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val0, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val0, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val0, _sum3); + + tmpptr += 4; + kptr0 += 4; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + __lsx_vst(_sum2, outptr2, 0); + __lsx_vst(_sum3, outptr3, 0); + + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4); + const float* kptr0 = kernel.channel(p / 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum = (__m128)__lsx_vld(biasptr, 0); + float* _sum_p = (float*)&_sum; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 4); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*tmpptr++); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + + kptr0 += 4; + } + + outptr0[0] = _sum_p[0]; + outptr1[0] = _sum_p[1]; + outptr2[0] = _sum_p[2]; + outptr3[0] = _sum_p[3]; + + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + float* outptr0 = top_blob.channel(p); + + const float bias0 = bias ? bias[p] : 0.f; + + int i = 0; + for (; i + 11 < size; i += 12) + { + const float* tmpptr = tmp.channel(i / 12); + const float* kptr0 = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(bias0); + __m128 _sum1 = __lsx_vreplfr2vr_s(bias0); + __m128 _sum2 = __lsx_vreplfr2vr_s(bias0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 48); + __builtin_prefetch(kptr0 + 4); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0); + __m128 _val2 = (__m128)__lsx_vld(tmpptr + 8, 0); + __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0); + _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0); + _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1); + _sum2 = __lsx_vfmadd_s(_val2, _w0, _sum2); + + tmpptr += 12; + kptr0 += 1; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + __lsx_vst(_sum2, outptr0 + 8, 0); + + outptr0 += 12; + } + for (; i + 7 < size; i += 8) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8); + const float* kptr0 = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(bias0); + __m128 _sum1 = __lsx_vreplfr2vr_s(bias0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr0 + 4); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _val1 = (__m128)__lsx_vld(tmpptr + 4, 0); + __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0); + _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0); + _sum1 = __lsx_vfmadd_s(_val1, _w0, _sum1); + + tmpptr += 8; + kptr0 += 1; + } + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr0 + 4, 0); + + outptr0 += 8; + } + for (; i + 3 < size; i += 4) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + const float* kptr0 = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk * 4; // inch always > 0 + + __m128 _sum0 = __lsx_vreplfr2vr_s(bias0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr0 + 4); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _w0 = __lsx_vreplfr2vr_s(*kptr0); + _sum0 = __lsx_vfmadd_s(_val0, _w0, _sum0); + + tmpptr += 4; + kptr0 += 1; + } + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + } + for (; i < size; i++) + { + const float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4); + const float* kptr0 = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk; // inch always > 0 + + float sum0 = bias0; + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(tmpptr + 16); + __builtin_prefetch(kptr0 + 16); + __m128 _val0 = (__m128)__lsx_vld(tmpptr, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr0, 0); + _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0); + tmpptr += 4; + kptr0 += 4; + } + + sum0 += __lsx_reduce_fadd_s(_sum0); + + outptr0[0] = sum0; + + outptr0 += 1; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack4to1_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = pb-pa-maxk-inch/pa-outch/pb + Mat kernel = _kernel.reshape(maxk, inch, outch); + kernel_tm.create(4 * 4 * maxk, inch / 4, outch / 4 + outch % 4); + + int q = 0; + for (; q + 3 < outch; q += 4) + { + float* g00 = kernel_tm.channel(q / 4); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = kernel.channel(q + j).row(p + i); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + for (; q < outch; q++) + { + const Mat k0 = kernel.channel(q); + + float* g00 = kernel_tm.channel(q / 4 + q % 4); + + for (int p = 0; p + 3 < inch; p += 4) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < 4; j++) + { + const float* k00 = k0.row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * 4, 4, opt.workspace_allocator); + { + const int gap = (w * stride_h - outw * stride_w) * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + float* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const float* sptr = img.row(dilation_h * u) + dilation_w * v * 4; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __lsx_vst(_val, ptr, 0); + + sptr += stride_w * 4; + ptr += 4; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack4to1_lsx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h b/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h new file mode 100644 index 000000000000..98d11a574b0e --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack8to1_int8.h @@ -0,0 +1,458 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = (size - remain_size_start) >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + int64_t* tmpptr = tmp.channel(i / 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + __m128i _v = __lsx_vld(img0, 0); + __lsx_vst(_v, tmpptr, 0); + tmpptr += 2; + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + int64_t* tmpptr = tmp.channel(i / 2 + i % 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr += 1; + img0 += size; + } + } + } + } + + int nn_outch = 0; + int remain_outch_start = 0; + + nn_outch = outch >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + int* outptr2 = top_blob.channel(p + 2); + int* outptr3 = top_blob.channel(p + 3); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 64); + __builtin_prefetch(kptr + 128); + __m128i _val01 = __lsx_vld(tmpptr, 0); + __m128i _extval01 = __lsx_vslti_b(_val01, 0); + __m128i _val0 = __lsx_vilvl_b(_extval01, _val01); + __m128i _val1 = __lsx_vilvh_b(_extval01, _val01); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s00 = __lsx_vmul_h(_val0, _w0); + __m128i _s01 = __lsx_vmul_h(_val0, _w1); + __m128i _s02 = __lsx_vmul_h(_val0, _w2); + __m128i _s03 = __lsx_vmul_h(_val0, _w3); + __m128i _s10 = __lsx_vmul_h(_val1, _w0); + __m128i _s11 = __lsx_vmul_h(_val1, _w1); + __m128i _s12 = __lsx_vmul_h(_val1, _w2); + __m128i _s13 = __lsx_vmul_h(_val1, _w3); + + _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00)); + _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01)); + _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02)); + _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03)); + _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10)); + _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11)); + _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12)); + _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13)); + + tmpptr += 16; + kptr += 32; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + + int sum[8]; + __lsx_vst(_sum00, sum, 0); + __lsx_vst(_sum10, sum + 4, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0[1] = sum[4]; + outptr1[1] = sum[5]; + outptr2[1] = sum[6]; + outptr3[1] = sum[7]; + outptr0 += 2; + outptr1 += 2; + outptr2 += 2; + outptr3 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr + 128); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + __m128i _s2 = __lsx_vmul_h(_val16, _w2); + __m128i _s3 = __lsx_vmul_h(_val16, _w3); + + _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0)); + _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1)); + _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2)); + _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3)); + + tmpptr += 8; + kptr += 32; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + _sum0 = __lsx_vadd_w(_sum0, _sum2); + + int sum[4]; + __lsx_vst(_sum0, sum, 0); + + outptr0[0] = sum[0]; + outptr1[0] = sum[1]; + outptr2[0] = sum[2]; + outptr3[0] = sum[3]; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + + remain_outch_start += nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 64); + __builtin_prefetch(kptr + 32); + __m128i _val01 = __lsx_vld(tmpptr, 0); + __m128i _extval01 = __lsx_vslti_b(_val01, 0); + __m128i _val0 = __lsx_vilvl_b(_extval01, _val01); + __m128i _val1 = __lsx_vilvh_b(_extval01, _val01); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val0, _w16); + __m128i _s1 = __lsx_vmul_h(_val1, _w16); + + _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0)); + _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1)); + + tmpptr += 16; + kptr += 8; + } + + outptr0[0] = __lsx_reduce_add_w(_sum0); + outptr0[1] = __lsx_reduce_add_w(_sum1); + outptr0 += 2; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p / 4 + p % 4); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr + 32); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + + _sum = __lsx_vadd_w(_sum, __lsx_vhaddw_w_h(_s0, _s0)); + + tmpptr += 8; + kptr += 8; + } + + outptr0[0] = __lsx_reduce_add_w(_sum); + outptr0 += 1; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack8to1_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8a-4b-maxk-inch/8a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + if (outch >= 4) + kernel_tm.create(32 * maxk, inch / 8, outch / 4 + outch % 4, (size_t)1u); + else + kernel_tm.create(8 * maxk, inch / 8, outch, (size_t)1u); + + int q = 0; + for (; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + // TODO unroll 2 + for (; q < outch; q++) + { + signed char* g00 = kernel_tm.channel(q / 4 + q % 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } +} + +static void convolution_im2col_sgemm_pack8to1_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + int64_t* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const int64_t* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack8to1_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h b/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h new file mode 100644 index 000000000000..ae9090c95606 --- /dev/null +++ b/src/layer/loongarch/convolution_sgemm_pack8to4_int8.h @@ -0,0 +1,324 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + // Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + + const int size = bottom_im2col.w; + const int maxk = bottom_im2col.h; + const int inch = bottom_im2col.c; + + const int outch = top_blob.c; + + // permute + Mat tmp; + if (size >= 2) + tmp.create(2 * maxk, inch, size / 2 + size % 2, 8u, 8, opt.workspace_allocator); + else + tmp.create(maxk, inch, size, 8u, 8, opt.workspace_allocator); + { + int remain_size_start = 0; + int nn_size = size >> 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 2; + + int64_t* tmpptr = tmp.channel(i / 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + __m128i _v = __lsx_vld(img0, 0); + __lsx_vst(_v, tmpptr, 0); + tmpptr += 2; + img0 += size; + } + } + } + + remain_size_start += nn_size << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + int64_t* tmpptr = tmp.channel(i / 2 + i % 2); + + for (int q = 0; q < inch; q++) + { + const int64_t* img0 = (const int64_t*)bottom_im2col.channel(q) + i; + + for (int k = 0; k < maxk; k++) + { + tmpptr[0] = img0[0]; + tmpptr += 1; + img0 += size; + } + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* outptr0 = top_blob.channel(p); + + int i = 0; + for (; i + 1 < size; i += 2) + { + const signed char* tmpptr = tmp.channel(i / 2); + const signed char* kptr = kernel.channel(p); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 64); + __builtin_prefetch(kptr + 128); + __m128i _val01 = __lsx_vld(tmpptr, 0); + __m128i _extval01 = __lsx_vslti_b(_val01, 0); + __m128i _val0 = __lsx_vilvl_b(_extval01, _val01); + __m128i _val1 = __lsx_vilvh_b(_extval01, _val01); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s00 = __lsx_vmul_h(_val0, _w0); + __m128i _s01 = __lsx_vmul_h(_val0, _w1); + __m128i _s02 = __lsx_vmul_h(_val0, _w2); + __m128i _s03 = __lsx_vmul_h(_val0, _w3); + __m128i _s10 = __lsx_vmul_h(_val1, _w0); + __m128i _s11 = __lsx_vmul_h(_val1, _w1); + __m128i _s12 = __lsx_vmul_h(_val1, _w2); + __m128i _s13 = __lsx_vmul_h(_val1, _w3); + + _sum00 = __lsx_vadd_w(_sum00, __lsx_vhaddw_w_h(_s00, _s00)); + _sum01 = __lsx_vadd_w(_sum01, __lsx_vhaddw_w_h(_s01, _s01)); + _sum02 = __lsx_vadd_w(_sum02, __lsx_vhaddw_w_h(_s02, _s02)); + _sum03 = __lsx_vadd_w(_sum03, __lsx_vhaddw_w_h(_s03, _s03)); + _sum10 = __lsx_vadd_w(_sum10, __lsx_vhaddw_w_h(_s10, _s10)); + _sum11 = __lsx_vadd_w(_sum11, __lsx_vhaddw_w_h(_s11, _s11)); + _sum12 = __lsx_vadd_w(_sum12, __lsx_vhaddw_w_h(_s12, _s12)); + _sum13 = __lsx_vadd_w(_sum13, __lsx_vhaddw_w_h(_s13, _s13)); + + tmpptr += 16; + kptr += 32; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum10, outptr0 + 4, 0); + outptr0 += 8; + } + for (; i < size; i++) + { + const signed char* tmpptr = tmp.channel(i / 2 + i % 2); + const signed char* kptr = kernel.channel(p); + + int nn = inch * maxk; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(tmpptr + 32); + __builtin_prefetch(kptr + 128); + __m128i _val = __lsx_vld(tmpptr, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 16, 0); + __m128i _extw01 = __lsx_vslti_b(_w01, 0); + __m128i _extw23 = __lsx_vslti_b(_w23, 0); + __m128i _w0 = __lsx_vilvl_b(_extw01, _w01); + __m128i _w1 = __lsx_vilvh_b(_extw01, _w01); + __m128i _w2 = __lsx_vilvl_b(_extw23, _w23); + __m128i _w3 = __lsx_vilvh_b(_extw23, _w23); + + __m128i _s0 = __lsx_vmul_h(_val16, _w0); + __m128i _s1 = __lsx_vmul_h(_val16, _w1); + __m128i _s2 = __lsx_vmul_h(_val16, _w2); + __m128i _s3 = __lsx_vmul_h(_val16, _w3); + + _sum0 = __lsx_vadd_w(_sum0, __lsx_vhaddw_w_h(_s0, _s0)); + _sum1 = __lsx_vadd_w(_sum1, __lsx_vhaddw_w_h(_s1, _s1)); + _sum2 = __lsx_vadd_w(_sum2, __lsx_vhaddw_w_h(_s2, _s2)); + _sum3 = __lsx_vadd_w(_sum3, __lsx_vhaddw_w_h(_s3, _s3)); + + tmpptr += 8; + kptr += 32; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + _sum0 = __lsx_vadd_w(_sum0, _sum2); + + __lsx_vst(_sum0, outptr0, 0); + outptr0 += 4; + } + } +} + +static void convolution_im2col_sgemm_transform_kernel_pack8to4_int8_lsx(const Mat& _kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // interleave + // src = maxk-inch-outch + // dst = 8a-4b-maxk-inch/8a-outch/4b + Mat kernel = _kernel.reshape(maxk, inch, outch); + kernel_tm.create(32 * maxk, inch / 8, outch / 4, (size_t)1u); + + for (int q = 0; q + 3 < outch; q += 4) + { + signed char* g00 = kernel_tm.channel(q / 4); + + for (int p = 0; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 4; i++) + { + for (int j = 0; j < 8; j++) + { + const signed char* k00 = kernel.channel(q + i).row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } +} + +static void convolution_im2col_sgemm_pack8to4_int8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 8u, 8, opt.workspace_allocator); + { + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < inch; p++) + { + const Mat img = bottom_blob.channel(p); + int64_t* ptr = bottom_im2col.channel(p); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const int64_t* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + } + + im2col_sgemm_pack8to4_int8_lsx(bottom_im2col, top_blob, kernel, opt); +} diff --git a/src/layer/loongarch/convolution_winograd_dot.h b/src/layer/loongarch/convolution_winograd_dot.h new file mode 100644 index 000000000000..9dbbe4955490 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot.h @@ -0,0 +1,495 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; + if (tiles >= 4) + bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, batch, 4u, opt.workspace_allocator); + else + bottom_blob_tm2.create(1 * inch, tiles, batch, 4u, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 3 < tiles; i += 4) + { + float* tmpptr = tm2.row(i / 4); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i); + + for (int q = 0; q < inch; q++) + { +#if __loongarch_sx + __lsx_vst(__lsx_vld(r0, 0), tmpptr, 0); +#else + tmpptr[0] = r0[0]; + tmpptr[1] = r0[1]; + tmpptr[2] = r0[2]; + tmpptr[3] = r0[3]; +#endif + + r0 += bottom_blob_tm.cstep; + tmpptr += 4; + } + } + for (; i < tiles; i++) + { + float* tmpptr = tm2.row(i / 4 + i % 4); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i); + + for (int q = 0; q < inch; q++) + { + tmpptr[0] = r0[0]; + + r0 += bottom_blob_tm.cstep; + tmpptr += 1; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 4u, opt.workspace_allocator); + +#if __loongarch_sx + int nn_outch = outch >> 3; + int remain_outch_start = nn_outch << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 8; + + float* output0_tm = top_blob_tm.channel(p); + float* output1_tm = top_blob_tm.channel(p + 1); + float* output2_tm = top_blob_tm.channel(p + 2); + float* output3_tm = top_blob_tm.channel(p + 3); + float* output4_tm = top_blob_tm.channel(p + 4); + float* output5_tm = top_blob_tm.channel(p + 5); + float* output6_tm = top_blob_tm.channel(p + 6); + float* output7_tm = top_blob_tm.channel(p + 7); + + const Mat kernel0_tm = kernel_tm.channel(p / 8); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 32); + __m128 _val = (__m128)__lsx_vld(r0, 0); + __m128i _w0123 = __lsx_vld(k0, 0); + __m128i _w4567 = __lsx_vld(k0 + 4, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3); + _sum4 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 0), _val, _sum4); + _sum5 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 1), _val, _sum5); + _sum6 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 2), _val, _sum6); + _sum7 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w4567, 3), _val, _sum7); + + r0 += 4; + k0 += 8; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output1_tm, 0); + __lsx_vst(_sum2, output2_tm, 0); + __lsx_vst(_sum3, output3_tm, 0); + __lsx_vst(_sum4, output4_tm, 0); + __lsx_vst(_sum5, output5_tm, 0); + __lsx_vst(_sum6, output6_tm, 0); + __lsx_vst(_sum7, output7_tm, 0); + + output0_tm += 4; + output1_tm += 4; + output2_tm += 4; + output3_tm += 4; + output4_tm += 4; + output5_tm += 4; + output6_tm += 4; + output7_tm += 4; + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 4 + i % 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + float sum4 = 0.f; + float sum5 = 0.f; + float sum6 = 0.f; + float sum7 = 0.f; + + int j = 0; + for (; j < nn; j++) + { + sum0 += r0[0] * k0[0]; + sum1 += r0[0] * k0[1]; + sum2 += r0[0] * k0[2]; + sum3 += r0[0] * k0[3]; + sum4 += r0[0] * k0[4]; + sum5 += r0[0] * k0[5]; + sum6 += r0[0] * k0[6]; + sum7 += r0[0] * k0[7]; + + r0 += 1; + k0 += 8; + } + + output0_tm[0] = sum0; + output1_tm[0] = sum1; + output2_tm[0] = sum2; + output3_tm[0] = sum3; + output4_tm[0] = sum4; + output5_tm[0] = sum5; + output6_tm[0] = sum6; + output7_tm[0] = sum7; + + output0_tm++; + output1_tm++; + output2_tm++; + output3_tm++; + output4_tm++; + output5_tm++; + output6_tm++; + output7_tm++; + } + } + } + + nn_outch = (outch - remain_outch_start) >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = remain_outch_start + pp * 4; + + float* output0_tm = top_blob_tm.channel(p); + float* output1_tm = top_blob_tm.channel(p + 1); + float* output2_tm = top_blob_tm.channel(p + 2); + float* output3_tm = top_blob_tm.channel(p + 3); + + const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 16); + __m128 _val = (__m128)__lsx_vld(r0, 0); + __m128i _w0123 = __lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w0123, 3), _val, _sum3); + + r0 += 4; + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output1_tm, 0); + __lsx_vst(_sum2, output2_tm, 0); + __lsx_vst(_sum3, output3_tm, 0); + + output0_tm += 4; + output1_tm += 4; + output2_tm += 4; + output3_tm += 4; + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 4 + i % 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + + int j = 0; + for (; j < nn; j++) + { + sum0 += r0[0] * k0[0]; + sum1 += r0[0] * k0[1]; + sum2 += r0[0] * k0[2]; + sum3 += r0[0] * k0[3]; + + r0 += 1; + k0 += 4; + } + + output0_tm[0] = sum0; + output1_tm[0] = sum1; + output2_tm[0] = sum2; + output3_tm[0] = sum3; + + output0_tm++; + output1_tm++; + output2_tm++; + output3_tm++; + } + } + } + + remain_outch_start += nn_outch << 2; +#else + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + float* output0_tm = top_blob_tm.channel(p); + float* output1_tm = top_blob_tm.channel(p + 1); + + const Mat kernel0_tm = kernel_tm.channel(p / 2); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum00 = 0.f; + float sum01 = 0.f; + float sum02 = 0.f; + float sum03 = 0.f; + float sum10 = 0.f; + float sum11 = 0.f; + float sum12 = 0.f; + float sum13 = 0.f; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 8); + float w0 = k0[0]; + float w1 = k0[1]; + sum00 += r0[0] * w0; + sum01 += r0[1] * w0; + sum02 += r0[2] * w0; + sum03 += r0[3] * w0; + sum10 += r0[0] * w1; + sum11 += r0[1] * w1; + sum12 += r0[2] * w1; + sum13 += r0[3] * w1; + + r0 += 4; + k0 += 2; + } + + output0_tm[0] = sum00; + output0_tm[1] = sum01; + output0_tm[2] = sum02; + output0_tm[3] = sum03; + output1_tm[0] = sum10; + output1_tm[1] = sum11; + output1_tm[2] = sum12; + output1_tm[3] = sum13; + + output0_tm += 4; + output1_tm += 4; + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 4 + i % 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum00 = 0.f; + float sum10 = 0.f; + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 4); + __builtin_prefetch(k0 + 8); + float val0 = r0[0]; + sum00 += val0 * k0[0]; + sum10 += val0 * k0[1]; + + r0 += 1; + k0 += 2; + } + + output0_tm[0] = sum00; + output1_tm[0] = sum10; + output0_tm++; + output1_tm++; + } + } + } +#endif + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + float* output0_tm = top_blob_tm.channel(p); + +#if __loongarch_sx + const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4); +#else + const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2); +#endif + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + int j = 0; +#if __loongarch_sx + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + + for (; j < nn; j++) + { + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vld(r0, 0), __lsx_vreplfr2vr_s(k0[0]), _sum0); + r0 += 4; + k0++; + } + + __lsx_vst(_sum0, output0_tm, 0); + output0_tm += 4; +#else // __loongarch_sx + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + + for (; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 4); + float w0 = k0[0]; + sum0 += r0[0] * w0; + sum1 += r0[1] * w0; + sum2 += r0[2] * w0; + sum3 += r0[3] * w0; + + r0 += 4; + k0++; + } + + output0_tm[0] = sum0; + output0_tm[1] = sum1; + output0_tm[2] = sum2; + output0_tm[3] = sum3; + output0_tm += 4; +#endif // __loongarch_sx + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 4 + i % 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + float sum = 0.f; + + for (int j = 0; j < nn; j++) + { + float w0 = k0[0]; + float val0 = r0[0]; + sum += val0 * w0; + + r0 += 1; + k0 += 1; + } + + output0_tm[0] = sum; + output0_tm += 1; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_dot_int8.h b/src/layer/loongarch/convolution_winograd_dot_int8.h new file mode 100644 index 000000000000..2ae5ce4f55eb --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot_int8.h @@ -0,0 +1,594 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u, 1, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; +#if __loongarch_sx + if (inch >= 4) + { + if (tiles >= 2) + bottom_blob_tm2.create(inch / 4 + inch % 4, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(inch / 4 + inch % 4, tiles, batch, 8u, 4, opt.workspace_allocator); + } + else +#endif // __loongarch_sx + { + if (tiles >= 2) + bottom_blob_tm2.create(inch, tiles / 2 + tiles % 2, batch, 4u, 2, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(inch, tiles, batch, 2u, 1, opt.workspace_allocator); + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 1 < tiles; i += 2) + { + short* tmpptr = tm2.row(i / 2); + + const short* r0 = (const short*)bottom_blob_tm + r * tiles + i; + + int q = 0; +#if __loongarch_sx + const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i; + const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i; + const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i; + for (; q + 3 < inch; q += 4) + { + tmpptr[0] = r0[0]; + tmpptr[1] = r1[0]; + tmpptr[2] = r2[0]; + tmpptr[3] = r3[0]; + tmpptr[4] = r0[1]; + tmpptr[5] = r1[1]; + tmpptr[6] = r2[1]; + tmpptr[7] = r3[1]; + r0 += bottom_blob_tm.cstep * 4; + r1 += bottom_blob_tm.cstep * 4; + r2 += bottom_blob_tm.cstep * 4; + r3 += bottom_blob_tm.cstep * 4; + tmpptr += 8; + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + tmpptr[0] = r0[0]; + tmpptr[1] = r0[1]; + r0 += bottom_blob_tm.cstep; + tmpptr += 2; + } + } + for (; i < tiles; i++) + { + short* tmpptr = tm2.row(i / 2 + i % 2); + + const short* r0 = (const short*)bottom_blob_tm + r * tiles + i; + + int q = 0; +#if __loongarch_sx + const short* r1 = (const short*)bottom_blob_tm.channel(1) + r * tiles + i; + const short* r2 = (const short*)bottom_blob_tm.channel(2) + r * tiles + i; + const short* r3 = (const short*)bottom_blob_tm.channel(3) + r * tiles + i; + for (; q + 3 < inch; q += 4) + { + tmpptr[0] = r0[0]; + tmpptr[1] = r1[0]; + tmpptr[2] = r2[0]; + tmpptr[3] = r3[0]; + r0 += bottom_blob_tm.cstep * 4; + r1 += bottom_blob_tm.cstep * 4; + r2 += bottom_blob_tm.cstep * 4; + r3 += bottom_blob_tm.cstep * 4; + tmpptr += 4; + } +#endif // __loongarch_sx + for (; q < inch; q++) + { + tmpptr[0] = r0[0]; + r0 += bottom_blob_tm.cstep; + tmpptr += 1; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator); + +#if __loongarch_sx + int nn_outch = outch >> 2; + int remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* output0_tm = top_blob_tm.channel(p); + int* output1_tm = top_blob_tm.channel(p + 1); + int* output2_tm = top_blob_tm.channel(p + 2); + int* output3_tm = top_blob_tm.channel(p + 3); + + const Mat kernel0_tm = kernel_tm.channel(p / 4); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int nn4 = inch / 4; + int nn1 = inch % 4; + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum02 = __lsx_vreplgr2vr_w(0); + __m128i _sum03 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum12 = __lsx_vreplgr2vr_w(0); + __m128i _sum13 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val01 = __lsx_vld(r0, 0); + + __m128i _val0 = __lsx_vilvl_d(_val01, _val01); + __m128i _val1 = __lsx_vilvh_d(_val01, _val01); + + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + + __m128i _extval0 = __lsx_vslti_h(_val0, 0); + __m128i _extval1 = __lsx_vslti_h(_val1, 0); + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + + __m128i _val0l = __lsx_vilvl_h(_extval0, _val0); + __m128i _val0h = __lsx_vilvh_h(_extval0, _val0); + __m128i _val1l = __lsx_vilvl_h(_extval1, _val1); + __m128i _val1h = __lsx_vilvh_h(_extval1, _val1); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + + _sum00 = __lsx_vmadd_w(_sum00, _val0l, _w0l); + _sum01 = __lsx_vmadd_w(_sum01, _val0h, _w0h); + _sum02 = __lsx_vmadd_w(_sum02, _val0l, _w1l); + _sum03 = __lsx_vmadd_w(_sum03, _val0h, _w1h); + _sum10 = __lsx_vmadd_w(_sum10, _val1l, _w0l); + _sum11 = __lsx_vmadd_w(_sum11, _val1h, _w0h); + _sum12 = __lsx_vmadd_w(_sum12, _val1l, _w1l); + _sum13 = __lsx_vmadd_w(_sum13, _val1h, _w1h); + + r0 += 8; + k0 += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum01, _sum00); + _tmp1 = __lsx_vilvl_w(_sum03, _sum02); + _tmp2 = __lsx_vilvh_w(_sum01, _sum00); + _tmp3 = __lsx_vilvh_w(_sum03, _sum02); + _sum00 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum01 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum02 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum03 = __lsx_vilvh_d(_tmp3, _tmp2); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum11, _sum10); + _tmp1 = __lsx_vilvl_w(_sum13, _sum12); + _tmp2 = __lsx_vilvh_w(_sum11, _sum10); + _tmp3 = __lsx_vilvh_w(_sum13, _sum12); + _sum10 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum11 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum12 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum13 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum00 = __lsx_vadd_w(_sum00, _sum01); + _sum02 = __lsx_vadd_w(_sum02, _sum03); + _sum10 = __lsx_vadd_w(_sum10, _sum11); + _sum12 = __lsx_vadd_w(_sum12, _sum13); + + _sum00 = __lsx_vadd_w(_sum00, _sum02); + _sum10 = __lsx_vadd_w(_sum10, _sum12); + } + + for (int j = 0; j < nn1; j++) + { + __m128i _val0 = __lsx_vreplgr2vr_h(r0[0]); + __m128i _val1 = __lsx_vreplgr2vr_h(r0[1]); + __m128i _val = __lsx_vilvl_d(_val1, _val0); + + __m128i _w16 = __lsx_vld(k0, 0); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _extval = __lsx_vslti_h(_val, 0); + __m128i _extw16 = __lsx_vslti_h(_w16, 0); + + __m128i _vall = __lsx_vilvl_h(_extval, _val); + __m128i _valh = __lsx_vilvh_h(_extval, _val); + __m128i _w0l = __lsx_vilvl_h(_extw16, _w16); + __m128i _w0h = __lsx_vilvh_h(_extw16, _w16); + + _sum00 = __lsx_vmadd_w(_sum00, _vall, _w0l); + _sum10 = __lsx_vmadd_w(_sum10, _valh, _w0h); + + r0 += 2; + k0 += 4; + } + + int sum[8]; + __lsx_vst(_sum00, sum, 0); + __lsx_vst(_sum10, sum + 4, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm[1] = sum[4]; + output1_tm[1] = sum[5]; + output2_tm[1] = sum[6]; + output3_tm[1] = sum[7]; + output0_tm += 2; + output1_tm += 2; + output2_tm += 2; + output3_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int nn4 = inch / 4; + int nn1 = inch % 4; + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + + if (nn4 > 0) + { + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val16 = __lsx_vld(r0, 0); + + _val16 = __lsx_vilvl_d(_val16, _val16); + + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + + __m128i _extval16 = __lsx_vslti_h(_val16, 0); + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + + __m128i _val0l = __lsx_vilvl_h(_extval16, _val16); + __m128i _val0h = __lsx_vilvh_h(_extval16, _val16); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + + _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l); + _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h); + _sum2 = __lsx_vmadd_w(_sum2, _val0l, _w1l); + _sum3 = __lsx_vmadd_w(_sum3, _val0h, _w1h); + + r0 += 4; + k0 += 16; + } + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = __lsx_vilvl_w(_sum1, _sum0); + _tmp1 = __lsx_vilvl_w(_sum3, _sum2); + _tmp2 = __lsx_vilvh_w(_sum1, _sum0); + _tmp3 = __lsx_vilvh_w(_sum3, _sum2); + _sum0 = __lsx_vilvl_d(_tmp1, _tmp0); + _sum1 = __lsx_vilvh_d(_tmp1, _tmp0); + _sum2 = __lsx_vilvl_d(_tmp3, _tmp2); + _sum3 = __lsx_vilvh_d(_tmp3, _tmp2); + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + _sum0 = __lsx_vadd_w(_sum0, _sum2); + } + + for (int j = 0; j < nn1; j++) + { + __m128i _val = __lsx_vreplgr2vr_w(r0[0]); + __m128i _w16 = __lsx_vld(k0, 0); + + __m128i _extw16 = __lsx_vslti_h(_w16, 0); + __m128i _w0l = __lsx_vilvl_h(_extw16, _w16); + + _sum0 = __lsx_vmadd_w(_sum0, _val, _w0l); + + r0 += 1; + k0 += 4; + } + + int sum[4]; + __lsx_vst(_sum0, sum, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm += 1; + output1_tm += 1; + output2_tm += 1; + output3_tm += 1; + } + } + } +#else // __loongarch_sx + int nn_outch = outch >> 1; + int remain_outch_start = nn_outch << 1; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 2; + + int* output0_tm = top_blob_tm.channel(p); + int* output1_tm = top_blob_tm.channel(p + 1); + + const Mat kernel0_tm = kernel_tm.channel(p / 2); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int sum00 = 0; + int sum01 = 0; + int sum10 = 0; + int sum11 = 0; + + int nn1 = inch; + + for (int j = 0; j < nn1; j++) + { + signed short val0 = r0[0]; + signed short val1 = r0[1]; + signed short w0 = k0[0]; + signed short w1 = k0[1]; + + sum00 += val0 * w0; + sum01 += val1 * w0; + sum10 += val0 * w1; + sum11 += val1 * w1; + + r0 += 2; + k0 += 2; + } + + output0_tm[0] = sum00; + output0_tm[1] = sum01; + output1_tm[0] = sum10; + output1_tm[1] = sum11; + output0_tm += 2; + output1_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int sum0 = 0; + int sum1 = 0; + + int nn1 = inch; + + for (int j = 0; j < nn1; j++) + { + signed short val0 = r0[0]; + signed short w0 = k0[0]; + signed short w1 = k0[1]; + + sum0 += val0 * w0; + sum1 += val0 * w1; + + r0 += 1; + k0 += 2; + } + + output0_tm[0] = sum0; + output1_tm[0] = sum1; + output0_tm += 1; + output1_tm += 1; + } + } + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* output0_tm = top_blob_tm.channel(p); + +#if __loongarch_sx + const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4); +#else + const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2); +#endif + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int sum0 = 0; + int sum1 = 0; + +#if __loongarch_sx + int nn4 = inch / 4; + int nn1 = inch % 4; + + if (nn4 > 0) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val16 = __lsx_vld(r0, 0); + + __m128i _w16 = __lsx_vld(k0, 0); + + _w16 = __lsx_vilvl_d(_w16, _w16); + + __m128i _extval16 = __lsx_vslti_h(_val16, 0); + __m128i _extw16 = __lsx_vslti_h(_w16, 0); + + __m128i _val0l = __lsx_vilvl_h(_extval16, _val16); + __m128i _val0h = __lsx_vilvh_h(_extval16, _val16); + + __m128i _w0l = __lsx_vilvl_h(_extw16, _w16); + __m128i _w0h = __lsx_vilvh_h(_extw16, _w16); + + _sum0 = __lsx_vmadd_w(_sum0, _val0l, _w0l); + _sum1 = __lsx_vmadd_w(_sum1, _val0h, _w0h); + + r0 += 8; + k0 += 4; + } + + sum0 = __lsx_reduce_add_w(_sum0); + sum1 = __lsx_reduce_add_w(_sum1); + } +#else // __loongarch_sx + int nn1 = inch; +#endif // __loongarch_sx + + for (int q = 0; q < nn1; q++) + { + signed short val0 = r0[0]; + signed short val1 = r0[1]; + signed short w = k0[0]; + + sum0 += val0 * w; + sum1 += val1 * w; + + k0 += 1; + r0 += 2; + } + + output0_tm[0] = sum0; + output0_tm[1] = sum1; + output0_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int sum = 0; + +#if __loongarch_sx + int nn4 = inch / 4; + int nn1 = inch % 4; + + if (nn4 > 0) + { + __m128i _sum = __lsx_vreplgr2vr_w(0); + + int j = 0; + for (; j < nn4; j++) + { + __m128i _val16 = __lsx_vld(r0, 0); + __m128i _w16 = __lsx_vld(k0, 0); + + __m128i _extval16 = __lsx_vslti_h(_val16, 0); + __m128i _extw16 = __lsx_vslti_h(_w16, 0); + + __m128i _val0l = __lsx_vilvl_h(_extval16, _val16); + __m128i _w0l = __lsx_vilvl_h(_extw16, _w16); + + _sum = __lsx_vmadd_w(_sum, _val0l, _w0l); + + r0 += 4; + k0 += 4; + } + + sum = __lsx_reduce_add_w(_sum); + } +#else // __loongarch_sx + int nn1 = inch; +#endif // __loongarch_sx + + for (int q = 0; q < nn1; q++) + { + signed short val = r0[0]; + signed short w = k0[0]; + + sum += val * w; + + k0 += 1; + r0 += 1; + } + + output0_tm[0] = sum; + output0_tm++; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_dot_pack4.h b/src/layer/loongarch/convolution_winograd_dot_pack4.h new file mode 100644 index 000000000000..66002a62a625 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot_pack4.h @@ -0,0 +1,448 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_pack4_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 4, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; + if (tiles >= 12) + bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, batch, 16u, 4, opt.workspace_allocator); + else if (tiles >= 8) + bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator); + else if (tiles >= 4) + bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator); + else if (tiles >= 2) + bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 4, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 4, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 11 < tiles; i += 12) + { + float* tmpptr = tm2.row(i / 12); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + // transpose 4x8 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0); + __m128i _r8 = __lsx_vld(r0 + 4 * 8, 0); + __m128i _r9 = __lsx_vld(r0 + 4 * 9, 0); + __m128i _ra = __lsx_vld(r0 + 4 * 10, 0); + __m128i _rb = __lsx_vld(r0 + 4 * 11, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r89r = __lsx_vilvl_w(_r9, _r8); + __m128i _r89l = __lsx_vilvh_w(_r9, _r8); + __m128i _rabr = __lsx_vilvl_w(_rb, _ra); + __m128i _rabl = __lsx_vilvh_w(_rb, _ra); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + __m128i _r89ab_0 = __lsx_vilvl_d(_rabr, _r89r); + __m128i _r89ab_1 = __lsx_vilvh_d(_rabr, _r89r); + __m128i _r89ab_2 = __lsx_vilvl_d(_rabl, _r89l); + __m128i _r89ab_3 = __lsx_vilvh_d(_rabl, _r89l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r89ab_0, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 4, 0); + __lsx_vst(_r89ab_1, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 7, 0); + __lsx_vst(_r89ab_2, tmpptr + 4 * 8, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 9, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 10, 0); + __lsx_vst(_r89ab_3, tmpptr + 4 * 11, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 48; + } + } + for (; i + 7 < tiles; i += 8) + { + float* tmpptr = tm2.row(i / 12 + (i % 12) / 8); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + // transpose 4x8 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + __m128i _r4 = __lsx_vld(r0 + 4 * 4, 0); + __m128i _r5 = __lsx_vld(r0 + 4 * 5, 0); + __m128i _r6 = __lsx_vld(r0 + 4 * 6, 0); + __m128i _r7 = __lsx_vld(r0 + 4 * 7, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r45r = __lsx_vilvl_w(_r5, _r4); + __m128i _r45l = __lsx_vilvh_w(_r5, _r4); + __m128i _r67r = __lsx_vilvl_w(_r7, _r6); + __m128i _r67l = __lsx_vilvh_w(_r7, _r6); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + __m128i _r4567_0 = __lsx_vilvl_d(_r67r, _r45r); + __m128i _r4567_1 = __lsx_vilvh_d(_r67r, _r45r); + __m128i _r4567_2 = __lsx_vilvl_d(_r67l, _r45l); + __m128i _r4567_3 = __lsx_vilvh_d(_r67l, _r45l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r4567_0, tmpptr + 4, 0); + __lsx_vst(_r0123_1, tmpptr + 4 * 2, 0); + __lsx_vst(_r4567_1, tmpptr + 4 * 3, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 4, 0); + __lsx_vst(_r4567_2, tmpptr + 4 * 5, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 6, 0); + __lsx_vst(_r4567_3, tmpptr + 4 * 7, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 32; + } + } + for (; i + 3 < tiles; i += 4) + { + float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, tmpptr, 0); + __lsx_vst(_r0123_1, tmpptr + 4, 0); + __lsx_vst(_r0123_2, tmpptr + 4 * 2, 0); + __lsx_vst(_r0123_3, tmpptr + 4 * 3, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 16; + } + } + for (; i + 1 < tiles; i += 2) + { + float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + // transpose 4x2 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + + __m128i _r01_0 = __lsx_vilvl_w(_r1, _r0); + __m128i _r01_1 = __lsx_vilvh_w(_r1, _r0); + + __lsx_vst(_r01_0, tmpptr, 0); + __lsx_vst(_r01_1, tmpptr + 4, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 8; + } + } + for (; i < tiles; i++) + { + float* tmpptr = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2); + + const float* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 4; + + for (int q = 0; q < inch; q++) + { + __m128i _val = __lsx_vld(r0, 0); + __lsx_vst(_val, tmpptr, 0); + + r0 += bottom_blob_tm.cstep * 4; + tmpptr += 4; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* output0_tm = top_blob_tm.channel(p); + + const Mat kernel0_tm = kernel_tm.channel(p); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 11 < tiles; i += 12) + { + const float* r0 = bb2.row(i / 12); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum8 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum9 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _suma = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sumb = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 48); + __builtin_prefetch(k0 + 16); + __m128i _val0123 = __lsx_vld(r0, 0); + __m128i _val4567 = __lsx_vld(r0 + 4, 0); + __m128i _val89ab = __lsx_vld(r0 + 8, 0); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4); + _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5); + _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6); + _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7); + _sum8 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 0), _sum8); + _sum9 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 1), _sum9); + _suma = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 2), _suma); + _sumb = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val89ab, 3), _sumb); + + r0 += 12; + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output0_tm + 4, 0); + __lsx_vst(_sum2, output0_tm + 4 * 2, 0); + __lsx_vst(_sum3, output0_tm + 4 * 3, 0); + __lsx_vst(_sum4, output0_tm + 4 * 4, 0); + __lsx_vst(_sum5, output0_tm + 4 * 5, 0); + __lsx_vst(_sum6, output0_tm + 4 * 6, 0); + __lsx_vst(_sum7, output0_tm + 4 * 7, 0); + __lsx_vst(_sum8, output0_tm + 4 * 8, 0); + __lsx_vst(_sum9, output0_tm + 4 * 9, 0); + __lsx_vst(_suma, output0_tm + 4 * 10, 0); + __lsx_vst(_sumb, output0_tm + 4 * 11, 0); + + output0_tm += 4 * 12; + } + for (; i + 7 < tiles; i += 8) + { + const float* r0 = bb2.row(i / 12 + (i % 12) / 8); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum4 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum5 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum6 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum7 = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 16); + __m128i _val0123 = __lsx_vld(r0, 0); + __m128i _val4567 = __lsx_vld(r0 + 4, 0); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + _sum4 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 0), _sum4); + _sum5 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 1), _sum5); + _sum6 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 2), _sum6); + _sum7 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val4567, 3), _sum7); + + r0 += 8; + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output0_tm + 4, 0); + __lsx_vst(_sum2, output0_tm + 4 * 2, 0); + __lsx_vst(_sum3, output0_tm + 4 * 3, 0); + __lsx_vst(_sum4, output0_tm + 4 * 4, 0); + __lsx_vst(_sum5, output0_tm + 4 * 5, 0); + __lsx_vst(_sum6, output0_tm + 4 * 6, 0); + __lsx_vst(_sum7, output0_tm + 4 * 7, 0); + + output0_tm += 4 * 8; + } + for (; i + 3 < tiles; i += 4) + { + const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(k0 + 16); + __m128i _val0123 = __lsx_vld(r0, 0); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val0123, 3), _sum3); + + r0 += 4; + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output0_tm + 4, 0); + __lsx_vst(_sum2, output0_tm + 4 * 2, 0); + __lsx_vst(_sum3, output0_tm + 4 * 3, 0); + + output0_tm += 4 * 4; + } + for (; i + 1 < tiles; i += 2) + { + const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 8); + __builtin_prefetch(k0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*r0++); + __m128 _val1 = __lsx_vreplfr2vr_s(*r0++); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum0 = __lsx_vfmadd_s(_w0, _val0, _sum0); + _sum1 = __lsx_vfmadd_s(_w0, _val1, _sum1); + + k0 += 4; + } + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum1, output0_tm + 4, 0); + + output0_tm += 4 * 2; + } + for (; i < tiles; i++) + { + const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2); + const float* k0 = kernel0_tm.row(r); + + int nn = inch * 4; // inch always > 0 + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 4); + __builtin_prefetch(k0 + 16); + __m128 _val0 = __lsx_vreplfr2vr_s(*r0++); + __m128 _w0 = (__m128)__lsx_vld(k0, 0); + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + + k0 += 4; + } + + __lsx_vst(_sum, output0_tm, 0); + + output0_tm += 4; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h b/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h new file mode 100644 index 000000000000..f87aa9ef558a --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot_pack8to1_int8.h @@ -0,0 +1,363 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_pack8to1_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; + if (tiles >= 2) + bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 1 < tiles; i += 2) + { + short* tmpptr = tm2.row(i / 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 8, 0); + __lsx_vst(_r0, tmpptr, 0); + __lsx_vst(_r1, tmpptr + 8, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 16; + } + } + for (; i < tiles; i++) + { + short* tmpptr = tm2.row(i / 2 + i % 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + __m128i _r0 = __lsx_vld(r0, 0); + __lsx_vst(_r0, tmpptr, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 8; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 4u, 1, opt.workspace_allocator); + + int nn_outch = 0; + int remain_outch_start = 0; + + nn_outch = outch >> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + int p = pp * 4; + + int* output0_tm = top_blob_tm.channel(p); + int* output1_tm = top_blob_tm.channel(p + 1); + int* output2_tm = top_blob_tm.channel(p + 2); + int* output3_tm = top_blob_tm.channel(p + 3); + + const Mat kernel0_tm = kernel_tm.channel(p / 4); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 64); + __builtin_prefetch(k0 + 128); + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + __m128i _w2 = __lsx_vld(k0 + 16, 0); + __m128i _w3 = __lsx_vld(k0 + 24, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + __m128i _extw2 = __lsx_vslti_h(_w2, 0); + __m128i _extw3 = __lsx_vslti_h(_w3, 0); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + __m128i _w2l = __lsx_vilvl_h(_extw2, _w2); + __m128i _w2h = __lsx_vilvh_h(_extw2, _w2); + __m128i _w3l = __lsx_vilvl_h(_extw3, _w3); + __m128i _w3h = __lsx_vilvh_h(_extw3, _w3); + + __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]); + __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]); + __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]); + __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]); + __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]); + __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]); + __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]); + __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]); + __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]); + __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]); + __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]); + __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]); + __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]); + __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]); + __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]); + __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1); + _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0); + _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1); + _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2); + _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3); + _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2); + _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3); + _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4); + _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5); + _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4); + _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5); + _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6); + _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7); + _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6); + _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7); + + r0 += 16; + k0 += 32; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + int sum[8]; + __lsx_vst(_sum0, sum, 0); + __lsx_vst(_sum2, sum + 4, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm[1] = sum[4]; + output1_tm[1] = sum[5]; + output2_tm[1] = sum[6]; + output3_tm[1] = sum[7]; + output0_tm += 2; + output1_tm += 2; + output2_tm += 2; + output3_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 128); + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + __m128i _w2 = __lsx_vld(k0 + 16, 0); + __m128i _w3 = __lsx_vld(k0 + 24, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + __m128i _extw2 = __lsx_vslti_h(_w2, 0); + __m128i _extw3 = __lsx_vslti_h(_w3, 0); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + __m128i _w2l = __lsx_vilvl_h(_extw2, _w2); + __m128i _w2h = __lsx_vilvh_h(_extw2, _w2); + __m128i _w3l = __lsx_vilvl_h(_extw3, _w3); + __m128i _w3h = __lsx_vilvh_h(_extw3, _w3); + + __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]); + __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]); + __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]); + __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]); + __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]); + __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]); + __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]); + __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1); + _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2); + _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3); + _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4); + _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5); + _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6); + _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7); + + r0 += 8; + k0 += 32; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + + int sum[4]; + __lsx_vst(_sum0, sum, 0); + + output0_tm[0] = sum[0]; + output1_tm[0] = sum[1]; + output2_tm[0] = sum[2]; + output3_tm[0] = sum[3]; + output0_tm += 1; + output1_tm += 1; + output2_tm += 1; + output3_tm += 1; + } + } + } + + remain_outch_start += nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* output0_tm = top_blob_tm.channel(p); + + const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + for (int q = 0; q < inch; q++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 64); + __m128i _val0 = __lsx_vld(r0, 0); + __m128i _val1 = __lsx_vld(r0 + 8, 0); + + __m128i _extval0 = __lsx_vslti_h(_val0, 0); + __m128i _extval1 = __lsx_vslti_h(_val1, 0); + __m128i _val0l = __lsx_vilvl_h(_extval0, _val0); + __m128i _val0h = __lsx_vilvh_h(_extval0, _val0); + __m128i _val1l = __lsx_vilvl_h(_extval1, _val1); + __m128i _val1h = __lsx_vilvh_h(_extval1, _val1); + + __m128i _w0 = __lsx_vld(k0, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0l); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0h); + _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1l); + _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1h); + + k0 += 8; + r0 += 16; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + output0_tm[0] = __lsx_reduce_add_w(_sum0); + output0_tm[1] = __lsx_reduce_add_w(_sum2); + output0_tm += 2; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + for (int q = 0; q < inch; q++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 32); + __m128i _val = __lsx_vld(r0, 0); + + __m128i _extval = __lsx_vslti_h(_val, 0); + __m128i _vall = __lsx_vilvl_h(_extval, _val); + __m128i _valh = __lsx_vilvh_h(_extval, _val); + + __m128i _w0 = __lsx_vld(k0, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _vall); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _valh); + + k0 += 8; + r0 += 8; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + + output0_tm[0] = __lsx_reduce_add_w(_sum0); + output0_tm++; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h b/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h new file mode 100644 index 000000000000..c20400cbf8c3 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_dot_pack8to4_int8.h @@ -0,0 +1,233 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convolution_winograd_dot_pack8to4_int8_lsx(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) +{ + // Mat bottom_blob_tm(tiles, 16/36/64, inch, 16u, 8, opt.workspace_allocator); + + const int tiles = bottom_blob_tm.w; + const int batch = bottom_blob_tm.h; + const int inch = bottom_blob_tm.c; + + // permute + Mat bottom_blob_tm2; + if (tiles >= 2) + bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, batch, 16u, 8, opt.workspace_allocator); + else // if (tiles >= 1) + bottom_blob_tm2.create(1 * inch, tiles, batch, 16u, 8, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r = 0; r < batch; r++) + { + Mat tm2 = bottom_blob_tm2.channel(r); + + // tile + int i = 0; + for (; i + 1 < tiles; i += 2) + { + short* tmpptr = tm2.row(i / 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 8, 0); + __lsx_vst(_r0, tmpptr, 0); + __lsx_vst(_r1, tmpptr + 8, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 16; + } + } + for (; i < tiles; i++) + { + short* tmpptr = tm2.row(i / 2 + i % 2); + + const short* r0 = bottom_blob_tm; + + r0 += (r * tiles + i) * 8; + + for (int q = 0; q < inch; q++) + { + __m128i _r0 = __lsx_vld(r0, 0); + __lsx_vst(_r0, tmpptr, 0); + r0 += bottom_blob_tm.cstep * 8; + tmpptr += 8; + } + } + } + + bottom_blob_tm = Mat(); + // permute end + + top_blob_tm.create(tiles, batch, outch, 16u, 4, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + int* output0_tm = top_blob_tm.channel(p); + + const Mat kernel0_tm = kernel_tm.channel(p); + + for (int r = 0; r < batch; r++) + { + const Mat bb2 = bottom_blob_tm2.channel(r); + + int i = 0; + for (; i + 1 < tiles; i += 2) + { + const short* r0 = bb2.row(i / 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + __m128i _sum2 = __lsx_vreplgr2vr_w(0); + __m128i _sum3 = __lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 64); + __builtin_prefetch(k0 + 128); + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + __m128i _w2 = __lsx_vld(k0 + 16, 0); + __m128i _w3 = __lsx_vld(k0 + 24, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + __m128i _extw2 = __lsx_vslti_h(_w2, 0); + __m128i _extw3 = __lsx_vslti_h(_w3, 0); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + __m128i _w2l = __lsx_vilvl_h(_extw2, _w2); + __m128i _w2h = __lsx_vilvh_h(_extw2, _w2); + __m128i _w3l = __lsx_vilvl_h(_extw3, _w3); + __m128i _w3h = __lsx_vilvh_h(_extw3, _w3); + + __m128i _val0_0 = __lsx_vreplgr2vr_w(r0[0]); + __m128i _val0_1 = __lsx_vreplgr2vr_w(r0[1]); + __m128i _val0_2 = __lsx_vreplgr2vr_w(r0[2]); + __m128i _val0_3 = __lsx_vreplgr2vr_w(r0[3]); + __m128i _val0_4 = __lsx_vreplgr2vr_w(r0[4]); + __m128i _val0_5 = __lsx_vreplgr2vr_w(r0[5]); + __m128i _val0_6 = __lsx_vreplgr2vr_w(r0[6]); + __m128i _val0_7 = __lsx_vreplgr2vr_w(r0[7]); + __m128i _val1_0 = __lsx_vreplgr2vr_w(r0[8]); + __m128i _val1_1 = __lsx_vreplgr2vr_w(r0[9]); + __m128i _val1_2 = __lsx_vreplgr2vr_w(r0[10]); + __m128i _val1_3 = __lsx_vreplgr2vr_w(r0[11]); + __m128i _val1_4 = __lsx_vreplgr2vr_w(r0[12]); + __m128i _val1_5 = __lsx_vreplgr2vr_w(r0[13]); + __m128i _val1_6 = __lsx_vreplgr2vr_w(r0[14]); + __m128i _val1_7 = __lsx_vreplgr2vr_w(r0[15]); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0_0); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val0_1); + _sum2 = __lsx_vmadd_w(_sum2, _w0l, _val1_0); + _sum3 = __lsx_vmadd_w(_sum3, _w0h, _val1_1); + _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val0_2); + _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val0_3); + _sum2 = __lsx_vmadd_w(_sum2, _w1l, _val1_2); + _sum3 = __lsx_vmadd_w(_sum3, _w1h, _val1_3); + _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val0_4); + _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val0_5); + _sum2 = __lsx_vmadd_w(_sum2, _w2l, _val1_4); + _sum3 = __lsx_vmadd_w(_sum3, _w2h, _val1_5); + _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val0_6); + _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val0_7); + _sum2 = __lsx_vmadd_w(_sum2, _w3l, _val1_6); + _sum3 = __lsx_vmadd_w(_sum3, _w3h, _val1_7); + + r0 += 16; + k0 += 32; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + _sum2 = __lsx_vadd_w(_sum2, _sum3); + + __lsx_vst(_sum0, output0_tm, 0); + __lsx_vst(_sum2, output0_tm + 4, 0); + + output0_tm += 8; + } + for (; i < tiles; i++) + { + const short* r0 = bb2.row(i / 2 + i % 2); + const short* k0 = kernel0_tm.row(r); + + int nn = inch; // inch always > 0 + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + for (int j = 0; j < nn; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(k0 + 128); + __m128i _w0 = __lsx_vld(k0, 0); + __m128i _w1 = __lsx_vld(k0 + 8, 0); + __m128i _w2 = __lsx_vld(k0 + 16, 0); + __m128i _w3 = __lsx_vld(k0 + 24, 0); + + __m128i _extw0 = __lsx_vslti_h(_w0, 0); + __m128i _extw1 = __lsx_vslti_h(_w1, 0); + __m128i _extw2 = __lsx_vslti_h(_w2, 0); + __m128i _extw3 = __lsx_vslti_h(_w3, 0); + + __m128i _w0l = __lsx_vilvl_h(_extw0, _w0); + __m128i _w0h = __lsx_vilvh_h(_extw0, _w0); + __m128i _w1l = __lsx_vilvl_h(_extw1, _w1); + __m128i _w1h = __lsx_vilvh_h(_extw1, _w1); + __m128i _w2l = __lsx_vilvl_h(_extw2, _w2); + __m128i _w2h = __lsx_vilvh_h(_extw2, _w2); + __m128i _w3l = __lsx_vilvl_h(_extw3, _w3); + __m128i _w3h = __lsx_vilvh_h(_extw3, _w3); + + __m128i _val0 = __lsx_vreplgr2vr_w(r0[0]); + __m128i _val1 = __lsx_vreplgr2vr_w(r0[1]); + __m128i _val2 = __lsx_vreplgr2vr_w(r0[2]); + __m128i _val3 = __lsx_vreplgr2vr_w(r0[3]); + __m128i _val4 = __lsx_vreplgr2vr_w(r0[4]); + __m128i _val5 = __lsx_vreplgr2vr_w(r0[5]); + __m128i _val6 = __lsx_vreplgr2vr_w(r0[6]); + __m128i _val7 = __lsx_vreplgr2vr_w(r0[7]); + + _sum0 = __lsx_vmadd_w(_sum0, _w0l, _val0); + _sum1 = __lsx_vmadd_w(_sum1, _w0h, _val1); + _sum0 = __lsx_vmadd_w(_sum0, _w1l, _val2); + _sum1 = __lsx_vmadd_w(_sum1, _w1h, _val3); + _sum0 = __lsx_vmadd_w(_sum0, _w2l, _val4); + _sum1 = __lsx_vmadd_w(_sum1, _w2h, _val5); + _sum0 = __lsx_vmadd_w(_sum0, _w3l, _val6); + _sum1 = __lsx_vmadd_w(_sum1, _w3h, _val7); + + r0 += 8; + k0 += 32; + } + + _sum0 = __lsx_vadd_w(_sum0, _sum1); + + __lsx_vst(_sum0, output0_tm, 0); + output0_tm += 4; + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform.h b/src/layer/loongarch/convolution_winograd_transform.h new file mode 100644 index 000000000000..624600e95a0d --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform.h @@ -0,0 +1,405 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 4; + const int h_tiles = (h - 2) / 4; + const int tiles = w_tiles * h_tiles; + + // const float itm[6][6] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[6][6]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 4) + (j * 4); + + for (int m = 0; m < 6; m++) + { + float r00 = r0[0]; + float r01 = r0[1]; + float r02 = r0[2]; + float r03 = r0[3]; + float r04 = r0[4]; + float r05 = r0[5]; + + float tmp0m = 4 * r00 - 5 * r02 + r04; + float tmp1m = -4 * (r01 + r02) + r04 + r03; + float tmp2m = 4 * (r01 - r02) + r04 - r03; + float tmp3m = -2 * (r01 - r03) + r04 - r02; + float tmp4m = 2 * (r01 - r03) + r04 - r02; + float tmp5m = 4 * r01 - 5 * r03 + r05; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + tmp[2][m] = tmp2m; + tmp[3][m] = tmp3m; + tmp[4][m] = tmp4m; + tmp[5][m] = tmp5m; + + r0 += w; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j); + float* r0_tm_1 = r0_tm_0 + tiles; + float* r0_tm_2 = r0_tm_0 + tiles * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 3; + float* r0_tm_4 = r0_tm_0 + tiles * 4; + float* r0_tm_5 = r0_tm_0 + tiles * 5; + + for (int m = 0; m < 6; m++) + { + float tmp00 = tmp[m][0]; + float tmp01 = tmp[m][1]; + float tmp02 = tmp[m][2]; + float tmp03 = tmp[m][3]; + float tmp04 = tmp[m][4]; + float tmp05 = tmp[m][5]; + + float r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04; + float r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03; + float r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03; + float r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02; + float r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02; + float r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05; + + r0_tm_0[0] = r0tm0; + r0_tm_1[0] = r0tm1; + r0_tm_2[0] = r0tm2; + r0_tm_3[0] = r0tm3; + r0_tm_4[0] = r0tm4; + r0_tm_5[0] = r0tm5; + + r0_tm_0 += tiles * 6; + r0_tm_1 += tiles * 6; + r0_tm_2 += tiles * 6; + r0_tm_3 += tiles * 6; + r0_tm_4 += tiles * 6; + r0_tm_5 += tiles * 6; + } + } + } + } +} + +static void conv3x3s1_winograd43_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 4; + const int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + float bias0 = biasptr ? biasptr[p] : 0.f; + + float tmp[4][6]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j); + const float* output0_tm_1 = output0_tm_0 + tiles; + const float* output0_tm_2 = output0_tm_0 + tiles * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 3; + const float* output0_tm_4 = output0_tm_0 + tiles * 4; + const float* output0_tm_5 = output0_tm_0 + tiles * 5; + + float* output0 = out0.row(i * 4) + (j * 4); + + for (int m = 0; m < 6; m++) + { + float out0tm0 = output0_tm_0[0]; + float out0tm1 = output0_tm_1[0]; + float out0tm2 = output0_tm_2[0]; + float out0tm3 = output0_tm_3[0]; + float out0tm4 = output0_tm_4[0]; + float out0tm5 = output0_tm_5[0]; + + float tmp02a = out0tm1 + out0tm2; + float tmp13a = out0tm1 - out0tm2; + + float tmp02b = out0tm3 + out0tm4; + float tmp13b = out0tm3 - out0tm4; + + float tmp0m = out0tm0 + tmp02a + tmp02b; + float tmp1m = tmp13a + tmp13b * 2; + float tmp2m = tmp02a + tmp02b * 4; + float tmp3m = out0tm5 + tmp13a + tmp13b * 8; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + tmp[2][m] = tmp2m; + tmp[3][m] = tmp3m; + + output0_tm_0 += tiles * 6; + output0_tm_1 += tiles * 6; + output0_tm_2 += tiles * 6; + output0_tm_3 += tiles * 6; + output0_tm_4 += tiles * 6; + output0_tm_5 += tiles * 6; + } + + for (int m = 0; m < 4; m++) + { + float tmp00 = tmp[m][0]; + float tmp01 = tmp[m][1]; + float tmp02 = tmp[m][2]; + float tmp03 = tmp[m][3]; + float tmp04 = tmp[m][4]; + float tmp05 = tmp[m][5]; + + float tmp02a = tmp01 + tmp02; + float tmp13a = tmp01 - tmp02; + + float tmp02b = tmp03 + tmp04; + float tmp13b = tmp03 - tmp04; + + float out00 = bias0 + tmp00 + tmp02a + tmp02b; + float out01 = bias0 + tmp13a + tmp13b * 2; + float out02 = bias0 + tmp02a + tmp02b * 4; + float out03 = bias0 + tmp05 + tmp13a + tmp13b * 8; + + output0[0] = out00; + output0[1] = out01; + output0[2] = out02; + output0[3] = out03; + + output0 += outw; + } + } + } + } +} + +static void conv3x3s1_winograd23_transform_input_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 2; + const int h_tiles = (h - 2) / 2; + const int tiles = w_tiles * h_tiles; + + // const float itm[4][4] = { + // {1.0f, 0.0f, -1.0f, 0.0f}, + // {0.0f, 1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 0.00f, 1.0f} + // }; + + // 0 = r00 - r02 + // 1 = r01 + r02 + // 2 = r02 - r01 + // 3 = r03 - r01 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[4][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 2) + (j * 2); + + for (int m = 0; m < 4; m++) + { + float r00 = r0[0]; + float r01 = r0[1]; + float r02 = r0[2]; + float r03 = r0[3]; + + float tmp0m = r00 - r02; + float tmp1m = r01 + r02; + float tmp2m = r02 - r01; + float tmp3m = r03 - r01; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + tmp[2][m] = tmp2m; + tmp[3][m] = tmp3m; + + r0 += w; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j); + float* r0_tm_1 = r0_tm_0 + tiles; + float* r0_tm_2 = r0_tm_0 + tiles * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 3; + + for (int m = 0; m < 4; m++) + { + float tmp00 = tmp[m][0]; + float tmp01 = tmp[m][1]; + float tmp02 = tmp[m][2]; + float tmp03 = tmp[m][3]; + + float r0tm0 = tmp00 - tmp02; + float r0tm1 = tmp01 + tmp02; + float r0tm2 = tmp02 - tmp01; + float r0tm3 = tmp03 - tmp01; + + r0_tm_0[0] = r0tm0; + r0_tm_1[0] = r0tm1; + r0_tm_2[0] = r0tm2; + r0_tm_3[0] = r0tm3; + + r0_tm_0 += tiles * 4; + r0_tm_1 += tiles * 4; + r0_tm_2 += tiles * 4; + r0_tm_3 += tiles * 4; + } + } + } + } +} + +static void conv3x3s1_winograd23_transform_output_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 2; + const int h_tiles = outh / 2; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[2][4] = { + // {1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 1.0f} + // }; + + // 0 = r00 + r01 + r02 + // 1 = r01 - r02 + r03 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + float bias0 = biasptr ? biasptr[p] : 0.f; + + float tmp[2][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j); + const float* output0_tm_1 = output0_tm_0 + tiles; + const float* output0_tm_2 = output0_tm_0 + tiles * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 3; + + float* output0 = out0.row(i * 2) + (j * 2); + + for (int m = 0; m < 4; m++) + { + float out0tm0 = output0_tm_0[0]; + float out0tm1 = output0_tm_1[0]; + float out0tm2 = output0_tm_2[0]; + float out0tm3 = output0_tm_3[0]; + + float tmp0m = out0tm0 + out0tm1 + out0tm2; + float tmp1m = out0tm1 - out0tm2 + out0tm3; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + + output0_tm_0 += tiles * 4; + output0_tm_1 += tiles * 4; + output0_tm_2 += tiles * 4; + output0_tm_3 += tiles * 4; + } + + for (int m = 0; m < 2; m++) + { + float tmp00 = tmp[m][0]; + float tmp01 = tmp[m][1]; + float tmp02 = tmp[m][2]; + float tmp03 = tmp[m][3]; + + float out00 = bias0 + tmp00 + tmp01 + tmp02; + float out01 = bias0 + tmp01 - tmp02 + tmp03; + + output0[0] = out00; + output0[1] = out01; + + output0 += outw; + } + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform_int8.h b/src/layer/loongarch/convolution_winograd_transform_int8.h new file mode 100644 index 000000000000..09ef669e4733 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform_int8.h @@ -0,0 +1,229 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_input_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 4; + const int h_tiles = (h - 2) / 4; + const int tiles = w_tiles * h_tiles; + + // const float itm[6][6] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + short tmp[6][6]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const signed char* r0 = img0.row(i * 4) + (j * 4); + + for (int m = 0; m < 6; m++) + { + signed char r00 = r0[0]; + signed char r01 = r0[1]; + signed char r02 = r0[2]; + signed char r03 = r0[3]; + signed char r04 = r0[4]; + signed char r05 = r0[5]; + + short tmp0m = 4 * r00 - 5 * r02 + r04; + short tmp1m = -4 * (r01 + r02) + r04 + r03; + short tmp2m = 4 * (r01 - r02) + r04 - r03; + short tmp3m = -2 * (r01 - r03) + r04 - r02; + short tmp4m = 2 * (r01 - r03) + r04 - r02; + short tmp5m = 4 * r01 - 5 * r03 + r05; + + tmp[0][m] = tmp0m; + tmp[1][m] = tmp1m; + tmp[2][m] = tmp2m; + tmp[3][m] = tmp3m; + tmp[4][m] = tmp4m; + tmp[5][m] = tmp5m; + + r0 += w; + } + + short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j); + short* r0_tm_1 = r0_tm_0 + tiles; + short* r0_tm_2 = r0_tm_0 + tiles * 2; + short* r0_tm_3 = r0_tm_0 + tiles * 3; + short* r0_tm_4 = r0_tm_0 + tiles * 4; + short* r0_tm_5 = r0_tm_0 + tiles * 5; + + for (int m = 0; m < 6; m++) + { + short tmp00 = tmp[m][0]; + short tmp01 = tmp[m][1]; + short tmp02 = tmp[m][2]; + short tmp03 = tmp[m][3]; + short tmp04 = tmp[m][4]; + short tmp05 = tmp[m][5]; + + short r0tm0 = 4 * tmp00 - 5 * tmp02 + tmp04; + short r0tm1 = -4 * (tmp01 + tmp02) + tmp04 + tmp03; + short r0tm2 = 4 * (tmp01 - tmp02) + tmp04 - tmp03; + short r0tm3 = -2 * (tmp01 - tmp03) + tmp04 - tmp02; + short r0tm4 = 2 * (tmp01 - tmp03) + tmp04 - tmp02; + short r0tm5 = 4 * tmp01 - 5 * tmp03 + tmp05; + + r0_tm_0[0] = r0tm0; + r0_tm_1[0] = r0tm1; + r0_tm_2[0] = r0tm2; + r0_tm_3[0] = r0tm3; + r0_tm_4[0] = r0tm4; + r0_tm_5[0] = r0tm5; + + r0_tm_0 += tiles * 6; + r0_tm_1 += tiles * 6; + r0_tm_2 += tiles * 6; + r0_tm_3 += tiles * 6; + r0_tm_4 += tiles * 6; + r0_tm_5 += tiles * 6; + } + } + } + } +} + +static void conv3x3s1_winograd43_transform_output_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 4; + const int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + int tmp[4][6]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 1; + const int* output0_tm_1 = output0_tm_0 + tiles * 1; + const int* output0_tm_2 = output0_tm_0 + tiles * 2; + const int* output0_tm_3 = output0_tm_0 + tiles * 3; + const int* output0_tm_4 = output0_tm_0 + tiles * 4; + const int* output0_tm_5 = output0_tm_0 + tiles * 5; + + int* output0 = out0.row(i * 4) + j * 4; + + for (int m = 0; m < 5; m++) + { + int tmp02a = output0_tm_1[0] + output0_tm_2[0]; + int tmp13a = output0_tm_1[0] - output0_tm_2[0]; + + int tmp02b = output0_tm_3[0] + output0_tm_4[0]; + int tmp13b = output0_tm_3[0] - output0_tm_4[0]; + + tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b; + tmp[1][m] = tmp13a + tmp13b * 2; + tmp[2][m] = tmp02a + tmp02b * 4; + tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8; + + output0_tm_0 += tiles * 6; + output0_tm_1 += tiles * 6; + output0_tm_2 += tiles * 6; + output0_tm_3 += tiles * 6; + output0_tm_4 += tiles * 6; + output0_tm_5 += tiles * 6; + } + for (int m = 5; m < 6; m++) + { + int tmp02a = output0_tm_1[0] + output0_tm_2[0]; + int tmp13a = output0_tm_1[0] - output0_tm_2[0]; + + int tmp02b = output0_tm_3[0] + output0_tm_4[0]; + int tmp13b = output0_tm_3[0] - output0_tm_4[0]; + + tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4; + tmp[1][m] = (tmp13a + tmp13b * 2) * 4; + tmp[2][m] = (tmp02a + tmp02b * 4) * 4; + tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4; + + output0_tm_0 += tiles * 6; + output0_tm_1 += tiles * 6; + output0_tm_2 += tiles * 6; + output0_tm_3 += tiles * 6; + output0_tm_4 += tiles * 6; + output0_tm_5 += tiles * 6; + } + + for (int m = 0; m < 4; m++) + { + const int* tmp0 = tmp[m]; + + int tmp02a = tmp0[1] + tmp0[2]; + int tmp13a = tmp0[1] - tmp0[2]; + + int tmp02b = tmp0[3] + tmp0[4]; + int tmp13b = tmp0[3] - tmp0[4]; + + output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576; + output0[1] = (tmp13a + tmp13b * 2) / 576; + output0[2] = (tmp02a + tmp02b * 4) / 576; + output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576; + + output0 += outw; + } + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform_pack4.h b/src/layer/loongarch/convolution_winograd_transform_pack4.h new file mode 100644 index 000000000000..3969e59cf09c --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform_pack4.h @@ -0,0 +1,730 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd63_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 6; + const int h_tiles = (h - 2) / 6; + const int tiles = w_tiles * h_tiles; + + // const float itm[8][8] = { + // {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f}, + // + // {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f}, + // + // {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f}, + // {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f}, + // + // {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f}, + // {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f}, + // + // {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f} + // }; + + // 0 = r00 - r06 + (r04 - r02) * 5.25 + // 7 = r07 - r01 + (r03 - r05) * 5.25 + + // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05) + // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05) + + // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2) + // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2) + + // reuse r04 * 1.25 + // reuse r03 * 2.5 + // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5) + // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5) + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[8][8][4]; + + __m128 _v5_25 = __lsx_vreplfr2vr_s(5.25f); + __m128 _vm4_25 = __lsx_vreplfr2vr_s(-4.25f); + __m128 _vm1_25 = __lsx_vreplfr2vr_s(-1.25f); + __m128 _v0_25 = __lsx_vreplfr2vr_s(0.25f); + __m128 _vm2_5 = __lsx_vreplfr2vr_s(-2.5f); + __m128 _v0_5 = __lsx_vreplfr2vr_s(0.5f); + __m128 _v2 = __lsx_vreplfr2vr_s(2.f); + __m128 _v4 = __lsx_vreplfr2vr_s(4.f); + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 6) + (j * 6) * 4; + + for (int m = 0; m < 8; m++) + { + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0); + __m128 _r06 = (__m128)__lsx_vld(r0 + 4 * 6, 0); + __m128 _r07 = (__m128)__lsx_vld(r0 + 4 * 7, 0); + + __m128 _tmp0m = __lsx_vfmadd_s(__lsx_vfsub_s(_r04, _r02), _v5_25, __lsx_vfsub_s(_r00, _r06)); + __m128 _tmp7m = __lsx_vfmadd_s(__lsx_vfsub_s(_r03, _r05), _v5_25, __lsx_vfsub_s(_r07, _r01)); + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp7m, tmp[7][m], 0); + + __m128 _tmp12a = __lsx_vfmadd_s(_r04, _vm4_25, __lsx_vfadd_s(_r02, _r06)); + __m128 _tmp12b = __lsx_vfmadd_s(_r03, _vm4_25, __lsx_vfadd_s(_r01, _r05)); + + __m128 _tmp1m = __lsx_vfadd_s(_tmp12a, _tmp12b); + __m128 _tmp2m = __lsx_vfsub_s(_tmp12a, _tmp12b); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + + __m128 _tmp34a = __lsx_vfmadd_s(_r04, _vm1_25, __lsx_vfmadd_s(_r02, _v0_25, _r06)); + __m128 _tmp34b = __lsx_vfmadd_s(_r05, _v2, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v0_5))); + + __m128 _tmp3m = __lsx_vfadd_s(_tmp34a, _tmp34b); + __m128 _tmp4m = __lsx_vfsub_s(_tmp34a, _tmp34b); + __lsx_vst(_tmp3m, tmp[3][m], 0); + __lsx_vst(_tmp4m, tmp[4][m], 0); + + __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_r04, _vm1_25, _r02), _v4, _r06); + __m128 _tmp56b = __lsx_vfmadd_s(_r05, _v0_5, __lsx_vfmadd_s(_r03, _vm2_5, __lsx_vfmul_s(_r01, _v2))); + + __m128 _tmp5m = __lsx_vfadd_s(_tmp56a, _tmp56b); + __m128 _tmp6m = __lsx_vfsub_s(_tmp56a, _tmp56b); + __lsx_vst(_tmp5m, tmp[5][m], 0); + __lsx_vst(_tmp6m, tmp[6][m], 0); + + r0 += w * 4; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4; + float* r0_tm_1 = r0_tm_0 + tiles * 4; + float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3; + float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4; + float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5; + float* r0_tm_6 = r0_tm_0 + tiles * 4 * 6; + float* r0_tm_7 = r0_tm_0 + tiles * 4 * 7; + + for (int m = 0; m < 8; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0); + __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0); + __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0); + __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0); + + __m128 _r0tm0 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp04, _tmp02), _v5_25, __lsx_vfsub_s(_tmp00, _tmp06)); + __m128 _r0tm7 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp03, _tmp05), _v5_25, __lsx_vfsub_s(_tmp07, _tmp01)); + + __m128 _tmp12a = __lsx_vfmadd_s(_tmp04, _vm4_25, __lsx_vfadd_s(_tmp02, _tmp06)); + __m128 _tmp12b = __lsx_vfmadd_s(_tmp03, _vm4_25, __lsx_vfadd_s(_tmp01, _tmp05)); + + __m128 _r0tm1 = __lsx_vfadd_s(_tmp12a, _tmp12b); + __m128 _r0tm2 = __lsx_vfsub_s(_tmp12a, _tmp12b); + + __m128 _tmp34a = __lsx_vfmadd_s(_tmp04, _vm1_25, __lsx_vfmadd_s(_tmp02, _v0_25, _tmp06)); + __m128 _tmp34b = __lsx_vfmadd_s(_tmp05, _v2, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v0_5))); + + __m128 _r0tm3 = __lsx_vfadd_s(_tmp34a, _tmp34b); + __m128 _r0tm4 = __lsx_vfsub_s(_tmp34a, _tmp34b); + + __m128 _tmp56a = __lsx_vfmadd_s(__lsx_vfmadd_s(_tmp04, _vm1_25, _tmp02), _v4, _tmp06); + __m128 _tmp56b = __lsx_vfmadd_s(_tmp05, _v0_5, __lsx_vfmadd_s(_tmp03, _vm2_5, __lsx_vfmul_s(_tmp01, _v2))); + + __m128 _r0tm5 = __lsx_vfadd_s(_tmp56a, _tmp56b); + __m128 _r0tm6 = __lsx_vfsub_s(_tmp56a, _tmp56b); + + __lsx_vst(_r0tm0, r0_tm_0, 0); + __lsx_vst(_r0tm1, r0_tm_1, 0); + __lsx_vst(_r0tm2, r0_tm_2, 0); + __lsx_vst(_r0tm3, r0_tm_3, 0); + __lsx_vst(_r0tm4, r0_tm_4, 0); + __lsx_vst(_r0tm5, r0_tm_5, 0); + __lsx_vst(_r0tm6, r0_tm_6, 0); + __lsx_vst(_r0tm7, r0_tm_7, 0); + + r0_tm_0 += tiles * 4 * 8; + r0_tm_1 += tiles * 4 * 8; + r0_tm_2 += tiles * 4 * 8; + r0_tm_3 += tiles * 4 * 8; + r0_tm_4 += tiles * 4 * 8; + r0_tm_5 += tiles * 4 * 8; + r0_tm_6 += tiles * 4 * 8; + r0_tm_7 += tiles * 4 * 8; + } + } + } + } +} + +static void conv3x3s1_winograd63_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 6; + const int h_tiles = outh / 6; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[6][8] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f} + // }; + + // 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32 + // 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16 + // 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8 + // 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4 + // 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2 + // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6) + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + float tmp[6][8][4]; + + __m128 _v32 = __lsx_vreplfr2vr_s(32.f); + __m128 _v16 = __lsx_vreplfr2vr_s(16.f); + __m128 _v8 = __lsx_vreplfr2vr_s(8.f); + __m128 _v4 = __lsx_vreplfr2vr_s(4.f); + __m128 _v2 = __lsx_vreplfr2vr_s(2.f); + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4; + const float* output0_tm_1 = output0_tm_0 + tiles * 4; + const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3; + const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4; + const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5; + const float* output0_tm_6 = output0_tm_0 + tiles * 4 * 6; + const float* output0_tm_7 = output0_tm_0 + tiles * 4 * 7; + + float* output0 = out0.row(i * 6) + (j * 6) * 4; + + for (int m = 0; m < 8; m++) + { + __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0); + __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0); + __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0); + __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0); + __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0); + __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0); + __m128 _out0tm6 = (__m128)__lsx_vld(output0_tm_6, 0); + __m128 _out0tm7 = (__m128)__lsx_vld(output0_tm_7, 0); + + __m128 _tmp024a = __lsx_vfadd_s(_out0tm1, _out0tm2); + __m128 _tmp135a = __lsx_vfsub_s(_out0tm1, _out0tm2); + + __m128 _tmp024b = __lsx_vfadd_s(_out0tm3, _out0tm4); + __m128 _tmp135b = __lsx_vfsub_s(_out0tm3, _out0tm4); + + __m128 _tmp024c = __lsx_vfadd_s(_out0tm5, _out0tm6); + __m128 _tmp135c = __lsx_vfsub_s(_out0tm5, _out0tm6); + + __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b)); + __m128 _tmp2m = __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a)); + __m128 _tmp4m = __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a)); + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp4m, tmp[4][m], 0); + + __m128 _tmp1m = __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a)); + __m128 _tmp3m = __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a)); + __m128 _tmp5m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm7, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c)); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + __lsx_vst(_tmp5m, tmp[5][m], 0); + + output0_tm_0 += tiles * 4 * 8; + output0_tm_1 += tiles * 4 * 8; + output0_tm_2 += tiles * 4 * 8; + output0_tm_3 += tiles * 4 * 8; + output0_tm_4 += tiles * 4 * 8; + output0_tm_5 += tiles * 4 * 8; + output0_tm_6 += tiles * 4 * 8; + output0_tm_7 += tiles * 4 * 8; + } + + for (int m = 0; m < 6; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0); + __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0); + __m128 _tmp06 = (__m128)__lsx_vld(tmp[m][6], 0); + __m128 _tmp07 = (__m128)__lsx_vld(tmp[m][7], 0); + + __m128 _tmp024a = __lsx_vfadd_s(_tmp01, _tmp02); + __m128 _tmp135a = __lsx_vfsub_s(_tmp01, _tmp02); + + __m128 _tmp024b = __lsx_vfadd_s(_tmp03, _tmp04); + __m128 _tmp135b = __lsx_vfsub_s(_tmp03, _tmp04); + + __m128 _tmp024c = __lsx_vfadd_s(_tmp05, _tmp06); + __m128 _tmp135c = __lsx_vfsub_s(_tmp05, _tmp06); + + __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp024a), __lsx_vfmadd_s(_tmp024c, _v32, _tmp024b))); + __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v8, __lsx_vfmadd_s(_tmp024b, _v4, _tmp024a))); + __m128 _out04 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp024c, _v2, __lsx_vfmadd_s(_tmp024b, _v16, _tmp024a))); + __lsx_vst(_out00, output0, 0); + __lsx_vst(_out02, output0 + 4 * 2, 0); + __lsx_vst(_out04, output0 + 4 * 4, 0); + + __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v16, __lsx_vfmadd_s(_tmp135b, _v2, _tmp135a))); + __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp135c, _v4, __lsx_vfmadd_s(_tmp135b, _v8, _tmp135a))); + __m128 _out05 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp07, _tmp135a), __lsx_vfmadd_s(_tmp135b, _v32, _tmp135c))); + __lsx_vst(_out01, output0 + 4, 0); + __lsx_vst(_out03, output0 + 4 * 3, 0); + __lsx_vst(_out05, output0 + 4 * 5, 0); + + output0 += outw * 4; + } + } + } + } +} + +static void conv3x3s1_winograd43_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 4; + const int h_tiles = (h - 2) / 4; + const int tiles = w_tiles * h_tiles; + + // const float itm[6][6] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[6][6][4]; + + __m128 _vm5 = __lsx_vreplfr2vr_s(-5.f); + __m128 _vm4 = __lsx_vreplfr2vr_s(-4.f); + __m128 _v4 = __lsx_vreplfr2vr_s(4.f); + __m128 _vm2 = __lsx_vreplfr2vr_s(-2.f); + __m128 _v2 = __lsx_vreplfr2vr_s(2.f); + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 4) + (j * 4) * 4; + + for (int m = 0; m < 6; m++) + { + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + __m128 _r05 = (__m128)__lsx_vld(r0 + 4 * 5, 0); + + __m128 _tmp0m = __lsx_vfmadd_s(_r02, _vm5, __lsx_vfmadd_s(_r00, _v4, _r04)); + __m128 _tmp1m = __lsx_vfmadd_s(__lsx_vfadd_s(_r01, _r02), _vm4, __lsx_vfadd_s(_r04, _r03)); + __m128 _tmp2m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r02), _v4, __lsx_vfsub_s(_r04, _r03)); + __m128 _tmp3m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _vm2, __lsx_vfsub_s(_r04, _r02)); + __m128 _tmp4m = __lsx_vfmadd_s(__lsx_vfsub_s(_r01, _r03), _v2, __lsx_vfsub_s(_r04, _r02)); + __m128 _tmp5m = __lsx_vfmadd_s(_r03, _vm5, __lsx_vfmadd_s(_r01, _v4, _r05)); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + __lsx_vst(_tmp4m, tmp[4][m], 0); + __lsx_vst(_tmp5m, tmp[5][m], 0); + + r0 += w * 4; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4; + float* r0_tm_1 = r0_tm_0 + tiles * 4; + float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3; + float* r0_tm_4 = r0_tm_0 + tiles * 4 * 4; + float* r0_tm_5 = r0_tm_0 + tiles * 4 * 5; + + for (int m = 0; m < 6; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0); + __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0); + + __m128 _r0tm0 = __lsx_vfmadd_s(_tmp02, _vm5, __lsx_vfmadd_s(_tmp00, _v4, _tmp04)); + __m128 _r0tm1 = __lsx_vfmadd_s(__lsx_vfadd_s(_tmp01, _tmp02), _vm4, __lsx_vfadd_s(_tmp04, _tmp03)); + __m128 _r0tm2 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _v4, __lsx_vfsub_s(_tmp04, _tmp03)); + __m128 _r0tm3 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _vm2, __lsx_vfsub_s(_tmp04, _tmp02)); + __m128 _r0tm4 = __lsx_vfmadd_s(__lsx_vfsub_s(_tmp01, _tmp03), _v2, __lsx_vfsub_s(_tmp04, _tmp02)); + __m128 _r0tm5 = __lsx_vfmadd_s(_tmp03, _vm5, __lsx_vfmadd_s(_tmp01, _v4, _tmp05)); + + __lsx_vst(_r0tm0, r0_tm_0, 0); + __lsx_vst(_r0tm1, r0_tm_1, 0); + __lsx_vst(_r0tm2, r0_tm_2, 0); + __lsx_vst(_r0tm3, r0_tm_3, 0); + __lsx_vst(_r0tm4, r0_tm_4, 0); + __lsx_vst(_r0tm5, r0_tm_5, 0); + + r0_tm_0 += tiles * 4 * 6; + r0_tm_1 += tiles * 4 * 6; + r0_tm_2 += tiles * 4 * 6; + r0_tm_3 += tiles * 4 * 6; + r0_tm_4 += tiles * 4 * 6; + r0_tm_5 += tiles * 4 * 6; + } + } + } + } +} + +static void conv3x3s1_winograd43_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 4; + const int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + float tmp[4][6][4]; + + __m128 _v2 = __lsx_vreplfr2vr_s(2.f); + __m128 _v4 = __lsx_vreplfr2vr_s(4.f); + __m128 _v8 = __lsx_vreplfr2vr_s(8.f); + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4; + const float* output0_tm_1 = output0_tm_0 + tiles * 4; + const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3; + const float* output0_tm_4 = output0_tm_0 + tiles * 4 * 4; + const float* output0_tm_5 = output0_tm_0 + tiles * 4 * 5; + + float* output0 = out0.row(i * 4) + (j * 4) * 4; + + for (int m = 0; m < 6; m++) + { + __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0); + __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0); + __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0); + __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0); + __m128 _out0tm4 = (__m128)__lsx_vld(output0_tm_4, 0); + __m128 _out0tm5 = (__m128)__lsx_vld(output0_tm_5, 0); + + __m128 _tmp02a = __lsx_vfadd_s(_out0tm1, _out0tm2); + __m128 _tmp13a = __lsx_vfsub_s(_out0tm1, _out0tm2); + + __m128 _tmp02b = __lsx_vfadd_s(_out0tm3, _out0tm4); + __m128 _tmp13b = __lsx_vfsub_s(_out0tm3, _out0tm4); + + __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _tmp02a), _tmp02b); + __m128 _tmp1m = __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a); + __m128 _tmp2m = __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a); + __m128 _tmp3m = __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_out0tm5, _tmp13a)); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + + output0_tm_0 += tiles * 4 * 6; + output0_tm_1 += tiles * 4 * 6; + output0_tm_2 += tiles * 4 * 6; + output0_tm_3 += tiles * 4 * 6; + output0_tm_4 += tiles * 4 * 6; + output0_tm_5 += tiles * 4 * 6; + } + + for (int m = 0; m < 4; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + __m128 _tmp04 = (__m128)__lsx_vld(tmp[m][4], 0); + __m128 _tmp05 = (__m128)__lsx_vld(tmp[m][5], 0); + + __m128 _tmp02a = __lsx_vfadd_s(_tmp01, _tmp02); + __m128 _tmp13a = __lsx_vfsub_s(_tmp01, _tmp02); + + __m128 _tmp02b = __lsx_vfadd_s(_tmp03, _tmp04); + __m128 _tmp13b = __lsx_vfsub_s(_tmp03, _tmp04); + + __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp02a), _tmp02b)); + __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v2, _tmp13a)); + __m128 _out02 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp02b, _v4, _tmp02a)); + __m128 _out03 = __lsx_vfadd_s(_bias0, __lsx_vfmadd_s(_tmp13b, _v8, __lsx_vfadd_s(_tmp05, _tmp13a))); + + __lsx_vst(_out00, output0, 0); + __lsx_vst(_out01, output0 + 4, 0); + __lsx_vst(_out02, output0 + 4 * 2, 0); + __lsx_vst(_out03, output0 + 4 * 3, 0); + + output0 += outw * 4; + } + } + } + } +} + +static void conv3x3s1_winograd23_transform_input_pack4_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 2; + const int h_tiles = (h - 2) / 2; + const int tiles = w_tiles * h_tiles; + + // const float itm[4][4] = { + // {1.0f, 0.0f, -1.0f, 0.0f}, + // {0.0f, 1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 0.00f, 1.0f} + // }; + + // 0 = r00 - r02 + // 1 = r01 + r02 + // 2 = r02 - r01 + // 3 = r03 - r01 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + float tmp[4][4][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* r0 = img0.row(i * 2) + (j * 2) * 4; + + for (int m = 0; m < 4; m++) + { + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + + __m128 _tmp0m = __lsx_vfsub_s(_r00, _r02); + __m128 _tmp1m = __lsx_vfadd_s(_r01, _r02); + __m128 _tmp2m = __lsx_vfsub_s(_r02, _r01); + __m128 _tmp3m = __lsx_vfsub_s(_r03, _r01); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + + r0 += w * 4; + } + + float* r0_tm_0 = (float*)img0_tm + (i * w_tiles + j) * 4; + float* r0_tm_1 = r0_tm_0 + tiles * 4; + float* r0_tm_2 = r0_tm_0 + tiles * 4 * 2; + float* r0_tm_3 = r0_tm_0 + tiles * 4 * 3; + + for (int m = 0; m < 4; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + + __m128 _r0tm0 = __lsx_vfsub_s(_tmp00, _tmp02); + __m128 _r0tm1 = __lsx_vfadd_s(_tmp01, _tmp02); + __m128 _r0tm2 = __lsx_vfsub_s(_tmp02, _tmp01); + __m128 _r0tm3 = __lsx_vfsub_s(_tmp03, _tmp01); + + __lsx_vst(_r0tm0, r0_tm_0, 0); + __lsx_vst(_r0tm1, r0_tm_1, 0); + __lsx_vst(_r0tm2, r0_tm_2, 0); + __lsx_vst(_r0tm3, r0_tm_3, 0); + + r0_tm_0 += tiles * 4 * 4; + r0_tm_1 += tiles * 4 * 4; + r0_tm_2 += tiles * 4 * 4; + r0_tm_3 += tiles * 4 * 4; + } + } + } + } +} + +static void conv3x3s1_winograd23_transform_output_pack4_lsx(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 2; + const int h_tiles = outh / 2; + const int tiles = w_tiles * h_tiles; + + const float* biasptr = bias; + + // const float otm[2][4] = { + // {1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 1.0f} + // }; + + // 0 = r00 + r01 + r02 + // 1 = r01 - r02 + r03 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + __m128 _bias0 = biasptr ? (__m128)__lsx_vld(biasptr + p * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + float tmp[2][4][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const float* output0_tm_0 = (const float*)out0_tm + (i * w_tiles + j) * 4; + const float* output0_tm_1 = output0_tm_0 + tiles * 4; + const float* output0_tm_2 = output0_tm_0 + tiles * 4 * 2; + const float* output0_tm_3 = output0_tm_0 + tiles * 4 * 3; + + float* output0 = out0.row(i * 2) + (j * 2) * 4; + + for (int m = 0; m < 4; m++) + { + __m128 _out0tm0 = (__m128)__lsx_vld(output0_tm_0, 0); + __m128 _out0tm1 = (__m128)__lsx_vld(output0_tm_1, 0); + __m128 _out0tm2 = (__m128)__lsx_vld(output0_tm_2, 0); + __m128 _out0tm3 = (__m128)__lsx_vld(output0_tm_3, 0); + + __m128 _tmp0m = __lsx_vfadd_s(__lsx_vfadd_s(_out0tm0, _out0tm1), _out0tm2); + __m128 _tmp1m = __lsx_vfadd_s(__lsx_vfsub_s(_out0tm1, _out0tm2), _out0tm3); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + + output0_tm_0 += tiles * 4 * 4; + output0_tm_1 += tiles * 4 * 4; + output0_tm_2 += tiles * 4 * 4; + output0_tm_3 += tiles * 4 * 4; + } + + for (int m = 0; m < 2; m++) + { + __m128 _tmp00 = (__m128)__lsx_vld(tmp[m][0], 0); + __m128 _tmp01 = (__m128)__lsx_vld(tmp[m][1], 0); + __m128 _tmp02 = (__m128)__lsx_vld(tmp[m][2], 0); + __m128 _tmp03 = (__m128)__lsx_vld(tmp[m][3], 0); + + __m128 _out00 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfadd_s(_tmp00, _tmp01), _tmp02)); + __m128 _out01 = __lsx_vfadd_s(_bias0, __lsx_vfadd_s(__lsx_vfsub_s(_tmp01, _tmp02), _tmp03)); + + __lsx_vst(_out00, output0, 0); + __lsx_vst(_out01, output0 + 4, 0); + + output0 += outw * 4; + } + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h b/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h new file mode 100644 index 000000000000..8b31ce97a869 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform_pack4_int8.h @@ -0,0 +1,166 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_output_pack4_int8_lsx(const Mat& top_blob_tm, Mat& top_blob, const Option& opt) +{ + const int outw = top_blob.w; + const int outh = top_blob.h; + const int outch = top_blob.c; + + const int w_tiles = outw / 4; + const int h_tiles = outh / 4; + const int tiles = w_tiles * h_tiles; + + // const float otm[4][6] = { + // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f} + // }; + + // 0 = r00 + (r01 + r02) + (r03 + r04) + // 1 = (r01 - r02) + (r03 - r04) * 2 + // 2 = (r01 + r02) + (r03 + r04) * 4 + // 3 = r05 + (r01 - r02) + (r03 - r04) * 8 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + const Mat out0_tm = top_blob_tm.channel(p); + Mat out0 = top_blob.channel(p); + + int tmp[4][6][4]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const int* output0_tm_0 = (const int*)out0_tm + (i * w_tiles + j) * 4; + const int* output0_tm_1 = output0_tm_0 + tiles * 4; + const int* output0_tm_2 = output0_tm_0 + tiles * 8; + const int* output0_tm_3 = output0_tm_0 + tiles * 12; + const int* output0_tm_4 = output0_tm_0 + tiles * 16; + const int* output0_tm_5 = output0_tm_0 + tiles * 20; + + int* output0 = out0.row(i * 4) + (j * 4) * 4; + + for (int m = 0; m < 5; m++) + { + __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0); + __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0); + __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0); + __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0); + __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0); + __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0); + + __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2); + __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2); + + __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4); + __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4); + + __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b); + __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1)); + __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2)); + __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3)); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + + output0_tm_0 += tiles * 24; + output0_tm_1 += tiles * 24; + output0_tm_2 += tiles * 24; + output0_tm_3 += tiles * 24; + output0_tm_4 += tiles * 24; + output0_tm_5 += tiles * 24; + } + for (int m = 5; m < 6; m++) + { + __m128i _out0tm0 = __lsx_vld(output0_tm_0, 0); + __m128i _out0tm1 = __lsx_vld(output0_tm_1, 0); + __m128i _out0tm2 = __lsx_vld(output0_tm_2, 0); + __m128i _out0tm3 = __lsx_vld(output0_tm_3, 0); + __m128i _out0tm4 = __lsx_vld(output0_tm_4, 0); + __m128i _out0tm5 = __lsx_vld(output0_tm_5, 0); + + __m128i _tmp02a = __lsx_vadd_w(_out0tm1, _out0tm2); + __m128i _tmp13a = __lsx_vsub_w(_out0tm1, _out0tm2); + + __m128i _tmp02b = __lsx_vadd_w(_out0tm3, _out0tm4); + __m128i _tmp13b = __lsx_vsub_w(_out0tm3, _out0tm4); + + __m128i _tmp0m = __lsx_vadd_w(__lsx_vadd_w(_out0tm0, _tmp02a), _tmp02b); + __m128i _tmp1m = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1)); + __m128i _tmp2m = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2)); + __m128i _tmp3m = __lsx_vadd_w(__lsx_vadd_w(_tmp13a, __lsx_vslli_w(_out0tm5, 2)), __lsx_vslli_w(_tmp13b, 3)); + + _tmp0m = __lsx_vslli_w(_tmp0m, 2); + _tmp1m = __lsx_vslli_w(_tmp1m, 2); + _tmp2m = __lsx_vslli_w(_tmp2m, 2); + _tmp3m = __lsx_vslli_w(_tmp3m, 2); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + + output0_tm_0 += tiles * 24; + output0_tm_1 += tiles * 24; + output0_tm_2 += tiles * 24; + output0_tm_3 += tiles * 24; + output0_tm_4 += tiles * 24; + output0_tm_5 += tiles * 24; + } + + for (int m = 0; m < 4; m++) + { + __m128i _tmp00 = __lsx_vld(tmp[m][0], 0); + __m128i _tmp01 = __lsx_vld(tmp[m][1], 0); + __m128i _tmp02 = __lsx_vld(tmp[m][2], 0); + __m128i _tmp03 = __lsx_vld(tmp[m][3], 0); + __m128i _tmp04 = __lsx_vld(tmp[m][4], 0); + __m128i _tmp05 = __lsx_vld(tmp[m][5], 0); + + __m128i _tmp02a = __lsx_vadd_w(_tmp01, _tmp02); + __m128i _tmp13a = __lsx_vsub_w(_tmp01, _tmp02); + + __m128i _tmp02b = __lsx_vadd_w(_tmp03, _tmp04); + __m128i _tmp13b = __lsx_vsub_w(_tmp03, _tmp04); + + __m128i _out00 = __lsx_vadd_w(__lsx_vadd_w(_tmp00, _tmp02a), _tmp02b); + __m128i _out01 = __lsx_vadd_w(_tmp13a, __lsx_vslli_w(_tmp13b, 1)); + __m128i _out02 = __lsx_vadd_w(_tmp02a, __lsx_vslli_w(_tmp02b, 2)); + __m128i _out03 = __lsx_vadd_w(__lsx_vadd_w(_tmp05, _tmp13a), __lsx_vslli_w(_tmp13b, 3)); + + // TODO use integer trick for division by 576 + __m128 _v576 = __lsx_vreplfr2vr_s(1.0 / 576); + _out00 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out00), _v576)); + _out01 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out01), _v576)); + _out02 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out02), _v576)); + _out03 = __lsx_vftint_w_s(__lsx_vfmul_s(__lsx_vffint_s_w(_out03), _v576)); + + __lsx_vst(_out00, output0, 0); + __lsx_vst(_out01, output0 + 4, 0); + __lsx_vst(_out02, output0 + 8, 0); + __lsx_vst(_out03, output0 + 12, 0); + + output0 += outw * 4; + } + } + } + } +} diff --git a/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h b/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h new file mode 100644 index 000000000000..5e49a87669a6 --- /dev/null +++ b/src/layer/loongarch/convolution_winograd_transform_pack8_int8.h @@ -0,0 +1,132 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv3x3s1_winograd43_transform_input_pack8_int8_lsx(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int inch = bottom_blob.c; + + const int w_tiles = (w - 2) / 4; + const int h_tiles = (h - 2) / 4; + const int tiles = w_tiles * h_tiles; + + // const float itm[6][6] = { + // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f}, + // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f}, + // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f}, + // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f}, + // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f} + // }; + + // 0 = 4 * r00 - 5 * r02 + r04 + // 1 = -4 * (r01 + r02) + r04 + r03 + // 2 = 4 * (r01 - r02) + r04 - r03 + // 3 = -2 * (r01 - r03) + r04 - r02 + // 4 = 2 * (r01 - r03) + r04 - r02 + // 5 = 4 * r01 - 5 * r03 + r05 + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < inch; q++) + { + const Mat img0 = bottom_blob.channel(q); + Mat img0_tm = bottom_blob_tm.channel(q); + + short tmp[6][6][8]; + + // tile + for (int i = 0; i < h_tiles; i++) + { + for (int j = 0; j < w_tiles; j++) + { + const signed char* r0 = img0.row(i * 4) + (j * 4) * 8; + + for (int m = 0; m < 6; m++) + { + __m128i _r00_01 = __lsx_vld(r0, 0); + __m128i _r02_03 = __lsx_vld(r0 + 16, 0); + __m128i _r04_05 = __lsx_vld(r0 + 32, 0); + __m128i _extr0001 = __lsx_vslti_b(_r00_01, 0); + __m128i _extr0203 = __lsx_vslti_b(_r02_03, 0); + __m128i _extr0405 = __lsx_vslti_b(_r04_05, 0); + __m128i _r00 = __lsx_vilvl_b(_extr0001, _r00_01); + __m128i _r01 = __lsx_vilvh_b(_extr0001, _r00_01); + __m128i _r02 = __lsx_vilvl_b(_extr0203, _r02_03); + __m128i _r03 = __lsx_vilvh_b(_extr0203, _r02_03); + __m128i _r04 = __lsx_vilvl_b(_extr0405, _r04_05); + __m128i _r05 = __lsx_vilvh_b(_extr0405, _r04_05); + + __m128i _v5 = __lsx_vreplgr2vr_h(5); + + __m128i _tmp0m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r00, 2), _r04), __lsx_vmul_h(_r02, _v5)); + __m128i _tmp1m = __lsx_vsub_h(__lsx_vadd_h(_r04, _r03), __lsx_vslli_h(__lsx_vadd_h(_r01, _r02), 2)); + __m128i _tmp2m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r03), __lsx_vslli_h(__lsx_vsub_h(_r01, _r02), 2)); + __m128i _tmp3m = __lsx_vsub_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1)); + __m128i _tmp4m = __lsx_vadd_h(__lsx_vsub_h(_r04, _r02), __lsx_vslli_h(__lsx_vsub_h(_r01, _r03), 1)); + __m128i _tmp5m = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_r01, 2), _r05), __lsx_vmul_h(_r03, _v5)); + + __lsx_vst(_tmp0m, tmp[0][m], 0); + __lsx_vst(_tmp1m, tmp[1][m], 0); + __lsx_vst(_tmp2m, tmp[2][m], 0); + __lsx_vst(_tmp3m, tmp[3][m], 0); + __lsx_vst(_tmp4m, tmp[4][m], 0); + __lsx_vst(_tmp5m, tmp[5][m], 0); + + r0 += w * 8; + } + + short* r0_tm_0 = (short*)img0_tm + (i * w_tiles + j) * 8; + short* r0_tm_1 = r0_tm_0 + tiles * 8; + short* r0_tm_2 = r0_tm_0 + tiles * 16; + short* r0_tm_3 = r0_tm_0 + tiles * 24; + short* r0_tm_4 = r0_tm_0 + tiles * 32; + short* r0_tm_5 = r0_tm_0 + tiles * 40; + + for (int m = 0; m < 6; m++) + { + __m128i _tmp00 = __lsx_vld(tmp[m][0], 0); + __m128i _tmp01 = __lsx_vld(tmp[m][1], 0); + __m128i _tmp02 = __lsx_vld(tmp[m][2], 0); + __m128i _tmp03 = __lsx_vld(tmp[m][3], 0); + __m128i _tmp04 = __lsx_vld(tmp[m][4], 0); + __m128i _tmp05 = __lsx_vld(tmp[m][5], 0); + + __m128i _v5 = __lsx_vreplgr2vr_h(5); + + __m128i _r0tm0 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp00, 2), _tmp04), __lsx_vmul_h(_tmp02, _v5)); + __m128i _r0tm1 = __lsx_vsub_h(__lsx_vadd_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vadd_h(_tmp01, _tmp02), 2)); + __m128i _r0tm2 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp03), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp02), 2)); + __m128i _r0tm3 = __lsx_vsub_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1)); + __m128i _r0tm4 = __lsx_vadd_h(__lsx_vsub_h(_tmp04, _tmp02), __lsx_vslli_h(__lsx_vsub_h(_tmp01, _tmp03), 1)); + __m128i _r0tm5 = __lsx_vsub_h(__lsx_vadd_h(__lsx_vslli_h(_tmp01, 2), _tmp05), __lsx_vmul_h(_tmp03, _v5)); + + __lsx_vst(_r0tm0, r0_tm_0, 0); + __lsx_vst(_r0tm1, r0_tm_1, 0); + __lsx_vst(_r0tm2, r0_tm_2, 0); + __lsx_vst(_r0tm3, r0_tm_3, 0); + __lsx_vst(_r0tm4, r0_tm_4, 0); + __lsx_vst(_r0tm5, r0_tm_5, 0); + + r0_tm_0 += tiles * 48; + r0_tm_1 += tiles * 48; + r0_tm_2 += tiles * 48; + r0_tm_3 += tiles * 48; + r0_tm_4 += tiles * 48; + r0_tm_5 += tiles * 48; + } + } + } + } +} diff --git a/src/layer/loongarch/convolutiondepthwise_3x3.h b/src/layer/loongarch/convolutiondepthwise_3x3.h new file mode 100644 index 000000000000..1c37f7789f3b --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_3x3.h @@ -0,0 +1,193 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const float* kernel = _kernel; + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + const float bias0 = bias ? bias[g] : 0.f; + + const float* kernel0 = kernel + g * 9; + + float* outptr0 = out; + float* outptr1 = outptr0 + outw; + + const float* img0 = bottom_blob.channel(g); + + const float* r0 = img0; + const float* r1 = img0 + w; + const float* r2 = img0 + w * 2; + const float* r3 = img0 + w * 3; + + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + int i = 0; + + for (; i + 1 < outh; i += 2) + { + for (int j = 0; j < outw; j++) + { + float sum = bias0; + float sum2 = bias0; + + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum2 += r1[0] * k0[0]; + sum2 += r1[1] * k0[1]; + sum2 += r1[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum2 += r2[0] * k1[0]; + sum2 += r2[1] * k1[1]; + sum2 += r2[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + sum2 += r3[0] * k2[0]; + sum2 += r3[1] * k2[1]; + sum2 += r3[2] * k2[2]; + + *outptr0 = sum; + *outptr1 = sum2; + + r0++; + r1++; + r2++; + r3++; + outptr0++; + outptr1++; + } + + r0 += 2 + w; + r1 += 2 + w; + r2 += 2 + w; + r3 += 2 + w; + + outptr0 += outw; + outptr1 += outw; + } + + for (; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = bias0; + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + + *outptr0 = sum; + + r0++; + r1++; + r2++; + outptr0++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + } +} + +static void convdw3x3s2_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const int tailstep = w - 2 * outw + w; + + const float* kernel = _kernel; + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + const float bias0 = bias ? bias[g] : 0.f; + + const float* kernel0 = kernel + g * 9; + + float* outptr = out; + + const float* img0 = bottom_blob.channel(g); + + const float* r0 = img0; + const float* r1 = img0 + w; + const float* r2 = img0 + w * 2; + + const float* k0 = kernel0; + const float* k1 = kernel0 + 3; + const float* k2 = kernel0 + 6; + + int i = 0; + + for (; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = bias0; + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + + *outptr = sum; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +} diff --git a/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h b/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h new file mode 100644 index 000000000000..48ae66412fc1 --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_3x3_pack4.h @@ -0,0 +1,464 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + const float* k0 = kernel.row(g); + + float* outptr0 = out.row(0); + float* outptr1 = out.row(1); + + const Mat img0 = bottom_blob.channel(g); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + const float* r3 = img0.row(3); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0); + __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0); + + int i = 0; + for (; i + 1 < outh; i += 2) + { + int j = 0; + for (; j + 1 < outw; j += 2) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(r1 + 32); + __builtin_prefetch(r2 + 32); + __builtin_prefetch(r3 + 32); + + __m128 _sum00 = _bias0; + __m128 _sum01 = _bias0; + __m128 _sum10 = _bias0; + __m128 _sum11 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00); + _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00); + _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00); + _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01); + _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01); + _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00); + _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00); + _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00); + _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01); + _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01); + _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01); + _sum10 = __lsx_vfmadd_s(_r10, _k00, _sum10); + _sum10 = __lsx_vfmadd_s(_r11, _k01, _sum10); + _sum10 = __lsx_vfmadd_s(_r12, _k02, _sum10); + _sum11 = __lsx_vfmadd_s(_r11, _k00, _sum11); + _sum11 = __lsx_vfmadd_s(_r12, _k01, _sum11); + _sum11 = __lsx_vfmadd_s(_r13, _k02, _sum11); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00); + _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00); + _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00); + _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01); + _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01); + _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01); + _sum10 = __lsx_vfmadd_s(_r20, _k10, _sum10); + _sum10 = __lsx_vfmadd_s(_r21, _k11, _sum10); + _sum10 = __lsx_vfmadd_s(_r22, _k12, _sum10); + _sum11 = __lsx_vfmadd_s(_r21, _k10, _sum11); + _sum11 = __lsx_vfmadd_s(_r22, _k11, _sum11); + _sum11 = __lsx_vfmadd_s(_r23, _k12, _sum11); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0); + + _sum10 = __lsx_vfmadd_s(_r30, _k20, _sum10); + _sum10 = __lsx_vfmadd_s(_r31, _k21, _sum10); + _sum10 = __lsx_vfmadd_s(_r32, _k22, _sum10); + _sum11 = __lsx_vfmadd_s(_r31, _k20, _sum11); + _sum11 = __lsx_vfmadd_s(_r32, _k21, _sum11); + _sum11 = __lsx_vfmadd_s(_r33, _k22, _sum11); + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum01, outptr0 + 4, 0); + __lsx_vst(_sum10, outptr1, 0); + __lsx_vst(_sum11, outptr1 + 4, 0); + + outptr0 += 4 * 2; + outptr1 += 4 * 2; + + r0 += 4 * 2; + r1 += 4 * 2; + r2 += 4 * 2; + r3 += 4 * 2; + } + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(r1 + 16); + __builtin_prefetch(r2 + 16); + __builtin_prefetch(r3 + 16); + + __m128 _sum0 = _bias0; + __m128 _sum1 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1); + _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1); + _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1); + _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1); + _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + + _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1); + _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1); + _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + + outptr0 += 4; + outptr1 += 4; + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + } + + r0 += 2 * 4 + w * 4; + r1 += 2 * 4 + w * 4; + r2 += 2 * 4 + w * 4; + r3 += 2 * 4 + w * 4; + + outptr0 += outw * 4; + outptr1 += outw * 4; + } + for (; i < outh; i++) + { + int j = 0; + for (; j + 1 < outw; j += 2) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(r1 + 32); + __builtin_prefetch(r2 + 32); + + __m128 _sum00 = _bias0; + __m128 _sum01 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00); + _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00); + _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00); + _sum01 = __lsx_vfmadd_s(_r01, _k00, _sum01); + _sum01 = __lsx_vfmadd_s(_r02, _k01, _sum01); + _sum01 = __lsx_vfmadd_s(_r03, _k02, _sum01); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00); + _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00); + _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00); + _sum01 = __lsx_vfmadd_s(_r11, _k10, _sum01); + _sum01 = __lsx_vfmadd_s(_r12, _k11, _sum01); + _sum01 = __lsx_vfmadd_s(_r13, _k12, _sum01); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + + _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00); + _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00); + _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00); + _sum01 = __lsx_vfmadd_s(_r21, _k20, _sum01); + _sum01 = __lsx_vfmadd_s(_r22, _k21, _sum01); + _sum01 = __lsx_vfmadd_s(_r23, _k22, _sum01); + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum01, outptr0 + 4, 0); + + outptr0 += 4 * 2; + + r0 += 4 * 2; + r1 += 4 * 2; + r2 += 4 * 2; + } + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(r1 + 16); + __builtin_prefetch(r2 + 16); + + __m128 _sum0 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 4; + r1 += 4; + r2 += 4; + } + + r0 += 2 * 4; + r1 += 2 * 4; + r2 += 2 * 4; + } + } +} + +static void convdw3x3s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const int tailstep = (w - 2 * outw + w) * 4; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + const float* k0 = kernel.row(g); + + float* outptr0 = out; + + const Mat img0 = bottom_blob.channel(g); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k10 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 5, 0); + __m128 _k20 = (__m128)__lsx_vld(k0 + 4 * 6, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4 * 7, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 8, 0); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j + 1 < outw; j += 2) + { + __builtin_prefetch(r0 + 64); + __builtin_prefetch(r1 + 64); + __builtin_prefetch(r2 + 64); + + __m128 _sum00 = _bias0; + __m128 _sum01 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + + _sum00 = __lsx_vfmadd_s(_r00, _k00, _sum00); + _sum00 = __lsx_vfmadd_s(_r01, _k01, _sum00); + _sum00 = __lsx_vfmadd_s(_r02, _k02, _sum00); + _sum01 = __lsx_vfmadd_s(_r02, _k00, _sum01); + _sum01 = __lsx_vfmadd_s(_r03, _k01, _sum01); + _sum01 = __lsx_vfmadd_s(_r04, _k02, _sum01); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0); + + _sum00 = __lsx_vfmadd_s(_r10, _k10, _sum00); + _sum00 = __lsx_vfmadd_s(_r11, _k11, _sum00); + _sum00 = __lsx_vfmadd_s(_r12, _k12, _sum00); + _sum01 = __lsx_vfmadd_s(_r12, _k10, _sum01); + _sum01 = __lsx_vfmadd_s(_r13, _k11, _sum01); + _sum01 = __lsx_vfmadd_s(_r14, _k12, _sum01); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0); + + _sum00 = __lsx_vfmadd_s(_r20, _k20, _sum00); + _sum00 = __lsx_vfmadd_s(_r21, _k21, _sum00); + _sum00 = __lsx_vfmadd_s(_r22, _k22, _sum00); + _sum01 = __lsx_vfmadd_s(_r22, _k20, _sum01); + _sum01 = __lsx_vfmadd_s(_r23, _k21, _sum01); + _sum01 = __lsx_vfmadd_s(_r24, _k22, _sum01); + + __lsx_vst(_sum00, outptr0, 0); + __lsx_vst(_sum01, outptr0 + 4, 0); + + outptr0 += 4 * 2; + + r0 += 4 * 4; + r1 += 4 * 4; + r2 += 4 * 4; + } + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(r1 + 32); + __builtin_prefetch(r2 + 32); + + __m128 _sum0 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 4 * 2; + r1 += 4 * 2; + r2 += 4 * 2; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +} diff --git a/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h b/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h new file mode 100644 index 000000000000..4f94c5e69958 --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_5x5_pack4.h @@ -0,0 +1,511 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw5x5s1_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + const float* k0 = kernel.row(g); + + float* outptr0 = out.row(0); + float* outptr1 = out.row(1); + + const Mat img0 = bottom_blob.channel(g); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + const float* r3 = img0.row(3); + const float* r4 = img0.row(4); + const float* r5 = img0.row(5); + + int i = 0; + for (; i + 1 < outh; i += 2) + { + int j = 0; + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(r1 + 16); + __builtin_prefetch(r2 + 16); + __builtin_prefetch(r3 + 16); + __builtin_prefetch(r4 + 16); + __builtin_prefetch(r5 + 16); + + __builtin_prefetch(k0 + 400); + + __m128 _sum0 = _bias0; + __m128 _sum1 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0); + _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r10, _k00, _sum1); + _sum1 = __lsx_vfmadd_s(_r11, _k01, _sum1); + _sum1 = __lsx_vfmadd_s(_r12, _k02, _sum1); + _sum1 = __lsx_vfmadd_s(_r13, _k03, _sum1); + _sum1 = __lsx_vfmadd_s(_r14, _k04, _sum1); + + __m128 _k10 = (__m128)__lsx_vld(k0, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0); + _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r20, _k10, _sum1); + _sum1 = __lsx_vfmadd_s(_r21, _k11, _sum1); + _sum1 = __lsx_vfmadd_s(_r22, _k12, _sum1); + _sum1 = __lsx_vfmadd_s(_r23, _k13, _sum1); + _sum1 = __lsx_vfmadd_s(_r24, _k14, _sum1); + + __m128 _k20 = (__m128)__lsx_vld(k0, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0); + _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0); + __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r30, _k20, _sum1); + _sum1 = __lsx_vfmadd_s(_r31, _k21, _sum1); + _sum1 = __lsx_vfmadd_s(_r32, _k22, _sum1); + _sum1 = __lsx_vfmadd_s(_r33, _k23, _sum1); + _sum1 = __lsx_vfmadd_s(_r34, _k24, _sum1); + + __m128 _k30 = (__m128)__lsx_vld(k0, 0); + __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0); + _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0); + _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0); + _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0); + _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0); + + __m128 _r40 = (__m128)__lsx_vld(r4, 0); + __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0); + __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0); + __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0); + __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r40, _k30, _sum1); + _sum1 = __lsx_vfmadd_s(_r41, _k31, _sum1); + _sum1 = __lsx_vfmadd_s(_r42, _k32, _sum1); + _sum1 = __lsx_vfmadd_s(_r43, _k33, _sum1); + _sum1 = __lsx_vfmadd_s(_r44, _k34, _sum1); + + __m128 _k40 = (__m128)__lsx_vld(k0, 0); + __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 -= 4 * 20; + + _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0); + _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0); + _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0); + _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0); + _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0); + + __m128 _r50 = (__m128)__lsx_vld(r5, 0); + __m128 _r51 = (__m128)__lsx_vld(r5 + 4, 0); + __m128 _r52 = (__m128)__lsx_vld(r5 + 4 * 2, 0); + __m128 _r53 = (__m128)__lsx_vld(r5 + 4 * 3, 0); + __m128 _r54 = (__m128)__lsx_vld(r5 + 4 * 4, 0); + + _sum1 = __lsx_vfmadd_s(_r50, _k40, _sum1); + _sum1 = __lsx_vfmadd_s(_r51, _k41, _sum1); + _sum1 = __lsx_vfmadd_s(_r52, _k42, _sum1); + _sum1 = __lsx_vfmadd_s(_r53, _k43, _sum1); + _sum1 = __lsx_vfmadd_s(_r54, _k44, _sum1); + + __lsx_vst(_sum0, outptr0, 0); + __lsx_vst(_sum1, outptr1, 0); + + outptr0 += 4; + outptr1 += 4; + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + r4 += 4; + r5 += 4; + } + + r0 += 4 * 4 + w * 4; + r1 += 4 * 4 + w * 4; + r2 += 4 * 4 + w * 4; + r3 += 4 * 4 + w * 4; + r4 += 4 * 4 + w * 4; + r5 += 4 * 4 + w * 4; + + outptr0 += outw * 4; + outptr1 += outw * 4; + } + for (; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 16); + __builtin_prefetch(r1 + 16); + __builtin_prefetch(r2 + 16); + __builtin_prefetch(r3 + 16); + __builtin_prefetch(r4 + 16); + + __builtin_prefetch(k0 + 400); + + __m128 _sum0 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0); + _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0); + + __m128 _k10 = (__m128)__lsx_vld(k0, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0); + _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0); + + __m128 _k20 = (__m128)__lsx_vld(k0, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0); + _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0); + __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0); + + __m128 _k30 = (__m128)__lsx_vld(k0, 0); + __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0); + _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0); + _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0); + _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0); + _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0); + + __m128 _r40 = (__m128)__lsx_vld(r4, 0); + __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0); + __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0); + __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0); + __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0); + + __m128 _k40 = (__m128)__lsx_vld(k0, 0); + __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 -= 4 * 20; + + _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0); + _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0); + _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0); + _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0); + _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + r4 += 4; + } + + r0 += 4 * 4; + r1 += 4 * 4; + r2 += 4 * 4; + r3 += 4 * 4; + r4 += 4 * 4; + } + } +} + +static void convdw5x5s2_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + const int group = bottom_blob.c; + + const int tailstep = (w - 2 * outw + w) * 4; + + const float* bias = _bias; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + __m128 _bias0 = bias ? (__m128)__lsx_vld(bias + g * 4, 0) : (__m128)__lsx_vreplgr2vr_w(0); + + const float* k0 = kernel.row(g); + + float* outptr0 = out; + + const Mat img0 = bottom_blob.channel(g); + + const float* r0 = img0.row(0); + const float* r1 = img0.row(1); + const float* r2 = img0.row(2); + const float* r3 = img0.row(3); + const float* r4 = img0.row(4); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + __builtin_prefetch(r0 + 32); + __builtin_prefetch(r1 + 32); + __builtin_prefetch(r2 + 32); + __builtin_prefetch(r3 + 32); + __builtin_prefetch(r4 + 32); + + __builtin_prefetch(k0 + 400); + + __m128 _sum0 = _bias0; + + __m128 _r00 = (__m128)__lsx_vld(r0, 0); + __m128 _r01 = (__m128)__lsx_vld(r0 + 4, 0); + __m128 _r02 = (__m128)__lsx_vld(r0 + 4 * 2, 0); + __m128 _r03 = (__m128)__lsx_vld(r0 + 4 * 3, 0); + __m128 _r04 = (__m128)__lsx_vld(r0 + 4 * 4, 0); + + __m128 _k00 = (__m128)__lsx_vld(k0, 0); + __m128 _k01 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k02 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k03 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k04 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r00, _k00, _sum0); + _sum0 = __lsx_vfmadd_s(_r01, _k01, _sum0); + _sum0 = __lsx_vfmadd_s(_r02, _k02, _sum0); + _sum0 = __lsx_vfmadd_s(_r03, _k03, _sum0); + _sum0 = __lsx_vfmadd_s(_r04, _k04, _sum0); + + __m128 _r10 = (__m128)__lsx_vld(r1, 0); + __m128 _r11 = (__m128)__lsx_vld(r1 + 4, 0); + __m128 _r12 = (__m128)__lsx_vld(r1 + 4 * 2, 0); + __m128 _r13 = (__m128)__lsx_vld(r1 + 4 * 3, 0); + __m128 _r14 = (__m128)__lsx_vld(r1 + 4 * 4, 0); + + __m128 _k10 = (__m128)__lsx_vld(k0, 0); + __m128 _k11 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k12 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k13 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k14 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r10, _k10, _sum0); + _sum0 = __lsx_vfmadd_s(_r11, _k11, _sum0); + _sum0 = __lsx_vfmadd_s(_r12, _k12, _sum0); + _sum0 = __lsx_vfmadd_s(_r13, _k13, _sum0); + _sum0 = __lsx_vfmadd_s(_r14, _k14, _sum0); + + __m128 _r20 = (__m128)__lsx_vld(r2, 0); + __m128 _r21 = (__m128)__lsx_vld(r2 + 4, 0); + __m128 _r22 = (__m128)__lsx_vld(r2 + 4 * 2, 0); + __m128 _r23 = (__m128)__lsx_vld(r2 + 4 * 3, 0); + __m128 _r24 = (__m128)__lsx_vld(r2 + 4 * 4, 0); + + __m128 _k20 = (__m128)__lsx_vld(k0, 0); + __m128 _k21 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k22 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k23 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k24 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r20, _k20, _sum0); + _sum0 = __lsx_vfmadd_s(_r21, _k21, _sum0); + _sum0 = __lsx_vfmadd_s(_r22, _k22, _sum0); + _sum0 = __lsx_vfmadd_s(_r23, _k23, _sum0); + _sum0 = __lsx_vfmadd_s(_r24, _k24, _sum0); + + __m128 _r30 = (__m128)__lsx_vld(r3, 0); + __m128 _r31 = (__m128)__lsx_vld(r3 + 4, 0); + __m128 _r32 = (__m128)__lsx_vld(r3 + 4 * 2, 0); + __m128 _r33 = (__m128)__lsx_vld(r3 + 4 * 3, 0); + __m128 _r34 = (__m128)__lsx_vld(r3 + 4 * 4, 0); + + __m128 _k30 = (__m128)__lsx_vld(k0, 0); + __m128 _k31 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k32 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k33 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k34 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 += 4 * 5; + + _sum0 = __lsx_vfmadd_s(_r30, _k30, _sum0); + _sum0 = __lsx_vfmadd_s(_r31, _k31, _sum0); + _sum0 = __lsx_vfmadd_s(_r32, _k32, _sum0); + _sum0 = __lsx_vfmadd_s(_r33, _k33, _sum0); + _sum0 = __lsx_vfmadd_s(_r34, _k34, _sum0); + + __m128 _r40 = (__m128)__lsx_vld(r4, 0); + __m128 _r41 = (__m128)__lsx_vld(r4 + 4, 0); + __m128 _r42 = (__m128)__lsx_vld(r4 + 4 * 2, 0); + __m128 _r43 = (__m128)__lsx_vld(r4 + 4 * 3, 0); + __m128 _r44 = (__m128)__lsx_vld(r4 + 4 * 4, 0); + + __m128 _k40 = (__m128)__lsx_vld(k0, 0); + __m128 _k41 = (__m128)__lsx_vld(k0 + 4, 0); + __m128 _k42 = (__m128)__lsx_vld(k0 + 4 * 2, 0); + __m128 _k43 = (__m128)__lsx_vld(k0 + 4 * 3, 0); + __m128 _k44 = (__m128)__lsx_vld(k0 + 4 * 4, 0); + k0 -= 4 * 20; + + _sum0 = __lsx_vfmadd_s(_r40, _k40, _sum0); + _sum0 = __lsx_vfmadd_s(_r41, _k41, _sum0); + _sum0 = __lsx_vfmadd_s(_r42, _k42, _sum0); + _sum0 = __lsx_vfmadd_s(_r43, _k43, _sum0); + _sum0 = __lsx_vfmadd_s(_r44, _k44, _sum0); + + __lsx_vst(_sum0, outptr0, 0); + + outptr0 += 4; + + r0 += 4 * 2; + r1 += 4 * 2; + r2 += 4 * 2; + r3 += 4 * 2; + r4 += 4 * 2; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + r3 += tailstep; + r4 += tailstep; + } + } +} diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.cpp b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp new file mode 100644 index 000000000000..4d134cc4a39a --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_loongarch.cpp @@ -0,0 +1,966 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolutiondepthwise_loongarch.h" + +#include "layer_type.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +#include "convolutiondepthwise_3x3.h" + +#if __loongarch_sx +#include "convolutiondepthwise_3x3_pack4.h" +#include "convolutiondepthwise_5x5_pack4.h" +#endif // __loongarch_sx + +ConvolutionDepthWise_loongarch::ConvolutionDepthWise_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx + + activation = 0; +} + +int ConvolutionDepthWise_loongarch::create_pipeline(const Option& opt) +{ + if (dynamic_weight) + return 0; + + activation = create_activation_layer(activation_type, activation_params, opt); + +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_loongarch(opt); + } +#endif + + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = channels % 4 == 0 ? 4 : 1; + } +#endif + +#if __loongarch_sx + // pack4 + if (elempack == 4) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, 4, opt); + } +#endif // __loongarch_sx + + if (elempack == 1) + { + weight_data_tm = weight_data; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; + } + + // group convolution + create_group_ops(opt); + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int ConvolutionDepthWise_loongarch::create_group_ops(const Option& opt) +{ + // create Convolution op for each group + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + for (int i = 0; i < (int)group_ops.size(); i++) + delete group_ops[i]; + + group_ops.clear(); + + const int channels_g = channels / group; + const int num_output_g = num_output / group; + + group_ops.resize(group); + + for (int g = 0; g < group; g++) + { + Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone(); + Mat bias_data_g; + if (bias_term) + bias_data_g = bias_data.range(num_output_g * g, num_output_g); + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution); + + // set param + ncnn::ParamDict pd; + pd.set(0, num_output_g); // num_output + pd.set(1, kernel_w); + pd.set(11, kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, 0); // pad_w + pd.set(14, 0); // pad_h + pd.set(5, bias_term); + pd.set(6, maxk * channels_g * num_output_g); // weight_data_size + pd.set(8, int8_scale_term); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + // set weights + if (bias_term) + { + ncnn::Mat weights[5]; + weights[0] = weight_data_g; + weights[1] = bias_data_g; + +#if NCNN_INT8 + if (int8_scale_term) + { + Mat weight_data_int8_scales_g(num_output_g); + weight_data_int8_scales_g.fill(weight_data_int8_scales[g]); + weights[2] = weight_data_int8_scales_g; + weights[3] = bottom_blob_int8_scales.range(g, 1); + } + if (int8_scale_term > 100) + { + weights[4] = top_blob_int8_scales.range(g, 1); + } +#endif + + op->load_model(ModelBinFromMatArray(weights)); + } + else + { + ncnn::Mat weights[4]; + weights[0] = weight_data_g; + +#if NCNN_INT8 + if (int8_scale_term) + { + Mat weight_data_int8_scales_g(num_output_g); + weight_data_int8_scales_g.fill(weight_data_int8_scales[g]); + weights[1] = weight_data_int8_scales_g; + weights[2] = bottom_blob_int8_scales.range(g, 1); + } + if (int8_scale_term > 100) + { + weights[3] = top_blob_int8_scales.range(g, 1); + } +#endif + + op->load_model(ModelBinFromMatArray(weights)); + } + + op->create_pipeline(opt); + + group_ops[g] = op; + } + + return 0; +} + +int ConvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt) +{ + if (activation) + { + activation->destroy_pipeline(opt); + delete activation; + activation = 0; + } + + for (int i = 0; i < (int)group_ops.size(); i++) + { + group_ops[i]->destroy_pipeline(opt); + delete group_ops[i]; + } + group_ops.clear(); + + return 0; +} + +int ConvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + return forward_int8_loongarch(bottom_blob, top_blob, opt); + } +#endif + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // depth-wise + if (channels * elempack == group && group == num_output) + { +#if __loongarch_sx + if (elempack == 4) + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convdw3x3s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convdw3x3s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convdw5x5s1_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convdw5x5s2_pack4_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + float* outptr = top_blob.channel(g); + const float* kptr = (const float*)weight_data_tm + maxk * g * 4; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0); + } + + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0); + __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + convdw3x3s1_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + convdw3x3s2_lsx(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + float* outptr = top_blob.channel(g); + const float* kptr = (const float*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[g]; + + const float* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + float val = (float)sptr[space_ofs[k]]; + float w = (float)kptr[k]; + sum += val * w; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } + } + } + + return 0; + } + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + g_elempack = channels_g % 4 == 0 ? 4 : 1; + out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; + } +#endif + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, 1, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} + +int ConvolutionDepthWise_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.c * _weight_data.elempack; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::ConvolutionDepthWise); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, pad_value); + pd.set(5, bias_term); + pd.set(6, weight_data_flattened.w); + pd.set(7, group); + pd.set(8, int8_scale_term); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_flattened; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + +#if NCNN_INT8 +int ConvolutionDepthWise_loongarch::create_pipeline_int8_loongarch(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = channels % 8 == 0 ? 8 : 1; + } +#endif // __loongarch_sx + + if (elempack == 8) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, 8, opt); + } + + if (elempack == 1) + { + weight_data_tm = weight_data; + } + + return 0; + } + + // group convolution + create_group_ops(opt); + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int ConvolutionDepthWise_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int elempack = bottom_blob.elempack; + + int elembits = bottom_blob.elembits(); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + const int channels_g = channels * elempack / group; + + Mat scales(channels * elempack); + { + float* ps = scales; + for (int g = 0; g < group; g++) + { + float scale = bottom_blob_int8_scales[g]; + for (int q = 0; q < channels_g; q++) + { + *ps++ = scale; + } + } + } + + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q); + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + channels = bottom_blob_bordered.c; + elempack = bottom_blob_bordered.elempack; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + // depth-wise + if (channels * elempack == group && group == num_output) + { + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif // __loongarch_sx + bool use_int8_requantize = int8_scale_term > 100; + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 8) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + __m128i _val = __lsx_vld(sptr + space_ofs[k] * 8, 0); + __m128i _val16 = __lsx_vilvl_b(__lsx_vslti_b(_val, 0), _val); + + __m128i _w = __lsx_vld(kptr + k * 8, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val16, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + } + + __m128 _scale_in0; + __m128 _scale_in1; + { + __m128 _bottom_blob_int8_scales0 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8, 0); + __m128 _bottom_blob_int8_scales1 = (__m128)__lsx_vld((const float*)bottom_blob_int8_scales + g * 8 + 4, 0); + __m128 _weight_data_int8_scales0 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8, 0); + __m128 _weight_data_int8_scales1 = (__m128)__lsx_vld((const float*)weight_data_int8_scales + g * 8 + 4, 0); + _scale_in0 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales0, _weight_data_int8_scales0)); + _scale_in1 = __lsx_vfrecip_s(__lsx_vfmul_s(_bottom_blob_int8_scales1, _weight_data_int8_scales1)); + + __m128i _m0 = __lsx_vfcmp_cne_s(_weight_data_int8_scales0, __lsx_vreplfr2vr_s(0.f)); + __m128i _m1 = __lsx_vfcmp_cne_s(_weight_data_int8_scales1, __lsx_vreplfr2vr_s(0.f)); + _scale_in0 = (__m128)__lsx_vand_v((__m128i)_scale_in0, (__m128i)_m0); + _scale_in1 = (__m128)__lsx_vand_v((__m128i)_scale_in1, (__m128i)_m1); + } + + __m128 _sumfp32_0 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum0), _scale_in0); + __m128 _sumfp32_1 = __lsx_vfmul_s(__lsx_vffint_s_w(_sum1), _scale_in1); + + if (bias_term) + { + __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + g * 8, 0); + __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + g * 8 + 4, 0); + _sumfp32_0 = __lsx_vfadd_s(_sumfp32_0, _bias0); + _sumfp32_1 = __lsx_vfadd_s(_sumfp32_1, _bias1); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize and relu + __m128 _scale_out0 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8, 0); + __m128 _scale_out1 = (__m128)__lsx_vld((const float*)top_blob_int8_scales + g * 8 + 4, 0); + _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_out0); + _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_out1); + int64_t _sum8 = float2int8(_sumfp32_0, _sumfp32_1); + + *(int64_t*)outptr_s8 = _sum8; + outptr_s8 += 8; + } + else + { + // dequantize and relu + __lsx_vst(_sumfp32_0, outptr_f32, 0); + __lsx_vst(_sumfp32_1, outptr_f32 + 4, 0); + outptr_f32 += 8; + } + } + } + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + float sumfp32 = sum * scale_in; + + if (bias_term) + sumfp32 += bias_data[g]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize + float scale_out = top_blob_int8_scales[g]; + signed char sums8 = float2int8(sumfp32 * scale_out); + outptr_s8[0] = sums8; + outptr_s8 += 1; + } + else + { + // dequantize + outptr_f32[0] = sumfp32; + outptr_f32 += 1; + } + } + } + } + } + } + + return 0; + } + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + g_elempack = channels_g % 8 == 0 ? 8 : 1; + if (use_int8_requantize) + out_g_elempack = num_output_g % 8 == 0 ? 8 : 1; + else + out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} +#endif // NCNN_INT8 + +} // namespace ncnn diff --git a/src/layer/loongarch/convolutiondepthwise_loongarch.h b/src/layer/loongarch/convolutiondepthwise_loongarch.h new file mode 100644 index 000000000000..554fe7643049 --- /dev/null +++ b/src/layer/loongarch/convolutiondepthwise_loongarch.h @@ -0,0 +1,50 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H +#define LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H + +#include "convolutiondepthwise.h" + +namespace ncnn { + +class ConvolutionDepthWise_loongarch : virtual public ConvolutionDepthWise +{ +public: + ConvolutionDepthWise_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + +protected: + int create_group_ops(const Option& opt); +#if NCNN_INT8 + int create_pipeline_int8_loongarch(const Option& opt); + int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + +public: + Layer* activation; + std::vector group_ops; + + Mat weight_data_tm; +}; + +} // namespace ncnn + +#endif // LAYER_CONVOLUTIONDEPTHWISE_LOONGARCH_H diff --git a/src/layer/loongarch/crop_loongarch.cpp b/src/layer/loongarch/crop_loongarch.cpp new file mode 100644 index 000000000000..e7c588bc4760 --- /dev/null +++ b/src/layer/loongarch/crop_loongarch.cpp @@ -0,0 +1,399 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "crop_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +namespace ncnn { + +Crop_loongarch::Crop_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +#if __loongarch_sx +static void crop_pack4_lsx(const Mat& src, Mat& dst, int top, int left) +{ + int w = dst.w; + int h = dst.h; + int right = src.w - dst.w - left; + + const float* ptr = src.row(top) + left * 4; + float* outptr = dst; + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } + + ptr += (left + right) * 4; + } +} +#endif // __loongarch_sx + +int Crop_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 4) + { + int _woffset, _hoffset, _doffset, _coffset; + int _outw, _outh, _outd, _outc; + resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc); + + if (dims == 1) + { + int out_elempack = _outw % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw / out_elempack == w && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_woffset % 4 == 0 && out_elempack == 4) + { + top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack); + + return 0; + } + } + + if (dims == 2) + { + int out_elempack = _outh % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh / out_elempack == h && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_hoffset % 4 == 0 && out_elempack == 4) + { + top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset); + + return 0; + } + } + + if (dims == 3) + { + int out_elempack = _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_coffset % 4 == 0 && out_elempack == 4) + { + const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack); + + if (_outw == w && _outh == h) + { + top_blob = bottom_blob_sliced.clone(); + if (top_blob.empty()) + return -100; + } + + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < top_blob.c; q++) + { + const Mat m = bottom_blob_sliced.channel(q); + Mat borderm = top_blob.channel(q); + + crop_pack4_lsx(m, borderm, _hoffset, _woffset); + } + + return 0; + } + } + + if (dims == 4) + { + int out_elempack = _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_coffset % 4 == 0 && out_elempack == 4) + { + const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack); + + if (_outw == w && _outh == h && _outd == d) + { + top_blob = bottom_blob_sliced.clone(); + if (top_blob.empty()) + return -100; + } + + top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < top_blob.c; q++) + { + for (int z = 0; z < _outd; z++) + { + const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset); + Mat borderm = top_blob.channel(q).depth(z); + + crop_pack4_lsx(m, borderm, _hoffset, _woffset); + } + } + + return 0; + } + } + } +#endif // __loongarch_sx + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + } + + return Crop::forward(bottom_blob_unpacked, top_blob, opt); +} + +int Crop_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& reference_blob = bottom_blobs[1]; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int ref_elempack = reference_blob.elempack; + + Mat& top_blob = top_blobs[0]; + +#if __loongarch_sx + if (elempack == 4) + { + int _woffset, _hoffset, _doffset, _coffset; + int _outw, _outh, _outd, _outc; + if (woffset == -233) + { + resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc); + } + else + { + resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc); + } + + if (dims == 1) + { + int out_elempack = _outw % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw / out_elempack == w && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_woffset % 4 == 0 && out_elempack == 4) + { + top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + crop_pack4_lsx(bottom_blob, top_blob, 0, _woffset / elempack); + + return 0; + } + } + + if (dims == 2) + { + int out_elempack = _outh % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh / out_elempack == h && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_hoffset % 4 == 0 && out_elempack == 4) + { + top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + crop_pack4_lsx(bottom_blob, top_blob, _hoffset / elempack, _woffset); + + return 0; + } + } + + if (dims == 3) + { + int out_elempack = _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_coffset % 4 == 0 && out_elempack == 4) + { + const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack); + + if (_outw == w && _outh == h) + { + top_blob = bottom_blob_sliced.clone(); + if (top_blob.empty()) + return -100; + } + + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < top_blob.c; q++) + { + const Mat m = bottom_blob_sliced.channel(q); + Mat borderm = top_blob.channel(q); + + crop_pack4_lsx(m, borderm, _hoffset, _woffset); + } + + return 0; + } + } + + if (dims == 4) + { + int out_elempack = _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4) + { + top_blob = bottom_blob; + return 0; + } + + if (_coffset % 4 == 0 && out_elempack == 4) + { + const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack); + + if (_outw == w && _outh == h && _outd == d) + { + top_blob = bottom_blob_sliced.clone(); + if (top_blob.empty()) + return -100; + } + + top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < top_blob.c; q++) + { + for (int z = 0; z < _outd; z++) + { + const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset); + Mat borderm = top_blob.channel(q).depth(z); + + crop_pack4_lsx(m, borderm, _hoffset, _woffset); + } + } + + return 0; + } + } + } +#endif // __loongarch_sx + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + } + + Mat reference_blob_unpacked = reference_blob; + if (ref_elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1); + } + + std::vector bottom_blobs_unpacked(2); + bottom_blobs_unpacked[0] = bottom_blob_unpacked; + bottom_blobs_unpacked[1] = reference_blob_unpacked; + + return Crop::forward(bottom_blobs_unpacked, top_blobs, opt); +} + +} // namespace ncnn diff --git a/src/layer/loongarch/crop_loongarch.h b/src/layer/loongarch/crop_loongarch.h new file mode 100644 index 000000000000..0ba460256d6a --- /dev/null +++ b/src/layer/loongarch/crop_loongarch.h @@ -0,0 +1,34 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CROP_LOONGARCH_H +#define LAYER_CROP_LOONGARCH_H + +#include "crop.h" + +namespace ncnn { + +class Crop_loongarch : virtual public Crop +{ +public: + Crop_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_CROP_LOONGARCH_H diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp new file mode 100644 index 000000000000..bb913909b551 --- /dev/null +++ b/src/layer/loongarch/deconvolution_loongarch.cpp @@ -0,0 +1,284 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deconvolution_loongarch.h" + +#include "layer_type.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +#if __loongarch_sx +#include "deconvolution_pack4.h" +#include "deconvolution_pack1to4.h" +#include "deconvolution_pack4to1.h" +#endif // __loongarch_sx + +Deconvolution_loongarch::Deconvolution_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Deconvolution_loongarch::create_pipeline(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + int num_input = weight_data_size / maxk / num_output; + + Mat weight_data_transposed(weight_data.w); + { + float* pt = weight_data_transposed; + const float* p = weight_data; + + for (int i = 0; i < num_input * num_output; i++) + { + for (int k = 0; k < maxk; k++) + { + pt[maxk - 1 - k] = p[k]; + } + + p += maxk; + pt += maxk; + } + } + + int elempack = 1; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + { + Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); + + weight_data_tm.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_tm.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < elempack; i++) + { + for (int j = 0; j < out_elempack; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } + +#if __loongarch_sx + // pack4 + if (elempack == 4 && out_elempack == 4) + { + } + + // pack1ton + if (elempack == 1 && out_elempack == 4) + { + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + } +#endif // __loongarch_sx + + // pack1 + if (elempack == 1 && out_elempack == 1) + { + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int Deconvolution_loongarch::destroy_pipeline(const Option& opt) +{ + return 0; +} + +int Deconvolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // deconvolv with NxN kernel + // value = value + bias + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + // NCNN_LOGE("Deconvolution input %d x %d pad = %d %d ksize=%d %d stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; + int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + +#if __loongarch_sx + if (elempack == 4 && out_elempack == 4) + { + { + deconvolution_pack4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 4) + { + { + deconvolution_pack1to4_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 1) + { + { + deconvolution_pack4to1_lsx(bottom_blob, top_blob_bordered, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type, activation_params, opt); + } + } +#endif // __loongarch_sx + + if (elempack == 1 && out_elempack == 1) + { + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output; p++) + { + float* outptr = top_blob_bordered.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + const float* kptr = (const float*)weight_data_tm.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + const float* sptr = m.row(sy); + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + float val = sptr[sx]; + + int k = y * kernel_w + x; + + float w = kptr[k]; + + sum += val * w; + } + } + + kptr += maxk; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } + } + } + + cut_padding(top_blob_bordered, top_blob, opt); + if (top_blob.empty()) + return -100; + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h new file mode 100644 index 000000000000..bb7653b563fa --- /dev/null +++ b/src/layer/loongarch/deconvolution_loongarch.h @@ -0,0 +1,38 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DECONVOLUTION_LOONGARCH_H +#define LAYER_DECONVOLUTION_LOONGARCH_H + +#include "deconvolution.h" + +namespace ncnn { + +class Deconvolution_loongarch : virtual public Deconvolution +{ +public: + Deconvolution_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +public: + Mat weight_data_tm; +}; + +} // namespace ncnn + +#endif // LAYER_DECONVOLUTION_LOONGARCH_H diff --git a/src/layer/loongarch/deconvolution_pack1to4.h b/src/layer/loongarch/deconvolution_pack1to4.h new file mode 100644 index 000000000000..ee1f932b57a9 --- /dev/null +++ b/src/layer/loongarch/deconvolution_pack1to4.h @@ -0,0 +1,99 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deconvolution_pack1to4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_data_ptr) + { + _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0); + } + + const float* kptr = (const float*)weight_data_pack1ton + maxk * channels * p * 4; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + const float* sptr = m.row(sy); + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + float val = sptr[sx]; + + int k = y * kernel_w + x; + + __m128 _val = (__m128)__lsx_vreplfr2vr_s(val); + __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + } + } + + kptr += maxk * 4; + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/deconvolution_pack4.h b/src/layer/loongarch/deconvolution_pack4.h new file mode 100644 index 000000000000..179a410350fb --- /dev/null +++ b/src/layer/loongarch/deconvolution_pack4.h @@ -0,0 +1,106 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deconvolution_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_data_ptr) + { + _sum = (__m128)__lsx_vld((const float*)bias_data_ptr + p * 4, 0); + } + + const float* kptr = (const float*)weight_data_pack4.channel(p); + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + const float* sptr = m.row(sy) + sx * 4; + + int k = (y * kernel_w + x) * 16; + + __m128 _val0 = (__m128)__lsx_vreplfr2vr_s(*sptr++); + __m128 _val1 = (__m128)__lsx_vreplfr2vr_s(*sptr++); + __m128 _val2 = (__m128)__lsx_vreplfr2vr_s(*sptr++); + __m128 _val3 = (__m128)__lsx_vreplfr2vr_s(*sptr++); + __m128 _w0 = (__m128)__lsx_vld(kptr + k, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + k + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + k + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + k + 12, 0); + _sum = __lsx_vfmadd_s(_w0, _val0, _sum); + _sum = __lsx_vfmadd_s(_w1, _val1, _sum); + _sum = __lsx_vfmadd_s(_w2, _val2, _sum); + _sum = __lsx_vfmadd_s(_w3, _val3, _sum); + } + } + + kptr += maxk * 16; + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } +} diff --git a/src/layer/loongarch/deconvolution_pack4to1.h b/src/layer/loongarch/deconvolution_pack4to1.h new file mode 100644 index 000000000000..e13721c2c35d --- /dev/null +++ b/src/layer/loongarch/deconvolution_pack4to1.h @@ -0,0 +1,101 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deconvolution_pack4to1_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack4to1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outch; p++) + { + float* outptr = top_blob.channel(p); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_data_ptr) + { + sum = bias_data_ptr[p]; + } + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + const float* kptr = (const float*)weight_data_pack4to1 + maxk * channels * p * 4; + + // channels + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob.channel(q); + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + const float* sptr = m.row(sy) + sx * 4; + + int k = y * kernel_w + x; + + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + } + } + + kptr += maxk * 4; + } + + sum += __lsx_reduce_fadd_s(_sum); + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } +} diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp new file mode 100644 index 000000000000..a141dd703601 --- /dev/null +++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp @@ -0,0 +1,412 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "deconvolutiondepthwise_loongarch.h" + +#include "layer_type.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +DeconvolutionDepthWise_loongarch::DeconvolutionDepthWise_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + elempack = channels % 4 == 0 ? 4 : 1; + } +#endif + + Mat weight_data_transposed(weight_data.w); + { + float* pt = weight_data_transposed; + const float* p = weight_data; + + for (int i = 0; i < (channels / group) * (num_output / group) * group; i++) + { + for (int k = 0; k < maxk; k++) + { + pt[maxk - 1 - k] = p[k]; + } + + p += maxk; + pt += maxk; + } + } + +#if __loongarch_sx + // pack4 + if (elempack == 4) + { + Mat weight_data_r2 = weight_data_transposed.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, 4, opt); + } +#endif // __loongarch_sx + + if (elempack == 1) + { + weight_data_tm = weight_data_transposed; + } + + return 0; + } + + // group convolution + create_group_ops(opt); + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int DeconvolutionDepthWise_loongarch::create_group_ops(const Option& opt) +{ + // create Deconvolution op for each group + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + for (int i = 0; i < (int)group_ops.size(); i++) + delete group_ops[i]; + + group_ops.clear(); + + const int channels_g = channels / group; + const int num_output_g = num_output / group; + + group_ops.resize(group); + + for (int g = 0; g < group; g++) + { + Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g).clone(); + Mat bias_data_g; + if (bias_term) + bias_data_g = bias_data.range(num_output_g * g, num_output_g); + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution); + + // set param + ncnn::ParamDict pd; + pd.set(0, num_output_g); // num_output + pd.set(1, kernel_w); + pd.set(11, kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, 0); // pad_w + pd.set(14, 0); // pad_h + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(5, bias_term); + pd.set(6, maxk * channels_g * num_output_g); // weight_data_size + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + // set weights + if (bias_term) + { + ncnn::Mat weights[2]; + weights[0] = weight_data_g; + weights[1] = bias_data_g; + + op->load_model(ModelBinFromMatArray(weights)); + } + else + { + ncnn::Mat weights[1]; + weights[0] = weight_data_g; + + op->load_model(ModelBinFromMatArray(weights)); + } + + op->create_pipeline(opt); + + group_ops[g] = op; + } + + return 0; +} + +int DeconvolutionDepthWise_loongarch::destroy_pipeline(const Option& opt) +{ + for (int i = 0; i < (int)group_ops.size(); i++) + { + group_ops[i]->destroy_pipeline(opt); + delete group_ops[i]; + } + group_ops.clear(); + + return 0; +} + +int DeconvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // convolv with NxN kernel + // value = value + bias + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - 1) * stride_w + kernel_extent_w + output_pad_right; + int outh = (h - 1) * stride_h + kernel_extent_h + output_pad_bottom; + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat top_blob_bordered; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0 || (output_w > 0 && output_h > 0)) + { + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_allocator); + } + else + { + top_blob_bordered = top_blob; + top_blob_bordered.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + } + if (top_blob_bordered.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // depth-wise + if (channels * elempack == group && group == num_output) + { +#if __loongarch_sx + if (elempack == 4) + { + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + float* outptr = top_blob_bordered.channel(g); + const float* kptr = (const float*)weight_data_tm + maxk * g * 4; + const Mat m = bottom_blob.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = (__m128)__lsx_vld((const float*)bias_data + g * 4, 0); + } + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + const float* sptr = m.row(sy) + sx * 4; + + int k = y * kernel_w + x; + + __m128 _val = (__m128)__lsx_vld(sptr, 0); + __m128 _w = (__m128)__lsx_vld(kptr + k * 4, 0); + _sum = __lsx_vfmadd_s(_w, _val, _sum); + } + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + float* outptr = top_blob_bordered.channel(g); + const float* kptr = (const float*)weight_data_tm + maxk * g; + const Mat m = bottom_blob.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[g]; + } + + for (int y = 0; y < kernel_h; y++) + { + int sys = (i + y * dilation_h - (kernel_extent_h - 1)); + if (sys < 0 || sys % stride_h != 0) + continue; + + int sy = sys / stride_h; + if (sy >= h) + continue; + + const float* sptr = m.row(sy); + + for (int x = 0; x < kernel_w; x++) + { + int sxs = (j + x * dilation_w - (kernel_extent_w - 1)); + if (sxs < 0 || sxs % stride_w != 0) + continue; + + int sx = sxs / stride_w; + if (sx >= w) + continue; + + float val = sptr[sx]; + + int k = y * kernel_w + x; + + float w = kptr[k]; + + sum += val * w; + } + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[j] = sum; + } + + outptr += outw; + } + } + } + } + else + { + // group deconvolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + g_elempack = channels_g % 4 == 0 ? 4 : 1; + out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; + } +#endif + + // unpacking + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_p); + } + + Mat top_blob_bordered_unpacked = top_blob_bordered; + if (out_g_elempack < out_elempack) + { + top_blob_bordered_unpacked.create(outw, outh, num_output, out_elemsize / out_elempack, 1, opt.workspace_allocator); + if (top_blob_bordered_unpacked.empty()) + return -100; + } + + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_g = bottom_blob_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_bordered_g = top_blob_bordered_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_bordered_unpacked.allocator; + + // forward + op->forward(bottom_blob_g, top_blob_bordered_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_bordered_unpacked, top_blob_bordered, 4, opt); + } + else + { + top_blob_bordered = top_blob_bordered_unpacked; + } + } + + cut_padding(top_blob_bordered, top_blob, opt); + if (top_blob.empty()) + return -100; + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h new file mode 100644 index 000000000000..e41e7cac9e18 --- /dev/null +++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h @@ -0,0 +1,43 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H +#define LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H + +#include "deconvolutiondepthwise.h" + +namespace ncnn { + +class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise +{ +public: + DeconvolutionDepthWise_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + int create_group_ops(const Option& opt); + +public: + std::vector group_ops; + + Mat weight_data_tm; +}; + +} // namespace ncnn + +#endif // LAYER_DECONVOLUTIONDEPTHWISE_LOONGARCH_H diff --git a/src/layer/loongarch/dequantize_loongarch.cpp b/src/layer/loongarch/dequantize_loongarch.cpp new file mode 100644 index 000000000000..5ee9595f89f0 --- /dev/null +++ b/src/layer/loongarch/dequantize_loongarch.cpp @@ -0,0 +1,838 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "dequantize_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Dequantize_loongarch::Dequantize_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Dequantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + // assert bottom_blob.elembits() == 32 + + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 8) + { + if (dims == 1) + { + int w = bottom_blob.w; + int outw = w * 2; + + top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outw; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int outh = h * 2; + + top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 2); + float* ptr1 = top_blob.row(i * 2 + 1); + + __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); + __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v1, ptr1, 0); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 2); + float* ptr1 = top_blob.row(i * 2 + 1); + + __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); + __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v1, ptr1, 0); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = channels * 2; + + top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 2); + float* ptr1 = top_blob.channel(q * 2 + 1); + + __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); + __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v2, ptr0 + 4, 0); + __lsx_vst(_v1, ptr1, 0); + __lsx_vst(_v3, ptr1 + 4, 0); + + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v1, ptr1, 0); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 2); + float* ptr1 = top_blob.channel(q * 2 + 1); + + __m128 _scale0 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); + __m128 _scale1 = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v2, ptr0 + 4, 0); + __lsx_vst(_v1, ptr1, 0); + __lsx_vst(_v3, ptr1 + 4, 0); + + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + __lsx_vst(_v0, ptr0, 0); + __lsx_vst(_v1, ptr1, 0); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + float* ptr = (float*)top_blob + i * 4; + + __m128 _scale = (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + i * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + __lsx_vst(_v0, ptr, 0); + __lsx_vst(_v1, ptr + 4, 0); + + intptr += 8; + ptr += 8; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + __m128 _scale = scale_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_data[0]) : (__m128)__lsx_vld((const float*)scale_data + q * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale, _v1, _bias); + __lsx_vst(_v0, ptr, 0); + __lsx_vst(_v1, ptr + 4, 0); + + intptr += 8; + ptr += 8; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } + } + } + } + + return 0; + } +#endif // __loongarch_sx + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + float* ptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias_data[i]; + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i]; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + int j = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + for (; j + 3 < w; j += 4) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + *ptr++ = *intptr++ * scale; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + int j = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); + for (; j + 3 < w; j += 4) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + *ptr++ = *intptr++ * scale + bias; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + __lsx_vst(_v0, ptr, 0); + __lsx_vst(_v1, ptr + 4, 0); + + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr++ = *intptr++ * scale; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + int i = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias); + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale, _v1, _bias); + __lsx_vst(_v0, ptr, 0); + __lsx_vst(_v1, ptr + 4, 0); + + intptr += 8; + ptr += 8; + } + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __lsx_vst(_v, ptr, 0); + + intptr += 4; + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr++ = *intptr++ * scale + bias; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/dequantize_loongarch.h b/src/layer/loongarch/dequantize_loongarch.h new file mode 100644 index 000000000000..61a408d5c505 --- /dev/null +++ b/src/layer/loongarch/dequantize_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEQUANTIZE_LOONGARCH_H +#define LAYER_DEQUANTIZE_LOONGARCH_H + +#include "dequantize.h" + +namespace ncnn { + +class Dequantize_loongarch : virtual public Dequantize +{ +public: + Dequantize_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_DEQUANTIZE_LOONGARCH_H diff --git a/src/layer/loongarch/dropout_loongarch.cpp b/src/layer/loongarch/dropout_loongarch.cpp new file mode 100644 index 000000000000..04a1f9ea95d8 --- /dev/null +++ b/src/layer/loongarch/dropout_loongarch.cpp @@ -0,0 +1,75 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "dropout_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Dropout_loongarch::Dropout_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Dropout_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + if (scale == 1.f) + { + return 0; + } + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmul_s(_p, _scale); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = *ptr * scale; + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/dropout_loongarch.h b/src/layer/loongarch/dropout_loongarch.h new file mode 100644 index 000000000000..42810050677a --- /dev/null +++ b/src/layer/loongarch/dropout_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DROPOUT_LOONGARCH_H +#define LAYER_DROPOUT_LOONGARCH_H + +#include "dropout.h" + +namespace ncnn { + +class Dropout_loongarch : virtual public Dropout +{ +public: + Dropout_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_DROPOUT_LOONGARCH_H diff --git a/src/layer/loongarch/eltwise_loongarch.cpp b/src/layer/loongarch/eltwise_loongarch.cpp new file mode 100644 index 000000000000..d803fc3db78e --- /dev/null +++ b/src/layer/loongarch/eltwise_loongarch.cpp @@ -0,0 +1,332 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "eltwise_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Eltwise_loongarch::Eltwise_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Eltwise_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int elempack = bottom_blob.elempack; + int size = w * h * elempack; + + Mat& top_blob = top_blobs[0]; + top_blob.create_like(bottom_blob, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (op_type == Operation_PROD) + { + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + _p = __lsx_vfmul_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = *ptr * *ptr1; + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(outptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmul_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr *= *ptr; + + ptr++; + outptr++; + } + } + } + } + if (op_type == Operation_SUM) + { + if (coeffs.w == 0) + { + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + _p = __lsx_vfadd_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = *ptr + *ptr1; + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(outptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfadd_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr += *ptr; + + ptr++; + outptr++; + } + } + } + } + else + { + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + float coeff0 = coeffs[0]; + float coeff1 = coeffs[1]; +#if __loongarch_sx + __m128 _coeff0 = (__m128)__lsx_vreplfr2vr_s(coeff0); + __m128 _coeff1 = (__m128)__lsx_vreplfr2vr_s(coeff1); +#endif // __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + _p = __lsx_vfmul_s(_p, _coeff0); + _p = __lsx_vfmadd_s(_coeff1, _p1, _p); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = *ptr * coeff0 + *ptr1 * coeff1; + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + float coeff = coeffs[b]; +#if __loongarch_sx + __m128 _coeff = (__m128)__lsx_vreplfr2vr_s(coeff); +#endif // __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(outptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmadd_s(_coeff, _p1, _p); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr += *ptr * coeff; + + ptr++; + outptr++; + } + } + } + } + } + if (op_type == Operation_MAX) + { + // first blob + const Mat& bottom_blob1 = bottom_blobs[1]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + const float* ptr1 = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr1, 0); + _p = __lsx_vfmax_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + ptr1 += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = std::max(*ptr, *ptr1); + + ptr++; + ptr1++; + outptr++; + } + } + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + const Mat& bottom_blob1 = bottom_blobs[b]; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob1.channel(q); + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __m128 _p = (__m128)__lsx_vld(outptr, 0); + __m128 _p1 = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmax_s(_p, _p1); + __lsx_vst(_p, outptr, 0); + + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr = std::max(*ptr, *outptr); + + ptr++; + outptr++; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/eltwise_loongarch.h b/src/layer/loongarch/eltwise_loongarch.h new file mode 100644 index 000000000000..f9715b20cadc --- /dev/null +++ b/src/layer/loongarch/eltwise_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_ELTWISE_LOONGARCH_H +#define LAYER_ELTWISE_LOONGARCH_H + +#include "eltwise.h" + +namespace ncnn { + +class Eltwise_loongarch : virtual public Eltwise +{ +public: + Eltwise_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_ELTWISE_LOONGARCH_H diff --git a/src/layer/loongarch/flatten_loongarch.cpp b/src/layer/loongarch/flatten_loongarch.cpp new file mode 100644 index 000000000000..6d9a86362873 --- /dev/null +++ b/src/layer/loongarch/flatten_loongarch.cpp @@ -0,0 +1,370 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "flatten_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +Flatten_loongarch::Flatten_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Flatten_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + if (elembits == 8) + return forward_int8(bottom_blob, top_blob, opt); + + int dims = bottom_blob.dims; + + if (dims == 1) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + int size = w * h * d; + + int total = size * channels * elempack; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = total % 4 == 0 ? 4 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (out_elempack == 1) + { + return Flatten::forward(bottom_blob, top_blob, opt); + } + + if (dims == 2 && elempack == 1) // out_elempack == 4 + { + top_blob = bottom_blob; + top_blob.dims = 1; + top_blob.w = total / out_elempack; + top_blob.h = 1; + top_blob.cstep = top_blob.w; + top_blob.elemsize = out_elemsize; + top_blob.elempack = out_elempack; + return 0; + } + + top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (dims == 2) + { +#if __loongarch_sx + if (elempack == 4) // out_elempack == 4 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr = bottom_blob.row(i); + float* outptr0 = (float*)top_blob + w * i * 4; + float* outptr1 = (float*)top_blob + w * (i * 4 + 1); + float* outptr2 = (float*)top_blob + w * (i * 4 + 2); + float* outptr3 = (float*)top_blob + w * (i * 4 + 3); + + int j = 0; + for (; j + 3 < w; j += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(ptr, 0); + __m128i _r1 = __lsx_vld(ptr + 4, 0); + __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0); + __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr0, 0); + __lsx_vst(_r0123_1, outptr1, 0); + __lsx_vst(_r0123_2, outptr2, 0); + __lsx_vst(_r0123_3, outptr3, 0); + + ptr += 16; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } + for (; j < w; j++) + { + *outptr0++ = ptr[0]; + *outptr1++ = ptr[1]; + *outptr2++ = ptr[2]; + *outptr3++ = ptr[3]; + + ptr += 4; + } + } + } +#endif // __loongarch_sx + } + + if (dims == 3 || dims == 4) + { +#if __loongarch_sx + if (elempack == 4) // out_elempack == 4 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + float* outptr0 = (float*)top_blob + size * q * 4; + float* outptr1 = (float*)top_blob + size * (q * 4 + 1); + float* outptr2 = (float*)top_blob + size * (q * 4 + 2); + float* outptr3 = (float*)top_blob + size * (q * 4 + 3); + + int i = 0; + for (; i + 3 < size; i += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(ptr, 0); + __m128i _r1 = __lsx_vld(ptr + 4, 0); + __m128i _r2 = __lsx_vld(ptr + 4 * 2, 0); + __m128i _r3 = __lsx_vld(ptr + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr0, 0); + __lsx_vst(_r0123_1, outptr1, 0); + __lsx_vst(_r0123_2, outptr2, 0); + __lsx_vst(_r0123_3, outptr3, 0); + + ptr += 16; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } + for (; i < size; i++) + { + *outptr0++ = ptr[0]; + *outptr1++ = ptr[1]; + *outptr2++ = ptr[2]; + *outptr3++ = ptr[3]; + + ptr += 4; + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) // out_elempack == 4 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + float* outptr = (float*)top_blob + size * q; + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __lsx_vst(__lsx_vld(ptr, 0), outptr, 0); + ptr += 4; + outptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr++ = *ptr++; + } + } + } + } + + return 0; +} + +int Flatten_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + + if (dims == 1) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + int size = w * h * d; + + int total = size * channels * elempack; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = total % 8 == 0 ? 8 : 1; + } +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (out_elempack == 1) + { + return Flatten::forward(bottom_blob, top_blob, opt); + } + + if (dims == 2 && elempack == 1) // out_elempack == 8 + { + top_blob = bottom_blob; + top_blob.dims = 1; + top_blob.w = total / out_elempack; + top_blob.h = 1; + top_blob.cstep = top_blob.w; + top_blob.elemsize = out_elemsize; + top_blob.elempack = out_elempack; + return 0; + } + + top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (dims == 2) + { +#if __loongarch_sx + if (elempack == 8) // out_elempack == 8 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const signed char* ptr = bottom_blob.row(i); + signed char* outptr0 = (signed char*)top_blob + w * i * 8; + signed char* outptr1 = (signed char*)top_blob + w * (i * 8 + 1); + signed char* outptr2 = (signed char*)top_blob + w * (i * 8 + 2); + signed char* outptr3 = (signed char*)top_blob + w * (i * 8 + 3); + signed char* outptr4 = (signed char*)top_blob + w * (i * 8 + 4); + signed char* outptr5 = (signed char*)top_blob + w * (i * 8 + 5); + signed char* outptr6 = (signed char*)top_blob + w * (i * 8 + 6); + signed char* outptr7 = (signed char*)top_blob + w * (i * 8 + 7); + + int j = 0; + for (; j < w; j++) + { + *outptr0++ = ptr[0]; + *outptr1++ = ptr[1]; + *outptr2++ = ptr[2]; + *outptr3++ = ptr[3]; + *outptr4++ = ptr[4]; + *outptr5++ = ptr[5]; + *outptr6++ = ptr[6]; + *outptr7++ = ptr[7]; + + ptr += 8; + } + } + } +#endif // __loongarch_sx + } + + if (dims == 3 || dims == 4) + { +#if __loongarch_sx + if (elempack == 8) // out_elempack == 8 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const signed char* ptr = bottom_blob.channel(q); + signed char* outptr0 = (signed char*)top_blob + size * q * 8; + signed char* outptr1 = (signed char*)top_blob + size * (q * 8 + 1); + signed char* outptr2 = (signed char*)top_blob + size * (q * 8 + 2); + signed char* outptr3 = (signed char*)top_blob + size * (q * 8 + 3); + signed char* outptr4 = (signed char*)top_blob + size * (q * 8 + 4); + signed char* outptr5 = (signed char*)top_blob + size * (q * 8 + 5); + signed char* outptr6 = (signed char*)top_blob + size * (q * 8 + 6); + signed char* outptr7 = (signed char*)top_blob + size * (q * 8 + 7); + + int i = 0; + for (; i < size; i++) + { + *outptr0++ = ptr[0]; + *outptr1++ = ptr[1]; + *outptr2++ = ptr[2]; + *outptr3++ = ptr[3]; + *outptr4++ = ptr[4]; + *outptr5++ = ptr[5]; + *outptr6++ = ptr[6]; + *outptr7++ = ptr[7]; + + ptr += 8; + } + } + } +#endif // __loongarch_sx + + if (elempack == 1) // out_elempack == 8 + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const signed char* ptr = bottom_blob.channel(q); + signed char* outptr = (signed char*)top_blob + size * q; + + int i = 0; + for (; i < size; i++) + { + *outptr++ = *ptr++; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/flatten_loongarch.h b/src/layer/loongarch/flatten_loongarch.h new file mode 100644 index 000000000000..afd35c701f59 --- /dev/null +++ b/src/layer/loongarch/flatten_loongarch.h @@ -0,0 +1,35 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_FLATTEN_LOONGARCH_H +#define LAYER_FLATTEN_LOONGARCH_H + +#include "flatten.h" + +namespace ncnn { + +class Flatten_loongarch : virtual public Flatten +{ +public: + Flatten_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_FLATTEN_LOONGARCH_H diff --git a/src/layer/loongarch/hardsigmoid_loongarch.cpp b/src/layer/loongarch/hardsigmoid_loongarch.cpp new file mode 100644 index 000000000000..9dfedb689bc5 --- /dev/null +++ b/src/layer/loongarch/hardsigmoid_loongarch.cpp @@ -0,0 +1,79 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "hardsigmoid_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +HardSigmoid_loongarch::HardSigmoid_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int HardSigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha); + __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmadd_s(_alpha, _p, _beta); + _p = __lsx_vfmax_s(_p, _zero); + _p = __lsx_vfmin_s(_p, _one); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < lower) + *ptr = 0.f; + else if (*ptr > upper) + *ptr = 1.f; + else + *ptr = *ptr * alpha + beta; + ++ptr; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/hardsigmoid_loongarch.h b/src/layer/loongarch/hardsigmoid_loongarch.h new file mode 100644 index 000000000000..755ae89ff03e --- /dev/null +++ b/src/layer/loongarch/hardsigmoid_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_HARDSIGMOID_LOONGARCH_H +#define LAYER_HARDSIGMOID_LOONGARCH_H + +#include "hardsigmoid.h" + +namespace ncnn { + +class HardSigmoid_loongarch : virtual public HardSigmoid +{ +public: + HardSigmoid_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_HARDSIGMOID_LOONGARCH_H diff --git a/src/layer/loongarch/hardswish_loongarch.cpp b/src/layer/loongarch/hardswish_loongarch.cpp new file mode 100644 index 000000000000..f1417a7986c9 --- /dev/null +++ b/src/layer/loongarch/hardswish_loongarch.cpp @@ -0,0 +1,80 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "hardswish_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +HardSwish_loongarch::HardSwish_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int HardSwish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(alpha); + __m128 _beta = (__m128)__lsx_vreplfr2vr_s(beta); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _outp = __lsx_vfmadd_s(_alpha, _p, _beta); + _outp = __lsx_vfmax_s(_outp, _zero); + _outp = __lsx_vfmin_s(_outp, _one); + _outp = __lsx_vfmul_s(_outp, _p); + __lsx_vst(_outp, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < lower) + *ptr = 0.f; + else if (*ptr > upper) + ; + else + *ptr = *ptr * (*ptr * alpha + beta); + ++ptr; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/hardswish_loongarch.h b/src/layer/loongarch/hardswish_loongarch.h new file mode 100644 index 000000000000..e9b0821245c3 --- /dev/null +++ b/src/layer/loongarch/hardswish_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_HARDSWISH_LOONGARCH_H +#define LAYER_HARDSWISH_LOONGARCH_H + +#include "hardswish.h" + +namespace ncnn { + +class HardSwish_loongarch : virtual public HardSwish +{ +public: + HardSwish_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_HARDSWISH_LOONGARCH_H diff --git a/src/layer/loongarch/innerproduct_loongarch.cpp b/src/layer/loongarch/innerproduct_loongarch.cpp new file mode 100644 index 000000000000..3dd6ff35e232 --- /dev/null +++ b/src/layer/loongarch/innerproduct_loongarch.cpp @@ -0,0 +1,1637 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "innerproduct_loongarch.h" + +#include "layer_type.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include "loongarch_activation.h" + +namespace ncnn { + +InnerProduct_loongarch::InnerProduct_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx + + flatten = 0; +} + +int InnerProduct_loongarch::create_pipeline(const Option& opt) +{ + { + flatten = ncnn::create_layer(ncnn::LayerType::Flatten); + + ncnn::ParamDict pd; + + flatten->load_param(pd); + + flatten->create_pipeline(opt); + } + +#if NCNN_INT8 + if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) + { + return create_pipeline_int8_loongarch(opt); + } +#endif + +#if __loongarch_sx + if (opt.use_fp16_storage) + { + return create_pipeline_fp16s(opt); + } +#endif + + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; + +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + + if (out_elempack == 4) + { + // src = inch-outch + // dst = 4-inch-outch/4 + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / 4, (size_t)4u * 4, 4); + + for (int q = 0; q + 3 < num_output; q += 4) + { + float* g0 = weight_data_tm.row(q / 4); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < 4; j++) + { + *g0++ = weight_data_r2.row(q + j)[p]; + } + } + } + } + } + else + { + weight_data_tm = weight_data; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int InnerProduct_loongarch::destroy_pipeline(const Option& opt) +{ + if (flatten) + { + flatten->destroy_pipeline(opt); + delete flatten; + flatten = 0; + } + + return 0; +} + +int InnerProduct_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + return forward_int8_loongarch(bottom_blob, top_blob, opt); + } +#endif + +#if __loongarch_sx + if (opt.use_fp16_storage) + { + return forward_fp16s(bottom_blob, top_blob, opt); + } +#endif + + const int num_input = weight_data_size / num_output; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + num_output_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { +#if __loongarch_sx + if (elempack == 4 && num_output_elempack == 4) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const float* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 0]); + _sum1 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 1]); + _sum2 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 2]); + _sum3 = __lsx_vreplfr2vr_s(bias_data[p * 4 + 3]); + } + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 16); + __m128 _val = (__m128)__lsx_vld(m, 0); + __m128i _w = __lsx_vld(kptr, 0); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3); + + m += 4; + kptr += 4; + } + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum1 = activation_ps(_sum1, activation_type, activation_params); + _sum2 = activation_ps(_sum2, activation_type, activation_params); + _sum3 = activation_ps(_sum3, activation_type, activation_params); + + __lsx_vst(_sum0, outptr, 0); + __lsx_vst(_sum1, outptr + 4, 0); + __lsx_vst(_sum2, outptr + 8, 0); + __lsx_vst(_sum3, outptr + 12, 0); + outptr += 16; + } + } + + if (elempack == 1 && num_output_elempack == 4) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const float* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + int i = 0; + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(m, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3); + + m += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = __lsx_vreplfr2vr_s(m[0]); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum0 = __lsx_vfmadd_s(_w, _val, _sum0); + + m += 1; + kptr += 4; + } + + _sum0 = __lsx_vfadd_s(_sum0, _sum1); + _sum2 = __lsx_vfadd_s(_sum2, _sum3); + _sum0 = __lsx_vfadd_s(_sum0, _sum2); + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + + __lsx_vst(_sum0, outptr, 0); + outptr += 4; + } + } + + if (elempack == 4 && num_output_elempack == 1) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const float* kptr = (const float*)weight_data_tm + num_input * p; + const float* m = bottom_blob.row(j); + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = __lsx_vreplfr2vr_s(bias_data[p]); + } + + for (int i = 0; i < num_input; i++) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 4); + __m128 _val = (__m128)__lsx_vld(m, 0); + __m128 _k = __lsx_vreplfr2vr_s(kptr[0]); + _sum = __lsx_vfmadd_s(_k, _val, _sum); + + m += 4; + kptr += 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr, 0); + outptr += 4; + } + } +#endif // __loongarch_sx + + if (elempack == 1 && num_output_elempack == 1) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const float* kptr = (const float*)weight_data_tm + num_input * p; + const float* m = bottom_blob.row(j); + + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + int i = 0; +#if __loongarch_sx + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum = __lsx_vfmadd_s(_w, _m, _sum); + + m += 4; + kptr += 4; + } + sum += __lsx_reduce_fadd_s(_sum); +#endif // __loongarch_sx + for (; i < num_input; i++) + { + sum += *m * *kptr; + + m += 1; + kptr += 1; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[0] = sum; + outptr += 1; + } + } + } + + return 0; + } + + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __loongarch_sx + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + const float* kptr = weight_data_tm.row(p); + + const float* sptr = bottom_blob_flattened; + + int i = 0; + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(sptr + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(sptr, 0); + __m128 _w0 = (__m128)__lsx_vld(kptr, 0); + __m128 _w1 = (__m128)__lsx_vld(kptr + 4, 0); + __m128 _w2 = (__m128)__lsx_vld(kptr + 8, 0); + __m128 _w3 = (__m128)__lsx_vld(kptr + 12, 0); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3); + + sptr += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = __lsx_vreplfr2vr_s(sptr[0]); + __m128 _w = (__m128)__lsx_vld(kptr, 0); + _sum0 = __lsx_vfmadd_s(_w, _val, _sum0); + + sptr += 1; + kptr += 4; + } + + _sum0 = __lsx_vfadd_s(_sum0, _sum1); + _sum2 = __lsx_vfadd_s(_sum2, _sum3); + _sum0 = __lsx_vfadd_s(_sum0, _sum2); + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + + float* outptr = top_blob; + __lsx_vst(_sum0, outptr + p * 4, 0); + } + } +#endif // __loongarch_sx + + if (out_elempack == 1) + { + int nn_num_output = num_output / 4; + int remain_num_output_start = nn_num_output * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * 4; + + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + + if (bias_term) + { + sum0 = bias_data[p]; + sum1 = bias_data[p + 1]; + sum2 = bias_data[p + 2]; + sum3 = bias_data[p + 3]; + } + + const float* w0 = (const float*)weight_data_tm + num_input * p; + const float* w1 = (const float*)weight_data_tm + num_input * (p + 1); + const float* w2 = (const float*)weight_data_tm + num_input * (p + 2); + const float* w3 = (const float*)weight_data_tm + num_input * (p + 3); + + const float* m = bottom_blob_flattened; + + int i = 0; +#if __loongarch_sx + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(w0 + 16); + __builtin_prefetch(w1 + 16); + __builtin_prefetch(w2 + 16); + __builtin_prefetch(w3 + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w0 = (__m128)__lsx_vld(w0, 0); + __m128 _w1 = (__m128)__lsx_vld(w1, 0); + __m128 _w2 = (__m128)__lsx_vld(w2, 0); + __m128 _w3 = (__m128)__lsx_vld(w3, 0); + _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0); + _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1); + _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2); + _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3); + + m += 4; + w0 += 4; + w1 += 4; + w2 += 4; + w3 += 4; + } +#endif // __loongarch_sx + for (; i < num_input; i++) + { + sum0 += *m * *w0; + sum1 += *m * *w1; + sum2 += *m * *w2; + sum3 += *m * *w3; + + m++; + w0++; + w1++; + w2++; + w3++; + } + +#if __loongarch_sx + sum0 += __lsx_reduce_fadd_s(_sum0); + sum1 += __lsx_reduce_fadd_s(_sum1); + sum2 += __lsx_reduce_fadd_s(_sum2); + sum3 += __lsx_reduce_fadd_s(_sum3); +#endif // __loongarch_sx + + sum0 = activation_ss(sum0, activation_type, activation_params); + sum1 = activation_ss(sum1, activation_type, activation_params); + sum2 = activation_ss(sum2, activation_type, activation_params); + sum3 = activation_ss(sum3, activation_type, activation_params); + + top_blob[p] = sum0; + top_blob[p + 1] = sum1; + top_blob[p + 2] = sum2; + top_blob[p + 3] = sum3; + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_num_output_start; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const float* w = (const float*)weight_data_tm + num_input * p; + + const float* m = bottom_blob_flattened; + + int i = 0; +#if __loongarch_sx + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(w + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w = (__m128)__lsx_vld(w, 0); + _sum0 = __lsx_vfmadd_s(_w, _m, _sum0); + + m += 4; + w += 4; + } + sum += __lsx_reduce_fadd_s(_sum0); +#endif // __loongarch_sx + for (; i < num_input; i++) + { + sum += *m * *w; + + m++; + w++; + } + + sum = activation_ss(sum, activation_type, activation_params); + + top_blob[p] = sum; + } + } + + return 0; +} + +#if __loongarch_sx +int InnerProduct_loongarch::create_pipeline_fp16s(const Option& opt) +{ + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } + + // src = inch-outch + // dst = pb-inch-outch/pb + if (out_elempack == 4) + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4); + + for (int q = 0; q + 3 < num_output; q += 4) + { + unsigned short* g0 = weight_data_tm.row(q / 4); + + const float* k0 = weight_data_r2.row(q); + const float* k1 = weight_data_r2.row(q + 1); + const float* k2 = weight_data_r2.row(q + 2); + const float* k3 = weight_data_r2.row(q + 3); + + int p = 0; + for (; p + 3 < num_input; p += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(k0, 0); + __m128i _r1 = __lsx_vld(k1, 0); + __m128i _r2 = __lsx_vld(k2, 0); + __m128i _r3 = __lsx_vld(k3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __m128i _p0 = __lsx_vfcvt_h_s((__m128)_r0123_1, (__m128)_r0123_0); + __m128i _p1 = __lsx_vfcvt_h_s((__m128)_r0123_3, (__m128)_r0123_2); + + __lsx_vst(_p0, g0, 0); + __lsx_vst(_p1, g0 + 8, 0); + + k0 += 4; + k1 += 4; + k2 += 4; + k3 += 4; + g0 += 16; + } + for (; p < num_input; p++) + { + g0[0] = float32_to_float16(*k0++); + g0[1] = float32_to_float16(*k1++); + g0[2] = float32_to_float16(*k2++); + g0[3] = float32_to_float16(*k3++); + g0 += 4; + } + } + } + + if (out_elempack == 1) + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt); + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int InnerProduct_loongarch::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int num_input = weight_data_size / num_output; + + if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; + if (opt.use_packing_layout) + { + num_output_elempack = num_output % 4 == 0 ? 4 : 1; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { + if (elempack == 4 && num_output_elempack == 4) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const unsigned short* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 0]); + _sum1 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 1]); + _sum2 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 2]); + _sum3 = (__m128)__lsx_vreplfr2vr_s(bias_data[p * 4 + 3]); + } + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 16); + __m128 _val = (__m128)__lsx_vld(m, 0); + __m128i _w = (__m128i)__lsx_vfcvtl_s_h(__lsx_vld(kptr, 0)); + _sum0 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 0), _val, _sum0); + _sum1 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 1), _val, _sum1); + _sum2 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 2), _val, _sum2); + _sum3 = __lsx_vfmadd_s((__m128)__lsx_vreplvei_w(_w, 3), _val, _sum3); + + m += 4; + kptr += 4; + } + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + _sum1 = activation_ps(_sum1, activation_type, activation_params); + _sum2 = activation_ps(_sum2, activation_type, activation_params); + _sum3 = activation_ps(_sum3, activation_type, activation_params); + + __lsx_vst(_sum0, outptr, 0); + __lsx_vst(_sum1, outptr + 4, 0); + __lsx_vst(_sum2, outptr + 8, 0); + __lsx_vst(_sum3, outptr + 12, 0); + outptr += 16; + } + } + + if (elempack == 1 && num_output_elempack == 4) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const unsigned short* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + int i = 0; + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(m, 0); + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 8, 0); + __m128 _w0 = __lsx_vfcvtl_s_h(_w01); + __m128 _w1 = __lsx_vfcvth_s_h(_w01); + __m128 _w2 = __lsx_vfcvtl_s_h(_w23); + __m128 _w3 = __lsx_vfcvth_s_h(_w23); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3); + + m += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = __lsx_vreplfr2vr_s(m[0]); + __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0)); + _sum0 = __lsx_vfmadd_s(_w, _val, _sum0); + + m += 1; + kptr += 4; + } + + _sum0 = __lsx_vfadd_s(_sum0, _sum1); + _sum2 = __lsx_vfadd_s(_sum2, _sum3); + _sum0 = __lsx_vfadd_s(_sum0, _sum2); + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + + __lsx_vst(_sum0, outptr, 0); + outptr += 4; + } + } + + if (elempack == 4 && num_output_elempack == 1) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const unsigned short* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum = __lsx_vreplfr2vr_s(bias_data[p]); + } + + for (int i = 0; i < num_input; i++) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 4); + __m128 _val = (__m128)__lsx_vld(m, 0); + __m128 _k = __lsx_vreplfr2vr_s(float16_to_float32(kptr[0])); + _sum = __lsx_vfmadd_s(_k, _val, _sum); + + m += 4; + kptr += 1; + } + + _sum = activation_ps(_sum, activation_type, activation_params); + + __lsx_vst(_sum, outptr, 0); + outptr += 4; + } + } + + if (elempack == 1 && num_output_elempack == 1) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const unsigned short* kptr = weight_data_tm.row(p); + const float* m = bottom_blob.row(j); + + float sum = 0.f; + + if (bias_term) + { + sum = bias_data[p]; + } + + int i = 0; + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(kptr + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0)); + _sum = __lsx_vfmadd_s(_w, _m, _sum); + + m += 4; + kptr += 4; + } + sum += __lsx_reduce_fadd_s(_sum); + for (; i < num_input; i++) + { + sum += *m * float16_to_float32(*kptr); + + m += 1; + kptr += 1; + } + + sum = activation_ss(sum, activation_type, activation_params); + + outptr[0] = sum; + outptr += 1; + } + } + } + + return 0; + } + + // flatten + Mat bottom_blob_flattened = bottom_blob; + if (bottom_blob.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + + flatten->forward(bottom_blob, bottom_blob_flattened, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = 1; + if (opt.use_packing_layout) + { + out_elempack = num_output % 4 == 0 ? 4 : 1; + } + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + + if (bias_term) + { + _sum0 = (__m128)__lsx_vld((const float*)bias_data + p * 4, 0); + } + + const unsigned short* kptr = weight_data_tm.row(p); + + const float* sptr = bottom_blob_flattened; + + int i = 0; + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(sptr + 16); + __builtin_prefetch(kptr + 64); + __m128i _val = __lsx_vld(sptr, 0); + __m128i _w01 = __lsx_vld(kptr, 0); + __m128i _w23 = __lsx_vld(kptr + 8, 0); + __m128 _w0 = __lsx_vfcvtl_s_h(_w01); + __m128 _w1 = __lsx_vfcvth_s_h(_w01); + __m128 _w2 = __lsx_vfcvtl_s_h(_w23); + __m128 _w3 = __lsx_vfcvth_s_h(_w23); + _sum0 = __lsx_vfmadd_s(_w0, (__m128)__lsx_vreplvei_w(_val, 0), _sum0); + _sum1 = __lsx_vfmadd_s(_w1, (__m128)__lsx_vreplvei_w(_val, 1), _sum1); + _sum2 = __lsx_vfmadd_s(_w2, (__m128)__lsx_vreplvei_w(_val, 2), _sum2); + _sum3 = __lsx_vfmadd_s(_w3, (__m128)__lsx_vreplvei_w(_val, 3), _sum3); + + sptr += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = __lsx_vreplfr2vr_s(sptr[0]); + __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(kptr, 0)); + _sum0 = __lsx_vfmadd_s(_w, _val, _sum0); + + sptr += 1; + kptr += 4; + } + + _sum0 = __lsx_vfadd_s(_sum0, _sum1); + _sum2 = __lsx_vfadd_s(_sum2, _sum3); + _sum0 = __lsx_vfadd_s(_sum0, _sum2); + + _sum0 = activation_ps(_sum0, activation_type, activation_params); + + float* outptr = top_blob; + __lsx_vst(_sum0, outptr + p * 4, 0); + } + } + + if (out_elempack == 1) + { + int nn_num_output = num_output / 4; + int remain_num_output_start = nn_num_output * 4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_num_output; pp++) + { + int p = pp * 4; + + float sum0 = 0.f; + float sum1 = 0.f; + float sum2 = 0.f; + float sum3 = 0.f; + + if (bias_term) + { + sum0 = bias_data[p]; + sum1 = bias_data[p + 1]; + sum2 = bias_data[p + 2]; + sum3 = bias_data[p + 3]; + } + + const unsigned short* w0 = weight_data_tm.row(p); + const unsigned short* w1 = weight_data_tm.row(p + 1); + const unsigned short* w2 = weight_data_tm.row(p + 2); + const unsigned short* w3 = weight_data_tm.row(p + 3); + + const float* m = bottom_blob_flattened; + + int i = 0; + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum1 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum2 = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _sum3 = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(w0 + 16); + __builtin_prefetch(w1 + 16); + __builtin_prefetch(w2 + 16); + __builtin_prefetch(w3 + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w0 = __lsx_vfcvtl_s_h(__lsx_vld(w0, 0)); + __m128 _w1 = __lsx_vfcvtl_s_h(__lsx_vld(w1, 0)); + __m128 _w2 = __lsx_vfcvtl_s_h(__lsx_vld(w2, 0)); + __m128 _w3 = __lsx_vfcvtl_s_h(__lsx_vld(w3, 0)); + _sum0 = __lsx_vfmadd_s(_w0, _m, _sum0); + _sum1 = __lsx_vfmadd_s(_w1, _m, _sum1); + _sum2 = __lsx_vfmadd_s(_w2, _m, _sum2); + _sum3 = __lsx_vfmadd_s(_w3, _m, _sum3); + + m += 4; + w0 += 4; + w1 += 4; + w2 += 4; + w3 += 4; + } + for (; i < num_input; i++) + { + sum0 += *m * float16_to_float32(*w0); + sum1 += *m * float16_to_float32(*w1); + sum2 += *m * float16_to_float32(*w2); + sum3 += *m * float16_to_float32(*w3); + + m++; + w0++; + w1++; + w2++; + w3++; + } + + sum0 += __lsx_reduce_fadd_s(_sum0); + sum1 += __lsx_reduce_fadd_s(_sum1); + sum2 += __lsx_reduce_fadd_s(_sum2); + sum3 += __lsx_reduce_fadd_s(_sum3); + + sum0 = activation_ss(sum0, activation_type, activation_params); + sum1 = activation_ss(sum1, activation_type, activation_params); + sum2 = activation_ss(sum2, activation_type, activation_params); + sum3 = activation_ss(sum3, activation_type, activation_params); + + top_blob[p] = sum0; + top_blob[p + 1] = sum1; + top_blob[p + 2] = sum2; + top_blob[p + 3] = sum3; + } + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_num_output_start; p < num_output; p++) + { + float sum = 0.f; + + if (bias_term) + sum = bias_data[p]; + + const unsigned short* w = weight_data_tm.row(p); + + const float* m = bottom_blob_flattened; + + int i = 0; + __m128 _sum0 = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < num_input; i += 4) + { + __builtin_prefetch(m + 16); + __builtin_prefetch(w + 16); + __m128 _m = (__m128)__lsx_vld(m, 0); + __m128 _w = __lsx_vfcvtl_s_h(__lsx_vld(w, 0)); + _sum0 = __lsx_vfmadd_s(_w, _m, _sum0); + + m += 4; + w += 4; + } + sum += __lsx_reduce_fadd_s(_sum0); + for (; i < num_input; i++) + { + sum += *m * float16_to_float32(*w); + + m++; + w++; + } + + sum = activation_ss(sum, activation_type, activation_params); + + top_blob[p] = sum; + } + } + + return 0; +} +#endif // __loongarch_sx + +#if NCNN_INT8 +int InnerProduct_loongarch::create_pipeline_int8_loongarch(const Option& opt) +{ + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif // __loongarch_sx + + // src = inch-outch + // dst = pb-inch-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + signed char* g0 = weight_data_tm.row(q / out_elempack); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < out_elempack; j++) + { + *g0++ = weight_data_r2.row(q + j)[p]; + } + } + } + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (opt.lightmode) + { + weight_data.release(); + } + + return 0; +} + +int InnerProduct_loongarch::forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int num_input = weight_data_size / num_output; + + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input && bottom_blob_int8.h * bottom_blob_int8.elempack > 1) + { + // gemm + Mat bottom_blob_int8_unpacked; + Option opt_unpack = opt; + opt_unpack.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack); + + int h = bottom_blob_int8_unpacked.h; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = h % 4 == 0 ? 4 : 1; + } +#endif + + int outh = h / out_elempack; + + top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + num_output_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif + +#if __loongarch_sx + if (num_output_elempack == 8 && out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m0 = bottom_blob_int8_unpacked.row(j * 4); + const signed char* m1 = bottom_blob_int8_unpacked.row(j * 4 + 1); + const signed char* m2 = bottom_blob_int8_unpacked.row(j * 4 + 2); + const signed char* m3 = bottom_blob_int8_unpacked.row(j * 4 + 3); + + __m128i _sum00 = __lsx_vreplgr2vr_w(0); + __m128i _sum01 = __lsx_vreplgr2vr_w(0); + __m128i _sum10 = __lsx_vreplgr2vr_w(0); + __m128i _sum11 = __lsx_vreplgr2vr_w(0); + __m128i _sum20 = __lsx_vreplgr2vr_w(0); + __m128i _sum21 = __lsx_vreplgr2vr_w(0); + __m128i _sum30 = __lsx_vreplgr2vr_w(0); + __m128i _sum31 = __lsx_vreplgr2vr_w(0); + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(m0 + 4); + __builtin_prefetch(m1 + 4); + __builtin_prefetch(m2 + 4); + __builtin_prefetch(m3 + 4); + __builtin_prefetch(kptr + 32); + __m128i _val0 = __lsx_vreplgr2vr_h((short)m0[0]); + __m128i _val1 = __lsx_vreplgr2vr_h((short)m1[0]); + __m128i _val2 = __lsx_vreplgr2vr_h((short)m2[0]); + __m128i _val3 = __lsx_vreplgr2vr_h((short)m3[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val0, _w16); + __m128i _s1 = __lsx_vmul_h(_val1, _w16); + __m128i _s2 = __lsx_vmul_h(_val2, _w16); + __m128i _s3 = __lsx_vmul_h(_val3, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _exts1 = __lsx_vslti_h(_s1, 0); + __m128i _exts2 = __lsx_vslti_h(_s2, 0); + __m128i _exts3 = __lsx_vslti_h(_s3, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + __m128i _s1l = __lsx_vilvl_h(_exts1, _s1); + __m128i _s1h = __lsx_vilvh_h(_exts1, _s1); + __m128i _s2l = __lsx_vilvl_h(_exts2, _s2); + __m128i _s2h = __lsx_vilvh_h(_exts2, _s2); + __m128i _s3l = __lsx_vilvl_h(_exts3, _s3); + __m128i _s3h = __lsx_vilvh_h(_exts3, _s3); + + _sum00 = __lsx_vadd_w(_sum00, _s0l); + _sum01 = __lsx_vadd_w(_sum01, _s0h); + _sum10 = __lsx_vadd_w(_sum10, _s1l); + _sum11 = __lsx_vadd_w(_sum11, _s1h); + _sum20 = __lsx_vadd_w(_sum20, _s2l); + _sum21 = __lsx_vadd_w(_sum21, _s2h); + _sum30 = __lsx_vadd_w(_sum30, _s3l); + _sum31 = __lsx_vadd_w(_sum31, _s3h); + + m0++; + m1++; + m2++; + m3++; + kptr += 8; + } + + // dequantize and relu + __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0); + __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0); + + __m128 _sumfp32_00 = __lsx_vffint_s_w(_sum00); + __m128 _sumfp32_01 = __lsx_vffint_s_w(_sum01); + __m128 _sumfp32_10 = __lsx_vffint_s_w(_sum10); + __m128 _sumfp32_11 = __lsx_vffint_s_w(_sum11); + __m128 _sumfp32_20 = __lsx_vffint_s_w(_sum20); + __m128 _sumfp32_21 = __lsx_vffint_s_w(_sum21); + __m128 _sumfp32_30 = __lsx_vffint_s_w(_sum30); + __m128 _sumfp32_31 = __lsx_vffint_s_w(_sum31); + if (bias_term) + { + __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0); + __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0); + _sumfp32_00 = __lsx_vfmadd_s(_scale_in0, _sumfp32_00, _bias0); + _sumfp32_01 = __lsx_vfmadd_s(_scale_in1, _sumfp32_01, _bias1); + _sumfp32_10 = __lsx_vfmadd_s(_scale_in0, _sumfp32_10, _bias0); + _sumfp32_11 = __lsx_vfmadd_s(_scale_in1, _sumfp32_11, _bias1); + _sumfp32_20 = __lsx_vfmadd_s(_scale_in0, _sumfp32_20, _bias0); + _sumfp32_21 = __lsx_vfmadd_s(_scale_in1, _sumfp32_21, _bias1); + _sumfp32_30 = __lsx_vfmadd_s(_scale_in0, _sumfp32_30, _bias0); + _sumfp32_31 = __lsx_vfmadd_s(_scale_in1, _sumfp32_31, _bias1); + } + else + { + _sumfp32_00 = __lsx_vfmul_s(_sumfp32_00, _scale_in0); + _sumfp32_01 = __lsx_vfmul_s(_sumfp32_01, _scale_in1); + _sumfp32_10 = __lsx_vfmul_s(_sumfp32_10, _scale_in0); + _sumfp32_11 = __lsx_vfmul_s(_sumfp32_11, _scale_in1); + _sumfp32_20 = __lsx_vfmul_s(_sumfp32_20, _scale_in0); + _sumfp32_21 = __lsx_vfmul_s(_sumfp32_21, _scale_in1); + _sumfp32_30 = __lsx_vfmul_s(_sumfp32_30, _scale_in0); + _sumfp32_31 = __lsx_vfmul_s(_sumfp32_31, _scale_in1); + } + + _sumfp32_00 = activation_ps(_sumfp32_00, activation_type, activation_params); + _sumfp32_01 = activation_ps(_sumfp32_01, activation_type, activation_params); + _sumfp32_10 = activation_ps(_sumfp32_10, activation_type, activation_params); + _sumfp32_11 = activation_ps(_sumfp32_11, activation_type, activation_params); + _sumfp32_20 = activation_ps(_sumfp32_20, activation_type, activation_params); + _sumfp32_21 = activation_ps(_sumfp32_21, activation_type, activation_params); + _sumfp32_30 = activation_ps(_sumfp32_30, activation_type, activation_params); + _sumfp32_31 = activation_ps(_sumfp32_31, activation_type, activation_params); + + // transpose 4x8 + __m128i _r01r = __lsx_vilvl_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00); + __m128i _r01l = __lsx_vilvh_w((__m128i)_sumfp32_10, (__m128i)_sumfp32_00); + __m128i _r23r = __lsx_vilvl_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20); + __m128i _r23l = __lsx_vilvh_w((__m128i)_sumfp32_30, (__m128i)_sumfp32_20); + __m128i _r45r = __lsx_vilvl_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01); + __m128i _r45l = __lsx_vilvh_w((__m128i)_sumfp32_11, (__m128i)_sumfp32_01); + __m128i _r67r = __lsx_vilvl_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21); + __m128i _r67l = __lsx_vilvh_w((__m128i)_sumfp32_31, (__m128i)_sumfp32_21); + _sumfp32_00 = (__m128)__lsx_vilvl_d(_r23r, _r01r); + _sumfp32_10 = (__m128)__lsx_vilvh_d(_r23r, _r01r); + _sumfp32_20 = (__m128)__lsx_vilvl_d(_r23l, _r01l); + _sumfp32_30 = (__m128)__lsx_vilvh_d(_r23l, _r01l); + _sumfp32_01 = (__m128)__lsx_vilvl_d(_r67r, _r45r); + _sumfp32_11 = (__m128)__lsx_vilvh_d(_r67r, _r45r); + _sumfp32_21 = (__m128)__lsx_vilvl_d(_r67l, _r45l); + _sumfp32_31 = (__m128)__lsx_vilvh_d(_r67l, _r45l); + + __lsx_vst(_sumfp32_00, outptr, 0); + __lsx_vst(_sumfp32_10, outptr + 4, 0); + __lsx_vst(_sumfp32_20, outptr + 8, 0); + __lsx_vst(_sumfp32_30, outptr + 12, 0); + __lsx_vst(_sumfp32_01, outptr + 16, 0); + __lsx_vst(_sumfp32_11, outptr + 20, 0); + __lsx_vst(_sumfp32_21, outptr + 24, 0); + __lsx_vst(_sumfp32_31, outptr + 28, 0); + + outptr += 32; + } + } + } + + if (num_output_elempack == 1 && out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m0 = bottom_blob_int8_unpacked.row(j * 4); + const signed char* m1 = bottom_blob_int8_unpacked.row(j * 4 + 1); + const signed char* m2 = bottom_blob_int8_unpacked.row(j * 4 + 2); + const signed char* m3 = bottom_blob_int8_unpacked.row(j * 4 + 3); + + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + + int i = 0; + for (; i < num_input; i++) + { + sum0 += *m0++ * kptr[0]; + sum1 += *m1++ * kptr[0]; + sum2 += *m2++ * kptr[0]; + sum3 += *m3++ * kptr[0]; + kptr += 1; + } + + // dequantize and relu + float sumfp32_0 = sum0 * scale_in_data[p]; + float sumfp32_1 = sum1 * scale_in_data[p]; + float sumfp32_2 = sum2 * scale_in_data[p]; + float sumfp32_3 = sum3 * scale_in_data[p]; + + if (bias_term) + { + sumfp32_0 += bias_data[p]; + sumfp32_1 += bias_data[p]; + sumfp32_2 += bias_data[p]; + sumfp32_3 += bias_data[p]; + } + + outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params); + outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params); + outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params); + outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params); + outptr += 4; + } + } + } + + if (num_output_elempack == 8 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(m + 4); + __builtin_prefetch(kptr + 32); + __m128i _val = __lsx_vreplgr2vr_h((short)m[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + + m++; + kptr += 8; + } + + // dequantize and relu + __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0); + __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0); + + __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0); + __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1); + + if (bias_term) + { + __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0); + __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0); + _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0); + _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1); + } + else + { + _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0); + _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); + + __lsx_vst(_sumfp32_0, outptr, 0); + __lsx_vst(_sumfp32_1, outptr + 4, 0); + outptr += 8; + } + } + } +#endif // __loongarch_sx + + if (num_output_elempack == 1 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + int sum = 0; + + int i = 0; + for (; i < num_input; i++) + { + sum += *m++ * *kptr++; + } + + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + + outptr[0] = activation_ss(sumfp32, activation_type, activation_params); + outptr += 1; + } + } + } + + return 0; + } + + Mat bottom_blob_int8_flattened = bottom_blob_int8; + if (bottom_blob_int8.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten); + } + + // int elempack = bottom_blob_int8_flattened.elempack; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif // __loongarch_sx + // size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + __m128i _sum0 = __lsx_vreplgr2vr_w(0); + __m128i _sum1 = __lsx_vreplgr2vr_w(0); + + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + int i = 0; + for (; i < num_input; i++) + { + __builtin_prefetch(sptr + 4); + __builtin_prefetch(kptr + 32); + __m128i _val = __lsx_vreplgr2vr_h((short)sptr[0]); + + __m128i _w = __lsx_vld(kptr, 0); + __m128i _w16 = __lsx_vilvl_b(__lsx_vslti_b(_w, 0), _w); + + __m128i _s0 = __lsx_vmul_h(_val, _w16); + __m128i _exts0 = __lsx_vslti_h(_s0, 0); + __m128i _s0l = __lsx_vilvl_h(_exts0, _s0); + __m128i _s0h = __lsx_vilvh_h(_exts0, _s0); + + _sum0 = __lsx_vadd_w(_sum0, _s0l); + _sum1 = __lsx_vadd_w(_sum1, _s0h); + + sptr += 1; + kptr += 8; + } + + // dequantize and relu + __m128 _scale_in0 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8, 0); + __m128 _scale_in1 = (__m128)__lsx_vld((const float*)scale_in_data + p * 8 + 4, 0); + + __m128 _sumfp32_0 = __lsx_vffint_s_w(_sum0); + __m128 _sumfp32_1 = __lsx_vffint_s_w(_sum1); + + if (bias_term) + { + __m128 _bias0 = (__m128)__lsx_vld((const float*)bias_data + p * 8, 0); + __m128 _bias1 = (__m128)__lsx_vld((const float*)bias_data + p * 8 + 4, 0); + _sumfp32_0 = __lsx_vfmadd_s(_scale_in0, _sumfp32_0, _bias0); + _sumfp32_1 = __lsx_vfmadd_s(_scale_in1, _sumfp32_1, _bias1); + } + else + { + _sumfp32_0 = __lsx_vfmul_s(_sumfp32_0, _scale_in0); + _sumfp32_1 = __lsx_vfmul_s(_sumfp32_1, _scale_in1); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); + + float* outptr = (float*)top_blob + p * 8; + __lsx_vst(_sumfp32_0, outptr, 0); + __lsx_vst(_sumfp32_1, outptr + 4, 0); + } + } +#endif // __loongarch_sx + + if (out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + int sum = 0; + + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + int i = 0; + for (; i < num_input; i++) + { + signed char val = sptr[0]; + + signed char w = kptr[0]; + + sum += val * w; + + sptr += 1; + kptr += 1; + } + + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + top_blob[p] = sumfp32; + } + } + + return 0; +} +#endif // NCNN_INT8 + +} // namespace ncnn diff --git a/src/layer/loongarch/innerproduct_loongarch.h b/src/layer/loongarch/innerproduct_loongarch.h new file mode 100644 index 000000000000..4d9574ce9192 --- /dev/null +++ b/src/layer/loongarch/innerproduct_loongarch.h @@ -0,0 +1,54 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_INNERPRODUCT_LOONGARCH_H +#define LAYER_INNERPRODUCT_LOONGARCH_H + +#include "innerproduct.h" + +namespace ncnn { + +class InnerProduct_loongarch : virtual public InnerProduct +{ +public: + InnerProduct_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: +#if __loongarch_sx + int create_pipeline_fp16s(const Option& opt); + int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif +#if NCNN_INT8 + int create_pipeline_int8_loongarch(const Option& opt); + int forward_int8_loongarch(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + +public: + Layer* flatten; + + Mat weight_data_tm; + +#if NCNN_INT8 + Mat scale_in_data; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_INNERPRODUCT_LOONGARCH_H diff --git a/src/layer/loongarch/interp_bicubic.h b/src/layer/loongarch/interp_bicubic.h new file mode 100644 index 000000000000..e52ba81de4f0 --- /dev/null +++ b/src/layer/loongarch/interp_bicubic.h @@ -0,0 +1,261 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static inline void interpolate_cubic(float fx, float* coeffs) +{ + const float A = -0.75f; + + float fx0 = fx + 1; + float fx1 = fx; + float fx2 = 1 - fx; + // float fx3 = 2 - fx; + + coeffs[0] = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A; + coeffs[1] = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1; + coeffs[2] = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1; + coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; +} + +static void cubic_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner) +{ + double scale = (double)w / outw; + if (align_corner) + { + scale = (double)(w - 1) / (outw - 1); + } + + for (int dx = 0; dx < outw; dx++) + { + float fx = (float)((dx + 0.5) * scale - 0.5); + if (align_corner) + { + fx = (float)(dx * scale); + } + + int sx = static_cast(floor(fx)); + fx -= sx; + + interpolate_cubic(fx, alpha + dx * 4); + + if (sx <= -1) + { + sx = 1; + alpha[dx * 4 + 0] = 1.f - alpha[dx * 4 + 3]; + alpha[dx * 4 + 1] = alpha[dx * 4 + 3]; + alpha[dx * 4 + 2] = 0.f; + alpha[dx * 4 + 3] = 0.f; + } + if (sx == 0) + { + sx = 1; + alpha[dx * 4 + 0] = alpha[dx * 4 + 0] + alpha[dx * 4 + 1]; + alpha[dx * 4 + 1] = alpha[dx * 4 + 2]; + alpha[dx * 4 + 2] = alpha[dx * 4 + 3]; + alpha[dx * 4 + 3] = 0.f; + } + if (sx == w - 2) + { + sx = w - 3; + alpha[dx * 4 + 3] = alpha[dx * 4 + 2] + alpha[dx * 4 + 3]; + alpha[dx * 4 + 2] = alpha[dx * 4 + 1]; + alpha[dx * 4 + 1] = alpha[dx * 4 + 0]; + alpha[dx * 4 + 0] = 0.f; + } + if (sx >= w - 1) + { + sx = w - 3; + alpha[dx * 4 + 3] = 1.f - alpha[dx * 4 + 0]; + alpha[dx * 4 + 2] = alpha[dx * 4 + 0]; + alpha[dx * 4 + 1] = 0.f; + alpha[dx * 4 + 0] = 0.f; + } + + xofs[dx] = sx; + } +} + +static void resize_bicubic_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) +{ + int w = dst.w; + int h = dst.h; + + // loop body + Mat rowsbuf0(w); + Mat rowsbuf1(w); + Mat rowsbuf2(w); + Mat rowsbuf3(w); + float* rows0 = rowsbuf0; + float* rows1 = rowsbuf1; + float* rows2 = rowsbuf2; + float* rows3 = rowsbuf3; + + int prev_sy1 = -3; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // reuse all rows + } + else if (sy == prev_sy1 + 1) + { + // hresize one row + float* rows0_old = rows0; + rows0 = rows1; + rows1 = rows2; + rows2 = rows3; + rows3 = rows0_old; + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S3p = S3 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; + + alphap += 4; + } + } + else if (sy == prev_sy1 + 2) + { + // hresize two rows + float* rows0_old = rows0; + float* rows1_old = rows1; + rows0 = rows2; + rows1 = rows3; + rows2 = rows0_old; + rows3 = rows1_old; + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; + rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; + + alphap += 4; + } + } + else if (sy == prev_sy1 + 3) + { + // hresize three rows + float* rows0_old = rows0; + float* rows1_old = rows1; + float* rows2_old = rows2; + rows0 = rows3; + rows1 = rows0_old; + rows2 = rows1_old; + rows3 = rows2_old; + const float* S1 = src.row(sy); + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S1p = S1 + sx; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3; + rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; + rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; + + alphap += 4; + } + } + else + { + // hresize four rows + const float* S0 = src.row(sy - 1); + const float* S1 = src.row(sy); + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows0p = rows0; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S0p = S0 + sx; + const float* S1p = S1 + sx; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + rows0p[dx] = S0p[-1] * a0 + S0p[0] * a1 + S0p[1] * a2 + S0p[2] * a3; + rows1p[dx] = S1p[-1] * a0 + S1p[0] * a1 + S1p[1] * a2 + S1p[2] * a3; + rows2p[dx] = S2p[-1] * a0 + S2p[0] * a1 + S2p[1] * a2 + S2p[2] * a3; + rows3p[dx] = S3p[-1] * a0 + S3p[0] * a1 + S3p[1] * a2 + S3p[2] * a3; + + alphap += 4; + } + } + + prev_sy1 = sy; + + // vresize + float b0 = beta[0]; + float b1 = beta[1]; + float b2 = beta[2]; + float b3 = beta[3]; + + float* rows0p = rows0; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + float* Dp = dst.row(dy); + for (int dx = 0; dx < w; dx++) + { + // D[x] = rows0[x]*b0 + rows1[x]*b1 + rows2[x]*b2 + rows3[x]*b3; + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1 + *rows2p++ * b2 + *rows3p++ * b3; + } + + beta += 4; + } +} diff --git a/src/layer/loongarch/interp_bicubic_pack4.h b/src/layer/loongarch/interp_bicubic_pack4.h new file mode 100644 index 000000000000..54281691ad79 --- /dev/null +++ b/src/layer/loongarch/interp_bicubic_pack4.h @@ -0,0 +1,286 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void resize_bicubic_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) +{ + int w = dst.w; + int h = dst.h; + + // loop body + Mat rowsbuf0(w, (size_t)4 * 4u, 4); + Mat rowsbuf1(w, (size_t)4 * 4u, 4); + Mat rowsbuf2(w, (size_t)4 * 4u, 4); + Mat rowsbuf3(w, (size_t)4 * 4u, 4); + float* rows0 = rowsbuf0; + float* rows1 = rowsbuf1; + float* rows2 = rowsbuf2; + float* rows3 = rowsbuf3; + + int prev_sy1 = -3; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // reuse all rows + } + else if (sy == prev_sy1 + 1) + { + // hresize one row + float* rows0_old = rows0; + rows0 = rows1; + rows1 = rows2; + rows2 = rows3; + rows3 = rows0_old; + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S3p = S3 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0); + __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0); + __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0); + __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0); + __m128 _rows3 = __lsx_vfmul_s(_S30, _a0); + _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3); + _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3); + _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3); + __lsx_vst(_rows3, rows3p + dx * 4, 0); + + alphap += 4; + } + } + else if (sy == prev_sy1 + 2) + { + // hresize two rows + float* rows0_old = rows0; + float* rows1_old = rows1; + rows0 = rows2; + rows1 = rows3; + rows2 = rows0_old; + rows3 = rows1_old; + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0); + __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0); + __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0); + __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0); + __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0); + __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0); + __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0); + __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0); + __m128 _rows2 = __lsx_vfmul_s(_S20, _a0); + __m128 _rows3 = __lsx_vfmul_s(_S30, _a0); + _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2); + _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3); + _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2); + _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3); + _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2); + _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3); + __lsx_vst(_rows2, rows2p + dx * 4, 0); + __lsx_vst(_rows3, rows3p + dx * 4, 0); + + alphap += 4; + } + } + else if (sy == prev_sy1 + 3) + { + // hresize three rows + float* rows0_old = rows0; + float* rows1_old = rows1; + float* rows2_old = rows2; + rows0 = rows3; + rows1 = rows0_old; + rows2 = rows1_old; + rows3 = rows2_old; + const float* S1 = src.row(sy); + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S1p = S1 + sx; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0); + __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0); + __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0); + __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0); + __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0); + __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0); + __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0); + __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0); + __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0); + __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0); + __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0); + __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0); + __m128 _rows1 = __lsx_vfmul_s(_S10, _a0); + __m128 _rows2 = __lsx_vfmul_s(_S20, _a0); + __m128 _rows3 = __lsx_vfmul_s(_S30, _a0); + _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1); + _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2); + _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3); + _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1); + _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2); + _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3); + _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1); + _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2); + _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3); + __lsx_vst(_rows1, rows1p + dx * 4, 0); + __lsx_vst(_rows2, rows2p + dx * 4, 0); + __lsx_vst(_rows3, rows3p + dx * 4, 0); + + alphap += 4; + } + } + else + { + // hresize four rows + const float* S0 = src.row(sy - 1); + const float* S1 = src.row(sy); + const float* S2 = src.row(sy + 1); + const float* S3 = src.row(sy + 2); + + const float* alphap = alpha; + float* rows0p = rows0; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + for (int dx = 0; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S0p = S0 + sx; + const float* S1p = S1 + sx; + const float* S2p = S2 + sx; + const float* S3p = S3 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S00 = (__m128)__lsx_vld(S0p - 4, 0); + __m128 _S01 = (__m128)__lsx_vld(S0p + 0, 0); + __m128 _S02 = (__m128)__lsx_vld(S0p + 4, 0); + __m128 _S03 = (__m128)__lsx_vld(S0p + 8, 0); + __m128 _S10 = (__m128)__lsx_vld(S1p - 4, 0); + __m128 _S11 = (__m128)__lsx_vld(S1p + 0, 0); + __m128 _S12 = (__m128)__lsx_vld(S1p + 4, 0); + __m128 _S13 = (__m128)__lsx_vld(S1p + 8, 0); + __m128 _S20 = (__m128)__lsx_vld(S2p - 4, 0); + __m128 _S21 = (__m128)__lsx_vld(S2p + 0, 0); + __m128 _S22 = (__m128)__lsx_vld(S2p + 4, 0); + __m128 _S23 = (__m128)__lsx_vld(S2p + 8, 0); + __m128 _S30 = (__m128)__lsx_vld(S3p - 4, 0); + __m128 _S31 = (__m128)__lsx_vld(S3p + 0, 0); + __m128 _S32 = (__m128)__lsx_vld(S3p + 4, 0); + __m128 _S33 = (__m128)__lsx_vld(S3p + 8, 0); + __m128 _rows0 = __lsx_vfmul_s(_S00, _a0); + __m128 _rows1 = __lsx_vfmul_s(_S10, _a0); + __m128 _rows2 = __lsx_vfmul_s(_S20, _a0); + __m128 _rows3 = __lsx_vfmul_s(_S30, _a0); + _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0); + _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1); + _rows2 = __lsx_vfmadd_s(_a1, _S21, _rows2); + _rows3 = __lsx_vfmadd_s(_a1, _S31, _rows3); + _rows0 = __lsx_vfmadd_s(_a2, _S02, _rows0); + _rows1 = __lsx_vfmadd_s(_a2, _S12, _rows1); + _rows2 = __lsx_vfmadd_s(_a2, _S22, _rows2); + _rows3 = __lsx_vfmadd_s(_a2, _S32, _rows3); + _rows0 = __lsx_vfmadd_s(_a3, _S03, _rows0); + _rows1 = __lsx_vfmadd_s(_a3, _S13, _rows1); + _rows2 = __lsx_vfmadd_s(_a3, _S23, _rows2); + _rows3 = __lsx_vfmadd_s(_a3, _S33, _rows3); + __lsx_vst(_rows0, rows0p + dx * 4, 0); + __lsx_vst(_rows1, rows1p + dx * 4, 0); + __lsx_vst(_rows2, rows2p + dx * 4, 0); + __lsx_vst(_rows3, rows3p + dx * 4, 0); + + alphap += 4; + } + } + + prev_sy1 = sy; + + // vresize + __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]); + __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]); + __m128 _b2 = __lsx_vreplfr2vr_s(beta[2]); + __m128 _b3 = __lsx_vreplfr2vr_s(beta[3]); + + float* rows0p = rows0; + float* rows1p = rows1; + float* rows2p = rows2; + float* rows3p = rows3; + float* Dp = dst.row(dy); + + for (int dx = 0; dx < w; dx++) + { + __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0); + __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0); + __m128 _rows2 = (__m128)__lsx_vld(rows2p, 0); + __m128 _rows3 = (__m128)__lsx_vld(rows3p, 0); + __m128 _D = __lsx_vfmul_s(_rows0, _b0); + _D = __lsx_vfmadd_s(_b1, _rows1, _D); + _D = __lsx_vfmadd_s(_b2, _rows2, _D); + _D = __lsx_vfmadd_s(_b3, _rows3, _D); + __lsx_vst(_D, Dp, 0); + + Dp += 4; + rows0p += 4; + rows1p += 4; + rows2p += 4; + rows3p += 4; + } + + beta += 4; + } +} diff --git a/src/layer/loongarch/interp_bilinear.h b/src/layer/loongarch/interp_bilinear.h new file mode 100644 index 000000000000..ad5a28672bef --- /dev/null +++ b/src/layer/loongarch/interp_bilinear.h @@ -0,0 +1,172 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void linear_coeffs(int w, int outw, int* xofs, float* alpha, int align_corner) +{ + double scale = (double)w / outw; + if (align_corner) + { + scale = (double)(w - 1) / (outw - 1); + } + + for (int dx = 0; dx < outw; dx++) + { + float fx = (float)((dx + 0.5) * scale - 0.5); + if (align_corner) + { + fx = (float)(dx * scale); + } + + int sx = floor(fx); + fx -= sx; + + if (sx < 0) + { + sx = 0; + fx = 0.f; + } + if (sx >= w - 1) + { + sx = w - 2; + fx = 1.f; + } + + xofs[dx] = sx; + + alpha[dx * 2] = 1.f - fx; + alpha[dx * 2 + 1] = fx; + } +} + +static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) +{ + int w = dst.w; + int h = dst.h; + + // loop body + Mat rowsbuf0(w); + Mat rowsbuf1(w); + float* rows0 = rowsbuf0; + float* rows1 = rowsbuf1; + + int prev_sy1 = -2; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // reuse all rows + } + else if (sy == prev_sy1 + 1) + { + // hresize one row + float* rows0_old = rows0; + rows0 = rows1; + rows1 = rows0_old; + const float* S1 = src.row(sy + 1); + + const float* alphap = alpha; + float* rows1p = rows1; + int dx = 0; + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + else + { + // hresize two rows + const float* S0 = src.row(sy); + const float* S1 = src.row(sy + 1); + + const float* alphap = alpha; + float* rows0p = rows0; + float* rows1p = rows1; + int dx = 0; + for (; dx < w; dx++) + { + int sx = xofs[dx]; + const float* S0p = S0 + sx; + const float* S1p = S1 + sx; + + float a0 = alphap[0]; + float a1 = alphap[1]; + rows0p[dx] = S0p[0] * a0 + S0p[1] * a1; + rows1p[dx] = S1p[0] * a0 + S1p[1] * a1; + + alphap += 2; + } + } + + prev_sy1 = sy; + + // vresize + float b0 = beta[0]; + float b1 = beta[1]; + + float* rows0p = rows0; + float* rows1p = rows1; + float* Dp = dst.row(dy); + +#if __loongarch_sx + int nn = w >> 3; +#else + int nn = 0; +#endif + int remain = w - (nn << 3); + +#if __loongarch_sx + __m128 _b0 = __lsx_vreplfr2vr_s(b0); + __m128 _b1 = __lsx_vreplfr2vr_s(b1); + for (; nn > 0; nn--) + { + __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0); + __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0); + + __m128 _D = __lsx_vfmul_s(_rows0, _b0); + _D = __lsx_vfmadd_s(_b1, _rows1, _D); + + __lsx_vst(_D, Dp, 0); + + __m128 _rows0n = (__m128)__lsx_vld(rows0p + 4, 0); + __m128 _rows1n = (__m128)__lsx_vld(rows1p + 4, 0); + + __m128 _Dn = __lsx_vfmul_s(_rows0n, _b0); + _Dn = __lsx_vfmadd_s(_b1, _rows1n, _Dn); + + __lsx_vst(_Dn, Dp + 4, 0); + + Dp += 8; + rows0p += 8; + rows1p += 8; + } +#endif // __loongarch_sx + for (; remain; --remain) + { + // D[x] = rows0[x]*b0 + rows1[x]*b1; + *Dp++ = *rows0p++ * b0 + *rows1p++ * b1; + } + + beta += 2; + } +} diff --git a/src/layer/loongarch/interp_bilinear_pack4.h b/src/layer/loongarch/interp_bilinear_pack4.h new file mode 100644 index 000000000000..2cfb138a1cbd --- /dev/null +++ b/src/layer/loongarch/interp_bilinear_pack4.h @@ -0,0 +1,123 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void resize_bilinear_image_pack4(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) +{ + int w = dst.w; + int h = dst.h; + + // loop body + Mat rowsbuf0(w, (size_t)4 * 4u, 4); + Mat rowsbuf1(w, (size_t)4 * 4u, 4); + float* rows0 = rowsbuf0; + float* rows1 = rowsbuf1; + + int prev_sy1 = -2; + + for (int dy = 0; dy < h; dy++) + { + int sy = yofs[dy]; + + if (sy == prev_sy1) + { + // reuse all rows + } + else if (sy == prev_sy1 + 1) + { + // hresize one row + float* rows0_old = rows0; + rows0 = rows1; + rows1 = rows0_old; + const float* S1 = src.row(sy + 1); + + const float* alphap = alpha; + float* rows1p = rows1; + int dx = 0; + for (; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S1p = S1 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + + __m128 _S10 = (__m128)__lsx_vld(S1p, 0); + __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0); + __m128 _rows1 = __lsx_vfmul_s(_S10, _a0); + _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1); + __lsx_vst(_rows1, rows1p + dx * 4, 0); + + alphap += 2; + } + } + else + { + // hresize two rows + const float* S0 = src.row(sy); + const float* S1 = src.row(sy + 1); + + const float* alphap = alpha; + float* rows0p = rows0; + float* rows1p = rows1; + int dx = 0; + for (; dx < w; dx++) + { + int sx = xofs[dx] * 4; + const float* S0p = S0 + sx; + const float* S1p = S1 + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + + __m128 _S00 = (__m128)__lsx_vld(S0p, 0); + __m128 _S01 = (__m128)__lsx_vld(S0p + 4, 0); + __m128 _S10 = (__m128)__lsx_vld(S1p, 0); + __m128 _S11 = (__m128)__lsx_vld(S1p + 4, 0); + __m128 _rows0 = __lsx_vfmul_s(_S00, _a0); + __m128 _rows1 = __lsx_vfmul_s(_S10, _a0); + _rows0 = __lsx_vfmadd_s(_a1, _S01, _rows0); + _rows1 = __lsx_vfmadd_s(_a1, _S11, _rows1); + __lsx_vst(_rows0, rows0p + dx * 4, 0); + __lsx_vst(_rows1, rows1p + dx * 4, 0); + + alphap += 2; + } + } + + prev_sy1 = sy; + + // vresize + __m128 _b0 = __lsx_vreplfr2vr_s(beta[0]); + __m128 _b1 = __lsx_vreplfr2vr_s(beta[1]); + + float* rows0p = rows0; + float* rows1p = rows1; + float* Dp = dst.row(dy); + + for (int dx = 0; dx < w; dx++) + { + __m128 _rows0 = (__m128)__lsx_vld(rows0p, 0); + __m128 _rows1 = (__m128)__lsx_vld(rows1p, 0); + __m128 _D = __lsx_vfmul_s(_rows0, _b0); + _D = __lsx_vfmadd_s(_b1, _rows1, _D); + __lsx_vst(_D, Dp, 0); + + Dp += 4; + rows0p += 4; + rows1p += 4; + } + + beta += 2; + } +} diff --git a/src/layer/loongarch/interp_loongarch.cpp b/src/layer/loongarch/interp_loongarch.cpp new file mode 100644 index 000000000000..94d25cf005eb --- /dev/null +++ b/src/layer/loongarch/interp_loongarch.cpp @@ -0,0 +1,470 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "interp_loongarch.h" + +#include + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +#include "interp_bicubic.h" +#include "interp_bilinear.h" + +#if __loongarch_sx +#include "interp_bicubic_pack4.h" +#include "interp_bilinear_pack4.h" +#endif + +Interp_loongarch::Interp_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Interp_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& reference_blob = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + int h = bottom_blob.h; + int w = bottom_blob.w; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = reference_blob.w; + int outh = reference_blob.h; + + if (dims == 1) + { + top_blob.create(outw, outh, w, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < w; q++) + { + Mat top_blob_c = top_blob.channel(q); + __m128 _v = (__m128)__lsx_vld((const float*)bottom_blob + q * 4, 0); + top_blob_c.fill(_v); + } + + return 0; + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < w; q++) + { + Mat top_blob_c = top_blob.channel(q); + const float v = bottom_blob[q]; + top_blob_c.fill(v); + } + + return 0; + } + + if (dims == 2) + { + if (outw == w) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 4) + { + if (resize_type == 1) // nearest + { + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + + __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0); + __lsx_vst(_p, outptr, 0); + + outptr += 4; + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outw * 2]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + linear_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * 4; + const float* Sp = ptr + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + + __m128 _S0 = (__m128)__lsx_vld(Sp, 0); + __m128 _S1 = (__m128)__lsx_vld(Sp + 4, 0); + __m128 _p = __lsx_vfmul_s(_S0, _a0); + _p = __lsx_vfmadd_s(_a1, _S1, _p); + __lsx_vst(_p, outptr, 0); + + alphap += 2; + outptr += 4; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outw * 4]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x] * 4; + const float* Sp = ptr + sx; + + __m128 _a0 = __lsx_vreplfr2vr_s(alphap[0]); + __m128 _a1 = __lsx_vreplfr2vr_s(alphap[1]); + __m128 _a2 = __lsx_vreplfr2vr_s(alphap[2]); + __m128 _a3 = __lsx_vreplfr2vr_s(alphap[3]); + + __m128 _S0 = (__m128)__lsx_vld(Sp - 4, 0); + __m128 _S1 = (__m128)__lsx_vld(Sp + 0, 0); + __m128 _S2 = (__m128)__lsx_vld(Sp + 4, 0); + __m128 _S3 = (__m128)__lsx_vld(Sp + 8, 0); + __m128 _p = __lsx_vfmul_s(_S0, _a0); + _p = __lsx_vfmadd_s(_a1, _S1, _p); + _p = __lsx_vfmadd_s(_a2, _S2, _p); + _p = __lsx_vfmadd_s(_a3, _S3, _p); + __lsx_vst(_p, outptr, 0); + + alphap += 4; + outptr += 4; + } + } + + delete[] buf; + } + + return 0; + } +#endif // __loongarch_sx + + if (resize_type == 1) // nearest + { + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + *outptr++ = ptr[in_x]; + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outw * 2]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + linear_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const float* Sp = ptr + sx; + float a0 = alphap[0]; + float a1 = alphap[1]; + *outptr++ = Sp[0] * a0 + Sp[1] * a1; + alphap += 2; + } + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outw * 4]; + + int* xofs = buf; + float* alpha = (float*)(buf + outw); + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int y = 0; y < h; y++) + { + const float* ptr = bottom_blob.row(y); + float* outptr = top_blob.row(y); + const float* alphap = alpha; + + for (int x = 0; x < outw; x++) + { + int sx = xofs[x]; + const float* Sp = ptr + sx; + float a0 = alphap[0]; + float a1 = alphap[1]; + float a2 = alphap[2]; + float a3 = alphap[3]; + *outptr++ = Sp[-1] * a0 + Sp[0] * a1 + Sp[1] * a2 + Sp[2] * a3; + alphap += 4; + } + } + + delete[] buf; + } + + return 0; + } + + if (outw == w && outh == h) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __loongarch_sx + if (elempack == 4) + { + if (resize_type == 1) // nearest + { + const float hs = output_height ? h / (float)outh : 1.f / height_scale; + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + int in_y = std::min((int)(y * hs), (h - 1)); + + const float* ptr = src.row(in_y); + float* outptr = dst.row(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + + __m128 _p = (__m128)__lsx_vld(ptr + in_x * 4, 0); + __lsx_vst(_p, outptr, 0); + + outptr += 4; + } + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; + float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; + + linear_coeffs(w, outw, xofs, alpha, align_corner); + linear_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image_pack4(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; + float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + cubic_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image_pack4(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; + } +#endif // __loongarch_sx + + if (resize_type == 1) // nearest + { + const float hs = output_height ? h / (float)outh : 1.f / height_scale; + const float ws = output_width ? w / (float)outw : 1.f / width_scale; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + for (int y = 0; y < outh; y++) + { + int in_y = std::min((int)(y * hs), (h - 1)); + + const float* ptr = src.row(in_y); + float* outptr = dst.row(y); + for (int x = 0; x < outw; x++) + { + int in_x = std::min((int)(x * ws), (w - 1)); + *outptr++ = ptr[in_x]; + } + } + } + } + + if (resize_type == 2) // bilinear + { + int* buf = new int[outw + outh + outw * 2 + outh * 2]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 2]; + float* beta = (float*)(buf + outw + outh + outw * 2); //new float[outh * 2]; + + linear_coeffs(w, outw, xofs, alpha, align_corner); + linear_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bilinear_image(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + if (resize_type == 3) // bicubic + { + int* buf = new int[outw + outh + outw * 4 + outh * 4]; + + int* xofs = buf; //new int[outw]; + int* yofs = buf + outw; //new int[outh]; + + float* alpha = (float*)(buf + outw + outh); //new float[outw * 4]; + float* beta = (float*)(buf + outw + outh + outw * 4); //new float[outh * 4]; + + cubic_coeffs(w, outw, xofs, alpha, align_corner); + cubic_coeffs(h, outh, yofs, beta, align_corner); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat src = bottom_blob.channel(q); + Mat dst = top_blob.channel(q); + + resize_bicubic_image(src, dst, alpha, xofs, beta, yofs); + } + + delete[] buf; + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/interp_loongarch.h b/src/layer/loongarch/interp_loongarch.h new file mode 100644 index 000000000000..4c0e0f3dc86b --- /dev/null +++ b/src/layer/loongarch/interp_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_INTERP_LOONGARCH_H +#define LAYER_INTERP_LOONGARCH_H + +#include "interp.h" + +namespace ncnn { + +class Interp_loongarch : virtual public Interp +{ +public: + Interp_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_INTERP_LOONGARCH_H diff --git a/src/layer/loongarch/loongarch_activation.h b/src/layer/loongarch/loongarch_activation.h new file mode 100644 index 000000000000..abb268f4bb6d --- /dev/null +++ b/src/layer/loongarch/loongarch_activation.h @@ -0,0 +1,70 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LOONGARCH_ACTIVATION_H +#define LOONGARCH_ACTIVATION_H + +#include "fused_activation.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" + +static inline __m128 activation_ps(__m128 _v, int activation_type, const ncnn::Mat& activation_params) +{ + if (activation_type == 1) + { + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + _v = __lsx_vfmax_s(_v, _zero); + } + else if (activation_type == 2) + { + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(activation_params[0]); + __m128i _lemask = __lsx_vfcmp_cle_s(_v, _zero); + __m128 _ps = __lsx_vfmul_s(_v, _slope); + _v = (__m128)__lsx_vbitsel_v((__m128i)_v, (__m128i)_ps, (__m128i)_lemask); + } + else if (activation_type == 3) + { + __m128 _min = (__m128)__lsx_vreplfr2vr_s(activation_params[0]); + __m128 _max = (__m128)__lsx_vreplfr2vr_s(activation_params[1]); + _v = __lsx_vfmax_s(_v, _min); + _v = __lsx_vfmin_s(_v, _max); + } + else if (activation_type == 4) + { + _v = sigmoid_ps(_v); + } + else if (activation_type == 5) + { + _v = __lsx_vfmul_s(_v, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_v), (__m128)__lsx_vreplfr2vr_s(1.f))))); + } + else if (activation_type == 6) + { + __m128 _alpha = (__m128)__lsx_vreplfr2vr_s(activation_params[0]); + __m128 _beta = (__m128)__lsx_vreplfr2vr_s(activation_params[1]); + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + __m128 _outp = __lsx_vfmadd_s(_alpha, _v, _beta); + _outp = __lsx_vfmax_s(_outp, _zero); + _outp = __lsx_vfmin_s(_outp, _one); + _v = __lsx_vfmul_s(_outp, _v); + } + + return _v; +} +#endif // __loongarch_sx + +#endif // LOONGARCH_ACTIVATION_H diff --git a/src/layer/loongarch/loongarch_usability.h b/src/layer/loongarch/loongarch_usability.h new file mode 100644 index 000000000000..d3ae5dec279d --- /dev/null +++ b/src/layer/loongarch/loongarch_usability.h @@ -0,0 +1,236 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LOONGARCH_USABILITY_H +#define LOONGARCH_USABILITY_H + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include +#include + +namespace ncnn { + +typedef union +{ + int32_t i; + float f; +} FloatInt; + +} // namespace ncnn + +#if __loongarch_sx +/* declare some loongarch constants with union */ +#define _LOONGARCH_FLOAT_CONST(Name, Val) \ + static const ncnn::FloatInt Name = {.f = Val} + +/* float type data load instructions */ +static NCNN_FORCEINLINE __m128 __lsx_vreplfr2vr_s(float val) +{ + ncnn::FloatInt fi_tmpval = {.f = val}; + return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i); +} + +static NCNN_FORCEINLINE float __lsx_reduce_fadd_s(__m128 _v) +{ + // TODO find a more efficient way + float* _v_p = (float*)&_v; + return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3]; +} + +static NCNN_FORCEINLINE int __lsx_reduce_add_w(__m128i _v) +{ + // TODO find a more efficient way + int* _v_p = (int*)&_v; + return _v_p[0] + _v_p[1] + _v_p[2] + _v_p[3]; +} + +#endif // __loongarch_sx + +static NCNN_FORCEINLINE signed char float2int8(float v) +{ + int int32 = round(v); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return (signed char)int32; +} + +#if __loongarch_sx +static NCNN_FORCEINLINE __m128i float2int8(__m128 _v) +{ + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask); + __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, (__m128i)_sign); + __m128 _v5 = __lsx_vfadd_s(_v, _p5s); + __m128i _v32 = __lsx_vftintrz_w_s(_v5); + + __m128i _v32_16 = __lsx_vsat_w(_v32, 15); + __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16); + _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127)); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8; +} + +static NCNN_FORCEINLINE int64_t float2int8(__m128 _vlow, __m128 _vhigh) +{ + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask); + __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask); + __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow); + __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh); + __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low); + __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high); + __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5); + __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5); + + __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15); + __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15); + __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16); + _v16 = __lsx_vmax_h(_v16, __lsx_vreplgr2vr_h(-127)); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8[0]; +} + +static NCNN_FORCEINLINE __m128i float2int8relu(__m128 _v) +{ + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask); + __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign); + __m128 _v5 = __lsx_vfadd_s(_v, _p5s); + __m128i _v32 = __lsx_vftintrz_w_s(_v5); + + __m128i _v32_16 = __lsx_vsat_w(_v32, 15); + __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16); + _v16 = __lsx_vmaxi_h(_v16, 0); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8; +} + +static NCNN_FORCEINLINE int64_t float2int8relu(__m128 _vlow, __m128 _vhigh) +{ + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask); + __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask); + __m128 _p5low = (__m128)__lsx_vor_v((__m128i)_p5, _signlow); + __m128 _p5high = (__m128)__lsx_vor_v((__m128i)_p5, _signhigh); + __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low); + __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high); + __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5); + __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5); + + __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15); + __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15); + __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16); + _v16 = __lsx_vmaxi_h(_v16, 0); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8[0]; +} + +static NCNN_FORCEINLINE __m128i float2int8leakyrelu(__m128 _v, __m128 _slope) +{ + __m128 _v_leaky = __lsx_vfmul_s(_v, _slope); + + // simulate round to nearest via +/-0.5 + __m128 _p5 = (__m128)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _sign = __lsx_vand_v((__m128i)_v, _signmask); + __m128 _p5s = (__m128)__lsx_vor_v((__m128i)_p5, _sign); + __m128 _v5 = __lsx_vfadd_s(_v, _p5s); + __m128i _v32 = __lsx_vftintrz_w_s(_v5); + + __m128i _sign_leaky = __lsx_vand_v((__m128i)_v_leaky, _signmask); + __m128 _p5_leaky = (__m128)__lsx_vor_v((__m128i)_p5, _sign_leaky); + __m128 _v5_leaky = __lsx_vfadd_s(_v_leaky, _p5_leaky); + __m128i _v32_leaky = __lsx_vftintrz_w_s(_v5_leaky); + + __m128i _v32_16 = __lsx_vsat_w(_v32, 15); + __m128i _v16 = __lsx_vpickev_h(_v32_16, _v32_16); + + __m128i _v32_16_leaky = __lsx_vsat_w(_v32_leaky, 15); + __m128i _v16_leaky = __lsx_vpickev_h(_v32_16_leaky, _v32_16_leaky); + + _v16 = __lsx_vmax_h(_v16, _v16_leaky); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8; +} + +static NCNN_FORCEINLINE int64_t float2int8leakyrelu(__m128 _vlow, __m128 _vhigh, __m128 _slope) +{ + __m128 _vlow_leaky = __lsx_vfmul_s(_vlow, _slope); + __m128 _vhigh_leaky = __lsx_vfmul_s(_vhigh, _slope); + + // simulate round to nearest via +/-0.5 + __m128i _p5 = (__m128i)__lsx_vreplfr2vr_s(0.5f); + __m128i _signmask = __lsx_vreplgr2vr_w(1 << 31); + + __m128i _signlow = __lsx_vand_v((__m128i)_vlow, _signmask); + __m128i _signhigh = __lsx_vand_v((__m128i)_vhigh, _signmask); + __m128 _p5low = (__m128)__lsx_vor_v(_p5, _signlow); + __m128 _p5high = (__m128)__lsx_vor_v(_p5, _signhigh); + __m128 _vlow5 = __lsx_vfadd_s(_vlow, _p5low); + __m128 _vhigh5 = __lsx_vfadd_s(_vhigh, _p5high); + __m128i _vlow32 = __lsx_vftintrz_w_s(_vlow5); + __m128i _vhigh32 = __lsx_vftintrz_w_s(_vhigh5); + + __m128i _signlow_leaky = __lsx_vand_v((__m128i)_vlow_leaky, _signmask); + __m128i _signhigh_leaky = __lsx_vand_v((__m128i)_vhigh_leaky, _signmask); + __m128 _p5low_leaky = (__m128)__lsx_vor_v(_p5, _signlow_leaky); + __m128 _p5high_leaky = (__m128)__lsx_vor_v(_p5, _signhigh_leaky); + __m128 _vlow5_leaky = __lsx_vfadd_s(_vlow_leaky, _p5low_leaky); + __m128 _vhigh5_leaky = __lsx_vfadd_s(_vhigh_leaky, _p5high_leaky); + __m128i _vlow32_leaky = __lsx_vftintrz_w_s(_vlow5_leaky); + __m128i _vhigh32_leaky = __lsx_vftintrz_w_s(_vhigh5_leaky); + + __m128i _vlow32_16 = __lsx_vsat_w(_vlow32, 15); + __m128i _vhigh32_16 = __lsx_vsat_w(_vhigh32, 15); + __m128i _v16 = __lsx_vpickev_h(_vhigh32_16, _vlow32_16); + + __m128i _vlow32_16_leaky = __lsx_vsat_w(_vlow32_leaky, 15); + __m128i _vhigh32_16_leaky = __lsx_vsat_w(_vhigh32_leaky, 15); + __m128i _v16_leaky = __lsx_vpickev_h(_vhigh32_16_leaky, _vlow32_16_leaky); + + _v16 = __lsx_vmax_h(_v16, _v16_leaky); + __m128i _v16_8 = __lsx_vsat_h(_v16, 7); + __m128i _v8 = __lsx_vpickev_b(_v16_8, _v16_8); + + return _v8[0]; +} +#endif // __loongarch_sx + +#endif // LOONGARCH_USABILITY_H diff --git a/src/layer/loongarch/lsx_mathfun.h b/src/layer/loongarch/lsx_mathfun.h new file mode 100644 index 000000000000..ededa5966593 --- /dev/null +++ b/src/layer/loongarch/lsx_mathfun.h @@ -0,0 +1,258 @@ +/* LOONGARCH implementation of exp + * + * Inspired by Intel Approximate Math library, and based on the + * corresponding algorithms of the cephes math library + * Copyright (C) 2022 yala ;. All rights reserved. + */ + +/* + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * (this is the zlib license) + */ + +#ifndef LSX_MATHFUN_H +#define LSX_MATHFUN_H + +#include "loongarch_usability.h" + +#include + +_LOONGARCH_FLOAT_CONST(c_1, 1.0f); +_LOONGARCH_FLOAT_CONST(c_2, 2.0f); +_LOONGARCH_FLOAT_CONST(c_n1, -1.0f); +_LOONGARCH_FLOAT_CONST(c_0p5, 0.5f); + +#define c_inv_mant_mask ~0x7f800000u +_LOONGARCH_FLOAT_CONST(c_cephes_SQRTHF, 0.707106781186547524); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p0, 7.0376836292E-2); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p1, -1.1514610310E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p2, 1.1676998740E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p3, -1.2420140846E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p4, +1.4249322787E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p5, -1.6668057665E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p6, +2.0000714765E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p7, -2.4999993993E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_p8, +3.3333331174E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_log_q1, -2.12194440e-4); +_LOONGARCH_FLOAT_CONST(c_cephes_log_q2, 0.693359375); + +/* natural logarithm computed for 4 simultaneous float + * return NaN for x <= 0 + */ +static inline __m128 log_ps(__m128 x) +{ + __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i); + + x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(0)); /* force flush to zero on denormal values */ + __m128i invalid_mask = __lsx_vfcmp_cle_s(x, (__m128)__lsx_vreplgr2vr_w(0)); + + __m128i ux = (__m128i)(x); + + __m128i emm0 = __lsx_vsrl_w(ux, __lsx_vreplgr2vr_w(23)); + + /* keep only the fractional part */ + ux = __lsx_vand_v(ux, __lsx_vreplgr2vr_w(c_inv_mant_mask)); + ux = __lsx_vor_v(ux, __lsx_vreplgr2vr_w(c_0p5.i)); + x = (__m128)(ux); + + emm0 = __lsx_vsub_w(emm0, __lsx_vreplgr2vr_w(0x7f)); + __m128 e = __lsx_vffint_s_w(emm0); + + e = __lsx_vfadd_s(e, one); + + /* part2: + * if( x < SQRTHF ) { + * e -= 1; + * x = x + x - 1.0; + * } else { x = x - 1.0; } + */ + __m128i mask = __lsx_vfcmp_clt_s((__m128)x, (__m128)__lsx_vreplgr2vr_w(c_cephes_SQRTHF.i)); + __m128 tmp = (__m128)(__lsx_vand_v((__m128i)(x), (__m128i)mask)); + x = __lsx_vfsub_s(x, one); + e = __lsx_vfsub_s(e, (__m128)(__lsx_vand_v((__m128i)(one), (__m128i)mask))); + x = __lsx_vfadd_s(x, tmp); + + __m128 z = __lsx_vfmul_s(x, x); + + __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p0.i); + + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p1.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p2.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p3.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p4.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p5.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p6.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p7.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_p8.i)); + y = __lsx_vfmul_s(y, x); + + y = __lsx_vfmul_s(y, z); + + tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q1.i)); + y = __lsx_vfadd_s(y, tmp); + + tmp = __lsx_vfmul_s(z, (__m128)__lsx_vreplgr2vr_w(c_0p5.i)); + y = __lsx_vfsub_s(y, tmp); + + tmp = __lsx_vfmul_s(e, (__m128)__lsx_vreplgr2vr_w(c_cephes_log_q2.i)); + x = __lsx_vfadd_s(x, y); + x = __lsx_vfadd_s(x, tmp); + x = (__m128)(__lsx_vor_v((__m128i)(x), (__m128i)invalid_mask)); // negative arg will be NAN + return x; +} + +_LOONGARCH_FLOAT_CONST(c_exp_hi, 88.3762626647949f); +_LOONGARCH_FLOAT_CONST(c_exp_lo, -88.3762626647949f); + +_LOONGARCH_FLOAT_CONST(c_cephes_LOG2EF, 1.44269504088896341); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_C1, 0.693359375); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_C2, -2.12194440e-4); + +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p0, 1.9875691500E-4); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p1, 1.3981999507E-3); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p2, 8.3334519073E-3); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p3, 4.1665795894E-2); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p4, 1.6666665459E-1); +_LOONGARCH_FLOAT_CONST(c_cephes_exp_p5, 5.0000001201E-1); + +/* exp() computed for 4 float at once */ +static inline __m128 exp_ps(__m128 x) +{ + __m128 tmp, fx; + + __m128 one = (__m128)__lsx_vreplgr2vr_w(c_1.i); + x = __lsx_vfmin_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_hi.i)); + x = __lsx_vfmax_s(x, (__m128)__lsx_vreplgr2vr_w(c_exp_lo.i)); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = __lsx_vfmul_s(x, (__m128)__lsx_vreplgr2vr_w(c_cephes_LOG2EF.i)); + fx = __lsx_vfadd_s(fx, (__m128)__lsx_vreplgr2vr_w(c_0p5.i)); + + /* perform a floorf */ + tmp = __lsx_vffint_s_w(__lsx_vftint_w_s(fx)); + + /* if greater, substract 1 */ + __m128i mask = __lsx_vfcmp_clt_s(fx, tmp); + mask = __lsx_vand_v(mask, (__m128i)one); + + fx = __lsx_vfsub_s(tmp, (__m128)mask); + + tmp = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C1.i)); + __m128 z = __lsx_vfmul_s(fx, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_C2.i)); + x = __lsx_vfsub_s(x, tmp); + x = __lsx_vfsub_s(x, z); + + __m128 y = (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p0.i); + + z = __lsx_vfmul_s(x, x); + + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p1.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p2.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p3.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p4.i)); + y = __lsx_vfmadd_s(x, y, (__m128)__lsx_vreplgr2vr_w(c_cephes_exp_p5.i)); + + y = __lsx_vfmul_s(y, z); + y = __lsx_vfadd_s(y, x); + y = __lsx_vfadd_s(y, one); + + /* build 2^n */ + __m128i mm; + mm = __lsx_vftintrz_w_s(fx); + mm = __lsx_vadd_w(mm, __lsx_vreplgr2vr_w(0x7f)); + mm = __lsx_vsll_w(mm, __lsx_vreplgr2vr_w(23)); + + y = __lsx_vfmul_s(y, (__m128)mm); + return y; +} + +_LOONGARCH_FLOAT_CONST(c_tanh_tiny, 1e-4f); +_LOONGARCH_FLOAT_CONST(c_tanh_hi, 9.0f); +// The monomial coefficients of the numerator polynomial (odd). +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_1, 4.89352455891786e-3f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_3, 6.37261928875436e-4f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_5, 1.48572235717979e-5f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_7, 5.12229709037114e-8f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_9, -8.60467152213735e-11f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_11, 2.00018790482477e-13f); +_LOONGARCH_FLOAT_CONST(c_tanh_alpha_13, -2.76076847742355e-16f); +// The monomial coefficients of the denominator polynomial (even). +_LOONGARCH_FLOAT_CONST(c_tanh_beta_0, 4.89352518554385e-3f); +_LOONGARCH_FLOAT_CONST(c_tanh_beta_2, 2.26843463243900e-3f); +_LOONGARCH_FLOAT_CONST(c_tanh_beta_4, 1.18534705686654e-4f); +_LOONGARCH_FLOAT_CONST(c_tanh_beta_6, 1.19825839466702e-6f); + +/* tanh() computed for 4 float at once */ +static inline __m128 tanh_ps(__m128 x) +{ + __m128 x2 = (__m128)__lsx_vbitclri_w((__m128i)x, 31); + __m128i tiny_mask = __lsx_vfcmp_clt_s((__m128)x2, (__m128)(__m128)__lsx_vreplgr2vr_w(c_tanh_tiny.i)); + __m128i sig_mask = __lsx_vreplgr2vr_w(1 << 31); + __m128i sig_save = __lsx_vand_v((__m128i)x, sig_mask); + + // clamp the inputs to the range [-9, 9] since anything outside + // this range is -/+1.0f in single-precision. + x2 = (__m128)__lsx_vbitsel_v((__m128i)x2, (__m128i)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128i)__lsx_vfcmp_clt_s((__m128)__lsx_vreplgr2vr_w(c_tanh_hi.i), (__m128)x2)); + + // since the polynomials are odd/even, we need x**2. + __m128 z = __lsx_vfmul_s(x2, x2); + + // evaluate the numerator polynomial y. + __m128 y = (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_13.i); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_11.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_9.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_7.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_5.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_3.i)); + y = __lsx_vfmadd_s(z, y, (__m128)__lsx_vreplgr2vr_w(c_tanh_alpha_1.i)); + y = __lsx_vfmul_s(y, x2); + + // evaluate the denominator polynomial w. + __m128 w = (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_6.i); + w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_4.i)); + w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_2.i)); + w = __lsx_vfmadd_s(z, w, (__m128)__lsx_vreplgr2vr_w(c_tanh_beta_0.i)); + + // divide the numerator by the denominator. + y = __lsx_vfdiv_s(y, w); + + // reinstate the sign. + y = (__m128)__lsx_vor_v((__m128i)y, sig_save); + + // when the argument is very small in magnitude it's more accurate to just return it. + y = (__m128)__lsx_vbitsel_v((__m128i)y, (__m128i)x, (__m128i)tiny_mask); + + return y; +} + +static inline __m128 pow_ps(__m128 a, __m128 b) +{ + // pow(x, m) = exp(m * log(x)) + return exp_ps(__lsx_vfmul_s(b, log_ps(a))); +} + +static inline __m128 sigmoid_ps(__m128 _v) +{ + __m128 _one = __lsx_vreplfr2vr_s(1.f); + _v = (__m128)__lsx_vbitrevi_w((__m128i)_v, 31); + _v = exp_ps(_v); + _v = __lsx_vfadd_s(_v, _one); + return __lsx_vfdiv_s(_one, _v); +} + +#endif // LSX_MATHFUN_H diff --git a/src/layer/loongarch/mish_loongarch.cpp b/src/layer/loongarch/mish_loongarch.cpp new file mode 100644 index 000000000000..8558e2f8cb06 --- /dev/null +++ b/src/layer/loongarch/mish_loongarch.cpp @@ -0,0 +1,70 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "mish_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include + +namespace ncnn { + +Mish_loongarch::Mish_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Mish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmul_s(_p, tanh_ps(log_ps(__lsx_vfadd_s(exp_ps(_p), _one)))); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = *ptr * tanh(log(exp(*ptr) + 1.f)); + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/mish_loongarch.h b/src/layer/loongarch/mish_loongarch.h new file mode 100644 index 000000000000..97c6f0520f50 --- /dev/null +++ b/src/layer/loongarch/mish_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_MISH_LOONGARCH_H +#define LAYER_MISH_LOONGARCH_H + +#include "mish.h" + +namespace ncnn { + +class Mish_loongarch : virtual public Mish +{ +public: + Mish_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_MISH_LOONGARCH_H diff --git a/src/layer/loongarch/packing_loongarch.cpp b/src/layer/loongarch/packing_loongarch.cpp new file mode 100644 index 000000000000..cf68b7b34d69 --- /dev/null +++ b/src/layer/loongarch/packing_loongarch.cpp @@ -0,0 +1,569 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "packing_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +namespace ncnn { + +Packing_loongarch::Packing_loongarch() +{ + support_packing = true; +} + +int Packing_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + if (elembits == 8) + return forward_int8(bottom_blob, top_blob, opt); + + if (use_padding) + { + return Packing::forward(bottom_blob, top_blob, opt); + } + + if (elembits != 32) + { + // non-fp32 type + return Packing::forward(bottom_blob, top_blob, opt); + } + + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + if (elempack == out_elempack) + { + top_blob = bottom_blob; + return 0; + } + + bool pack1to4 = elempack == 1 && out_elempack == 4; + bool pack4to1 = elempack == 4 && out_elempack == 1; + + if (!pack1to4 && !pack4to1) + { + return Packing::forward(bottom_blob, top_blob, opt); + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + + if (!use_padding) + { + // identity if use_padding not allowed + if (dims == 1 && w * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if (dims == 2 && h * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + } + + if (dims == 1) + { + top_blob = bottom_blob; + top_blob.w = w * elempack / out_elempack; + top_blob.cstep = w * elempack / out_elempack; + top_blob.elemsize = elemsize / elempack * out_elempack; + top_blob.elempack = out_elempack; + return 0; + } + + if (dims == 2) + { + int outh = h * elempack / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (pack1to4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* r0 = bottom_blob.row(i * 4); + const float* r1 = bottom_blob.row(i * 4 + 1); + const float* r2 = bottom_blob.row(i * 4 + 2); + const float* r3 = bottom_blob.row(i * 4 + 3); + + float* outptr = top_blob.row(i); + + int j = 0; +#if __loongarch_sx + for (; j + 3 < w; j += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r3 = __lsx_vld(r3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr, 0); + __lsx_vst(_r0123_1, outptr + 4, 0); + __lsx_vst(_r0123_2, outptr + 4 * 2, 0); + __lsx_vst(_r0123_3, outptr + 4 * 3, 0); + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + outptr += 16; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + outptr[0] = *r0++; + outptr[1] = *r1++; + outptr[2] = *r2++; + outptr[3] = *r3++; + + outptr += 4; + } + } + } + if (pack4to1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* r0 = bottom_blob.row(i); + + float* outptr0 = top_blob.row(i * 4); + float* outptr1 = top_blob.row(i * 4 + 1); + float* outptr2 = top_blob.row(i * 4 + 2); + float* outptr3 = top_blob.row(i * 4 + 3); + + int j = 0; +#if __loongarch_sx + for (; j + 3 < w; j += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr0, 0); + __lsx_vst(_r0123_1, outptr1, 0); + __lsx_vst(_r0123_2, outptr2, 0); + __lsx_vst(_r0123_3, outptr3, 0); + + r0 += 16; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + } + } + + return 0; + } + + if (dims == 3 || dims == 4) + { + int size = w * h * d; + int outc = channels * elempack / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (dims == 3) + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + else // if (dims == 4) + top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (pack1to4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* r0 = bottom_blob.channel(q * 4); + const float* r1 = bottom_blob.channel(q * 4 + 1); + const float* r2 = bottom_blob.channel(q * 4 + 2); + const float* r3 = bottom_blob.channel(q * 4 + 3); + + float* outptr = top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r1, 0); + __m128i _r2 = __lsx_vld(r2, 0); + __m128i _r3 = __lsx_vld(r3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr, 0); + __lsx_vst(_r0123_1, outptr + 4, 0); + __lsx_vst(_r0123_2, outptr + 4 * 2, 0); + __lsx_vst(_r0123_3, outptr + 4 * 3, 0); + + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + outptr += 16; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + outptr[0] = *r0++; + outptr[1] = *r1++; + outptr[2] = *r2++; + outptr[3] = *r3++; + + outptr += 4; + } + } + } + if (pack4to1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* r0 = bottom_blob.channel(q); + + float* outptr0 = top_blob.channel(q * 4); + float* outptr1 = top_blob.channel(q * 4 + 1); + float* outptr2 = top_blob.channel(q * 4 + 2); + float* outptr3 = top_blob.channel(q * 4 + 3); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + // transpose 4x4 + __m128i _r0 = __lsx_vld(r0, 0); + __m128i _r1 = __lsx_vld(r0 + 4, 0); + __m128i _r2 = __lsx_vld(r0 + 4 * 2, 0); + __m128i _r3 = __lsx_vld(r0 + 4 * 3, 0); + + __m128i _r01r = __lsx_vilvl_w(_r1, _r0); + __m128i _r01l = __lsx_vilvh_w(_r1, _r0); + __m128i _r23r = __lsx_vilvl_w(_r3, _r2); + __m128i _r23l = __lsx_vilvh_w(_r3, _r2); + __m128i _r0123_0 = __lsx_vilvl_d(_r23r, _r01r); + __m128i _r0123_1 = __lsx_vilvh_d(_r23r, _r01r); + __m128i _r0123_2 = __lsx_vilvl_d(_r23l, _r01l); + __m128i _r0123_3 = __lsx_vilvh_d(_r23l, _r01l); + + __lsx_vst(_r0123_0, outptr0, 0); + __lsx_vst(_r0123_1, outptr1, 0); + __lsx_vst(_r0123_2, outptr2, 0); + __lsx_vst(_r0123_3, outptr3, 0); + + r0 += 16; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + + r0 += 4; + } + } + } + + return 0; + } + + return 0; +} + +int Packing_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + if (use_padding) + { + return Packing::forward(bottom_blob, top_blob, opt); + } + + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + if (elempack == out_elempack) + { + top_blob = bottom_blob; + return 0; + } + + bool pack1to8 = elempack == 1 && out_elempack == 8; + bool pack8to1 = elempack == 8 && out_elempack == 1; + + if (!pack1to8 && !pack8to1) + { + return Packing::forward(bottom_blob, top_blob, opt); + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + + if (!use_padding) + { + // identity if use_padding not allowed + if (dims == 1 && w * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if (dims == 2 && h * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if ((dims == 3 || dims == 4) && channels * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + } + + if (dims == 1) + { + top_blob = bottom_blob; + top_blob.w = w * elempack / out_elempack; + top_blob.cstep = w * elempack / out_elempack; + top_blob.elemsize = elemsize / elempack * out_elempack; + top_blob.elempack = out_elempack; + return 0; + } + + if (dims == 2) + { + int outh = h * elempack / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (pack1to8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const signed char* r0 = bottom_blob.row(i * 8); + const signed char* r1 = bottom_blob.row(i * 8 + 1); + const signed char* r2 = bottom_blob.row(i * 8 + 2); + const signed char* r3 = bottom_blob.row(i * 8 + 3); + const signed char* r4 = bottom_blob.row(i * 8 + 4); + const signed char* r5 = bottom_blob.row(i * 8 + 5); + const signed char* r6 = bottom_blob.row(i * 8 + 6); + const signed char* r7 = bottom_blob.row(i * 8 + 7); + + signed char* outptr = top_blob.row(i); + + int j = 0; + for (; j < w; j++) + { + outptr[0] = *r0++; + outptr[1] = *r1++; + outptr[2] = *r2++; + outptr[3] = *r3++; + outptr[4] = *r4++; + outptr[5] = *r5++; + outptr[6] = *r6++; + outptr[7] = *r7++; + + outptr += 8; + } + } + } + if (pack8to1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const signed char* r0 = bottom_blob.row(i); + + signed char* outptr0 = top_blob.row(i * 8); + signed char* outptr1 = top_blob.row(i * 8 + 1); + signed char* outptr2 = top_blob.row(i * 8 + 2); + signed char* outptr3 = top_blob.row(i * 8 + 3); + signed char* outptr4 = top_blob.row(i * 8 + 4); + signed char* outptr5 = top_blob.row(i * 8 + 5); + signed char* outptr6 = top_blob.row(i * 8 + 6); + signed char* outptr7 = top_blob.row(i * 8 + 7); + + int j = 0; + for (; j < w; j++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + *outptr4++ = r0[4]; + *outptr5++ = r0[5]; + *outptr6++ = r0[6]; + *outptr7++ = r0[7]; + + r0 += 8; + } + } + } + + return 0; + } + + if (dims == 3 || dims == 4) + { + int size = w * h * d; + int outc = channels * elempack / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (dims == 3) + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + else // if (dims == 4) + top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (pack1to8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const signed char* r0 = bottom_blob.channel(q * 8); + const signed char* r1 = bottom_blob.channel(q * 8 + 1); + const signed char* r2 = bottom_blob.channel(q * 8 + 2); + const signed char* r3 = bottom_blob.channel(q * 8 + 3); + const signed char* r4 = bottom_blob.channel(q * 8 + 4); + const signed char* r5 = bottom_blob.channel(q * 8 + 5); + const signed char* r6 = bottom_blob.channel(q * 8 + 6); + const signed char* r7 = bottom_blob.channel(q * 8 + 7); + + signed char* outptr = top_blob.channel(q); + + int i = 0; + for (; i < size; i++) + { + outptr[0] = *r0++; + outptr[1] = *r1++; + outptr[2] = *r2++; + outptr[3] = *r3++; + outptr[4] = *r4++; + outptr[5] = *r5++; + outptr[6] = *r6++; + outptr[7] = *r7++; + + outptr += 8; + } + } + } + if (pack8to1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const signed char* r0 = bottom_blob.channel(q); + + signed char* outptr0 = top_blob.channel(q * 8); + signed char* outptr1 = top_blob.channel(q * 8 + 1); + signed char* outptr2 = top_blob.channel(q * 8 + 2); + signed char* outptr3 = top_blob.channel(q * 8 + 3); + signed char* outptr4 = top_blob.channel(q * 8 + 4); + signed char* outptr5 = top_blob.channel(q * 8 + 5); + signed char* outptr6 = top_blob.channel(q * 8 + 6); + signed char* outptr7 = top_blob.channel(q * 8 + 7); + + int i = 0; + for (; i < size; i++) + { + *outptr0++ = r0[0]; + *outptr1++ = r0[1]; + *outptr2++ = r0[2]; + *outptr3++ = r0[3]; + *outptr4++ = r0[4]; + *outptr5++ = r0[5]; + *outptr6++ = r0[6]; + *outptr7++ = r0[7]; + + r0 += 8; + } + } + } + + return 0; + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/packing_loongarch.h b/src/layer/loongarch/packing_loongarch.h new file mode 100644 index 000000000000..1db215cfee7a --- /dev/null +++ b/src/layer/loongarch/packing_loongarch.h @@ -0,0 +1,35 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_PACKING_LOONGARCH_H +#define LAYER_PACKING_LOONGARCH_H + +#include "packing.h" + +namespace ncnn { + +class Packing_loongarch : virtual public Packing +{ +public: + Packing_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_PACKING_LOONGARCH_H diff --git a/src/layer/loongarch/padding_loongarch.cpp b/src/layer/loongarch/padding_loongarch.cpp new file mode 100644 index 000000000000..1f345ce60532 --- /dev/null +++ b/src/layer/loongarch/padding_loongarch.cpp @@ -0,0 +1,385 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "padding_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +#if __loongarch_sx +#include "padding_pack4.h" +#include "padding_pack8_int8.h" +#endif // __loongarch_sx + +Padding_loongarch::Padding_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Padding_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + if (top == 0 && bottom == 0 && left == 0 && right == 0 && front == 0 && behind == 0) + { + top_blob = bottom_blob; + return 0; + } + + int elembits = bottom_blob.elembits(); + + if (elembits == 8) + return forward_int8(bottom_blob, top_blob, opt); + + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 4) + { + if (dims == 1) + { + int outw = w * elempack + left + right; + + int out_elempack = outw % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (left % 4 == 0 && out_elempack == 4 && type == 0) + { + top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + __m128 pad_value = __lsx_vreplfr2vr_s(value); + padding_constant_pack4_lsx(bottom_blob, top_blob, 0, 0, left / 4, right / 4, pad_value); + + return 0; + } + } + + if (dims == 2) + { + int outw = w + left + right; + int outh = h * elempack + top + bottom; + + int out_elempack = outh % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (top % 4 == 0 && out_elempack == 4 && type == 0) + { + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + __m128 pad_value = __lsx_vreplfr2vr_s(value); + padding_constant_pack4_lsx(bottom_blob, top_blob, top / 4, bottom / 4, left, right, pad_value); + + return 0; + } + } + + if (dims == 3) + { + int outw = w + left + right; + int outh = h + top + bottom; + int outc = channels * elempack + front + behind; + + int out_elempack = outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (front % 4 == 0 && out_elempack == 4 && !(outc != channels * elempack && type != 0)) + { + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int front_ = front / elempack; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc / out_elempack; q++) + { + Mat borderm = top_blob.channel(q); + + __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value); + //Channel padding + if ((q - front_) < 0 || (q - front_) >= channels) + { + borderm.fill(pad_value); + } + else + { + const Mat m = bottom_blob.channel(q - front_); + if (type == 0) + padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value); + if (type == 1) + padding_replicate_pack4_lsx(m, borderm, top, bottom, left, right); + if (type == 2) + padding_reflect_pack4_lsx(m, borderm, top, bottom, left, right); + } + } + + return 0; + } + } + + if (dims == 4) + { + int outw = w + left + right; + int outh = h + top + bottom; + int outd = d + front + behind; + + if (type == 0) + { + top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __m128 pad_value = per_channel_pad_data_size ? (__m128)__lsx_vld((const float*)per_channel_pad_data + q * 4, 0) : __lsx_vreplfr2vr_s(value); + + for (int z = 0; z < outd; z++) + { + Mat borderm = top_blob.channel(q).depth(z); + + // depth padding + if ((z - front) < 0 || (z - front) >= d) + { + borderm.fill(pad_value); + } + else + { + const Mat m = bottom_blob.channel(q).depth(z - front); + padding_constant_pack4_lsx(m, borderm, top, bottom, left, right, pad_value); + } + } + } + + return 0; + } + } + } +#endif // __loongarch_sx + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + } + + Mat top_blob_unpacked; + int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt); + if (ret != 0) + return ret; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = top_blob_unpacked.c % 4 == 0 ? 4 : 1; + } +#endif + + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + + return 0; +} + +int Padding_loongarch::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int d = bottom_blob.d; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 8) + { + if (dims == 1) + { + int outw = w * elempack + left + right; + + int out_elempack = outw % 8 == 0 ? 8 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (left % 8 == 0 && out_elempack == 8 && type == 0) + { + top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int64_t v8 = (int64_t)value; + int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56); + padding_constant_pack8_int8_lsx(bottom_blob, top_blob, 0, 0, left / 8, right / 8, pad_value); + + return 0; + } + } + + if (dims == 2) + { + int outw = w + left + right; + int outh = h * elempack + top + bottom; + + int out_elempack = outh % 8 == 0 ? 8 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (top % 8 == 0 && out_elempack == 8 && type == 0) + { + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int64_t v8 = (int64_t)value; + int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56); + padding_constant_pack8_int8_lsx(bottom_blob, top_blob, top / 8, bottom / 8, left, right, pad_value); + + return 0; + } + } + + if (dims == 3) + { + int outw = w + left + right; + int outh = h + top + bottom; + int outc = channels * elempack + front + behind; + + int out_elempack = outc % 8 == 0 ? 8 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (front % 8 == 0 && out_elempack == 8 && !(outc != channels * elempack && type != 0)) + { + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int front_ = front / elempack; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc / out_elempack; q++) + { + Mat borderm = top_blob.channel(q); + + // TODO perchannel + // int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value); + int64_t v8 = (int64_t)value; + int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56); + + //Channel padding + if ((q - front_) < 0 || (q - front_) >= channels) + { + borderm.fill(pad_value); + } + else + { + const Mat m = bottom_blob.channel(q - front_); + if (type == 0) + padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value); + if (type == 1) + padding_replicate_pack8_int8_lsx(m, borderm, top, bottom, left, right); + if (type == 2) + padding_reflect_pack8_int8_lsx(m, borderm, top, bottom, left, right); + } + } + + return 0; + } + } + + if (dims == 4) + { + int outw = w + left + right; + int outh = h + top + bottom; + int outd = d + front + behind; + + if (type == 0) + { + top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + // TODO perchannel + // int64_t pad_value = per_channel_pad_data_size ? vld1_s8(per_channel_pad_data + q * 8) : vdup_n_s8((signed char)value); + int64_t v8 = (int64_t)value; + int64_t pad_value = v8 | (v8 << 8) | (v8 << 16) | (v8 << 24) | (v8 << 32) | (v8 << 40) | (v8 << 48) | (v8 << 56); + + for (int z = 0; z < outd; z++) + { + Mat borderm = top_blob.channel(q).depth(z); + + // depth padding + if ((z - front) < 0 || (z - front) >= d) + { + borderm.fill(pad_value); + } + else + { + const Mat m = bottom_blob.channel(q).depth(z - front); + padding_constant_pack8_int8_lsx(m, borderm, top, bottom, left, right, pad_value); + } + } + } + + return 0; + } + } + } +#endif // __loongarch_sx + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack != 1) + { + Option opt_pack1 = opt; + opt_pack1.blob_allocator = opt.workspace_allocator; + + convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + } + + Mat top_blob_unpacked; + int ret = Padding::forward(bottom_blob_unpacked, top_blob_unpacked, opt); + if (ret != 0) + return ret; + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + { + out_elempack = top_blob_unpacked.c % 8 == 0 ? 8 : 1; + } +#endif + + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/padding_loongarch.h b/src/layer/loongarch/padding_loongarch.h new file mode 100644 index 000000000000..137fbc4459ec --- /dev/null +++ b/src/layer/loongarch/padding_loongarch.h @@ -0,0 +1,35 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_PADDING_LOONGARCH_H +#define LAYER_PADDING_LOONGARCH_H + +#include "padding.h" + +namespace ncnn { + +class Padding_loongarch : virtual public Padding +{ +public: + Padding_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_PADDING_LOONGARCH_H diff --git a/src/layer/loongarch/padding_pack4.h b/src/layer/loongarch/padding_pack4.h new file mode 100644 index 000000000000..d040ce778b58 --- /dev/null +++ b/src/layer/loongarch/padding_pack4.h @@ -0,0 +1,213 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void padding_constant_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, __m128 v) +{ + const float* ptr = src; + float* outptr = dst; + int top_size = top * dst.w; + int bottom_size = bottom * dst.w; + + // fill top + for (int y = 0; y < top_size; y++) + { + __lsx_vst(v, outptr, 0); + outptr += 4; + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + __lsx_vst(v, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + __builtin_prefetch(ptr + 32); + __lsx_vst(__lsx_vld(ptr, 0), outptr, 0); + ptr += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __lsx_vst(v, outptr, 0); + outptr += 4; + } + } + // fill top + for (int y = 0; y < bottom_size; y++) + { + __lsx_vst(v, outptr, 0); + outptr += 4; + } +} + +static void padding_replicate_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right) +{ + const float* ptr = src; + float* outptr = dst; + + // fill top + for (int y = 0; y < top; y++) + { + const float* ptr0 = ptr; + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + for (int x = 0; x < left; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + _p = (__m128)__lsx_vld(ptr0, 0); + __lsx_vst(_p, outptr, 0); + ptr0 += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + } + // fill center + for (int y = 0; y < src.h; y++) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + for (int x = 0; x < left; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + _p = (__m128)__lsx_vld(ptr, 0); + __lsx_vst(_p, outptr, 0); + ptr += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + } + // fill bottom + ptr -= src.w * 4; + for (int y = 0; y < bottom; y++) + { + const float* ptr0 = ptr; + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + for (int x = 0; x < left; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + _p = (__m128)__lsx_vld(ptr0, 0); + __lsx_vst(_p, outptr, 0); + ptr0 += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + } +} + +static void padding_reflect_pack4_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right) +{ + const float* ptr = src; + float* outptr = dst; + + // fill top + ptr += top * src.w * 4; + for (int y = 0; y < top; y++) + { + const float* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __lsx_vst(_p, outptr, 0); + ptr0 += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + ptr -= src.w * 4; + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr + (left - x) * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __lsx_vst(_p, outptr, 0); + ptr += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr - 8 - x * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + } + // fill bottom + ptr -= 2 * src.w * 4; + for (int y = 0; y < bottom; y++) + { + const float* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0 + (left - x) * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + for (int x = 0; x < src.w; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __lsx_vst(_p, outptr, 0); + ptr0 += 4; + outptr += 4; + } + for (int x = 0; x < right; x++) + { + __m128 _p = (__m128)__lsx_vld(ptr0 - 8 - x * 4, 0); + __lsx_vst(_p, outptr, 0); + outptr += 4; + } + ptr -= src.w * 4; + } +} diff --git a/src/layer/loongarch/padding_pack8_int8.h b/src/layer/loongarch/padding_pack8_int8.h new file mode 100644 index 000000000000..4c6586c6ae27 --- /dev/null +++ b/src/layer/loongarch/padding_pack8_int8.h @@ -0,0 +1,171 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void padding_constant_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int64_t _v) +{ + const int64_t* ptr = src; + int64_t* outptr = dst; + + // fill top + for (int y = 0; y < top; y++) + { + for (int x = 0; x < dst.w; x++) + { + *outptr++ = _v; + } + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + *outptr++ = _v; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = _v; + } + } + // fill bottom + for (int y = 0; y < bottom; y++) + { + for (int x = 0; x < dst.w; x++) + { + *outptr++ = _v; + } + } +} + +static void padding_replicate_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right) +{ + const int64_t* ptr = src; + int64_t* outptr = dst; + + // fill top + for (int y = 0; y < top; y++) + { + const int64_t* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + *outptr++ = *ptr0; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr0++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr0[-1]; + } + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + *outptr++ = *ptr; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr[-1]; + } + } + // fill bottom + ptr -= src.w; + for (int y = 0; y < bottom; y++) + { + const int64_t* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + *outptr++ = *ptr0; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr0++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr0[-1]; + } + } +} + +static void padding_reflect_pack8_int8_lsx(const Mat& src, Mat& dst, int top, int bottom, int left, int right) +{ + const int64_t* ptr = src; + int64_t* outptr = dst; + + // fill top + ptr += top * src.w; + for (int y = 0; y < top; y++) + { + const int64_t* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + *outptr++ = ptr0[left - x]; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr0++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr0[-2 - x]; + } + ptr -= src.w; + } + // fill center + for (int y = 0; y < src.h; y++) + { + for (int x = 0; x < left; x++) + { + *outptr++ = ptr[left - x]; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr[-2 - x]; + } + } + // fill bottom + ptr -= 2 * src.w; + for (int y = 0; y < bottom; y++) + { + const int64_t* ptr0 = ptr; + for (int x = 0; x < left; x++) + { + *outptr++ = ptr0[left - x]; + } + for (int x = 0; x < src.w; x++) + { + *outptr++ = *ptr0++; + } + for (int x = 0; x < right; x++) + { + *outptr++ = ptr0[-2 - x]; + } + ptr -= src.w; + } +} diff --git a/src/layer/loongarch/pooling_loongarch.cpp b/src/layer/loongarch/pooling_loongarch.cpp new file mode 100644 index 000000000000..9d9889713244 --- /dev/null +++ b/src/layer/loongarch/pooling_loongarch.cpp @@ -0,0 +1,291 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pooling_loongarch.h" + +#include + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Pooling_loongarch::Pooling_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Pooling_loongarch::create_pipeline(const Option& /*opt*/) +{ + if (adaptive_pooling) + { + support_packing = false; + + support_bf16_storage = false; + support_fp16_storage = false; + support_int8_storage = false; + support_tensor_storage = false; + } + return 0; +} + +int Pooling_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + if (adaptive_pooling) + { + return Pooling::forward(bottom_blob, top_blob, opt); + } + + // max value in NxN window + // avg value in NxN window + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + // NCNN_LOGE("Pooling input %d x %d pad = %d %d %d %d ksize=%d %d stride=%d %d", w, h, pad_left, pad_right, pad_top, pad_bottom, kernel_w, kernel_h, stride_w, stride_h); + + if (elempack == 4) + { + if (global_pooling) + { + top_blob.create(channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int size = w * h; + + if (pooling_type == PoolMethod_MAX) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + + __m128 _max = (__m128)__lsx_vld(ptr, 0); + for (int i = 0; i < size; i++) + { + __m128 _val = (__m128)__lsx_vld(ptr, 0); + _max = __lsx_vfmax_s(_max, _val); + ptr += 4; + } + + float* outptr = top_blob; + __lsx_vst(_max, outptr + q * 4, 0); + } + } + else if (pooling_type == PoolMethod_AVE) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + for (int i = 0; i < size; i++) + { + __m128 _val = (__m128)__lsx_vld(ptr, 0); + _sum = __lsx_vfadd_s(_sum, _val); + ptr += 4; + } + + __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / size)); + + float* outptr = top_blob; + __lsx_vst(_avg, outptr + q * 4, 0); + } + } + + return 0; + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + + int outw = (w - kernel_w) / stride_w + 1; + int outh = (h - kernel_h) / stride_h + 1; + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w - kernel_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2++; + } + p2 += gap; + } + } + + if (pooling_type == PoolMethod_MAX) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + float* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + __m128 _max = (__m128)__lsx_vld(sptr, 0); + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0); + _max = __lsx_vfmax_s(_max, _val); + } + + __lsx_vst(_max, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + else if (pooling_type == PoolMethod_AVE) + { + if (avgpool_count_include_pad == 0) + { + int wtailpad = 0; + int htailpad = 0; + + if (pad_mode == 0) // full padding + { + wtailpad = bottom_blob_bordered.w - bottom_blob.w - pad_left - pad_right; + htailpad = bottom_blob_bordered.h - bottom_blob.h - pad_top - pad_bottom; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + float* outptr = top_blob.channel(q); + + for (int i = 0; i < outh; i++) + { + int sy0 = i * stride_h; + + for (int j = 0; j < outw; j++) + { + int sx0 = j * stride_w; + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + int area = 0; + + for (int ki = 0; ki < kernel_h; ki++) + { + int sy = sy0 + ki; + + if (sy < pad_top) + continue; + + if (sy >= h - pad_bottom - htailpad) + break; + + for (int kj = 0; kj < kernel_w; kj++) + { + int sx = sx0 + kj; + + if (sx < pad_left) + continue; + + if (sx >= w - pad_right - wtailpad) + break; + + __m128 _val = (__m128)__lsx_vld(m.row(sy) + sx * 4, 0); + _sum = __lsx_vfadd_s(_sum, _val); + area += 1; + } + } + + __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(1.f / area)); + __lsx_vst(_avg, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + else // if (avgpool_count_include_pad == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const Mat m = bottom_blob_bordered.channel(q); + float* outptr = top_blob.channel(q); + + const float inv_maxk = 1.f / maxk; + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + const float* sptr = m.row(i * stride_h) + j * stride_w * 4; + + __m128 _sum = (__m128)__lsx_vreplgr2vr_w(0); + + for (int k = 0; k < maxk; k++) + { + __m128 _val = (__m128)__lsx_vld(sptr + space_ofs[k] * 4, 0); + _sum = __lsx_vfadd_s(_sum, _val); + } + + __m128 _avg = __lsx_vfmul_s(_sum, __lsx_vreplfr2vr_s(inv_maxk)); + __lsx_vst(_avg, outptr + j * 4, 0); + } + + outptr += outw * 4; + } + } + } + } + + return 0; + } +#endif // __loongarch_sx + + return Pooling::forward(bottom_blob, top_blob, opt); +} + +} // namespace ncnn diff --git a/src/layer/loongarch/pooling_loongarch.h b/src/layer/loongarch/pooling_loongarch.h new file mode 100644 index 000000000000..97e0c9ff2f7e --- /dev/null +++ b/src/layer/loongarch/pooling_loongarch.h @@ -0,0 +1,33 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_POOLING_LOONGARCH_H +#define LAYER_POOLING_LOONGARCH_H + +#include "pooling.h" + +namespace ncnn { + +class Pooling_loongarch : virtual public Pooling +{ +public: + Pooling_loongarch(); + + virtual int create_pipeline(const Option& opt); + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_POOLING_LOONGARCH_H diff --git a/src/layer/loongarch/prelu_loongarch.cpp b/src/layer/loongarch/prelu_loongarch.cpp new file mode 100644 index 000000000000..27cc0bc9d446 --- /dev/null +++ b/src/layer/loongarch/prelu_loongarch.cpp @@ -0,0 +1,193 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "prelu_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +PReLU_loongarch::PReLU_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int PReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int elempack = bottom_top_blob.elempack; + + if (dims == 1) + { + int w = bottom_top_blob.w * elempack; + +#if __loongarch_sx + int nn_w = w / 4; + int remain_w_start = nn_w * 4; +#else + int remain_w_start = 0; +#endif // __loongarch_sx + + float* ptr = bottom_top_blob; + + if (num_slope > 1) + { + const float* slope = slope_data; + +#if __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < nn_w; i++) + { + float* ptr0 = ptr + i * 4; + + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (__m128)__lsx_vld(slope + i * 4, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr0, 0); + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_w_start; i < w; i++) + { + float v = ptr[i]; + if (v < 0.f) + ptr[i] = v * slope[i]; + } + } + else + { + const float slope = slope_data[0]; + +#if __loongarch_sx + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < nn_w; i++) + { + float* ptr0 = ptr + i * 4; + + __m128 _p = (__m128)__lsx_vld(ptr0, 0); + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr0, 0); + } +#endif // __loongarch_sx + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_w_start; i < w; i++) + { + float v = ptr[i]; + if (v < 0.f) + ptr[i] = v * slope; + } + } + } + + if (dims == 2) + { + int w = bottom_top_blob.w * elempack; + int h = bottom_top_blob.h; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + + const float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; + + int j = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope); + + for (; j + 3 < w; j += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; j < w; j++) + { + float v = *ptr; + if (v < 0.f) + *ptr = v * slope; + + ptr++; + } + } + } + + if (dims == 3) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h * elempack; + + const float* slope_data_ptr = slope_data; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0]; + + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (elempack == 4 && num_slope > 1) ? (__m128)__lsx_vld((const float*)slope_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(slope); + + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < 0) + *ptr *= slope; + + ptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/prelu_loongarch.h b/src/layer/loongarch/prelu_loongarch.h new file mode 100644 index 000000000000..97031bb06016 --- /dev/null +++ b/src/layer/loongarch/prelu_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_PRELU_LOONGARCH_H +#define LAYER_PRELU_LOONGARCH_H + +#include "prelu.h" + +namespace ncnn { + +class PReLU_loongarch : virtual public PReLU +{ +public: + PReLU_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_PRELU_LOONGARCH_H diff --git a/src/layer/loongarch/quantize_loongarch.cpp b/src/layer/loongarch/quantize_loongarch.cpp new file mode 100644 index 000000000000..657ff2d06bf5 --- /dev/null +++ b/src/layer/loongarch/quantize_loongarch.cpp @@ -0,0 +1,494 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "quantize_loongarch.h" + +#include + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +Quantize_loongarch::Quantize_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Quantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale); + outptr[1] = float2int8(ptr0[1] * scale); + outptr[2] = float2int8(ptr0[2] * scale); + outptr[3] = float2int8(ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); + outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); + outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); + outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(ptr0 + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); + __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); + _vlow = __lsx_vfmul_s(_vlow, _scale); + _vhigh = __lsx_vfmul_s(_vhigh, _scale); + *((int64_t*)outptr) = float2int8(_vlow, _vhigh); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + i * 8, 0); + __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(ptr0 + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); + __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); + _vlow = __lsx_vfmul_s(_vlow, _scale0); + _vhigh = __lsx_vfmul_s(_vhigh, _scale1); + *((int64_t*)outptr) = float2int8(_vlow, _vhigh); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + const float s0 = scale_data[i * 4]; + const float s1 = scale_data[i * 4 + 1]; + const float s2 = scale_data[i * 4 + 2]; + const float s3 = scale_data[i * 4 + 3]; + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + int i = 0; + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(ptr0 + 32); + __builtin_prefetch(ptr1 + 32); + __m128 _v0 = (__m128)__lsx_vld(ptr0, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr0 + 4, 0); + __m128 _v2 = (__m128)__lsx_vld(ptr1, 0); + __m128 _v3 = (__m128)__lsx_vld(ptr1 + 4, 0); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + _v2 = __lsx_vfmul_s(_v2, _scale); + _v3 = __lsx_vfmul_s(_v3, _scale); + *((int64_t*)outptr) = float2int8(_v0, _v2); + *((int64_t*)(outptr + 8)) = float2int8(_v1, _v3); + + ptr0 += 8; + ptr1 += 8; + outptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(ptr0 + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); + __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); + _vlow = __lsx_vfmul_s(_vlow, _scale); + _vhigh = __lsx_vfmul_s(_vhigh, _scale); + *((int64_t*)outptr) = float2int8(_vlow, _vhigh); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + __m128 _scale0 = (__m128)__lsx_vld((const float*)scale_data + q * 8, 0); + __m128 _scale1 = (__m128)__lsx_vld((const float*)scale_data + q * 8 + 4, 0); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(ptr0 + 16); + __builtin_prefetch(ptr1 + 16); + __m128 _vlow = (__m128)__lsx_vld(ptr0, 0); + __m128 _vhigh = (__m128)__lsx_vld(ptr1, 0); + _vlow = __lsx_vfmul_s(_vlow, _scale0); + _vhigh = __lsx_vfmul_s(_vhigh, _scale1); + *((int64_t*)outptr) = float2int8(_vlow, _vhigh); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + const float s0 = scale_data[q * 4]; + const float s1 = scale_data[q * 4 + 1]; + const float s2 = scale_data[q * 4 + 2]; + const float s3 = scale_data[q * 4 + 3]; + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + return 0; + } +#endif // __loongarch_sx + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale_data[i]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + for (int j = 0; j < w; j++) + { + *outptr0++ = float2int8(*ptr0++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __loongarch_sx + __m128 _scale = (__m128)__lsx_vreplfr2vr_s(scale); + for (; i + 15 < size; i += 16) + { + __builtin_prefetch(ptr + 64); + __m128 _v0 = (__m128)__lsx_vld(ptr, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0); + __m128 _v2 = (__m128)__lsx_vld(ptr + 8, 0); + __m128 _v3 = (__m128)__lsx_vld(ptr + 12, 0); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + _v2 = __lsx_vfmul_s(_v2, _scale); + _v3 = __lsx_vfmul_s(_v3, _scale); + *((int64_t*)outptr) = float2int8(_v0, _v1); + *((int64_t*)(outptr + 8)) = float2int8(_v2, _v3); + + ptr += 16; + outptr += 16; + } + for (; i + 7 < size; i += 8) + { + __builtin_prefetch(ptr + 32); + __m128 _v0 = (__m128)__lsx_vld(ptr, 0); + __m128 _v1 = (__m128)__lsx_vld(ptr + 4, 0); + _v0 = __lsx_vfmul_s(_v0, _scale); + _v1 = __lsx_vfmul_s(_v1, _scale); + *((int64_t*)outptr) = float2int8(_v0, _v1); + + ptr += 8; + outptr += 8; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *outptr++ = float2int8(*ptr++ * scale); + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/quantize_loongarch.h b/src/layer/loongarch/quantize_loongarch.h new file mode 100644 index 000000000000..cae04aab171f --- /dev/null +++ b/src/layer/loongarch/quantize_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_QUANTIZE_LOONGARCH_H +#define LAYER_QUANTIZE_LOONGARCH_H + +#include "quantize.h" + +namespace ncnn { + +class Quantize_loongarch : virtual public Quantize +{ +public: + Quantize_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_QUANTIZE_LOONGARCH_H diff --git a/src/layer/loongarch/relu_loongarch.cpp b/src/layer/loongarch/relu_loongarch.cpp new file mode 100644 index 000000000000..eb478d3ae9b1 --- /dev/null +++ b/src/layer/loongarch/relu_loongarch.cpp @@ -0,0 +1,98 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "relu_loongarch.h" + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +namespace ncnn { + +ReLU_loongarch::ReLU_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int ReLU_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + if (slope == 0.f) + { + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = __lsx_vfmax_s(_p, _zero); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < 0) + *ptr = 0; + ptr++; + } + } + else + { + int i = 0; +#if __loongarch_sx + __m128 _zero = (__m128)__lsx_vreplgr2vr_w(0); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128i _lemask = __lsx_vfcmp_cle_s(_p, _zero); + __m128 _ps = __lsx_vfmul_s(_p, _slope); + _p = (__m128)__lsx_vbitsel_v((__m128i)_p, (__m128i)_ps, (__m128i)_lemask); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + if (*ptr < 0) + *ptr *= slope; + ptr++; + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/relu_loongarch.h b/src/layer/loongarch/relu_loongarch.h new file mode 100644 index 000000000000..445c6e8febca --- /dev/null +++ b/src/layer/loongarch/relu_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_RELU_LOONGARCH_H +#define LAYER_RELU_LOONGARCH_H + +#include "relu.h" + +namespace ncnn { + +class ReLU_loongarch : virtual public ReLU +{ +public: + ReLU_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_RELU_LOONGARCH_H diff --git a/src/layer/loongarch/requantize_leakyrelu_pack4.h b/src/layer/loongarch/requantize_leakyrelu_pack4.h new file mode 100644 index 000000000000..d6b499426609 --- /dev/null +++ b/src/layer/loongarch/requantize_leakyrelu_pack4.h @@ -0,0 +1,271 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_leakyrelu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = top_blob.c; + int out_elempack = top_blob.elempack; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) + + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr0 + 64); + __builtin_prefetch(intptr1 + 64); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); + __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); + __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); + _v00 = __lsx_vfmul_s(_v00, _scale0); + _v01 = __lsx_vfmul_s(_v01, _scale0); + _v02 = __lsx_vfmul_s(_v02, _scale0); + _v03 = __lsx_vfmul_s(_v03, _scale0); + _v10 = __lsx_vfmul_s(_v10, _scale1); + _v11 = __lsx_vfmul_s(_v11, _scale1); + _v12 = __lsx_vfmul_s(_v12, _scale1); + _v13 = __lsx_vfmul_s(_v13, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); + *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope); + *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr0 + 64); + __builtin_prefetch(intptr1 + 64); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); + __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); + __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); + _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); + _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); + _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0); + _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0); + _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); + _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); + _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1); + _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); + *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v02, _v12, _slope); + *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v03, _v13, _slope); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr0 + 32); + __builtin_prefetch(intptr1 + 32); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); + _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); + _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); + _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v00, _v10, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v01, _v11, _slope); + + intptr0 += 8; + intptr1 += 8; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + signed char* vp; + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + + __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __m128i v = float2int8leakyrelu(_v, _slope); + vp = (signed char*)&v; + ptr0[0] = vp[0]; + ptr1[0] = vp[1]; + ptr2[0] = vp[2]; + ptr3[0] = vp[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + signed char* vp; + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + + __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); + _bias = __lsx_vfmul_s(_bias, _scale_out); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __m128i v = float2int8leakyrelu(_v, _slope); + vp = (signed char*)&v; + ptr0[0] = vp[0]; + ptr1[0] = vp[1]; + ptr2[0] = vp[2]; + ptr3[0] = vp[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } +} diff --git a/src/layer/loongarch/requantize_leakyrelu_pack8.h b/src/layer/loongarch/requantize_leakyrelu_pack8.h new file mode 100644 index 000000000000..a2c4faed4f2a --- /dev/null +++ b/src/layer/loongarch/requantize_leakyrelu_pack8.h @@ -0,0 +1,188 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_leakyrelu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(leakyrelu(v * scale_in, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out), slope) + + // int8(leakyrelu(v * scale_in + bias, slope) * scale_out) + // int8_leakyrelu(v * (scale_in * scale_out) + (bias * scale_out), slope) + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 128); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); + __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); + __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); + __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + _v4 = __lsx_vfmul_s(_v4, _scale0); + _v5 = __lsx_vfmul_s(_v5, _scale1); + _v6 = __lsx_vfmul_s(_v6, _scale0); + _v7 = __lsx_vfmul_s(_v7, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); + *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope); + *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); + __m128 _slope = (__m128)__lsx_vreplfr2vr_s(slope); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 128); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); + __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); + __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); + __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0); + _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1); + _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0); + _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); + *((int64_t*)(ptr + 16)) = float2int8leakyrelu(_v4, _v5, _slope); + *((int64_t*)(ptr + 24)) = float2int8leakyrelu(_v6, _v7, _slope); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + *((int64_t*)(ptr + 8)) = float2int8leakyrelu(_v2, _v3, _slope); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + *((int64_t*)ptr) = float2int8leakyrelu(_v0, _v1, _slope); + + intptr += 8; + ptr += 8; + } + } + } +} diff --git a/src/layer/loongarch/requantize_loongarch.cpp b/src/layer/loongarch/requantize_loongarch.cpp new file mode 100644 index 000000000000..556d20de4f6d --- /dev/null +++ b/src/layer/loongarch/requantize_loongarch.cpp @@ -0,0 +1,1386 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "requantize_loongarch.h" + +#include + +#if __loongarch_sx +#include +#endif // __loongarch_sx + +#include "loongarch_activation.h" +#include "loongarch_usability.h" + +namespace ncnn { + +#if __loongarch_sx +#include "requantize_leakyrelu_pack4.h" +#include "requantize_leakyrelu_pack8.h" +#include "requantize_relu_pack4.h" +#include "requantize_relu_pack8.h" +#endif // __loongarch_sx + +Requantize_loongarch::Requantize_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Requantize_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __loongarch_sx + if (elempack == 8) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in); + _v1 = __lsx_vfmul_s(_v1, _scale_in); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in); + _v1 = __lsx_vfmul_s(_v1, _scale_in); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out); + _v1 = __lsx_vfmul_s(_v1, _scale_out); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (activation_type == 1) + { + requantize_relu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + return 0; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + requantize_leakyrelu_pack8_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + return 0; + } + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + __m128 _scale_in = (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + __m128 _scale_out = (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else if (bias_data_size == 1) + { + __m128 _bias = (__m128)__lsx_vreplfr2vr_s(bias_data[0]); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + __m128 _scale_in = (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _bias = (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr[0] = v[0]; + ptr[1] = v[1]; + ptr[2] = v[2]; + ptr[3] = v[3]; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 8 + 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr0[0] = v[0]; + ptr1[0] = v[1]; + ptr2[0] = v[2]; + ptr3[0] = v[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + i * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + i * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + i * 4, 0); + + for (int j = 0; j < w; j++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr0[0] = v[0]; + ptr1[0] = v[1]; + ptr2[0] = v[2]; + ptr3[0] = v[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (activation_type == 1) + { + requantize_relu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + return 0; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + requantize_leakyrelu_pack4_lsx(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + return 0; + } + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale_in0); + _v1 = __lsx_vfmul_s(_v1, _scale_in1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmadd_s(_scale_in0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale_in1, _v1, _bias1); + _v0 = activation_ps(_v0, activation_type, activation_params); + _v1 = activation_ps(_v1, activation_type, activation_params); + _v0 = __lsx_vfmul_s(_v0, _scale_out0); + _v1 = __lsx_vfmul_s(_v1, _scale_out1); + *((int64_t*)ptr) = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale_in); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr0[0] = v[0]; + ptr1[0] = v[1]; + ptr2[0] = v[2]; + ptr3[0] = v[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + + for (int i = 0; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale_in, _v, _bias); + _v = activation_ps(_v, activation_type, activation_params); + _v = __lsx_vfmul_s(_v, _scale_out); + v16i8 v = (v16i8)float2int8(_v); + ptr0[0] = v[0]; + ptr1[0] = v[1]; + ptr2[0] = v[2]; + ptr3[0] = v[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } + } + + return 0; + } +#endif // __loongarch_sx + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + signed char* ptr = top_blob; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + const float scale_in = scale_in_data[0]; + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + const float scale_in = scale_in_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in + bias; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/requantize_loongarch.h b/src/layer/loongarch/requantize_loongarch.h new file mode 100644 index 000000000000..8175989959eb --- /dev/null +++ b/src/layer/loongarch/requantize_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_REQUANTIZE_LOONGARCH_H +#define LAYER_REQUANTIZE_LOONGARCH_H + +#include "requantize.h" + +namespace ncnn { + +class Requantize_loongarch : virtual public Requantize +{ +public: + Requantize_loongarch(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_REQUANTIZE_LOONGARCH_H diff --git a/src/layer/loongarch/requantize_relu_pack4.h b/src/layer/loongarch/requantize_relu_pack4.h new file mode 100644 index 000000000000..2fba8dfc2e48 --- /dev/null +++ b/src/layer/loongarch/requantize_relu_pack4.h @@ -0,0 +1,267 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_relu_pack4_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = top_blob.c; + int out_elempack = top_blob.elempack; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr0 + 64); + __builtin_prefetch(intptr1 + 64); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); + __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); + __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); + _v00 = __lsx_vfmul_s(_v00, _scale0); + _v01 = __lsx_vfmul_s(_v01, _scale0); + _v02 = __lsx_vfmul_s(_v02, _scale0); + _v03 = __lsx_vfmul_s(_v03, _scale0); + _v10 = __lsx_vfmul_s(_v10, _scale1); + _v11 = __lsx_vfmul_s(_v11, _scale1); + _v12 = __lsx_vfmul_s(_v12, _scale1); + _v13 = __lsx_vfmul_s(_v13, _scale1); + *((int64_t*)ptr) = float2int8relu(_v00, _v10); + *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); + *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12); + *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr0 + 64); + __builtin_prefetch(intptr1 + 64); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v02 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 8, 0)); + __m128 _v03 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 12, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + __m128 _v12 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 8, 0)); + __m128 _v13 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 12, 0)); + _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); + _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); + _v02 = __lsx_vfmadd_s(_scale0, _v02, _bias0); + _v03 = __lsx_vfmadd_s(_scale0, _v03, _bias0); + _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); + _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); + _v12 = __lsx_vfmadd_s(_scale1, _v12, _bias1); + _v13 = __lsx_vfmadd_s(_scale1, _v13, _bias1); + *((int64_t*)ptr) = float2int8relu(_v00, _v10); + *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); + *((int64_t*)(ptr + 16)) = float2int8relu(_v02, _v12); + *((int64_t*)(ptr + 24)) = float2int8relu(_v03, _v13); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr0 + 32); + __builtin_prefetch(intptr1 + 32); + __m128 _v00 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v01 = __lsx_vffint_s_w(__lsx_vld(intptr0 + 4, 0)); + __m128 _v10 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + __m128 _v11 = __lsx_vffint_s_w(__lsx_vld(intptr1 + 4, 0)); + _v00 = __lsx_vfmadd_s(_scale0, _v00, _bias0); + _v01 = __lsx_vfmadd_s(_scale0, _v01, _bias0); + _v10 = __lsx_vfmadd_s(_scale1, _v10, _bias1); + _v11 = __lsx_vfmadd_s(_scale1, _v11, _bias1); + *((int64_t*)ptr) = float2int8relu(_v00, _v10); + *((int64_t*)(ptr + 8)) = float2int8relu(_v01, _v11); + + intptr0 += 8; + intptr1 += 8; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr0 + 16); + __builtin_prefetch(intptr1 + 16); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr0, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr1, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + signed char* vp; + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + + __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmul_s(_v, _scale); + __m128i v = float2int8relu(_v); + vp = (signed char*)&v; + ptr0[0] = vp[0]; + ptr1[0] = vp[1]; + ptr2[0] = vp[2]; + ptr3[0] = vp[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + signed char* vp; + + __m128 _scale_in = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 4, 0); + __m128 _scale_out = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 4, 0); + __m128 _bias = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 4, 0); + + __m128 _scale = __lsx_vfmul_s(_scale_in, _scale_out); + _bias = __lsx_vfmul_s(_bias, _scale_out); + + int i = 0; + for (; i < size; i++) + { + __builtin_prefetch(intptr + 16); + __m128 _v = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + _v = __lsx_vfmadd_s(_scale, _v, _bias); + __m128i v = float2int8relu(_v); + vp = (signed char*)&v; + ptr0[0] = vp[0]; + ptr1[0] = vp[1]; + ptr2[0] = vp[2]; + ptr3[0] = vp[3]; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } +} diff --git a/src/layer/loongarch/requantize_relu_pack8.h b/src/layer/loongarch/requantize_relu_pack8.h new file mode 100644 index 000000000000..3d2a45b45d06 --- /dev/null +++ b/src/layer/loongarch/requantize_relu_pack8.h @@ -0,0 +1,186 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_relu_pack8_lsx(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 128); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); + __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); + __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); + __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + _v4 = __lsx_vfmul_s(_v4, _scale0); + _v5 = __lsx_vfmul_s(_v5, _scale1); + _v6 = __lsx_vfmul_s(_v6, _scale0); + _v7 = __lsx_vfmul_s(_v7, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); + *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5); + *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + _v2 = __lsx_vfmul_s(_v2, _scale0); + _v3 = __lsx_vfmul_s(_v3, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmul_s(_v0, _scale0); + _v1 = __lsx_vfmul_s(_v1, _scale1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + __m128 _scale_in0 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8, 0); + __m128 _scale_in1 = scale_in_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_in_data[0]) : (__m128)__lsx_vld((const float*)scale_in_data + q * 8 + 4, 0); + __m128 _scale_out0 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8, 0); + __m128 _scale_out1 = scale_out_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(scale_out_data[0]) : (__m128)__lsx_vld((const float*)scale_out_data + q * 8 + 4, 0); + __m128 _bias0 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8, 0); + __m128 _bias1 = bias_data_size == 1 ? (__m128)__lsx_vreplfr2vr_s(bias_data[0]) : (__m128)__lsx_vld((const float*)bias_data + q * 8 + 4, 0); + + __m128 _scale0 = __lsx_vfmul_s(_scale_in0, _scale_out0); + __m128 _scale1 = __lsx_vfmul_s(_scale_in1, _scale_out1); + _bias0 = __lsx_vfmul_s(_bias0, _scale_out0); + _bias1 = __lsx_vfmul_s(_bias1, _scale_out1); + + int i = 0; + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(intptr + 128); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + __m128 _v4 = __lsx_vffint_s_w(__lsx_vld(intptr + 16, 0)); + __m128 _v5 = __lsx_vffint_s_w(__lsx_vld(intptr + 20, 0)); + __m128 _v6 = __lsx_vffint_s_w(__lsx_vld(intptr + 24, 0)); + __m128 _v7 = __lsx_vffint_s_w(__lsx_vld(intptr + 28, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + _v4 = __lsx_vfmadd_s(_scale0, _v4, _bias0); + _v5 = __lsx_vfmadd_s(_scale1, _v5, _bias1); + _v6 = __lsx_vfmadd_s(_scale0, _v6, _bias0); + _v7 = __lsx_vfmadd_s(_scale1, _v7, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); + *((int64_t*)(ptr + 16)) = float2int8relu(_v4, _v5); + *((int64_t*)(ptr + 24)) = float2int8relu(_v6, _v7); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + __builtin_prefetch(intptr + 64); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + __m128 _v2 = __lsx_vffint_s_w(__lsx_vld(intptr + 8, 0)); + __m128 _v3 = __lsx_vffint_s_w(__lsx_vld(intptr + 12, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + _v2 = __lsx_vfmadd_s(_scale0, _v2, _bias0); + _v3 = __lsx_vfmadd_s(_scale1, _v3, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + *((int64_t*)(ptr + 8)) = float2int8relu(_v2, _v3); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + __builtin_prefetch(intptr + 32); + __m128 _v0 = __lsx_vffint_s_w(__lsx_vld(intptr, 0)); + __m128 _v1 = __lsx_vffint_s_w(__lsx_vld(intptr + 4, 0)); + _v0 = __lsx_vfmadd_s(_scale0, _v0, _bias0); + _v1 = __lsx_vfmadd_s(_scale1, _v1, _bias1); + *((int64_t*)ptr) = float2int8relu(_v0, _v1); + + intptr += 8; + ptr += 8; + } + } + } +} diff --git a/src/layer/loongarch/sigmoid_loongarch.cpp b/src/layer/loongarch/sigmoid_loongarch.cpp new file mode 100644 index 000000000000..6d112804f269 --- /dev/null +++ b/src/layer/loongarch/sigmoid_loongarch.cpp @@ -0,0 +1,76 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "sigmoid_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include "loongarch_usability.h" + +#include + +namespace ncnn { + +Sigmoid_loongarch::Sigmoid_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int Sigmoid_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = (__m128)__lsx_vbitrevi_w((__m128i)_p, 31); + _p = exp_ps(_p); + _p = __lsx_vfadd_s(_p, _one); + __m128 _outp = __lsx_vfdiv_s(_one, _p); + __lsx_vst(_outp, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = 1.f / (1.f + exp(-*ptr)); + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/sigmoid_loongarch.h b/src/layer/loongarch/sigmoid_loongarch.h new file mode 100644 index 000000000000..b15aad235db5 --- /dev/null +++ b/src/layer/loongarch/sigmoid_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SIGMOID_LOONGARCH_H +#define LAYER_SIGMOID_LOONGARCH_H + +#include "sigmoid.h" + +namespace ncnn { + +class Sigmoid_loongarch : virtual public Sigmoid +{ +public: + Sigmoid_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SIGMOID_LOONGARCH_H diff --git a/src/layer/loongarch/slice_loongarch.cpp b/src/layer/loongarch/slice_loongarch.cpp new file mode 100644 index 000000000000..edd8656a4bb3 --- /dev/null +++ b/src/layer/loongarch/slice_loongarch.cpp @@ -0,0 +1,371 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "slice_loongarch.h" + +namespace ncnn { + +Slice_loongarch::Slice_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Slice_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + const int* slices_ptr = slices; + int positive_axis = axis < 0 ? dims + axis : axis; + + if (dims == 1) // positive_axis == 0 + { + // slice vector + int w = bottom_blob.w * elempack; + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (w - q) / (top_blobs.size() - i); + } + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + out_elempack = slice % 4 == 0 ? 4 : 1; +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[i]; + top_blob.create(slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* ptr = (const float*)bottom_blob + q; + float* outptr = top_blob; + memcpy(outptr, ptr, top_blob.w * top_blob.elemsize); + + q += slice; + } + } + + if (dims == 2 && positive_axis == 0) + { + // slice image height + int w = bottom_blob.w; + int h = bottom_blob.h * elempack; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (h - q) / (top_blobs.size() - i); + } + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + out_elempack = slice % 4 == 0 ? 4 : 1; +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[i]; + top_blob.create(w, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + size_t out_elemsize = top_blobs[0].elemsize; + int out_elempack = top_blobs[0].elempack; + for (size_t i = 0; i < top_blobs.size(); i++) + { + out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); + out_elempack = std::min(out_elempack, top_blobs[i].elempack); + } + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > out_elempack) + { + convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + } + + const float* ptr = bottom_blob_unpacked; + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + if (out_elempack == 1 && top_blob.elempack == 4) + { + for (int j = 0; j < top_blob.h; j++) + { + const float* r0 = ptr; + const float* r1 = ptr + w; + const float* r2 = ptr + w * 2; + const float* r3 = ptr + w * 3; + + float* outptr0 = top_blob.row(j); + + for (int j = 0; j < w; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; + + outptr0 += 4; + } + + ptr += w * 4; + } + } + else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) + { + int size = w * top_blob.h; + + float* outptr = top_blob; + memcpy(outptr, ptr, size * top_blob.elemsize); + + ptr += size * top_blob.elempack; + } + } + } + + if (dims == 2 && positive_axis == 1) + { + // slice image width + int w = bottom_blob.w; + int h = bottom_blob.h; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (w - q) / (top_blobs.size() - i); + } + + Mat& top_blob = top_blobs[i]; + top_blob.create(slice, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < h; j++) + { + const float* ptr = bottom_blob.row(j); + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + float* outptr = top_blob.row(j); + memcpy(outptr, ptr, top_blob.w * elemsize); + + ptr += top_blob.w * elempack; + } + } + } + + if (dims == 3 && positive_axis == 0) + { + // slice dim channel + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c * elempack; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (channels - q) / (top_blobs.size() - i); + } + + int out_elempack = 1; +#if __loongarch_sx + if (opt.use_packing_layout) + out_elempack = slice % 4 == 0 ? 4 : 1; +#endif + size_t out_elemsize = elemsize / elempack * out_elempack; + + Mat& top_blob = top_blobs[i]; + top_blob.create(w, h, slice / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + size_t out_elemsize = top_blobs[0].elemsize; + int out_elempack = top_blobs[0].elempack; + for (size_t i = 0; i < top_blobs.size(); i++) + { + out_elemsize = std::min(out_elemsize, top_blobs[i].elemsize); + out_elempack = std::min(out_elempack, top_blobs[i].elempack); + } + + Mat bottom_blob_unpacked = bottom_blob; + if (elempack > out_elempack) + { + convert_packing(bottom_blob, bottom_blob_unpacked, out_elempack, opt); + } + + int p = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + if (out_elempack == 1 && top_blob.elempack == 4) + { + int size = top_blob.w * top_blob.h; + + for (int q = 0; q < top_blob.c; q++) + { + const float* r0 = bottom_blob_unpacked.channel(p); + const float* r1 = bottom_blob_unpacked.channel(p + 1); + const float* r2 = bottom_blob_unpacked.channel(p + 2); + const float* r3 = bottom_blob_unpacked.channel(p + 3); + + float* outptr0 = top_blob.channel(q); + + for (int j = 0; j < size; j++) + { + outptr0[0] = *r0++; + outptr0[1] = *r1++; + outptr0[2] = *r2++; + outptr0[3] = *r3++; + + outptr0 += 4; + } + + p += 4; + } + } + else // if (out_elempack == 1 && top_blob.elempack == 1) if (out_elempack == 4 && top_blob.elempack == 4) + { + int size = top_blob.total(); + + const float* ptr = bottom_blob_unpacked.channel(p); + float* outptr = top_blob; + memcpy(outptr, ptr, size * top_blob.elemsize); + + p += top_blob.c; + } + } + } + + if (dims == 3 && positive_axis == 1) + { + // slice dim height + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (h - q) / (top_blobs.size() - i); + } + + Mat& top_blob = top_blobs[i]; + top_blob.create(w, slice, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* ptr = bottom_blob.channel(p); + + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + int size = top_blob.w * top_blob.h; + + float* outptr = top_blob.channel(p); + memcpy(outptr, ptr, size * elemsize); + + ptr += size * elempack; + } + } + } + + if (dims == 3 && positive_axis == 2) + { + // slice dim width + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + + int q = 0; + for (size_t i = 0; i < top_blobs.size(); i++) + { + int slice = slices_ptr[i]; + if (slice == -233) + { + slice = (w - q) / (top_blobs.size() - i); + } + + Mat& top_blob = top_blobs[i]; + top_blob.create(slice, h, channels, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + q += slice; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const float* ptr = bottom_blob.channel(p); + + for (int j = 0; j < h; j++) + { + for (size_t i = 0; i < top_blobs.size(); i++) + { + Mat& top_blob = top_blobs[i]; + + float* outptr = top_blob.channel(p).row(j); + memcpy(outptr, ptr, top_blob.w * elemsize); + + ptr += top_blob.w * elempack; + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/slice_loongarch.h b/src/layer/loongarch/slice_loongarch.h new file mode 100644 index 000000000000..b42138ba4183 --- /dev/null +++ b/src/layer/loongarch/slice_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SLICE_LOONGARCH_H +#define LAYER_SLICE_LOONGARCH_H + +#include "slice.h" + +namespace ncnn { + +class Slice_loongarch : virtual public Slice +{ +public: + Slice_loongarch(); + + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SLICE_LOONGARCH_H diff --git a/src/layer/loongarch/softmax_loongarch.cpp b/src/layer/loongarch/softmax_loongarch.cpp new file mode 100644 index 000000000000..88b49559754b --- /dev/null +++ b/src/layer/loongarch/softmax_loongarch.cpp @@ -0,0 +1,175 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "softmax_loongarch.h" + +#include +#include + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +int Softmax_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + size_t elemsize = bottom_top_blob.elemsize; + int positive_axis = axis < 0 ? dims + axis : axis; + + if (dims != 3 || positive_axis != 0) + return Softmax::forward_inplace(bottom_top_blob, opt); + + // value = exp( value - global max value ) + // sum all value + // value = value / sum + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + Mat max; + max.create(w, h, elemsize, opt.workspace_allocator); + if (max.empty()) + return -100; + max.fill(-FLT_MAX); + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float* maxptr = max; + + for (int i = 0; i < size; i++) + { + maxptr[i] = std::max(maxptr[i], ptr[i]); + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float* maxptr = max; + +#if __loongarch_sx + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __loongarch_sx + +#if __loongarch_sx + for (; nn > 0; nn--) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _max = (__m128)__lsx_vld(maxptr, 0); + + _p = exp_ps(__lsx_vfsub_s(_p, _max)); + + __lsx_vst(_p, ptr, 0); + + ptr += 4; + maxptr += 4; + } +#endif // __loongarch_sx + + for (; remain > 0; remain--) + { + *ptr = exp(*ptr - *maxptr); + + ptr++; + maxptr++; + } + } + + Mat sum; + sum.create(w, h, elemsize, opt.workspace_allocator); + if (sum.empty()) + return -100; + sum.fill(0.f); + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float* sumptr = sum; + +#if __loongarch_sx + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __loongarch_sx + +#if __loongarch_sx + for (; nn > 0; nn--) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _sum = (__m128)__lsx_vld(sumptr, 0); + _sum = __lsx_vfadd_s(_sum, _p); + __lsx_vst(_sum, sumptr, 0); + + ptr += 4; + sumptr += 4; + } +#endif // __loongarch_sx + + for (; remain > 0; remain--) + { + *sumptr += *ptr; + + ptr++; + sumptr++; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + float* sumptr = sum; + +#if __loongarch_sx + int nn = size >> 2; + int remain = size - (nn << 2); +#else + int remain = size; +#endif // __loongarch_sx + +#if __loongarch_sx + for (; nn > 0; nn--) + { + __m128 _p = (__m128)__lsx_vld(ptr, 0); + __m128 _sum = (__m128)__lsx_vld(sumptr, 0); + _p = __lsx_vfdiv_s(_p, _sum); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + sumptr += 4; + } +#endif // __loongarch_sx + + for (; remain > 0; remain--) + { + *ptr /= *sumptr; + + ptr++; + sumptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/softmax_loongarch.h b/src/layer/loongarch/softmax_loongarch.h new file mode 100644 index 000000000000..3c8272a6412f --- /dev/null +++ b/src/layer/loongarch/softmax_loongarch.h @@ -0,0 +1,30 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SOFTMAX_LOONGARCH_H +#define LAYER_SOFTMAX_LOONGARCH_H + +#include "softmax.h" + +namespace ncnn { + +class Softmax_loongarch : virtual public Softmax +{ +public: + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SOFTMAX_LOONGARCH_H diff --git a/src/layer/loongarch/swish_loongarch.cpp b/src/layer/loongarch/swish_loongarch.cpp new file mode 100644 index 000000000000..9c9005de6fcc --- /dev/null +++ b/src/layer/loongarch/swish_loongarch.cpp @@ -0,0 +1,70 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "swish_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include + +namespace ncnn { + +Swish_loongarch::Swish_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +int Swish_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + __m128 _one = (__m128)__lsx_vreplfr2vr_s(1.f); + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128i _p = __lsx_vld(ptr, 0); + _p = (__m128i)__lsx_vfdiv_s((__m128)_p, __lsx_vfadd_s(_one, exp_ps((__m128)__lsx_vbitrevi_w(_p, 31)))); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = *ptr / (1.f + exp(-*ptr)); + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/swish_loongarch.h b/src/layer/loongarch/swish_loongarch.h new file mode 100644 index 000000000000..b8d0b80f01e4 --- /dev/null +++ b/src/layer/loongarch/swish_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_SWISH_LOONGARCH_H +#define LAYER_SWISH_LOONGARCH_H + +#include "swish.h" + +namespace ncnn { + +class Swish_loongarch : virtual public Swish +{ +public: + Swish_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_SWISH_LOONGARCH_H diff --git a/src/layer/loongarch/tanh_loongarch.cpp b/src/layer/loongarch/tanh_loongarch.cpp new file mode 100644 index 000000000000..13227fa71e34 --- /dev/null +++ b/src/layer/loongarch/tanh_loongarch.cpp @@ -0,0 +1,69 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "tanh_loongarch.h" + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +#include + +namespace ncnn { + +TanH_loongarch::TanH_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif +} + +int TanH_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int d = bottom_top_blob.d; + int channels = bottom_top_blob.c; + int elempack = bottom_top_blob.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = tanh_ps(_p); + __lsx_vst(_p, ptr, 0); + + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = tanh(*ptr); + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/tanh_loongarch.h b/src/layer/loongarch/tanh_loongarch.h new file mode 100644 index 000000000000..ecbab01ec8fe --- /dev/null +++ b/src/layer/loongarch/tanh_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_TANH_LOONGARCH_H +#define LAYER_TANH_LOONGARCH_H + +#include "tanh.h" + +namespace ncnn { + +class TanH_loongarch : virtual public TanH +{ +public: + TanH_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_TANH_LOONGARCH_H diff --git a/src/layer/loongarch/unaryop_loongarch.cpp b/src/layer/loongarch/unaryop_loongarch.cpp new file mode 100644 index 000000000000..892c4dc42608 --- /dev/null +++ b/src/layer/loongarch/unaryop_loongarch.cpp @@ -0,0 +1,427 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "unaryop_loongarch.h" + +#include + +#if __loongarch_sx +#include +#include "lsx_mathfun.h" +#endif // __loongarch_sx + +namespace ncnn { + +UnaryOp_loongarch::UnaryOp_loongarch() +{ +#if __loongarch_sx + support_packing = true; +#endif // __loongarch_sx +} + +template +static int unary_op_inplace(Mat& a, const Option& opt) +{ + Op op; + + int w = a.w; + int h = a.h; + int d = a.d; + int channels = a.c; + int elempack = a.elempack; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = a.channel(q); + + int i = 0; +#if __loongarch_sx + for (; i + 3 < size; i += 4) + { + __builtin_prefetch(ptr + 16); + __m128 _p = (__m128)__lsx_vld(ptr, 0); + _p = op.func_pack4(_p); + __lsx_vst(_p, ptr, 0); + ptr += 4; + } +#endif // __loongarch_sx + for (; i < size; i++) + { + *ptr = op.func(*ptr); + ptr++; + } + } + + return 0; +} + +namespace UnaryOp_loongarch_functor { + +struct unary_op_abs +{ + float func(const float& x) const + { + return (float)fabs(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return (__m128)__lsx_vbitclri_w((__m128i)x, 31); + } +#endif // __loongarch_sx +}; + +struct unary_op_neg +{ + float func(const float& x) const + { + return -x; + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return (__m128)__lsx_vbitrevi_w((__m128i)x, 31); + } +#endif // __loongarch_sx +}; + +struct unary_op_floor +{ + float func(const float& x) const + { + return (float)floor(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = floor(tmp[0]); + tmp[1] = floor(tmp[1]); + tmp[2] = floor(tmp[2]); + tmp[3] = floor(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_ceil +{ + float func(const float& x) const + { + return (float)ceil(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = ceil(tmp[0]); + tmp[1] = ceil(tmp[1]); + tmp[2] = ceil(tmp[2]); + tmp[3] = ceil(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_square +{ + float func(const float& x) const + { + return x * x; + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return __lsx_vfmul_s(x, x); + } +#endif // __loongarch_sx +}; + +struct unary_op_sqrt +{ + float func(const float& x) const + { + return (float)sqrt(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return __lsx_vfsqrt_s(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_rsqrt +{ + float func(const float& x) const + { + return (float)(1.f / sqrt(x)); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return __lsx_vfrsqrt_s(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_exp +{ + float func(const float& x) const + { + return (float)exp(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return exp_ps(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_log +{ + float func(const float& x) const + { + return (float)log(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return log_ps(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_sin +{ + float func(const float& x) const + { + return (float)sin(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = sin(tmp[0]); + tmp[1] = sin(tmp[1]); + tmp[2] = sin(tmp[2]); + tmp[3] = sin(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_cos +{ + float func(const float& x) const + { + return (float)cos(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = cos(tmp[0]); + tmp[1] = cos(tmp[1]); + tmp[2] = cos(tmp[2]); + tmp[3] = cos(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_tan +{ + float func(const float& x) const + { + return (float)tan(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = tan(tmp[0]); + tmp[1] = tan(tmp[1]); + tmp[2] = tan(tmp[2]); + tmp[3] = tan(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_asin +{ + float func(const float& x) const + { + return (float)asin(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = asin(tmp[0]); + tmp[1] = asin(tmp[1]); + tmp[2] = asin(tmp[2]); + tmp[3] = asin(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_acos +{ + float func(const float& x) const + { + return (float)acos(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = acos(tmp[0]); + tmp[1] = acos(tmp[1]); + tmp[2] = acos(tmp[2]); + tmp[3] = acos(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_atan +{ + float func(const float& x) const + { + return (float)atan(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + // TODO msa optimize + float tmp[4]; + __lsx_vst(x, tmp, 0); + tmp[0] = atan(tmp[0]); + tmp[1] = atan(tmp[1]); + tmp[2] = atan(tmp[2]); + tmp[3] = atan(tmp[3]); + return (__m128)__lsx_vld(tmp, 0); + } +#endif // __loongarch_sx +}; + +struct unary_op_reciprocal +{ + float func(const float& x) const + { + return 1.f / x; + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return __lsx_vfrecip_s(x); + } +#endif // __loongarch_sx +}; + +struct unary_op_tanh +{ + float func(const float& x) const + { + return (float)tanh(x); + } +#if __loongarch_sx + __m128 func_pack4(const __m128& x) const + { + return tanh_ps(x); + } +#endif // __loongarch_sx +}; + +} // namespace UnaryOp_loongarch_functor + +int UnaryOp_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + using namespace UnaryOp_loongarch_functor; + + if (op_type == Operation_ABS) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_NEG) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_FLOOR) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_CEIL) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_SQUARE) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_SQRT) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_RSQRT) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_EXP) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_LOG) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_SIN) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_COS) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_TAN) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_ASIN) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_ACOS) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_ATAN) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_RECIPROCAL) + return unary_op_inplace(bottom_top_blob, opt); + + if (op_type == Operation_TANH) + return unary_op_inplace(bottom_top_blob, opt); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/loongarch/unaryop_loongarch.h b/src/layer/loongarch/unaryop_loongarch.h new file mode 100644 index 000000000000..8170bec50cf8 --- /dev/null +++ b/src/layer/loongarch/unaryop_loongarch.h @@ -0,0 +1,32 @@ +// yala is pleased to support the open source community by making ncnn available. +// +// +// Copyright (C) 2022 yala ;. All rights reserved. +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_UNARYOP_LOONGARCH_H +#define LAYER_UNARYOP_LOONGARCH_H + +#include "unaryop.h" + +namespace ncnn { + +class UnaryOp_loongarch : virtual public UnaryOp +{ +public: + UnaryOp_loongarch(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_UNARYOP_LOONGARCH_H diff --git a/src/layer/lstm.cpp b/src/layer/lstm.cpp index 6749f05e019e..a065bcacae3b 100644 --- a/src/layer/lstm.cpp +++ b/src/layer/lstm.cpp @@ -29,6 +29,7 @@ int LSTM::load_param(const ParamDict& pd) num_output = pd.get(0, 0); weight_data_size = pd.get(1, 0); direction = pd.get(2, 0); + hidden_size = pd.get(3, num_output); return 0; } @@ -36,36 +37,52 @@ int LSTM::load_model(const ModelBin& mb) { int num_directions = direction == 2 ? 2 : 1; - int size = weight_data_size / num_directions / num_output / 4; + int size = weight_data_size / num_directions / hidden_size / 4; // raw weight data - weight_xc_data = mb.load(size, num_output * 4, num_directions, 0); + weight_xc_data = mb.load(size, hidden_size * 4, num_directions, 0); if (weight_xc_data.empty()) return -100; - bias_c_data = mb.load(num_output, 4, num_directions, 0); + bias_c_data = mb.load(hidden_size, 4, num_directions, 0); if (bias_c_data.empty()) return -100; - weight_hc_data = mb.load(num_output, num_output * 4, num_directions, 0); + weight_hc_data = mb.load(num_output, hidden_size * 4, num_directions, 0); if (weight_hc_data.empty()) return -100; + if (num_output != hidden_size) + { + weight_hr_data = mb.load(hidden_size, num_output, num_directions, 0); + if (weight_hr_data.empty()) + return -100; + } + return 0; } -static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt) +static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt) { int size = bottom_blob.w; int T = bottom_blob.h; int num_output = top_blob.w; + int hidden_size = cell_state.w; - // 4 x num_output - Mat gates(4, num_output, 4u, opt.workspace_allocator); + // 4 x hidden_size + Mat gates(4, hidden_size, 4u, opt.workspace_allocator); if (gates.empty()) return -100; + Mat tmp_hidden_state; + if (num_output != hidden_size) + { + tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator); + if (tmp_hidden_state.empty()) + return -100; + } + // unroll for (int t = 0; t < T; t++) { @@ -80,7 +97,7 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w const float* x = bottom_blob.row(ti); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) + for (int q = 0; q < hidden_size; q++) { const float* bias_c_I = bias_c.row(0); const float* bias_c_F = bias_c.row(1); @@ -90,15 +107,15 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w float* gates_data = gates.row(q); // gate I F O G - const float* weight_xc_I = weight_xc.row(num_output * 0 + q); - const float* weight_xc_F = weight_xc.row(num_output * 1 + q); - const float* weight_xc_O = weight_xc.row(num_output * 2 + q); - const float* weight_xc_G = weight_xc.row(num_output * 3 + q); + const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q); + const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q); + const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q); + const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q); - const float* weight_hc_I = weight_hc.row(num_output * 0 + q); - const float* weight_hc_F = weight_hc.row(num_output * 1 + q); - const float* weight_hc_O = weight_hc.row(num_output * 2 + q); - const float* weight_hc_G = weight_hc.row(num_output * 3 + q); + const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q); + const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q); + const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q); + const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q); float I = bias_c_I[q]; float F = bias_c_F[q]; @@ -140,7 +157,7 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w // h_t := o_t .* tanh[c_t] float* output_data = top_blob.row(ti); #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) + for (int q = 0; q < hidden_size; q++) { const float* gates_data = gates.row(q); @@ -157,8 +174,34 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w float cell2 = F * cell_state[q] + I * G; float H = O * tanh(cell2); cell_state[q] = cell2; - hidden_state[q] = H; - output_data[q] = H; + + if (num_output == hidden_size) + { + hidden_state[q] = H; + output_data[q] = H; + } + else + { + tmp_hidden_state[q] = H; + } + } + + if (num_output != hidden_size) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + const float* hr = weight_hr.row(q); + + float H = 0; + for (int i = 0; i < hidden_size; i++) + { + H += tmp_hidden_state[i] * hr[i]; + } + + hidden_state[q] = H; + output_data[q] = H; + } } } @@ -177,7 +220,7 @@ int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons return -100; hidden.fill(0.f); - Mat cell(num_output, 4u, opt.workspace_allocator); + Mat cell(hidden_size, 4u, opt.workspace_allocator); if (cell.empty()) return -100; cell.fill(0.f); @@ -189,7 +232,7 @@ int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons // Uni directional if (direction == 0 || direction == 1) { - int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt); + int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret != 0) return ret; } @@ -204,14 +247,14 @@ int LSTM::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons if (top_blob_reverse.empty()) return -100; - int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt); + int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret0 != 0) return ret0; hidden.fill(0.0f); cell.fill(0.0f); - int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, cell, opt); + int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt); if (ret1 != 0) return ret1; @@ -251,7 +294,7 @@ int LSTM::forward(const std::vector& bottom_blobs, std::vector& top_bl return -100; hidden.fill(0.f); - cell.create(num_output, num_directions, 4u, hidden_cell_allocator); + cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator); if (cell.empty()) return -100; cell.fill(0.f); @@ -265,7 +308,7 @@ int LSTM::forward(const std::vector& bottom_blobs, std::vector& top_bl // Uni directional if (direction == 0 || direction == 1) { - int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt); + int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret != 0) return ret; } @@ -282,13 +325,13 @@ int LSTM::forward(const std::vector& bottom_blobs, std::vector& top_bl Mat hidden0 = hidden.row_range(0, 1); Mat cell0 = cell.row_range(0, 1); - int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, cell0, opt); + int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt); if (ret0 != 0) return ret0; Mat hidden1 = hidden.row_range(1, 1); Mat cell1 = cell.row_range(1, 1); - int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, cell1, opt); + int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt); if (ret1 != 0) return ret1; diff --git a/src/layer/lstm.h b/src/layer/lstm.h index 78d8366a0f96..58bd67f987ab 100644 --- a/src/layer/lstm.h +++ b/src/layer/lstm.h @@ -36,10 +36,12 @@ class LSTM : public Layer int num_output; int weight_data_size; int direction; // 0=forward 1=reverse 2=bidirectional + int hidden_size; Mat weight_hc_data; Mat weight_xc_data; Mat bias_c_data; + Mat weight_hr_data; }; } // namespace ncnn diff --git a/src/layer/multiheadattention.cpp b/src/layer/multiheadattention.cpp index ac26f599f048..966df81d41d0 100644 --- a/src/layer/multiheadattention.cpp +++ b/src/layer/multiheadattention.cpp @@ -27,6 +27,8 @@ int MultiHeadAttention::load_param(const ParamDict& pd) embed_dim = pd.get(0, 0); num_head = pd.get(1, 1); weight_data_size = pd.get(2, 0); + kdim = pd.get(3, embed_dim); + vdim = pd.get(4, embed_dim); return 0; } @@ -41,7 +43,7 @@ int MultiHeadAttention::load_model(const ModelBin& mb) if (q_bias_data.empty()) return -100; - k_weight_data = mb.load(weight_data_size, 0); + k_weight_data = mb.load(embed_dim * kdim, 0); if (k_weight_data.empty()) return -100; @@ -49,7 +51,7 @@ int MultiHeadAttention::load_model(const ModelBin& mb) if (k_bias_data.empty()) return -100; - v_weight_data = mb.load(weight_data_size, 0); + v_weight_data = mb.load(embed_dim * vdim, 0); if (v_weight_data.empty()) return -100; @@ -73,23 +75,26 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto { const Mat& q_blob = bottom_blobs[0]; const Mat& k_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[1]; - const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs[2]; + const Mat& v_blob = bottom_blobs.size() == 1 ? q_blob : bottom_blobs.size() == 2 ? k_blob : bottom_blobs[2]; - const int seqlen = q_blob.h; + const int src_seqlen = q_blob.h; + const int dst_seqlen = k_blob.h; const int embed_dim_per_head = embed_dim / num_head; + // assert k_blob.h == v_blob.h + Mat& top_blob = top_blobs[0]; - top_blob.create(embed_dim, seqlen, 4u, opt.blob_allocator); + top_blob.create(embed_dim, src_seqlen, 4u, opt.blob_allocator); if (top_blob.empty()) return -1; - Mat xq(embed_dim_per_head, seqlen, num_head, 4u, opt.workspace_allocator); - Mat xk(embed_dim_per_head, seqlen, num_head, 4u, opt.workspace_allocator); - Mat xv(seqlen, embed_dim_per_head, num_head, 4u, opt.workspace_allocator); + Mat xq(embed_dim_per_head, src_seqlen, num_head, 4u, opt.workspace_allocator); + Mat xk(embed_dim_per_head, dst_seqlen, num_head, 4u, opt.workspace_allocator); + Mat xv(dst_seqlen, embed_dim_per_head, num_head, 4u, opt.workspace_allocator); - Mat xqk(seqlen, seqlen, num_head, 4u, opt.workspace_allocator); + Mat xqk(dst_seqlen, src_seqlen, num_head, 4u, opt.workspace_allocator); - Mat xqkv(embed_dim_per_head, num_head, seqlen, 4u, opt.workspace_allocator); + Mat xqkv(embed_dim_per_head, num_head, src_seqlen, 4u, opt.workspace_allocator); const float inv_sqrt_embed_dim_per_head = 1.f / sqrt(embed_dim_per_head); @@ -100,7 +105,7 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto { Mat outm = xq.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* outptr = outm.row(i); @@ -124,17 +129,17 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto { Mat outm = xk.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < dst_seqlen; i++) { float* outptr = outm.row(i); for (int j = 0; j < embed_dim_per_head; j++) { const float* ptr = k_blob.row(i); - const float* kptr = (const float*)k_weight_data + embed_dim * (q * embed_dim_per_head + j); + const float* kptr = (const float*)k_weight_data + kdim * (q * embed_dim_per_head + j); float sum = k_bias_data[q * embed_dim_per_head + j]; - for (int k = 0; k < embed_dim; k++) + for (int k = 0; k < kdim; k++) { sum += *ptr++ * *kptr++; } @@ -150,13 +155,13 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto for (int i = 0; i < embed_dim_per_head; i++) { - for (int j = 0; j < seqlen; j++) + for (int j = 0; j < dst_seqlen; j++) { const float* ptr = v_blob.row(j); - const float* kptr = (const float*)v_weight_data + embed_dim * (q * embed_dim_per_head + i); + const float* kptr = (const float*)v_weight_data + vdim * (q * embed_dim_per_head + i); float sum = v_bias_data[q * embed_dim_per_head + i]; - for (int k = 0; k < embed_dim; k++) + for (int k = 0; k < vdim; k++) { sum += *ptr++ * *kptr++; } @@ -169,19 +174,19 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto } // xqk = xq * xk - // xq (embed_dim_per_head, seqlen) - // xk (embed_dim_per_head, seqlen) + // xq (embed_dim_per_head, src_seqlen) + // xk (embed_dim_per_head, dst_seqlen) { const Mat xqm = xq.channel(q); const Mat xkm = xk.channel(q); Mat outm = xqk.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* outptr = outm.row(i); - for (int j = 0; j < seqlen; j++) + for (int j = 0; j < dst_seqlen; j++) { const float* qptr = xqm.row(i); const float* kptr = xkm.row(j); @@ -201,24 +206,24 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto { Mat outm = xqk.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* ptr = outm.row(i); float max = -FLT_MAX; - for (int j = 0; j < seqlen; j++) + for (int j = 0; j < dst_seqlen; j++) { max = std::max(max, ptr[j]); } float sum = 0.f; - for (int j = 0; j < seqlen; j++) + for (int j = 0; j < dst_seqlen; j++) { ptr[j] = (float)(exp(ptr[j] - max)); sum += ptr[j]; } - for (int j = 0; j < seqlen; j++) + for (int j = 0; j < dst_seqlen; j++) { ptr[j] /= sum; } @@ -226,14 +231,14 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto } // xqkv = xqk * xv - // xqk (seqlen, seqlen) - // xv (seqlen, embed_dim_per_head) - // out (embed_dim_per_head, num_head, seqlen) + // xqk (dst_seqlen, src_seqlen) + // xv (dst_seqlen, embed_dim_per_head) + // out (embed_dim_per_head, num_head, src_seqlen) { const Mat xqkm = xqk.channel(q); const Mat xvm = xv.channel(q); - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* outptr = xqkv.channel(i).row(q); @@ -243,7 +248,7 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto const float* vptr = xvm.row(j); float sum = 0.f; - for (int k = 0; k < seqlen; k++) + for (int k = 0; k < dst_seqlen; k++) { sum += *qkptr++ * *vptr++; } @@ -255,9 +260,9 @@ int MultiHeadAttention::forward(const std::vector& bottom_blobs, std::vecto } // out = affine(xqkv) - // xqkv (embed_dim, seqlen) + // xqkv (embed_dim, src_seqlen) #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < seqlen; i++) + for (int i = 0; i < src_seqlen; i++) { float* outptr = top_blob.row(i); diff --git a/src/layer/multiheadattention.h b/src/layer/multiheadattention.h index b878055385d0..2de5213ca315 100644 --- a/src/layer/multiheadattention.h +++ b/src/layer/multiheadattention.h @@ -34,6 +34,8 @@ class MultiHeadAttention : public Layer int embed_dim; int num_head; int weight_data_size; + int kdim; + int vdim; Mat q_weight_data; Mat q_bias_data; diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp index 4a41788ec9e9..092a8b5d6b13 100644 --- a/src/layer/riscv/absval_riscv.cpp +++ b/src/layer/riscv/absval_riscv.cpp @@ -66,7 +66,7 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfabs_v_f32m8_absval(_p, vl); @@ -106,7 +106,7 @@ int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfabs_v_f16m8_absval(_p, vl); diff --git a/src/layer/riscv/batchnorm_riscv.cpp b/src/layer/riscv/batchnorm_riscv.cpp new file mode 100644 index 000000000000..2a8ec0cce582 --- /dev/null +++ b/src/layer/riscv/batchnorm_riscv.cpp @@ -0,0 +1,537 @@ +// Xavier Hsinyuan is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 Xavier Hsinyuan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "batchnorm_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_usability.h" + +namespace ncnn { + +BatchNorm_riscv::BatchNorm_riscv() +{ +#if __riscv_vector + support_packing = true; +#if __riscv_zfh + support_fp16_storage = true; +#endif +#endif // __riscv_vector +} + +int BatchNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ +#if __riscv_vector + int elembits = bottom_top_blob.elembits(); + +#if __riscv_zfh + if (opt.use_fp16_storage && elembits == 16) + { + if (opt.use_fp16_arithmetic) + return forward_inplace_fp16sa(bottom_top_blob, opt); + else + return forward_inplace_fp16s(bottom_top_blob, opt); + } +#endif + int elempack = bottom_top_blob.elempack; +#endif // __riscv_vector + int dims = bottom_top_blob.dims; + if (dims == 1) + { + float* ptr = bottom_top_blob; +#if __riscv_vector + const float* ptr_a = a_data; + const float* ptr_b = b_data; + int n = bottom_top_blob.w * elempack; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + + vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + vfloat32m8_t _a = vle32_v_f32m8(ptr_a, vl); + vfloat32m8_t _b = vle32_v_f32m8(ptr_b, vl); + + _p = vfmadd_vv_f32m8(_p, _b, _a, vl); + + vse32_v_f32m8(ptr, _p, vl); + + ptr += vl; + ptr_a += vl; + ptr_b += vl; + n -= vl; + } +#else + int w = bottom_top_blob.w; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = b_data[i] * ptr[i] + a_data[i]; + } +#endif // __riscv_vector + return 0; + } + +#if __riscv_vector + if (elempack == 1) +#endif + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + float a = a_data[i]; + float b = b_data[i]; + +#if __riscv_vector + int n = w; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + _p = vfmul_vf_f32m8(_p, b, vl); + _p = vfadd_vf_f32m8(_p, a, vl); + vse32_v_f32m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else + for (int j = 0; j < w; j++) + { + ptr[j] = b * ptr[j] + a; + } +#endif // __riscv_vector + } + } + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + float* ptr = bottom_top_blob.channel(q); + float a = a_data[q]; + float b = b_data[q]; + +#if __riscv_vector + int n = size; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); + _p = vfmul_vf_f32m8(_p, b, vl); + _p = vfadd_vf_f32m8(_p, a, vl); + vse32_v_f32m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } +#else + for (int i = 0; i < size; i++) + { + ptr[i] = b * ptr[i] + a; + } +#endif // __riscv_vector + } + } + return 0; + } + +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + if (elempack == packn) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + + const size_t vl = vsetvl_e32m1(packn); + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + float* ptr = bottom_top_blob.row(i); + const float* ptr_a = a_data; + ptr_a += i * elempack; + const float* ptr_b = b_data; + ptr_b += i * elempack; + int n = w * elempack; + + vfloat32m1_t _a = vle32_v_f32m1(ptr_a, vl); + vfloat32m1_t _b = vle32_v_f32m1(ptr_b, vl); + while (n > 0) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); + _p = vfmadd_vv_f32m1(_p, _b, _a, vl); + vse32_v_f32m1(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } + } + + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + float* ptr = bottom_top_blob.channel(q); + const float* ptr_a = (const float*)a_data + q * elempack; + const float* ptr_b = (const float*)b_data + q * elempack; + + vfloat32m1_t _a = vle32_v_f32m1(ptr_a, vl); + vfloat32m1_t _b = vle32_v_f32m1(ptr_b, vl); + + int n = size; + while (n > 0) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); + _p = vfmadd_vv_f32m1(_p, _b, _a, vl); + vse32_v_f32m1(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } + } + } +#endif + return 0; +} + +#if __riscv_vector && __riscv_zfh +int BatchNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int elempack = bottom_top_blob.elempack; + if (dims == 1) + { + int n = bottom_top_blob.w * elempack; + __fp16* ptr = bottom_top_blob; + const float* ptr_a = a_data; + const float* ptr_b = b_data; + while (n > 0) + { + size_t vl = vsetvl_e16m4(n); + + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); + vfloat32m8_t _a = vle32_v_f32m8(ptr_a, vl); + vfloat32m8_t _b = vle32_v_f32m8(ptr_b, vl); + + _p = vfmadd_vv_f32m8(_p, _b, _a, vl); + + vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + ptr_a += vl; + ptr_b += vl; + n -= vl; + } + + return 0; + } + + if (elempack == 1) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + float a = a_data[i]; + float b = b_data[i]; + + int n = w; + while (n > 0) + { + size_t vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); + _p = vfmul_vf_f32m8(_p, b, vl); + _p = vfadd_vf_f32m8(_p, a, vl); + vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } + } + } + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + float a = a_data[q]; + float b = b_data[q]; + + int n = size; + while (n > 0) + { + size_t vl = vsetvl_e16m4(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); + ; + _p = vfmul_vf_f32m8(_p, b, vl); + _p = vfadd_vf_f32m8(_p, a, vl); + vse16_v_f16m4(ptr, vfncvt_f_f_w_f16m4(_p, vl), vl); + + ptr += vl; + n -= vl; + } + } + } + + return 0; + } + + const int packn = csrr_vlenb() / 2; // fp16 + if (elempack == packn) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + + const size_t vl = vsetvl_e16m1(packn); + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + const float* ptr_a = (const float*)a_data + i * elempack; + const float* ptr_b = (const float*)b_data + i * elempack; + int n = w * elempack; + + vfloat32m2_t _a = vle32_v_f32m2(ptr_a, vl); + vfloat32m2_t _b = vle32_v_f32m2(ptr_b, vl); + while (n > 0) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr, vl), vl); + _p = vfmadd_vv_f32m2(_p, _b, _a, vl); + vse16_v_f16m1(ptr, vfncvt_f_f_w_f16m1(_p, vl), vl); + + ptr += vl; + n -= vl; + } + } + } + + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + const float* ptr_a = (const float*)a_data + q * elempack; + const float* ptr_b = (const float*)b_data + q * elempack; + + vfloat32m2_t _a = vle32_v_f32m2(ptr_a, vl); + vfloat32m2_t _b = vle32_v_f32m2(ptr_b, vl); + + int n = size; + while (n > 0) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr, vl), vl); + _p = vfmadd_vv_f32m2(_p, _b, _a, vl); + vse16_v_f16m1(ptr, vfncvt_f_f_w_f16m1(_p, vl), vl); + + ptr += vl; + n -= vl; + } + } + } + } + + return 0; +} + +int BatchNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int elempack = bottom_top_blob.elempack; + if (dims == 1) + { + int n = bottom_top_blob.w * elempack; + __fp16* ptr = bottom_top_blob; + const float* ptr_a = a_data; + const float* ptr_b = b_data; + while (n > 0) + { + size_t vl = vsetvl_e16m4(n); + + vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); + vfloat16m4_t _a = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_a, vl), vl); + vfloat16m4_t _b = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_b, vl), vl); + + _p = vfmadd_vv_f16m4(_p, _b, _a, vl); + + vse16_v_f16m4(ptr, _p, vl); + + ptr += vl; + ptr_a += vl; + ptr_b += vl; + n -= vl; + } + + return 0; + } + + if (elempack == 1) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + float a = a_data[i]; + float b = b_data[i]; + + int n = w; + while (n > 0) + { + size_t vl = vsetvl_e16m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); + _p = vfmul_vf_f16m8(_p, b, vl); + _p = vfadd_vf_f16m8(_p, a, vl); + vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } + } + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + float a = a_data[q]; + float b = b_data[q]; + + int n = size; + while (n > 0) + { + size_t vl = vsetvl_e16m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); + ; + _p = vfmul_vf_f16m8(_p, b, vl); + _p = vfadd_vf_f16m8(_p, a, vl); + vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } + } + + return 0; + } + + const int packn = csrr_vlenb() / 2; // fp16 + if (elempack == packn) + { + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + + const size_t vl = vsetvl_e16m1(packn); + if (dims == 2) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + __fp16* ptr = bottom_top_blob.row<__fp16>(i); + const float* ptr_a = (const float*)a_data + i * elempack; + const float* ptr_b = (const float*)b_data + i * elempack; + int n = w * elempack; + + vfloat16m1_t _a = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_a, vl), vl); + vfloat16m1_t _b = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_b, vl), vl); + while (n > 0) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr, vl); + _p = vfmadd_vv_f16m1(_p, _b, _a, vl); + vse16_v_f16m1(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } + } + + if (dims == 3 || dims == 4) + { + int d = bottom_top_blob.d; + int c = bottom_top_blob.c; + int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + const float* ptr_a = (const float*)a_data + q * elempack; + const float* ptr_b = (const float*)b_data + q * elempack; + + vfloat16m1_t _a = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_a, vl), vl); + vfloat16m1_t _b = vfncvt_f_f_w_f16m1(vle32_v_f32m2(ptr_b, vl), vl); + + int n = size; + while (n > 0) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr, vl); + _p = vfmadd_vv_f16m1(_p, _b, _a, vl); + vse16_v_f16m1(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } + } + } + + return 0; +} + +#endif // __riscv_vector && __riscv_zfh +} // namespace ncnn diff --git a/src/layer/riscv/batchnorm_riscv.h b/src/layer/riscv/batchnorm_riscv.h new file mode 100644 index 000000000000..e2365fa5fcfe --- /dev/null +++ b/src/layer/riscv/batchnorm_riscv.h @@ -0,0 +1,37 @@ +// Xavier Hsinyuan is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 Xavier Hsinyuan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_BATCHNORM_RISCV_H +#define LAYER_BATCHNORM_RISCV_H + +#include "batchnorm.h" + +namespace ncnn { +class BatchNorm_riscv : virtual public BatchNorm +{ +public: + BatchNorm_riscv(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: +#if __riscv_vector && __riscv_zfh + int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; + int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; +#endif +}; + +} // namespace ncnn + +#endif // LAYER_BATCHNORM_RISCV_H diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp index b4e53a2c8568..9858e654822a 100644 --- a/src/layer/riscv/binaryop_riscv.cpp +++ b/src/layer/riscv/binaryop_riscv.cpp @@ -67,7 +67,7 @@ static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(a0, _p, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -108,7 +108,7 @@ static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, b0, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -149,7 +149,7 @@ static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_p, _p1, vl); @@ -217,7 +217,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -252,7 +252,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * h * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -289,7 +289,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -328,7 +328,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -367,7 +367,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -400,7 +400,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, *ptr1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -436,7 +436,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -469,7 +469,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n1 = elempack1; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _p = vfmv_v_f_f32m8(*ptr, vl); vfloat32m8_t _outp = op(_p, _p1, vl); @@ -508,7 +508,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -545,7 +545,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) const float* ptr1_vol = ptr1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1_vol, vl); vfloat32m8_t _outp = op(_p, _p1, vl); @@ -583,7 +583,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -620,7 +620,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) const float* ptr_vol = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_p, _p1, vl); @@ -662,7 +662,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -699,7 +699,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -736,7 +736,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * h1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -774,7 +774,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -825,7 +825,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -867,7 +867,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -899,7 +899,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -931,7 +931,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -985,7 +985,7 @@ static int binary_op_scalar_rvv(Mat& a, float b, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = op(_p, b, vl); vse32_v_f32m8(ptr, _p, vl); @@ -1000,21 +1000,21 @@ static int binary_op_scalar_rvv(Mat& a, float b, const Option& opt) namespace BinaryOp_riscv_functor { -#define MAKE_FUNCTION(NAME, IMPLVV, IMPLVS, IMPLSV) \ - struct NAME \ - { \ - vfloat32m8_t operator()(const vfloat32m8_t& x, const vfloat32m8_t& y, const word_type vl) const \ - { \ - return IMPLVV; \ - } \ - vfloat32m8_t operator()(const vfloat32m8_t& x, const float y, const word_type vl) const \ - { \ - return IMPLVS; \ - } \ - vfloat32m8_t operator()(const float x, const vfloat32m8_t& y, const word_type vl) const \ - { \ - return IMPLSV; \ - } \ +#define MAKE_FUNCTION(NAME, IMPLVV, IMPLVS, IMPLSV) \ + struct NAME \ + { \ + vfloat32m8_t operator()(const vfloat32m8_t& x, const vfloat32m8_t& y, const size_t vl) const \ + { \ + return IMPLVV; \ + } \ + vfloat32m8_t operator()(const vfloat32m8_t& x, const float y, const size_t vl) const \ + { \ + return IMPLVS; \ + } \ + vfloat32m8_t operator()(const float x, const vfloat32m8_t& y, const size_t vl) const \ + { \ + return IMPLSV; \ + } \ }; MAKE_FUNCTION(binary_op_add_rvv, vfadd_vv_f32m8(x, y, vl), vfadd_vf_f32m8(x, y, vl), vfadd_vf_f32m8(y, x, vl)) @@ -1159,7 +1159,7 @@ static int binary_op_2_3_4_20_fp16s(const Mat& a, const Mat& b, Mat& c, const Op int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(a0, _p, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1200,7 +1200,7 @@ static int binary_op_6_11_16_25_fp16s(const Mat& a, const Mat& b, Mat& c, const int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, b0, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1241,7 +1241,7 @@ static int binary_op_7_13_19_29_fp16s(const Mat& a, const Mat& b, Mat& c, const int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_p, _p1, vl); @@ -1309,7 +1309,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1344,7 +1344,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * h * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1381,7 +1381,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1420,7 +1420,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1459,7 +1459,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1492,7 +1492,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, *ptr1, vl); @@ -1527,7 +1527,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e16m8(n1); + size_t vl = vsetvl_e16m8(n1); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1560,7 +1560,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n1 = elempack1; while (n1 > 0) { - word_type vl = vsetvl_e16m8(n1); + size_t vl = vsetvl_e16m8(n1); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _p = vfmv_v_f_f16m8(*ptr, vl); vfloat16m8_t _outp = op(_p, _p1, vl); @@ -1598,7 +1598,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1637,7 +1637,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& const __fp16* ptr1_vol = ptr1 + x * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1_vol, vl); vfloat16m8_t _outp = op(_p, _p1, vl); @@ -1676,7 +1676,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1715,7 +1715,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& const __fp16* ptr_vol = ptr + x * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr_vol, vl); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_p, _p1, vl); @@ -1758,7 +1758,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1795,7 +1795,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1832,7 +1832,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * h1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1870,7 +1870,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1921,7 +1921,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1963,7 +1963,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e16m8(n1); + size_t vl = vsetvl_e16m8(n1); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1995,7 +1995,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e16m8(n1); + size_t vl = vsetvl_e16m8(n1); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -2027,7 +2027,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -2706,7 +2706,7 @@ static int binary_op_scalar_rvv_fp16s(Mat& a, float b, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = op(_p, b, vl); vse16_v_f16m8(ptr, _p, vl); @@ -2721,25 +2721,25 @@ static int binary_op_scalar_rvv_fp16s(Mat& a, float b, const Option& opt) namespace BinaryOp_riscv_functor { -#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ - struct NAME \ - { \ - __fp16 operator()(const __fp16& x, const __fp16& y) const \ - { \ - return IMPL; \ - } \ - vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const word_type vl) const \ - { \ - return IMPLVV; \ - } \ - vfloat16m8_t operator()(const vfloat16m8_t& x, const float y, const word_type vl) const \ - { \ - return IMPLVS; \ - } \ - vfloat16m8_t operator()(const float x, const vfloat16m8_t& y, const word_type vl) const \ - { \ - return IMPLSV; \ - } \ +#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ + struct NAME \ + { \ + __fp16 operator()(const __fp16& x, const __fp16& y) const \ + { \ + return IMPL; \ + } \ + vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const size_t vl) const \ + { \ + return IMPLVV; \ + } \ + vfloat16m8_t operator()(const vfloat16m8_t& x, const float y, const size_t vl) const \ + { \ + return IMPLVS; \ + } \ + vfloat16m8_t operator()(const float x, const vfloat16m8_t& y, const size_t vl) const \ + { \ + return IMPLSV; \ + } \ }; // clang-format off diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp index 8ea5d0f05eff..5d0642e7da7d 100644 --- a/src/layer/riscv/cast_riscv.cpp +++ b/src/layer/riscv/cast_riscv.cpp @@ -101,7 +101,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat16m4_t _outp = vfncvt_f_f_w_f16m4(_p, vl); @@ -125,7 +125,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); vfloat32m8_t _outp = vfwcvt_f_f_v_f32m8(_p, vl); diff --git a/src/layer/riscv/clip_riscv.cpp b/src/layer/riscv/clip_riscv.cpp index 9acff0218f02..8c43e06a4d82 100644 --- a/src/layer/riscv/clip_riscv.cpp +++ b/src/layer/riscv/clip_riscv.cpp @@ -62,7 +62,7 @@ int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfmax_vf_f32m8(_p, min, vl); @@ -107,7 +107,7 @@ int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = vfmax_vf_f32m8(_p, min, vl); @@ -139,7 +139,7 @@ int Clip_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfmax_vf_f16m8(_p, min, vl); diff --git a/src/layer/riscv/concat_riscv.cpp b/src/layer/riscv/concat_riscv.cpp index d80d9985b479..5736fd25dcd2 100644 --- a/src/layer/riscv/concat_riscv.cpp +++ b/src/layer/riscv/concat_riscv.cpp @@ -143,7 +143,7 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); for (int i = 0; i < bottom_blob.h; i++) { @@ -266,7 +266,7 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int size = bottom_blob.w * bottom_blob.h; @@ -487,7 +487,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector& bottom_blobs, std: #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); for (int i = 0; i < bottom_blob.h; i++) { @@ -610,7 +610,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector& bottom_blobs, std: #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int size = bottom_blob.w * bottom_blob.h; diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp index a956d394f175..483aa511672c 100644 --- a/src/layer/riscv/convolution1d_riscv.cpp +++ b/src/layer/riscv/convolution1d_riscv.cpp @@ -119,7 +119,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -476,7 +476,7 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt) int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -697,7 +697,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/convolution_1x1_packn.h b/src/layer/riscv/convolution_1x1_packn.h index 8f55d260abc9..31bf72ba3d08 100644 --- a/src/layer/riscv/convolution_1x1_packn.h +++ b/src/layer/riscv/convolution_1x1_packn.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, con static void conv1x1s2_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_1x1_packn_fp16s.h b/src/layer/riscv/convolution_1x1_packn_fp16s.h index 110d61dc121e..5ac3f8967cea 100644 --- a/src/layer/riscv/convolution_1x1_packn_fp16s.h +++ b/src/layer/riscv/convolution_1x1_packn_fp16s.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_bl static void conv1x1s2_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_1x1_packnto1.h b/src/layer/riscv/convolution_1x1_packnto1.h index 0cd1747586e9..a3e1204a3254 100644 --- a/src/layer/riscv/convolution_1x1_packnto1.h +++ b/src/layer/riscv/convolution_1x1_packnto1.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, static void conv1x1s2_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h index 04e86f97dca4..10591ab27f29 100644 --- a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top static void conv1x1s2_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_3x3_pack1ton.h b/src/layer/riscv/convolution_3x3_pack1ton.h index bb123ef8997e..9adcfb1e263c 100644 --- a/src/layer/riscv/convolution_3x3_pack1ton.h +++ b/src/layer/riscv/convolution_3x3_pack1ton.h @@ -15,7 +15,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int inch = bottom_blob.c; int outw = top_blob.w; @@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h index e25c7d09097c..bff24a0099f9 100644 --- a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int inch = bottom_blob.c; int outw = top_blob.w; @@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_7x7_pack1ton.h b/src/layer/riscv/convolution_7x7_pack1ton.h index 06c4dfe2f6a2..3605ed027cdc 100644 --- a/src/layer/riscv/convolution_7x7_pack1ton.h +++ b/src/layer/riscv/convolution_7x7_pack1ton.h @@ -15,7 +15,7 @@ static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h index 91ee1b7d8269..01804bf391da 100644 --- a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_pack1ton.h b/src/layer/riscv/convolution_pack1ton.h index f667f4d5d09f..15eec7badd98 100644 --- a/src/layer/riscv/convolution_pack1ton.h +++ b/src/layer/riscv/convolution_pack1ton.h @@ -15,7 +15,7 @@ static void convolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_pack1ton_fp16s.h b/src/layer/riscv/convolution_pack1ton_fp16s.h index fc4861730316..6f8c649e632d 100644 --- a/src/layer/riscv/convolution_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -95,7 +95,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_packn.h b/src/layer/riscv/convolution_packn.h index c9b51d07881a..9d18c1d858e4 100644 --- a/src/layer/riscv/convolution_packn.h +++ b/src/layer/riscv/convolution_packn.h @@ -15,7 +15,7 @@ static void convolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_packn_fp16s.h b/src/layer/riscv/convolution_packn_fp16s.h index 8ae4468495af..1f7b308e846e 100644 --- a/src/layer/riscv/convolution_packn_fp16s.h +++ b/src/layer/riscv/convolution_packn_fp16s.h @@ -15,7 +15,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -100,7 +100,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_packnto1.h b/src/layer/riscv/convolution_packnto1.h index 7eda38580837..4c66116d20e8 100644 --- a/src/layer/riscv/convolution_packnto1.h +++ b/src/layer/riscv/convolution_packnto1.h @@ -15,7 +15,7 @@ static void convolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_packnto1_fp16s.h b/src/layer/riscv/convolution_packnto1_fp16s.h index 63aefbb5d5a0..83efd3081f83 100644 --- a/src/layer/riscv/convolution_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -109,7 +109,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_sgemm.h b/src/layer/riscv/convolution_sgemm.h index c62db6c78ee1..801b7cc456f2 100644 --- a/src/layer/riscv/convolution_sgemm.h +++ b/src/layer/riscv/convolution_sgemm.h @@ -16,7 +16,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& { #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_sgemm_fp16s.h b/src/layer/riscv/convolution_sgemm_fp16s.h index 5cd5ea8a31e5..72a621641dbb 100644 --- a/src/layer/riscv/convolution_sgemm_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_fp16s.h @@ -16,7 +16,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con { #if __riscv_vector const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); #endif // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_sgemm_pack1ton.h b/src/layer/riscv/convolution_sgemm_pack1ton.h index bc2f558a6d9b..8a3e6ffbc437 100644 --- a/src/layer/riscv/convolution_sgemm_pack1ton.h +++ b/src/layer/riscv/convolution_sgemm_pack1ton.h @@ -15,7 +15,7 @@ static void im2col_sgemm_pack1ton_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h index c3590a6ed6b1..0c0b2791a8f8 100644 --- a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_pack1ton_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_sgemm_packn.h b/src/layer/riscv/convolution_sgemm_packn.h index 88518a23136a..9255c092ae40 100644 --- a/src/layer/riscv/convolution_sgemm_packn.h +++ b/src/layer/riscv/convolution_sgemm_packn.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator); @@ -78,7 +78,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl); vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl); vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl); - vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); img0 += size * packn; tmpptr += packn * 8; @@ -119,7 +119,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl); - vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); img0 += size * packn; tmpptr += packn * 4; @@ -156,7 +156,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons #else vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl); + vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); img0 += size * packn; tmpptr += packn * 2; @@ -363,7 +363,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons static void convolution_im2col_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_sgemm_packn_fp16s.h b/src/layer/riscv/convolution_sgemm_packn_fp16s.h index 977dc38204a9..cb3b65196ed1 100644 --- a/src/layer/riscv/convolution_sgemm_packn_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packn_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator); @@ -109,7 +109,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl); vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl); vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl); - vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); img0 += size * packn; tmpptr += packn * 8; @@ -172,7 +172,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl); - vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); img0 += size * packn; tmpptr += packn * 4; @@ -228,7 +228,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo #else vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl); + vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); img0 += size * packn; tmpptr += packn * 2; @@ -435,7 +435,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo static void convolution_im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_sgemm_packnto1.h b/src/layer/riscv/convolution_sgemm_packnto1.h index 212cf98b39b9..2df2c7d76563 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1.h +++ b/src/layer/riscv/convolution_sgemm_packnto1.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator); @@ -77,7 +77,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl); vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl); vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl); - vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); img0 += size * packn; tmpptr += packn * 8; @@ -118,7 +118,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl); - vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); img0 += size * packn; tmpptr += packn * 4; @@ -155,7 +155,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c #else vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl); + vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); img0 += size * packn; tmpptr += packn * 2; @@ -190,6 +190,14 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c int nn_outch = outch / packn; int remain_outch_start = nn_outch * packn; +#ifdef __clang__ + // clang complains about VLA in the following loop + float* _zero_tmp = new float[packn](); + for (int _zero_clean_idx = 0; _zero_clean_idx < packn; _zero_clean_idx++) + { + _zero_tmp[_zero_clean_idx] = 0.f; + } +#endif // __clang__ #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_outch; pp++) { @@ -197,7 +205,11 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c float* outptr0 = top_blob.channel(p); +#ifdef __clang__ + const float* zeros = _zero_tmp; +#else const float zeros[packn] = {0.f}; +#endif // __clang__ const float* biasptr = bias ? bias + p : zeros; int i = 0; @@ -250,7 +262,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vsse32_v_f32m1(outptr0 + 6, top_blob.cstep * sizeof(float), _sum6, vl); vsse32_v_f32m1(outptr0 + 7, top_blob.cstep * sizeof(float), _sum7, vl); #else - vssseg8e32_v_f32m1x8(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl); + vssseg8e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl); #endif outptr0 += 8; } @@ -287,7 +299,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl); vsse32_v_f32m1(outptr0 + 3, top_blob.cstep * sizeof(float), _sum3, vl); #else - vssseg4e32_v_f32m1x4(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x4(_sum0, _sum1, _sum2, _sum3), vl); + vssseg4e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, vl); #endif outptr0 += 4; } @@ -316,7 +328,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); #else - vssseg2e32_v_f32m1x2(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x2(_sum0, _sum1), vl); + vssseg2e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, vl); #endif outptr0 += 2; } @@ -343,6 +355,9 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c outptr0 += 1; } } +#ifdef __clang__ + delete[] _zero_tmp; +#endif #pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_outch_start; p < outch; p++) @@ -379,16 +394,24 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c for (int j = 0; j < nn; j++) { - vfloat32m1x8_t _val01 = vlseg8e32_v_f32m1x8(tmpptr, vl); + vfloat32m1_t _val0; + vfloat32m1_t _val1; + vfloat32m1_t _val2; + vfloat32m1_t _val3; + vfloat32m1_t _val4; + vfloat32m1_t _val5; + vfloat32m1_t _val6; + vfloat32m1_t _val7; + vlseg8e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl); vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x8_f32m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x8_f32m1(_val01, 1), _w0, vl); - _sum2 = vfmacc_vv_f32m1(_sum2, vget_f32m1x8_f32m1(_val01, 2), _w0, vl); - _sum3 = vfmacc_vv_f32m1(_sum3, vget_f32m1x8_f32m1(_val01, 3), _w0, vl); - _sum4 = vfmacc_vv_f32m1(_sum4, vget_f32m1x8_f32m1(_val01, 4), _w0, vl); - _sum5 = vfmacc_vv_f32m1(_sum5, vget_f32m1x8_f32m1(_val01, 5), _w0, vl); - _sum6 = vfmacc_vv_f32m1(_sum6, vget_f32m1x8_f32m1(_val01, 6), _w0, vl); - _sum7 = vfmacc_vv_f32m1(_sum7, vget_f32m1x8_f32m1(_val01, 7), _w0, vl); + _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); + _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl); + _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl); + _sum4 = vfmacc_vv_f32m1(_sum4, _val4, _w0, vl); + _sum5 = vfmacc_vv_f32m1(_sum5, _val5, _w0, vl); + _sum6 = vfmacc_vv_f32m1(_sum6, _val6, _w0, vl); + _sum7 = vfmacc_vv_f32m1(_sum7, _val7, _w0, vl); tmpptr += packn * 8; kptr0 += packn; } @@ -463,12 +486,16 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c for (int j = 0; j < nn; j++) { - vfloat32m1x4_t _val01 = vlseg4e32_v_f32m1x4(tmpptr, vl); + vfloat32m1_t _val0; + vfloat32m1_t _val1; + vfloat32m1_t _val2; + vfloat32m1_t _val3; + vlseg4e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl); vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x4_f32m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x4_f32m1(_val01, 1), _w0, vl); - _sum2 = vfmacc_vv_f32m1(_sum2, vget_f32m1x4_f32m1(_val01, 2), _w0, vl); - _sum3 = vfmacc_vv_f32m1(_sum3, vget_f32m1x4_f32m1(_val01, 3), _w0, vl); + _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); + _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl); + _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl); tmpptr += packn * 4; kptr0 += packn; } @@ -519,10 +546,12 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c for (int j = 0; j < nn; j++) { - vfloat32m1x2_t _val01 = vlseg2e32_v_f32m1x2(tmpptr, vl); + vfloat32m1_t _val0; + vfloat32m1_t _val1; + vlseg2e32_v_f32m1(&_val0, &_val1, tmpptr, vl); vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x2_f32m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x2_f32m1(_val01, 1), _w0, vl); + _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); tmpptr += packn * 2; kptr0 += packn; } @@ -648,7 +677,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_rvv(const Mat& _k static void convolution_im2col_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h index d6dd867397c4..925713d9826b 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator); @@ -77,7 +77,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl); vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl); vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl); - vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); img0 += size * packn; tmpptr += packn * 8; @@ -118,7 +118,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl); - vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); img0 += size * packn; tmpptr += packn * 4; @@ -155,7 +155,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ #else vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl); + vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); img0 += size * packn; tmpptr += packn * 2; @@ -190,6 +190,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ int nn_outch = outch / packn; int remain_outch_start = nn_outch * packn; + // make clang happy with the following loop +#ifdef __clang__ + __fp16* _zero_tmp = new __fp16[packn](); + for (int _zero_clean_idx = 0; _zero_clean_idx < packn; _zero_clean_idx++) + { + _zero_tmp[_zero_clean_idx] = 0.f; + } +#endif // __clang__ #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_outch; pp++) { @@ -197,7 +205,11 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16* outptr0 = top_blob.channel(p); +#ifdef __clang__ + const __fp16* zeros = _zero_tmp; +#else const __fp16 zeros[packn] = {0.f}; +#endif // __clang__ const __fp16* biasptr = bias ? bias + p : zeros; int i = 0; @@ -250,7 +262,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vsse16_v_f16m1(outptr0 + 6, top_blob.cstep * sizeof(__fp16), _sum6, vl); vsse16_v_f16m1(outptr0 + 7, top_blob.cstep * sizeof(__fp16), _sum7, vl); #else - vssseg8e16_v_f16m1x8(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl); + vssseg8e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl); #endif outptr0 += 8; } @@ -287,7 +299,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl); vsse16_v_f16m1(outptr0 + 3, top_blob.cstep * sizeof(__fp16), _sum3, vl); #else - vssseg4e16_v_f16m1x4(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x4(_sum0, _sum1, _sum2, _sum3), vl); + vssseg4e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, vl); #endif outptr0 += 4; } @@ -316,7 +328,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); #else - vssseg2e16_v_f16m1x2(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x2(_sum0, _sum1), vl); + vssseg2e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, vl); #endif outptr0 += 2; } @@ -343,6 +355,9 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ outptr0 += 1; } } +#ifdef __clang__ + delete[] _zero_tmp; +#endif // __clang__ #pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_outch_start; p < outch; p++) @@ -379,16 +394,24 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ for (int j = 0; j < nn; j++) { - vfloat16m1x8_t _val01 = vlseg8e16_v_f16m1x8(tmpptr, vl); + vfloat16m1_t _val0; + vfloat16m1_t _val1; + vfloat16m1_t _val2; + vfloat16m1_t _val3; + vfloat16m1_t _val4; + vfloat16m1_t _val5; + vfloat16m1_t _val6; + vfloat16m1_t _val7; + vlseg8e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl); vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x8_f16m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x8_f16m1(_val01, 1), _w0, vl); - _sum2 = vfmacc_vv_f16m1(_sum2, vget_f16m1x8_f16m1(_val01, 2), _w0, vl); - _sum3 = vfmacc_vv_f16m1(_sum3, vget_f16m1x8_f16m1(_val01, 3), _w0, vl); - _sum4 = vfmacc_vv_f16m1(_sum4, vget_f16m1x8_f16m1(_val01, 4), _w0, vl); - _sum5 = vfmacc_vv_f16m1(_sum5, vget_f16m1x8_f16m1(_val01, 5), _w0, vl); - _sum6 = vfmacc_vv_f16m1(_sum6, vget_f16m1x8_f16m1(_val01, 6), _w0, vl); - _sum7 = vfmacc_vv_f16m1(_sum7, vget_f16m1x8_f16m1(_val01, 7), _w0, vl); + _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); + _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl); + _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl); + _sum4 = vfmacc_vv_f16m1(_sum4, _val4, _w0, vl); + _sum5 = vfmacc_vv_f16m1(_sum5, _val5, _w0, vl); + _sum6 = vfmacc_vv_f16m1(_sum6, _val6, _w0, vl); + _sum7 = vfmacc_vv_f16m1(_sum7, _val7, _w0, vl); tmpptr += packn * 8; kptr0 += packn; } @@ -463,12 +486,17 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ for (int j = 0; j < nn; j++) { - vfloat16m1x4_t _val01 = vlseg4e16_v_f16m1x4(tmpptr, vl); + vfloat16m1_t _val0; + vfloat16m1_t _val1; + vfloat16m1_t _val2; + vfloat16m1_t _val3; + + vlseg4e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl); vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x4_f16m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x4_f16m1(_val01, 1), _w0, vl); - _sum2 = vfmacc_vv_f16m1(_sum2, vget_f16m1x4_f16m1(_val01, 2), _w0, vl); - _sum3 = vfmacc_vv_f16m1(_sum3, vget_f16m1x4_f16m1(_val01, 3), _w0, vl); + _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); + _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl); + _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl); tmpptr += packn * 4; kptr0 += packn; } @@ -519,10 +547,12 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ for (int j = 0; j < nn; j++) { - vfloat16m1x2_t _val01 = vlseg2e16_v_f16m1x2(tmpptr, vl); + vfloat16m1_t _val0; + vfloat16m1_t _val1; + vlseg2e16_v_f16m1(&_val0, &_val1, tmpptr, vl); vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x2_f16m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x2_f16m1(_val01, 1), _w0, vl); + _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); tmpptr += packn * 2; kptr0 += packn; } @@ -648,7 +678,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(const static void convolution_im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_winograd_dot.h b/src/layer/riscv/convolution_winograd_dot.h index 8ea6bc9c5761..c0a7b7680f81 100644 --- a/src/layer/riscv/convolution_winograd_dot.h +++ b/src/layer/riscv/convolution_winograd_dot.h @@ -16,7 +16,7 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M { #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_winograd_dot_packn.h b/src/layer/riscv/convolution_winograd_dot_packn.h index 434eaa00c681..1c505d5c2e18 100644 --- a/src/layer/riscv/convolution_winograd_dot_packn.h +++ b/src/layer/riscv/convolution_winograd_dot_packn.h @@ -15,7 +15,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u * packn, packn, opt.workspace_allocator); @@ -75,7 +75,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c vfloat32m1_t _val5 = vle32_v_f32m1(r0 + packn * 5, vl); vfloat32m1_t _val6 = vle32_v_f32m1(r0 + packn * 6, vl); vfloat32m1_t _val7 = vle32_v_f32m1(r0 + packn * 7, vl); - vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 8; @@ -108,7 +108,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl); vfloat32m1_t _val2 = vle32_v_f32m1(r0 + packn * 2, vl); vfloat32m1_t _val3 = vle32_v_f32m1(r0 + packn * 3, vl); - vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 4; @@ -137,7 +137,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c #else vfloat32m1_t _val0 = vle32_v_f32m1(r0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl); - vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl); + vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 2; diff --git a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h index 0b731519426f..ed35ad3e3785 100644 --- a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h @@ -15,7 +15,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u * packn, packn, opt.workspace_allocator); @@ -75,7 +75,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o vfloat16m1_t _val5 = vle16_v_f16m1(r0 + packn * 5, vl); vfloat16m1_t _val6 = vle16_v_f16m1(r0 + packn * 6, vl); vfloat16m1_t _val7 = vle16_v_f16m1(r0 + packn * 7, vl); - vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 8; @@ -108,7 +108,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl); vfloat16m1_t _val2 = vle16_v_f16m1(r0 + packn * 2, vl); vfloat16m1_t _val3 = vle16_v_f16m1(r0 + packn * 3, vl); - vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 4; @@ -137,7 +137,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o #else vfloat16m1_t _val0 = vle16_v_f16m1(r0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl); - vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl); + vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 2; diff --git a/src/layer/riscv/convolution_winograd_transform_packn.h b/src/layer/riscv/convolution_winograd_transform_packn.h index db3a05aa92f4..f5a529707598 100644 --- a/src/layer/riscv/convolution_winograd_transform_packn.h +++ b/src/layer/riscv/convolution_winograd_transform_packn.h @@ -15,7 +15,7 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -436,7 +436,7 @@ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -553,7 +553,7 @@ static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -646,7 +646,7 @@ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd23_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; diff --git a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h index b1b1ad9f54d8..2404a8a40928 100644 --- a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h @@ -15,7 +15,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -436,7 +436,7 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -553,7 +553,7 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -646,7 +646,7 @@ static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn.h b/src/layer/riscv/convolutiondepthwise_3x3_packn.h index d8aa0ec4ee03..0cab1af0802b 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_packn.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_packn.h @@ -15,7 +15,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; @@ -302,7 +302,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h index c3d73053beaf..d479385f6a2c 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h @@ -15,7 +15,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; @@ -302,7 +302,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn.h b/src/layer/riscv/convolutiondepthwise_5x5_packn.h index cd35ef8e816c..2ef2fea74551 100644 --- a/src/layer/riscv/convolutiondepthwise_5x5_packn.h +++ b/src/layer/riscv/convolutiondepthwise_5x5_packn.h @@ -15,7 +15,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; @@ -336,7 +336,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M static void convdw5x5s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h index 1647f96db8cc..08270e307c9f 100644 --- a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h +++ b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h @@ -15,7 +15,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; @@ -336,7 +336,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void convdw5x5s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index e33360e06092..eb39ac0baa7b 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -282,7 +282,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -710,7 +710,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -920,7 +920,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/crop_riscv.cpp b/src/layer/riscv/crop_riscv.cpp index f7b44efd1a12..80e76fc47b47 100644 --- a/src/layer/riscv/crop_riscv.cpp +++ b/src/layer/riscv/crop_riscv.cpp @@ -43,7 +43,7 @@ static void crop_packn_rvv(const Mat& src, Mat& dst, int top, int left, int pack int h = dst.h; int right = src.w - dst.w - left; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const float* ptr = src.row(top) + left * packn; float* outptr = dst; @@ -69,7 +69,7 @@ static void crop_packn_bf16_fp16s_rvv(const Mat& src, Mat& dst, int top, int lef int h = dst.h; int right = src.w - dst.w - left; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const unsigned short* ptr = src.row(top) + left * packn; unsigned short* outptr = dst; diff --git a/src/layer/riscv/deconvolution_pack1ton.h b/src/layer/riscv/deconvolution_pack1ton.h index dfbe8e01a2d5..ec18f62c1c6f 100644 --- a/src/layer/riscv/deconvolution_pack1ton.h +++ b/src/layer/riscv/deconvolution_pack1ton.h @@ -15,7 +15,7 @@ static void deconvolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_pack1ton_fp16s.h b/src/layer/riscv/deconvolution_pack1ton_fp16s.h index a1fcfefc254f..168c709217d2 100644 --- a/src/layer/riscv/deconvolution_pack1ton_fp16s.h +++ b/src/layer/riscv/deconvolution_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -103,7 +103,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_packn.h b/src/layer/riscv/deconvolution_packn.h index 457e2b95c929..8cab6c3b0a10 100644 --- a/src/layer/riscv/deconvolution_packn.h +++ b/src/layer/riscv/deconvolution_packn.h @@ -15,7 +15,7 @@ static void deconvolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_packn_fp16s.h b/src/layer/riscv/deconvolution_packn_fp16s.h index 46d52470ad04..62fbd2eb731c 100644 --- a/src/layer/riscv/deconvolution_packn_fp16s.h +++ b/src/layer/riscv/deconvolution_packn_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -105,7 +105,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_packnto1.h b/src/layer/riscv/deconvolution_packnto1.h index ba81baf3676c..2efa9b154d2e 100644 --- a/src/layer/riscv/deconvolution_packnto1.h +++ b/src/layer/riscv/deconvolution_packnto1.h @@ -15,7 +15,7 @@ static void deconvolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_packnto1_fp16s.h b/src/layer/riscv/deconvolution_packnto1_fp16s.h index 5cb0a3c49bd1..ab70100fb3bd 100644 --- a/src/layer/riscv/deconvolution_packnto1_fp16s.h +++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -116,7 +116,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp index ab20e6c4148a..b53e8962fd26 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp @@ -210,7 +210,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif // convolv with NxN kernel @@ -518,7 +518,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -739,7 +739,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/dropout_riscv.cpp b/src/layer/riscv/dropout_riscv.cpp index fc71db7689aa..461edf2d056d 100644 --- a/src/layer/riscv/dropout_riscv.cpp +++ b/src/layer/riscv/dropout_riscv.cpp @@ -53,7 +53,7 @@ int Dropout_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfmul_vf_f32m8(_p, scale, vl); diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp index 325ab6f175d1..491c051c7fea 100644 --- a/src/layer/riscv/flatten_riscv.cpp +++ b/src/layer/riscv/flatten_riscv.cpp @@ -119,7 +119,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); vsse32_v_f32m1(outptr, w * sizeof(float), _p, vl); @@ -147,7 +147,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); vsse32_v_f32m1(outptr, size * sizeof(float), _p, vl); @@ -172,7 +172,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vse32_v_f32m8(outptr, _p, vl); @@ -262,7 +262,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); vuint16m1_t _p = vle16_v_u16m1(ptr, vl); vsse16_v_u16m1(outptr, w * sizeof(unsigned short), _p, vl); @@ -290,7 +290,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); vuint16m1_t _p = vle16_v_u16m1(ptr, vl); vsse16_v_u16m1(outptr, size * sizeof(unsigned short), _p, vl); @@ -315,7 +315,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vuint16m8_t _p = vle16_v_u16m8(ptr, vl); vse16_v_u16m8(outptr, _p, vl); @@ -405,7 +405,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e8m1(n); + size_t vl = vsetvl_e8m1(n); vint8m1_t _p = vle8_v_i8m1(ptr, vl); vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl); @@ -433,7 +433,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e8m1(n); + size_t vl = vsetvl_e8m1(n); vint8m1_t _p = vle8_v_i8m1(ptr, vl); vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl); @@ -458,7 +458,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e8m8(n); + size_t vl = vsetvl_e8m8(n); vint8m8_t _p = vle8_v_i8m8(ptr, vl); vse8_v_i8m8(outptr, _p, vl); diff --git a/src/layer/riscv/gelu_riscv.cpp b/src/layer/riscv/gelu_riscv.cpp index 708e951e5a31..69b374998f32 100644 --- a/src/layer/riscv/gelu_riscv.cpp +++ b/src/layer/riscv/gelu_riscv.cpp @@ -48,7 +48,7 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m4(n); + size_t vl = vsetvl_e32m4(n); vfloat32m4_t _p = vle32_v_f32m4(ptr, vl); @@ -77,7 +77,7 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); auto _p = vle32_v_f32m8(ptr, vl); auto _perfc = vfmul_vf_f32m8(_p, -.70710678f, vl); _p = vfmul_vf_f32m8(_p, .5f, vl); diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp index e45d37592ef4..28afa5081d06 100644 --- a/src/layer/riscv/gru_riscv.cpp +++ b/src/layer/riscv/gru_riscv.cpp @@ -63,7 +63,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_xcu = weight_xc_U; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _x = vle32_v_f32m8(ptr_x, vl); vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl); vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl); @@ -93,7 +93,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_hcu = weight_hc_U; while (n_out > 0) { - word_type vl = vsetvl_e32m8(n_out); + size_t vl = vsetvl_e32m8(n_out); vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl); vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl); vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl); @@ -136,7 +136,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_whc_n = weight_hc_N; while (n_out2 > 0) { - word_type vl = vsetvl_e32m8(n_out2); + size_t vl = vsetvl_e32m8(n_out2); vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl); vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl); @@ -160,7 +160,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_xcn = weight_xc_N; while (n2 > 0) { - word_type vl = vsetvl_e32m8(n2); + size_t vl = vsetvl_e32m8(n2); vfloat32m8_t _x = vle32_v_f32m8(ptr_x2, vl); vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl); @@ -428,7 +428,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M const float* ptr_xcu = weight_xc_U; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x, vl), vl); vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl); vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl); @@ -458,7 +458,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M const float* ptr_hcu = weight_hc_U; while (n_out > 0) { - word_type vl = vsetvl_e16m4(n_out); + size_t vl = vsetvl_e16m4(n_out); vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl); vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl); vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl); @@ -501,7 +501,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M const float* ptr_whc_n = weight_hc_N; while (n_out2 > 0) { - word_type vl = vsetvl_e16m4(n_out2); + size_t vl = vsetvl_e16m4(n_out2); vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl); vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl); @@ -525,7 +525,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M const float* ptr_xcn = weight_xc_N; while (n2 > 0) { - word_type vl = vsetvl_e16m4(n2); + size_t vl = vsetvl_e16m4(n2); vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x2, vl), vl); vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl); @@ -758,7 +758,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const const __fp16* ptr_xcu = weight_xc_U; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _x = vle16_v_f16m8(ptr_x, vl); vfloat16m8_t _xcr = vle16_v_f16m8(ptr_xcr, vl); vfloat16m8_t _xcu = vle16_v_f16m8(ptr_xcu, vl); @@ -785,7 +785,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const const __fp16* ptr_hcu = weight_hc_U; while (n_out > 0) { - word_type vl = vsetvl_e16m4(n_out); + size_t vl = vsetvl_e16m4(n_out); vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc, vl), vl); vfloat16m4_t _hcr = vle16_v_f16m4(ptr_hcr, vl); vfloat16m4_t _hcu = vle16_v_f16m4(ptr_hcu, vl); @@ -825,7 +825,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const const __fp16* ptr_whc_n = weight_hc_N; while (n_out2 > 0) { - word_type vl = vsetvl_e16m4(n_out2); + size_t vl = vsetvl_e16m4(n_out2); vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc2, vl), vl); vfloat16m4_t _whc_n = vle16_v_f16m4(ptr_whc_n, vl); @@ -846,7 +846,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const const __fp16* ptr_xcn = weight_xc_N; while (n2 > 0) { - word_type vl = vsetvl_e16m8(n2); + size_t vl = vsetvl_e16m8(n2); vfloat16m8_t _x = vle16_v_f16m8(ptr_x2, vl); vfloat16m8_t _xcn = vle16_v_f16m8(ptr_xcn, vl); diff --git a/src/layer/riscv/hardsigmoid_riscv.cpp b/src/layer/riscv/hardsigmoid_riscv.cpp index 2c3bbec28862..112a1c9c8d29 100644 --- a/src/layer/riscv/hardsigmoid_riscv.cpp +++ b/src/layer/riscv/hardsigmoid_riscv.cpp @@ -60,7 +60,7 @@ int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl); @@ -111,7 +111,7 @@ int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl); diff --git a/src/layer/riscv/hardswish_riscv.cpp b/src/layer/riscv/hardswish_riscv.cpp index b60197115ca5..5d68e07b06a5 100644 --- a/src/layer/riscv/hardswish_riscv.cpp +++ b/src/layer/riscv/hardswish_riscv.cpp @@ -60,7 +60,7 @@ int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl); @@ -111,7 +111,7 @@ int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl); diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index 721c6361b8be..30dd74287776 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -198,7 +198,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt #if __riscv_vector if (elempack == packn && num_output_elempack == packn) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -237,7 +237,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt if (elempack == 1 && num_output_elempack == packn) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -273,7 +273,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt if (elempack == packn && num_output_elempack == 1) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -372,7 +372,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); if (bias_term) @@ -414,7 +414,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt { int p = pp * packn; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); if (bias_term) @@ -595,7 +595,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con { if (elempack == packn && num_output_elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -635,7 +635,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con if (elempack == 1 && num_output_elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -672,7 +672,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con if (elempack == packn && num_output_elempack == 1) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -765,7 +765,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); if (bias_term) @@ -857,7 +857,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co { if (elempack == packn && num_output_elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -897,7 +897,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co if (elempack == 1 && num_output_elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -934,7 +934,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co if (elempack == packn && num_output_elempack == 1) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -1027,7 +1027,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); if (bias_term) diff --git a/src/layer/riscv/instancenorm_riscv.cpp b/src/layer/riscv/instancenorm_riscv.cpp new file mode 100644 index 000000000000..c13cf261220f --- /dev/null +++ b/src/layer/riscv/instancenorm_riscv.cpp @@ -0,0 +1,514 @@ +// Xavier Hsinyuan is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 Xavier Hsinyuan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "instancenorm_riscv.h" + +#include + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_usability.h" + +namespace ncnn { +InstanceNorm_riscv::InstanceNorm_riscv() +{ +#if __riscv_vector + support_packing = true; +#if __riscv_zfh + support_fp16_storage = true; +#endif +#endif // __riscv_vector +} + +int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ +// x = (x - mean) / (sqrt(var + eps)) * gamma + beta +#if __riscv_vector + int elembits = bottom_top_blob.elembits(); + if (opt.use_fp16_storage && elembits == 16) + { + if (opt.use_fp16_arithmetic) + return forward_inplace_fp16sa(bottom_top_blob, opt); + else + return forward_inplace_fp16s(bottom_top_blob, opt); + } + int elempack = bottom_top_blob.elempack; +#endif // __riscv_vector + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int c = bottom_top_blob.c; + int size = w * h; + + int dims = bottom_top_blob.dims; +#if __riscv_vector + if (elempack == 1) +#endif // __riscv_vector + { +#if __riscv_vector + size = elempack * size; +#endif + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + float* ptr = bottom_top_blob.channel(q); + + // mean and var + float sum = 0.f; + float sqsum = 0.f; +#if __riscv_vector + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + { + int n = size; + float* ptr_sum = ptr; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sum, vl); + _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); +#else + for (int i = 0; i < size; i++) + { + sum += ptr[i]; + //sqsum += ptr[i] * ptr[i]; + } +#endif // __riscv_vector + float mean = sum / size; +#if __riscv_vecotr + { + int n = size; + float* ptr_sqsum = ptr; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_sqsum, vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); +#else + float tmp = 0.f; + for (int i = 0; i < size; i++) + { + tmp = ptr[i] - mean; + sqsum += tmp * tmp; + } +#endif // __riscv_vector + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + float a; + float b; + if (affine) + { + float gamma = gamma_data[q]; + float beta = beta_data[q]; + + a = static_cast(gamma / (sqrt(var + eps))); + b = -mean * a + beta; + } + else + { + a = static_cast(1.f / (sqrt(var + eps))); + b = -mean * a; + } +#if __riscv_vector + { + int n = size; + float* ptr_store = ptr; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vle32_v_f32m8(ptr_store, vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse32_v_f32m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } +#else + for (int i = 0; i < size; i++) + { + ptr[i] = ptr[i] * a + b; + } +#endif // __riscv_vector + } + return 0; + } + +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + if (elempack == packn) + { + const size_t vl = vsetvl_e32m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + float* ptr = bottom_top_blob.channel(q); + vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); + vfloat32m1_t _sqsum = vfmv_v_f_f32m1(0.f, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); + _sum = vfadd_vv_f32m1(_p, _sum, vl); + // _sqsum = vfmadd_vv_f32m1(_p,_p,_sqsum,vl); + } + vfloat32m1_t _mean = vfdiv_vf_f32m1(_sum, size, vl); + for (int i = 0; i < size; i++) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr + vl * i, vl); + _p = vfsub_vv_f32m1(_p, _mean, vl); + _sqsum = vfmadd_vv_f32m1(_p, _p, _sqsum, vl); + } + vfloat32m1_t _var = vfdiv_vf_f32m1(_sqsum, size, vl); + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + vfloat32m1_t _a; + vfloat32m1_t _b; + if (affine) + { + vfloat32m1_t _gamma = vle32_v_f32m1((const float*)gamma_data + q * vl, vl); + vfloat32m1_t _beta = vle32_v_f32m1((const float*)beta_data + q * vl, vl); + _a = vfdiv_vv_f32m1(_gamma, vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), vl); + _b = vfnmsub_vv_f32m1(_a, _mean, _beta, vl); + } + else + { + _a = vfrdiv_vf_f32m1(vfsqrt_v_f32m1(vfadd_vf_f32m1(_var, eps, vl), vl), 1.f, vl); + _b = vfmul_vv_f32m1(_a, _mean, vl); + _b = vfsgnjn_vv_f32m1(_b, _b, vl); + } + for (int i = 0; i < size; i++) + { + vfloat32m1_t _p = vle32_v_f32m1(ptr + i * vl, vl); + _p = vfmadd_vv_f32m1(_p, _a, _b, vl); + vse32_v_f32m1(ptr + i * vl, _p, vl); + } + } + return 0; + } +#endif // __riscv_vector + return 0; +} + +#if __riscv_vector && __riscv_zfh +int InstanceNorm_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + // x = (x - mean) / (sqrt(var + eps)) * gamma + beta + + int elempack = bottom_top_blob.elempack; + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int c = bottom_top_blob.c; + int size = w * h; + + int dims = bottom_top_blob.dims; + if (elempack == 1) + { + size = elempack * size; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + + // mean and var + float sum = 0.f; + float sqsum = 0.f; + vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + vfloat32m1_t _sqsum = vfmv_s_f_f32m1(vundefined_f32m1(), 0.f, vsetvlmax_e32m1()); + { + int n = size; + __fp16* ptr_sum = ptr; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sum, vl), vl); + _sum = vfredusum_vs_f32m8_f32m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f32m1_f32(_sum); + float mean = sum / size; + { + int n = size; + __fp16* ptr_sqsum = ptr; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_sqsum, vl), vl); + _p = vfsub_vf_f32m8(_p, mean, vl); + _sqsum = vfredosum_vs_f32m8_f32m1(_sqsum, vfmul_vv_f32m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f32m1_f32(_sqsum); + float var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + float a; + float b; + if (affine) + { + float gamma = gamma_data[q]; + float beta = beta_data[q]; + + a = static_cast(gamma / (sqrt(var + eps))); + b = -mean * a + beta; + } + else + { + a = static_cast(1.f / (sqrt(var + eps))); + b = -mean * a; + } + { + int n = size; + __fp16* ptr_store = ptr; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_store, vl), vl); + _p = vfmul_vf_f32m8(_p, a, vl); + _p = vfadd_vf_f32m8(_p, b, vl); + vse16_v_f16m4(ptr_store, vfncvt_f_f_w_f16m4(_p, vl), vl); + n -= vl; + ptr_store += vl; + } + } + } + return 0; + } + + const int packn = csrr_vlenb() / 2; + if (elempack == packn) + { + const size_t vl = vsetvl_e16m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); + vfloat32m2_t _sqsum = vfmv_v_f_f32m2(0.f, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); + _sum = vfadd_vv_f32m2(_p, _sum, vl); + // _sqsum = vfmadd_vv_f32m2(_p,_p,_sqsum,vl); + } + vfloat32m2_t _mean = vfdiv_vf_f32m2(_sum, size, vl); + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + vl * i, vl), vl); + _p = vfsub_vv_f32m2(_p, _mean, vl); + _sqsum = vfmadd_vv_f32m2(_p, _p, _sqsum, vl); + } + vfloat32m2_t _var = vfdiv_vf_f32m2(_sqsum, size, vl); + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + vfloat32m2_t _a; + vfloat32m2_t _b; + if (affine) + { + vfloat32m2_t _gamma = vle32_v_f32m2((const float*)gamma_data + q * vl, vl); + vfloat32m2_t _beta = vle32_v_f32m2((const float*)beta_data + q * vl, vl); + _a = vfdiv_vv_f32m2(_gamma, vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), vl); + _b = vfnmsub_vv_f32m2(_a, _mean, _beta, vl); + } + else + { + _a = vfrdiv_vf_f32m2(vfsqrt_v_f32m2(vfadd_vf_f32m2(_var, eps, vl), vl), 1.f, vl); + _b = vfmul_vv_f32m2(_a, _mean, vl); + _b = vfsgnjn_vv_f32m2(_b, _b, vl); + } + for (int i = 0; i < size; i++) + { + vfloat32m2_t _p = vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr + i * vl, vl), vl); + _p = vfmadd_vv_f32m2(_p, _a, _b, vl); + vse16_v_f16m1(ptr + i * vl, vfncvt_f_f_w_f16m1(_p, vl), vl); + } + } + return 0; + } + return 0; +} + +int InstanceNorm_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const +{ + // x = (x - mean) / (sqrt(var + eps)) * gamma + beta + int elempack = bottom_top_blob.elempack; + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int c = bottom_top_blob.c; + int size = w * h; + + int dims = bottom_top_blob.dims; + if (elempack == 1) + { + size = elempack * size; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + + // mean and var + __fp16 sum = 0.f; + __fp16 sqsum = 0.f; + vfloat16m1_t _sum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); + vfloat16m1_t _sqsum = vfmv_s_f_f16m1(vundefined_f16m1(), 0.f, vsetvlmax_e32m1()); + { + int n = size; + __fp16* ptr_sum = ptr; + while (n > 0) + { + size_t vl = vsetvl_e16m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_sum, vl); + _sum = vfredusum_vs_f16m8_f16m1(_sum, _p, /* scalar */ _sum, vl); + // _sqsum = vfredosum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); + ptr_sum += vl; + n -= vl; + } + } + sum = vfmv_f_s_f16m1_f16(_sum); + __fp16 mean = sum / size; + { + int n = size; + __fp16* ptr_sqsum = ptr; + while (n > 0) + { + size_t vl = vsetvl_e16m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_sqsum, vl); + _p = vfsub_vf_f16m8(_p, mean, vl); + _sqsum = vfredosum_vs_f16m8_f16m1(_sqsum, vfmul_vv_f16m8(_p, _p, vl), /* scalar */ _sqsum, vl); + n -= vl; + ptr_sqsum += vl; + } + } + sqsum = vfmv_f_s_f16m1_f16(_sqsum); + __fp16 var = sqsum / size; + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + __fp16 a; + __fp16 b; + if (affine) + { + float gamma = gamma_data[q]; + float beta = beta_data[q]; + + a = static_cast<__fp16>(gamma / (sqrt(var + eps))); + b = static_cast<__fp16>(-mean * a + beta); + } + else + { + a = static_cast<__fp16>(1.f / (sqrt(var + eps))); + b = static_cast<__fp16>(-mean * a); + } + { + int n = size; + __fp16* ptr_store = ptr; + while (n > 0) + { + size_t vl = vsetvl_e32m8(n); + vfloat16m8_t _p = vle16_v_f16m8(ptr_store, vl); + _p = vfmul_vf_f16m8(_p, a, vl); + _p = vfadd_vf_f16m8(_p, b, vl); + vse16_v_f16m8(ptr_store, _p, vl); + n -= vl; + ptr_store += vl; + } + } + } + return 0; + } + + const int packn = csrr_vlenb() / 2; + if (elempack == packn) + { + const size_t vl = vsetvl_e16m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < c; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); + vfloat16m1_t _sqsum = vfmv_v_f_f16m1(0.f, vl); + + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); + _sum = vfadd_vv_f16m1(_p, _sum, vl); + // _sqsum = vfmadd_vv_f16m1(_p,_p,_sqsum,vl); + } + vfloat16m1_t _mean = vfdiv_vf_f16m1(_sum, size, vl); + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr + vl * i, vl); + _p = vfsub_vv_f16m1(_p, _mean, vl); + _sqsum = vfmadd_vv_f16m1(_p, _p, _sqsum, vl); + } + vfloat16m1_t _var = vfdiv_vf_f16m1(_sqsum, size, vl); + // the var maybe minus due to accuracy + //float var = sqsum / size - mean * mean; + + vfloat16m1_t _a; + vfloat16m1_t _b; + if (affine) + { + vfloat16m1_t _gamma = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)gamma_data + q * vl, vl), vl); + vfloat16m1_t _beta = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)beta_data + q * vl, vl), vl); + _a = vfdiv_vv_f16m1(_gamma, vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), vl); + _b = vfnmsub_vv_f16m1(_a, _mean, _beta, vl); + } + else + { + _a = vfrdiv_vf_f16m1(vfsqrt_v_f16m1(vfadd_vf_f16m1(_var, eps, vl), vl), 1.f, vl); + _b = vfmul_vv_f16m1(_a, _mean, vl); + _b = vfsgnjn_vv_f16m1(_b, _b, vl); + } + for (int i = 0; i < size; i++) + { + vfloat16m1_t _p = vle16_v_f16m1(ptr + i * vl, vl); + _p = vfmadd_vv_f16m1(_p, _a, _b, vl); + vse16_v_f16m1(ptr + i * vl, _p, vl); + } + } + return 0; + } + return 0; +} + +#endif // __riscv_vector && __riscv_zfh + +} // namespace ncnn \ No newline at end of file diff --git a/src/layer/riscv/instancenorm_riscv.h b/src/layer/riscv/instancenorm_riscv.h new file mode 100644 index 000000000000..80583cc2c89f --- /dev/null +++ b/src/layer/riscv/instancenorm_riscv.h @@ -0,0 +1,36 @@ +// Xavier Hsinyuan is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 Xavier Hsinyuan . All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_INSTANCENORM_RISCV_H +#define LAYER_INSTANCENORM_RISCV_H + +#include "instancenorm.h" + +namespace ncnn { +class InstanceNorm_riscv : virtual public InstanceNorm +{ +public: + InstanceNorm_riscv(); + + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; + +protected: +#if __riscv_vector && __riscv_zfh + int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; + int forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const; +#endif +}; +} // namespace ncnn + +#endif // LAYER_INSTANCENORM_RISCV_H \ No newline at end of file diff --git a/src/layer/riscv/interp_bicubic_packn.h b/src/layer/riscv/interp_bicubic_packn.h index 16ed365ff536..4c4eb869c43d 100644 --- a/src/layer/riscv/interp_bicubic_packn.h +++ b/src/layer/riscv/interp_bicubic_packn.h @@ -15,7 +15,7 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = dst.w; int h = dst.h; diff --git a/src/layer/riscv/interp_bicubic_packn_fp16s.h b/src/layer/riscv/interp_bicubic_packn_fp16s.h index b83a9eba3c63..ff2284552b7f 100644 --- a/src/layer/riscv/interp_bicubic_packn_fp16s.h +++ b/src/layer/riscv/interp_bicubic_packn_fp16s.h @@ -15,7 +15,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; @@ -244,7 +244,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; diff --git a/src/layer/riscv/interp_bilinear.h b/src/layer/riscv/interp_bilinear.h index 1742626017ac..0f6338d73109 100644 --- a/src/layer/riscv/interp_bilinear.h +++ b/src/layer/riscv/interp_bilinear.h @@ -86,16 +86,17 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - word_type vl = vsetvl_e32m4(n); + size_t vl = vsetvl_e32m4(n); vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl); - vfloat32m4x2_t _S1p = vloxseg2ei32_v_f32m4x2(S1, _sx, vl); - vfloat32m4_t _S1p0 = vget_f32m4x2_f32m4(_S1p, 0); - vfloat32m4_t _S1p1 = vget_f32m4x2_f32m4(_S1p, 1); - vfloat32m4x2_t _a0a1 = vlseg2e32_v_f32m4x2(alphap, vl); - vfloat32m4_t _a0 = vget_f32m4x2_f32m4(_a0a1, 0); - vfloat32m4_t _a1 = vget_f32m4x2_f32m4(_a0a1, 1); + vfloat32m4_t _S1p0; + vfloat32m4_t _S1p1; + vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl); + + vfloat32m4_t _a0; + vfloat32m4_t _a1; + vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl); vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl); @@ -135,19 +136,21 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - word_type vl = vsetvl_e32m4(n); + size_t vl = vsetvl_e32m4(n); vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl); - vfloat32m4x2_t _S0p = vloxseg2ei32_v_f32m4x2(S0, _sx, vl); - vfloat32m4x2_t _S1p = vloxseg2ei32_v_f32m4x2(S1, _sx, vl); - vfloat32m4_t _S0p0 = vget_f32m4x2_f32m4(_S0p, 0); - vfloat32m4_t _S0p1 = vget_f32m4x2_f32m4(_S0p, 1); - vfloat32m4_t _S1p0 = vget_f32m4x2_f32m4(_S1p, 0); - vfloat32m4_t _S1p1 = vget_f32m4x2_f32m4(_S1p, 1); - vfloat32m4x2_t _a0a1 = vlseg2e32_v_f32m4x2(alphap, vl); - vfloat32m4_t _a0 = vget_f32m4x2_f32m4(_a0a1, 0); - vfloat32m4_t _a1 = vget_f32m4x2_f32m4(_a0a1, 1); + vfloat32m4_t _S0p0; + vfloat32m4_t _S0p1; + vfloat32m4_t _S1p0; + vfloat32m4_t _S1p1; + + vloxseg2ei32_v_f32m4(&_S0p0, &_S0p1, S0, _sx, vl); + vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl); + + vfloat32m4_t _a0; + vfloat32m4_t _a1; + vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl); vfloat32m4_t _rows0 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S0p0, _a0, vl), _S0p1, _a1, vl); vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl); @@ -192,7 +195,7 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl); vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl); diff --git a/src/layer/riscv/interp_bilinear_fp16s.h b/src/layer/riscv/interp_bilinear_fp16s.h index 091e86b7301b..cd61af6efac3 100644 --- a/src/layer/riscv/interp_bilinear_fp16s.h +++ b/src/layer/riscv/interp_bilinear_fp16s.h @@ -131,7 +131,7 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha, int n = w; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl); vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl); @@ -232,7 +232,7 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha int n = w; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _rows0 = vle16_v_f16m8(rows0p, vl); vfloat16m8_t _rows1 = vle16_v_f16m8(rows1p, vl); diff --git a/src/layer/riscv/interp_bilinear_packn.h b/src/layer/riscv/interp_bilinear_packn.h index 0d800e324cba..9dffc01bf300 100644 --- a/src/layer/riscv/interp_bilinear_packn.h +++ b/src/layer/riscv/interp_bilinear_packn.h @@ -15,7 +15,7 @@ static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = dst.w; int h = dst.h; diff --git a/src/layer/riscv/interp_bilinear_packn_fp16s.h b/src/layer/riscv/interp_bilinear_packn_fp16s.h index b48fd8431a4a..dfe02c00d1be 100644 --- a/src/layer/riscv/interp_bilinear_packn_fp16s.h +++ b/src/layer/riscv/interp_bilinear_packn_fp16s.h @@ -15,7 +15,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; @@ -122,7 +122,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp index b72cfd00280f..ea8344985edf 100644 --- a/src/layer/riscv/interp_riscv.cpp +++ b/src/layer/riscv/interp_riscv.cpp @@ -88,7 +88,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (elempack == packn) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < w; q++) @@ -130,7 +130,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector { if (resize_type == 1) // nearest { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -153,7 +153,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector if (resize_type == 2) // bilinear { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int* buf = new int[outw + outw * packn]; @@ -190,7 +190,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector if (resize_type == 3) // bicubic { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int* buf = new int[outw + outw * packn]; @@ -328,7 +328,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector { if (resize_type == 1) // nearest { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const float hs = output_height ? h / (float)outh : 1.f / height_scale; const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -518,7 +518,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < w; q++) @@ -558,7 +558,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto { if (resize_type == 1) // nearest { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -581,7 +581,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (resize_type == 2) // bilinear { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int* buf = new int[outw + outw * packn]; @@ -618,7 +618,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (resize_type == 3) // bicubic { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int* buf = new int[outw + outw * packn]; @@ -754,7 +754,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto { if (resize_type == 1) // nearest { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const float hs = output_height ? h / (float)outh : 1.f / height_scale; const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -955,7 +955,7 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect { if (resize_type == 2) // bilinear { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int* buf = new int[outw + outw * packn]; @@ -992,7 +992,7 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect if (resize_type == 3) // bicubic { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int* buf = new int[outw + outw * packn]; diff --git a/src/layer/riscv/mish_riscv.cpp b/src/layer/riscv/mish_riscv.cpp index abee1ec37489..4ddb14700061 100644 --- a/src/layer/riscv/mish_riscv.cpp +++ b/src/layer/riscv/mish_riscv.cpp @@ -64,7 +64,7 @@ int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); @@ -103,7 +103,7 @@ int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); @@ -134,7 +134,7 @@ int Mish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfmul_vv_f16m8(_p, tanh_ps(log_ps(vfadd_vf_f16m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp index 1805c2469eb7..5c298da522dd 100644 --- a/src/layer/riscv/packing_riscv.cpp +++ b/src/layer/riscv/packing_riscv.cpp @@ -18,6 +18,8 @@ #include #endif // __riscv_vector +#include "riscv_usability.h" + namespace ncnn { Packing_riscv::Packing_riscv() @@ -137,13 +139,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m2(n); + size_t vl = vsetvl_e32m2(n); vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl); vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl); vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl); vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl); - vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl); + vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl); r0 += vl; r1 += vl; @@ -181,13 +183,18 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m2(n); + size_t vl = vsetvl_e32m2(n); + + vfloat32m2_t _p0; + vfloat32m2_t _p1; + vfloat32m2_t _p2; + vfloat32m2_t _p3; + vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl); - vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl); - vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl); - vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl); - vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl); - vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl); + vse32_v_f32m2(outptr0, _p0, vl); + vse32_v_f32m2(outptr1, _p1, vl); + vse32_v_f32m2(outptr2, _p2, vl); + vse32_v_f32m2(outptr3, _p3, vl); r0 += vl * 4; outptr0 += vl; @@ -229,7 +236,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl); vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl); @@ -239,7 +246,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl); vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl); vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl); - vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); + vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); r0 += vl; r1 += vl; @@ -289,17 +296,25 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl); - vse32_v_f32m1(outptr0, vget_f32m1x8_f32m1(_p, 0), vl); - vse32_v_f32m1(outptr1, vget_f32m1x8_f32m1(_p, 1), vl); - vse32_v_f32m1(outptr2, vget_f32m1x8_f32m1(_p, 2), vl); - vse32_v_f32m1(outptr3, vget_f32m1x8_f32m1(_p, 3), vl); - vse32_v_f32m1(outptr4, vget_f32m1x8_f32m1(_p, 4), vl); - vse32_v_f32m1(outptr5, vget_f32m1x8_f32m1(_p, 5), vl); - vse32_v_f32m1(outptr6, vget_f32m1x8_f32m1(_p, 6), vl); - vse32_v_f32m1(outptr7, vget_f32m1x8_f32m1(_p, 7), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p0; + vfloat32m1_t _p1; + vfloat32m1_t _p2; + vfloat32m1_t _p3; + vfloat32m1_t _p4; + vfloat32m1_t _p5; + vfloat32m1_t _p6; + vfloat32m1_t _p7; + vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vse32_v_f32m1(outptr0, _p0, vl); + vse32_v_f32m1(outptr1, _p1, vl); + vse32_v_f32m1(outptr2, _p2, vl); + vse32_v_f32m1(outptr3, _p3, vl); + vse32_v_f32m1(outptr4, _p4, vl); + vse32_v_f32m1(outptr5, _p5, vl); + vse32_v_f32m1(outptr6, _p6, vl); + vse32_v_f32m1(outptr7, _p7, vl); r0 += vl * 8; outptr0 += vl; @@ -343,19 +358,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x4_t _p0 = vlseg4e32_v_f32m1x4(r0, vl); - vfloat32m1x4_t _p1 = vlseg4e32_v_f32m1x4(r1, vl); - vfloat32m1_t _p00 = vget_f32m1x4_f32m1(_p0, 0); - vfloat32m1_t _p01 = vget_f32m1x4_f32m1(_p0, 1); - vfloat32m1_t _p02 = vget_f32m1x4_f32m1(_p0, 2); - vfloat32m1_t _p03 = vget_f32m1x4_f32m1(_p0, 3); - vfloat32m1_t _p10 = vget_f32m1x4_f32m1(_p1, 0); - vfloat32m1_t _p11 = vget_f32m1x4_f32m1(_p1, 1); - vfloat32m1_t _p12 = vget_f32m1x4_f32m1(_p1, 2); - vfloat32m1_t _p13 = vget_f32m1x4_f32m1(_p1, 3); - vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p00; + vfloat32m1_t _p01; + vfloat32m1_t _p02; + vfloat32m1_t _p03; + vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl); + + vfloat32m1_t _p10; + vfloat32m1_t _p11; + vfloat32m1_t _p12; + vfloat32m1_t _p13; + vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + + vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); r0 += vl * 4; r1 += vl * 4; @@ -395,19 +412,19 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl); - vfloat32m1_t _p0 = vget_f32m1x8_f32m1(_p, 0); - vfloat32m1_t _p1 = vget_f32m1x8_f32m1(_p, 1); - vfloat32m1_t _p2 = vget_f32m1x8_f32m1(_p, 2); - vfloat32m1_t _p3 = vget_f32m1x8_f32m1(_p, 3); - vfloat32m1_t _p4 = vget_f32m1x8_f32m1(_p, 4); - vfloat32m1_t _p5 = vget_f32m1x8_f32m1(_p, 5); - vfloat32m1_t _p6 = vget_f32m1x8_f32m1(_p, 6); - vfloat32m1_t _p7 = vget_f32m1x8_f32m1(_p, 7); - vsseg4e32_v_f32m1x4(outptr0, vcreate_f32m1x4(_p0, _p1, _p2, _p3), vl); - vsseg4e32_v_f32m1x4(outptr1, vcreate_f32m1x4(_p4, _p5, _p6, _p7), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p0; + vfloat32m1_t _p1; + vfloat32m1_t _p2; + vfloat32m1_t _p3; + vfloat32m1_t _p4; + vfloat32m1_t _p5; + vfloat32m1_t _p6; + vfloat32m1_t _p7; + vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl); + vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl); r0 += vl * 8; outptr0 += vl * 4; @@ -466,13 +483,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m2(n); + size_t vl = vsetvl_e32m2(n); vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl); vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl); vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl); vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl); - vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl); + vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl); r0 += vl; r1 += vl; @@ -510,13 +527,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m2(n); - - vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl); - vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl); - vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl); - vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl); - vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl); + size_t vl = vsetvl_e32m2(n); + vfloat32m2_t _p0; + vfloat32m2_t _p1; + vfloat32m2_t _p2; + vfloat32m2_t _p3; + vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl); + vse32_v_f32m2(outptr0, _p0, vl); + vse32_v_f32m2(outptr1, _p1, vl); + vse32_v_f32m2(outptr2, _p2, vl); + vse32_v_f32m2(outptr3, _p3, vl); r0 += vl * 4; outptr0 += vl; @@ -558,7 +578,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl); vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl); @@ -568,7 +588,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl); vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl); vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl); - vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); + vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); r0 += vl; r1 += vl; @@ -618,17 +638,26 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl); - vse32_v_f32m1(outptr0, vget_f32m1x8_f32m1(_p, 0), vl); - vse32_v_f32m1(outptr1, vget_f32m1x8_f32m1(_p, 1), vl); - vse32_v_f32m1(outptr2, vget_f32m1x8_f32m1(_p, 2), vl); - vse32_v_f32m1(outptr3, vget_f32m1x8_f32m1(_p, 3), vl); - vse32_v_f32m1(outptr4, vget_f32m1x8_f32m1(_p, 4), vl); - vse32_v_f32m1(outptr5, vget_f32m1x8_f32m1(_p, 5), vl); - vse32_v_f32m1(outptr6, vget_f32m1x8_f32m1(_p, 6), vl); - vse32_v_f32m1(outptr7, vget_f32m1x8_f32m1(_p, 7), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p0; + vfloat32m1_t _p1; + vfloat32m1_t _p2; + vfloat32m1_t _p3; + vfloat32m1_t _p4; + vfloat32m1_t _p5; + vfloat32m1_t _p6; + vfloat32m1_t _p7; + vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + + vse32_v_f32m1(outptr0, _p0, vl); + vse32_v_f32m1(outptr1, _p1, vl); + vse32_v_f32m1(outptr2, _p2, vl); + vse32_v_f32m1(outptr3, _p3, vl); + vse32_v_f32m1(outptr4, _p4, vl); + vse32_v_f32m1(outptr5, _p5, vl); + vse32_v_f32m1(outptr6, _p6, vl); + vse32_v_f32m1(outptr7, _p7, vl); r0 += vl * 8; outptr0 += vl; @@ -672,20 +701,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); - vfloat32m1x4_t _p0 = vlseg4e32_v_f32m1x4(r0, vl); - vfloat32m1x4_t _p1 = vlseg4e32_v_f32m1x4(r1, vl); + vfloat32m1_t _p00; + vfloat32m1_t _p01; + vfloat32m1_t _p02; + vfloat32m1_t _p03; + vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl); - vfloat32m1_t _p00 = vget_f32m1x4_f32m1(_p0, 0); - vfloat32m1_t _p01 = vget_f32m1x4_f32m1(_p0, 1); - vfloat32m1_t _p02 = vget_f32m1x4_f32m1(_p0, 2); - vfloat32m1_t _p03 = vget_f32m1x4_f32m1(_p0, 3); - vfloat32m1_t _p10 = vget_f32m1x4_f32m1(_p1, 0); - vfloat32m1_t _p11 = vget_f32m1x4_f32m1(_p1, 1); - vfloat32m1_t _p12 = vget_f32m1x4_f32m1(_p1, 2); - vfloat32m1_t _p13 = vget_f32m1x4_f32m1(_p1, 3); - vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); + vfloat32m1_t _p10; + vfloat32m1_t _p11; + vfloat32m1_t _p12; + vfloat32m1_t _p13; + vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + + vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); r0 += vl * 4; r1 += vl * 4; @@ -725,19 +755,19 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl); - vfloat32m1_t _p0 = vget_f32m1x8_f32m1(_p, 0); - vfloat32m1_t _p1 = vget_f32m1x8_f32m1(_p, 1); - vfloat32m1_t _p2 = vget_f32m1x8_f32m1(_p, 2); - vfloat32m1_t _p3 = vget_f32m1x8_f32m1(_p, 3); - vfloat32m1_t _p4 = vget_f32m1x8_f32m1(_p, 4); - vfloat32m1_t _p5 = vget_f32m1x8_f32m1(_p, 5); - vfloat32m1_t _p6 = vget_f32m1x8_f32m1(_p, 6); - vfloat32m1_t _p7 = vget_f32m1x8_f32m1(_p, 7); - vsseg4e32_v_f32m1x4(outptr0, vcreate_f32m1x4(_p0, _p1, _p2, _p3), vl); - vsseg4e32_v_f32m1x4(outptr1, vcreate_f32m1x4(_p4, _p5, _p6, _p7), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p0; + vfloat32m1_t _p1; + vfloat32m1_t _p2; + vfloat32m1_t _p3; + vfloat32m1_t _p4; + vfloat32m1_t _p5; + vfloat32m1_t _p6; + vfloat32m1_t _p7; + vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl); + vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl); r0 += vl * 8; outptr0 += vl * 4; @@ -859,13 +889,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m2(n); + size_t vl = vsetvl_e16m2(n); vuint16m2_t _p0 = vle16_v_u16m2(r0, vl); vuint16m2_t _p1 = vle16_v_u16m2(r1, vl); vuint16m2_t _p2 = vle16_v_u16m2(r2, vl); vuint16m2_t _p3 = vle16_v_u16m2(r3, vl); - vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl); + vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl); r0 += vl; r1 += vl; @@ -903,13 +933,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m2(n); - - vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl); - vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl); - vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl); - vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl); - vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl); + size_t vl = vsetvl_e16m2(n); + + vuint16m2_t _p0; + vuint16m2_t _p1; + vuint16m2_t _p2; + vuint16m2_t _p3; + vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl); + vse16_v_u16m2(outptr0, _p0, vl); + vse16_v_u16m2(outptr1, _p1, vl); + vse16_v_u16m2(outptr2, _p2, vl); + vse16_v_u16m2(outptr3, _p3, vl); r0 += vl * 4; outptr0 += vl; @@ -951,7 +985,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); vuint16m1_t _p0 = vle16_v_u16m1(r0, vl); vuint16m1_t _p1 = vle16_v_u16m1(r1, vl); @@ -961,7 +995,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co vuint16m1_t _p5 = vle16_v_u16m1(r5, vl); vuint16m1_t _p6 = vle16_v_u16m1(r6, vl); vuint16m1_t _p7 = vle16_v_u16m1(r7, vl); - vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); + vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); r0 += vl; r1 += vl; @@ -1011,17 +1045,26 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl); - vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl); - vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl); - vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl); - vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl); - vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl); - vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl); - vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl); - vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p0; + vuint16m1_t _p1; + vuint16m1_t _p2; + vuint16m1_t _p3; + vuint16m1_t _p4; + vuint16m1_t _p5; + vuint16m1_t _p6; + vuint16m1_t _p7; + vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + + vse16_v_u16m1(outptr0, _p0, vl); + vse16_v_u16m1(outptr1, _p1, vl); + vse16_v_u16m1(outptr2, _p2, vl); + vse16_v_u16m1(outptr3, _p3, vl); + vse16_v_u16m1(outptr4, _p4, vl); + vse16_v_u16m1(outptr5, _p5, vl); + vse16_v_u16m1(outptr6, _p6, vl); + vse16_v_u16m1(outptr7, _p7, vl); r0 += vl * 8; outptr0 += vl; @@ -1065,19 +1108,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl); - vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl); - vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0); - vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1); - vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2); - vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3); - vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0); - vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1); - vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2); - vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3); - vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p00; + vuint16m1_t _p01; + vuint16m1_t _p02; + vuint16m1_t _p03; + vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl); + + vuint16m1_t _p10; + vuint16m1_t _p11; + vuint16m1_t _p12; + vuint16m1_t _p13; + vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + + vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); r0 += vl * 4; r1 += vl * 4; @@ -1117,19 +1162,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl); - vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0); - vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1); - vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2); - vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3); - vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4); - vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5); - vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6); - vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7); - vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl); - vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p0; + vuint16m1_t _p1; + vuint16m1_t _p2; + vuint16m1_t _p3; + vuint16m1_t _p4; + vuint16m1_t _p5; + vuint16m1_t _p6; + vuint16m1_t _p7; + vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + + vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl); + vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl); r0 += vl * 8; outptr0 += vl * 4; @@ -1188,13 +1234,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m2(n); + size_t vl = vsetvl_e16m2(n); vuint16m2_t _p0 = vle16_v_u16m2(r0, vl); vuint16m2_t _p1 = vle16_v_u16m2(r1, vl); vuint16m2_t _p2 = vle16_v_u16m2(r2, vl); vuint16m2_t _p3 = vle16_v_u16m2(r3, vl); - vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl); + vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl); r0 += vl; r1 += vl; @@ -1232,13 +1278,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m2(n); - - vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl); - vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl); - vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl); - vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl); - vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl); + size_t vl = vsetvl_e16m2(n); + + vuint16m2_t _p0; + vuint16m2_t _p1; + vuint16m2_t _p2; + vuint16m2_t _p3; + vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl); + vse16_v_u16m2(outptr0, _p0, vl); + vse16_v_u16m2(outptr1, _p1, vl); + vse16_v_u16m2(outptr2, _p2, vl); + vse16_v_u16m2(outptr3, _p3, vl); r0 += vl * 4; outptr0 += vl; @@ -1280,7 +1330,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); vuint16m1_t _p0 = vle16_v_u16m1(r0, vl); vuint16m1_t _p1 = vle16_v_u16m1(r1, vl); @@ -1290,7 +1340,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co vuint16m1_t _p5 = vle16_v_u16m1(r5, vl); vuint16m1_t _p6 = vle16_v_u16m1(r6, vl); vuint16m1_t _p7 = vle16_v_u16m1(r7, vl); - vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); + vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); r0 += vl; r1 += vl; @@ -1340,17 +1390,25 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl); - vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl); - vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl); - vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl); - vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl); - vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl); - vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl); - vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl); - vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p0; + vuint16m1_t _p1; + vuint16m1_t _p2; + vuint16m1_t _p3; + vuint16m1_t _p4; + vuint16m1_t _p5; + vuint16m1_t _p6; + vuint16m1_t _p7; + vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vse16_v_u16m1(outptr0, _p0, vl); + vse16_v_u16m1(outptr1, _p1, vl); + vse16_v_u16m1(outptr2, _p2, vl); + vse16_v_u16m1(outptr3, _p3, vl); + vse16_v_u16m1(outptr4, _p4, vl); + vse16_v_u16m1(outptr5, _p5, vl); + vse16_v_u16m1(outptr6, _p6, vl); + vse16_v_u16m1(outptr7, _p7, vl); r0 += vl * 8; outptr0 += vl; @@ -1394,20 +1452,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p00; + vuint16m1_t _p01; + vuint16m1_t _p02; + vuint16m1_t _p03; + vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl); - vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl); - vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl); + vuint16m1_t _p10; + vuint16m1_t _p11; + vuint16m1_t _p12; + vuint16m1_t _p13; + vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl); - vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0); - vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1); - vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2); - vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3); - vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0); - vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1); - vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2); - vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3); - vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); + vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); r0 += vl * 4; r1 += vl * 4; @@ -1447,19 +1506,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl); - vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0); - vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1); - vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2); - vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3); - vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4); - vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5); - vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6); - vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7); - vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl); - vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p0; + vuint16m1_t _p1; + vuint16m1_t _p2; + vuint16m1_t _p3; + vuint16m1_t _p4; + vuint16m1_t _p5; + vuint16m1_t _p6; + vuint16m1_t _p7; + vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + + vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl); + vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl); r0 += vl * 8; outptr0 += vl * 4; diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h index 1f93ecfe92dd..50f5efe1216d 100644 --- a/src/layer/riscv/padding_packn.h +++ b/src/layer/riscv/padding_packn.h @@ -16,7 +16,7 @@ static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const word_type vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ @@ -65,7 +65,7 @@ static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const word_type vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ @@ -144,7 +144,7 @@ static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const word_type vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index de29af0f6bf5..8f4b54da5904 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -91,7 +91,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -261,7 +261,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co { #if __riscv_vector const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); #endif int w = bottom_blob.w; @@ -511,7 +511,7 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt { #if __riscv_vector const int packn = csrr_vlenb() / 1; - const word_type vl = vsetvl_e8m1(packn); + const size_t vl = vsetvl_e8m1(packn); #endif int w = bottom_blob.w; diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp index 0ca4e3d894c9..1b4c1f0ed8ad 100644 --- a/src/layer/riscv/pooling_riscv.cpp +++ b/src/layer/riscv/pooling_riscv.cpp @@ -72,7 +72,7 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -315,7 +315,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op // avg value in NxN window const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -721,7 +721,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O } const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/prelu_riscv.cpp b/src/layer/riscv/prelu_riscv.cpp index c25223461a10..32cb77023b45 100644 --- a/src/layer/riscv/prelu_riscv.cpp +++ b/src/layer/riscv/prelu_riscv.cpp @@ -63,7 +63,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -84,7 +84,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -115,7 +115,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); @@ -135,7 +135,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -170,7 +170,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const const float* slope_ptr = (const float*)slope_data + q * elempack; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl); @@ -191,7 +191,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -303,7 +303,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); @@ -324,7 +324,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -355,7 +355,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); @@ -375,7 +375,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -410,7 +410,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const float* slope_ptr = (const float*)slope_data + q * elempack; while (n1 > 0) { - word_type vl = vsetvl_e16m4(n1); + size_t vl = vsetvl_e16m4(n1); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl); @@ -431,7 +431,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -468,7 +468,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl); vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl); @@ -489,7 +489,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); @@ -520,7 +520,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl); @@ -540,7 +540,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); @@ -575,7 +575,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const float* slope_ptr = (const float*)slope_data + q * elempack; while (n1 > 0) { - word_type vl = vsetvl_e16m4(n1); + size_t vl = vsetvl_e16m4(n1); vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(slope_ptr, vl), vl); @@ -596,7 +596,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); diff --git a/src/layer/riscv/relu_riscv.cpp b/src/layer/riscv/relu_riscv.cpp index 6b23ebc3a634..cf2d40570690 100644 --- a/src/layer/riscv/relu_riscv.cpp +++ b/src/layer/riscv/relu_riscv.cpp @@ -58,10 +58,10 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmax_vf_f32m8(_p, (float32_t)0.f, vl); + _p = vfmax_vf_f32m8(_p, 0.f, vl); vse32_v_f32m8(ptr, _p, vl); ptr += vl; @@ -82,7 +82,7 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfmul_vf_f32m8_m(vmflt_vf_f32m8_b4(_p, .0f, vl), _p, _p, slope, vl); //slope: float(float32_t) @@ -124,10 +124,10 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfmax_vf_f16m8(_p, (float16_t)0.f, vl); + _p = vfmax_vf_f16m8(_p, (__fp16)0.f, vl); vse16_v_f16m8(ptr, _p, vl); ptr += vl; @@ -137,10 +137,10 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c else { int n = size; - float16_t _slope = (float16_t)slope; + __fp16 _slope = (__fp16)slope; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfmul_vf_f16m8_m(vmflt_vf_f16m8_b2(_p, .0f, vl), _p, _p, _slope, vl); diff --git a/src/layer/riscv/riscv_activation.h b/src/layer/riscv/riscv_activation.h index 763e719b15d6..d5f114f3aaac 100644 --- a/src/layer/riscv/riscv_activation.h +++ b/src/layer/riscv/riscv_activation.h @@ -22,49 +22,49 @@ #include "rvv_mathfun.h" #include "rvv_mathfun_fp16s.h" -#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN) \ - static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, word_type vl) \ - { \ - if (activation_type == 1) \ - { \ - _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl); \ - } \ - else if (activation_type == 2) \ - { \ - vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl); \ - _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl); \ - } \ - else if (activation_type == 3) \ - { \ - _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl); \ - _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl); \ - } \ - else if (activation_type == 4) \ - { \ - _v = sigmoid_ps(_v, vl); \ - } \ - else if (activation_type == 5) \ - { \ - _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl); \ - } \ - else if (activation_type == 6) \ - { \ - const float alpha = activation_params[0]; \ - const float beta = activation_params[1]; \ - const float lower = -beta / alpha; \ - const float upper = (1.f / alpha) + lower; \ - vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl); \ - vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl); \ - vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl); \ - _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl); \ - \ - vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m( \ - _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta, \ - vl); \ - _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl); \ - } \ - \ - return _v; \ +#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN) \ + static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, size_t vl) \ + { \ + if (activation_type == 1) \ + { \ + _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl); \ + } \ + else if (activation_type == 2) \ + { \ + vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl); \ + _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl); \ + } \ + else if (activation_type == 3) \ + { \ + _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl); \ + _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl); \ + } \ + else if (activation_type == 4) \ + { \ + _v = sigmoid_ps(_v, vl); \ + } \ + else if (activation_type == 5) \ + { \ + _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl); \ + } \ + else if (activation_type == 6) \ + { \ + const float alpha = activation_params[0]; \ + const float beta = activation_params[1]; \ + const float lower = -beta / alpha; \ + const float upper = (1.f / alpha) + lower; \ + vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl); \ + vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl); \ + vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl); \ + _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl); \ + \ + vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m( \ + _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta, \ + vl); \ + _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl); \ + } \ + \ + return _v; \ } _RVV_FLOAT_ACTIVATION_PS(16, 1, 16) diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index f60faad50f72..596bf4435c64 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -53,7 +53,7 @@ static inline int csrr_vlenb() static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m8(packn * 8); + const size_t vl = vsetvl_e32m8(packn * 8); // NOTE vloxei8_v_f32m8 gets illegal instruction on d1 --- nihui @@ -90,7 +90,7 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m8(packn * 8); + const size_t vl = vsetvl_e16m8(packn * 8); // NOTE vloxei8_v_f16m8 gets illegal instruction on d1 --- nihui @@ -125,4 +125,278 @@ static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr) #endif // __riscv_zfh #endif // __riscv_vector +#if __riscv_vector && __rvv_tuple + +// f32m1, vsseg.v +static inline void vsseg8e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) +{ + vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vsseg8e32_v_f32m1x8(base, _tmp, vl); +} + +static inline void vsseg4e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) +{ + vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3); + vsseg4e32_v_f32m1x4(base, _tmp, vl); +} + +static inline void vsseg2e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) +{ + vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1); + vsseg2e32_v_f32m1x2(base, _tmp, vl); +} + +// f32m1, vssseg.v, 8/4/2 +static inline void vssseg8e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) +{ + vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vssseg8e32_v_f32m1x8(base, bstride, _tmp, vl); +} + +static inline void vssseg4e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) +{ + vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3); + vssseg4e32_v_f32m1x4(base, bstride, _tmp, vl); +} + +static inline void vssseg2e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) +{ + vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1); + vssseg2e32_v_f32m1x2(base, bstride, _tmp, vl); +} + +// f32m2, vsseg.v, 4/2 +static inline void vsseg4e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl) +{ + vfloat32m2x4_t _tmp = vcreate_f32m2x4(v0, v1, v2, v3); + vsseg4e32_v_f32m2x4(base, _tmp, vl); +} + +static inline void vsseg2e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl) +{ + vfloat32m2x2_t _tmp = vcreate_f32m2x2(v0, v1); + vsseg2e32_v_f32m2x2(base, _tmp, vl); +} + +// u16m1, vsseg.v, 8/4 +static inline void vsseg8e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4, vuint16m1_t v5, vuint16m1_t v6, vuint16m1_t v7, size_t vl) +{ + vuint16m1x8_t _tmp = vcreate_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vsseg8e16_v_u16m1x8(base, _tmp, vl); +} + +static inline void vsseg4e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, size_t vl) +{ + vuint16m1x4_t _tmp = vcreate_u16m1x4(v0, v1, v2, v3); + vsseg4e16_v_u16m1x4(base, _tmp, vl); +} + +// u16m2, vsseg.v, 4/2 +static inline void vsseg4e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, vuint16m2_t v2, vuint16m2_t v3, size_t vl) +{ + vuint16m2x4_t _tmp = vcreate_u16m2x4(v0, v1, v2, v3); + vsseg4e16_v_u16m2x4(base, _tmp, vl); +} + +static inline void vsseg2e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, size_t vl) +{ + vuint16m2x2_t _tmp = vcreate_u16m2x2(v0, v1); + vsseg2e16_v_u16m2x2(base, _tmp, vl); +} + +// f32m1, vlseg.v 8/4/2 +static inline void vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float32_t* base, size_t vl) +{ + vfloat32m1x8_t _tmp = vlseg8e32_v_f32m1x8(base, vl); + *v0 = vget_f32m1x8_f32m1(_tmp, 0); + *v1 = vget_f32m1x8_f32m1(_tmp, 1); + *v2 = vget_f32m1x8_f32m1(_tmp, 2); + *v3 = vget_f32m1x8_f32m1(_tmp, 3); + *v4 = vget_f32m1x8_f32m1(_tmp, 4); + *v5 = vget_f32m1x8_f32m1(_tmp, 5); + *v6 = vget_f32m1x8_f32m1(_tmp, 6); + *v7 = vget_f32m1x8_f32m1(_tmp, 7); +} + +static inline void vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float32_t* base, size_t vl) +{ + vfloat32m1x4_t _tmp = vlseg4e32_v_f32m1x4(base, vl); + *v0 = vget_f32m1x4_f32m1(_tmp, 0); + *v1 = vget_f32m1x4_f32m1(_tmp, 1); + *v2 = vget_f32m1x4_f32m1(_tmp, 2); + *v3 = vget_f32m1x4_f32m1(_tmp, 3); +} + +static inline void vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float32_t* base, size_t vl) +{ + vfloat32m1x2_t _tmp = vlseg2e32_v_f32m1x2(base, vl); + *v0 = vget_f32m1x2_f32m1(_tmp, 0); + *v1 = vget_f32m1x2_f32m1(_tmp, 1); +} + +// f32m2, vlseg.v, 4 +static inline void vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float32_t* base, size_t vl) +{ + vfloat32m2x4_t _tmp = vlseg4e32_v_f32m2x4(base, vl); + *v0 = vget_f32m2x4_f32m2(_tmp, 0); + *v1 = vget_f32m2x4_f32m2(_tmp, 1); + *v2 = vget_f32m2x4_f32m2(_tmp, 2); + *v3 = vget_f32m2x4_f32m2(_tmp, 3); +} + +// f32m4, vlseg.v, 2 +static inline void vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, size_t vl) +{ + vfloat32m4x2_t _tmp = vlseg2e32_v_f32m4x2(base, vl); + *v0 = vget_f32m4x2_f32m4(_tmp, 0); + *v1 = vget_f32m4x2_f32m4(_tmp, 1); +} + +// f32m4, vloxseg.v +static inline void vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, vuint32m4_t bindex, size_t vl) +{ + vfloat32m4x2_t _tmp = vloxseg2ei32_v_f32m4x2(base, bindex, vl); + *v0 = vget_f32m4x2_f32m4(_tmp, 0); + *v1 = vget_f32m4x2_f32m4(_tmp, 1); +} + +// u16m1, vlseg.v 8/4/2 +static inline void vlseg8e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, vuint16m1_t* v4, vuint16m1_t* v5, vuint16m1_t* v6, vuint16m1_t* v7, const uint16_t* base, size_t vl) +{ + vuint16m1x8_t _tmp = vlseg8e16_v_u16m1x8(base, vl); + *v0 = vget_u16m1x8_u16m1(_tmp, 0); + *v1 = vget_u16m1x8_u16m1(_tmp, 1); + *v2 = vget_u16m1x8_u16m1(_tmp, 2); + *v3 = vget_u16m1x8_u16m1(_tmp, 3); + *v4 = vget_u16m1x8_u16m1(_tmp, 4); + *v5 = vget_u16m1x8_u16m1(_tmp, 5); + *v6 = vget_u16m1x8_u16m1(_tmp, 6); + *v7 = vget_u16m1x8_u16m1(_tmp, 7); +} + +static inline void vlseg4e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, const uint16_t* base, size_t vl) +{ + vuint16m1x4_t _tmp = vlseg4e16_v_u16m1x4(base, vl); + *v0 = vget_u16m1x4_u16m1(_tmp, 0); + *v1 = vget_u16m1x4_u16m1(_tmp, 1); + *v2 = vget_u16m1x4_u16m1(_tmp, 2); + *v3 = vget_u16m1x4_u16m1(_tmp, 3); +} + +static inline void vlseg2e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, const uint16_t* base, size_t vl) +{ + vuint16m1x2_t _tmp = vlseg2e16_v_u16m1x2(base, vl); + *v0 = vget_u16m1x2_u16m1(_tmp, 0); + *v1 = vget_u16m1x2_u16m1(_tmp, 1); +} + +// u16m2, vlseg.v, 4 +static inline void vlseg4e16_v_u16m2(vuint16m2_t* v0, vuint16m2_t* v1, vuint16m2_t* v2, vuint16m2_t* v3, const uint16_t* base, size_t vl) +{ + vuint16m2x4_t _tmp = vlseg4e16_v_u16m2x4(base, vl); + *v0 = vget_u16m2x4_u16m2(_tmp, 0); + *v1 = vget_u16m2x4_u16m2(_tmp, 1); + *v2 = vget_u16m2x4_u16m2(_tmp, 2); + *v3 = vget_u16m2x4_u16m2(_tmp, 3); +} + +// u16m4, vlseg.v, 2 +static inline void vlseg2e16_v_u16m4(vuint16m4_t* v0, vuint16m4_t* v1, const uint16_t* base, size_t vl) +{ + vuint16m4x2_t _tmp = vlseg2e16_v_u16m4x2(base, vl); + *v0 = vget_u16m4x2_u16m4(_tmp, 0); + *v1 = vget_u16m4x2_u16m4(_tmp, 1); +} + +#if __riscv_zfh + +// f16m1, vsseg.v, 8/4/2 +static inline void vsseg8e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl) +{ + vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vsseg8e16_v_f16m1x8(base, _tmp, vl); +} + +static inline void vsseg4e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl) +{ + vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3); + vsseg4e16_v_f16m1x4(base, _tmp, vl); +} + +static inline void vsseg2e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl) +{ + vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1); + vsseg2e16_v_f16m1x2(base, _tmp, vl); +} + +// f16m1, vssseg.v, 8/4/2 +static inline void vssseg8e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl) +{ + vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vssseg8e16_v_f16m1x8(base, bstride, _tmp, vl); +} + +static inline void vssseg4e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl) +{ + vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3); + vssseg4e16_v_f16m1x4(base, bstride, _tmp, vl); +} + +static inline void vssseg2e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl) +{ + vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1); + vssseg2e16_v_f16m1x2(base, bstride, _tmp, vl); +} + +// f16m1, vlseg.v 8/4/2 +static inline void vlseg8e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, vfloat16m1_t* v4, vfloat16m1_t* v5, vfloat16m1_t* v6, vfloat16m1_t* v7, const float16_t* base, size_t vl) +{ + vfloat16m1x8_t _tmp = vlseg8e16_v_f16m1x8(base, vl); + *v0 = vget_f16m1x8_f16m1(_tmp, 0); + *v1 = vget_f16m1x8_f16m1(_tmp, 1); + *v2 = vget_f16m1x8_f16m1(_tmp, 2); + *v3 = vget_f16m1x8_f16m1(_tmp, 3); + *v4 = vget_f16m1x8_f16m1(_tmp, 4); + *v5 = vget_f16m1x8_f16m1(_tmp, 5); + *v6 = vget_f16m1x8_f16m1(_tmp, 6); + *v7 = vget_f16m1x8_f16m1(_tmp, 7); +} + +static inline void vlseg4e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, const float16_t* base, size_t vl) +{ + vfloat16m1x4_t _tmp = vlseg4e16_v_f16m1x4(base, vl); + *v0 = vget_f16m1x4_f16m1(_tmp, 0); + *v1 = vget_f16m1x4_f16m1(_tmp, 1); + *v2 = vget_f16m1x4_f16m1(_tmp, 2); + *v3 = vget_f16m1x4_f16m1(_tmp, 3); +} + +static inline void vlseg2e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, const float16_t* base, size_t vl) +{ + vfloat16m1x2_t _tmp = vlseg2e16_v_f16m1x2(base, vl); + *v0 = vget_f16m1x2_f16m1(_tmp, 0); + *v1 = vget_f16m1x2_f16m1(_tmp, 1); +} + +// f16m2, vlseg.v, 4 +static inline void vlseg4e16_v_f16m2(vfloat16m2_t* v0, vfloat16m2_t* v1, vfloat16m2_t* v2, vfloat16m2_t* v3, const float16_t* base, size_t vl) +{ + vfloat16m2x4_t _tmp = vlseg4e16_v_f16m2x4(base, vl); + *v0 = vget_f16m2x4_f16m2(_tmp, 0); + *v1 = vget_f16m2x4_f16m2(_tmp, 1); + *v2 = vget_f16m2x4_f16m2(_tmp, 2); + *v3 = vget_f16m2x4_f16m2(_tmp, 3); +} + +// f16m4, vlseg.v, 2 +static inline void vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, const float16_t* base, size_t vl) +{ + vfloat16m4x2_t _tmp = vlseg2e16_v_f16m4x2(base, vl); + *v0 = vget_f16m4x2_f16m4(_tmp, 0); + *v1 = vget_f16m4x2_f16m4(_tmp, 1); +} + +#endif // __riscv_zfh +#endif // __riscv_vector + #endif // RISCV_USABILITY_H diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h index 8993b5ad8e69..aa966de6c86a 100644 --- a/src/layer/riscv/rvv_mathfun.h +++ b/src/layer/riscv/rvv_mathfun.h @@ -32,7 +32,7 @@ #define c_cephes_log_q2 0.693359375 #define _RVV_FLOAT32_LOG_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ x = vfmax_vf_f32m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */ \ vbool##MLEN##_t invalid_mask = vmfle_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl); \ @@ -118,7 +118,7 @@ _RVV_FLOAT32_LOG_OP(8, 4) #define c_cephes_exp_p5 5.0000001201E-1 #define _RVV_FLOAT32_EXP_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ vfloat32m##LMUL##_t tmp, fx; \ \ @@ -184,7 +184,7 @@ _RVV_FLOAT32_EXP_OP(8, 4) #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI #define _RVV_FLOAT32_SINCOS_OP(LMUL, MLEN) \ - static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, word_type vl) \ + static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, size_t vl) \ { \ /* any x */ \ vfloat32m##LMUL##_t xmm1, xmm2, xmm3, y; \ @@ -257,12 +257,12 @@ _RVV_FLOAT32_SINCOS_OP(2, 16) _RVV_FLOAT32_SINCOS_OP(4, 8) _RVV_FLOAT32_SINCOS_OP(8, 4) -#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, word_type vl) \ - { \ - vfloat32m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ysin; \ +#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, size_t vl) \ + { \ + vfloat32m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ysin; \ } _RVV_FLOAT32_SIN_OP(1, 32) @@ -270,12 +270,12 @@ _RVV_FLOAT32_SIN_OP(2, 16) _RVV_FLOAT32_SIN_OP(4, 8) _RVV_FLOAT32_SIN_OP(8, 4) -#define _RVV_FLOAT32_COS_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, word_type vl) \ - { \ - vfloat32m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ycos; \ +#define _RVV_FLOAT32_COS_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, size_t vl) \ + { \ + vfloat32m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ycos; \ } _RVV_FLOAT32_COS_OP(1, 32) @@ -293,7 +293,7 @@ _RVV_FLOAT32_COS_OP(8, 4) #define c_cephes_tanh_p4 -3.33332819422E-1 #define _RVV_FLOAT32_TANH_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ vfloat32m##LMUL##_t x2 = vfsgnj_vf_f32m##LMUL(x, 1.f, vl); \ \ @@ -341,11 +341,11 @@ _RVV_FLOAT32_TANH_OP(2, 16) _RVV_FLOAT32_TANH_OP(4, 8) _RVV_FLOAT32_TANH_OP(8, 4) -#define _RVV_FLOAT32_POW_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, word_type vl) \ - { \ - /* pow(x, m) = exp(m * log(x)) */ \ - return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl); \ +#define _RVV_FLOAT32_POW_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, size_t vl) \ + { \ + /* pow(x, m) = exp(m * log(x)) */ \ + return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl); \ } _RVV_FLOAT32_POW_OP(1, 32) @@ -354,7 +354,7 @@ _RVV_FLOAT32_POW_OP(4, 8) _RVV_FLOAT32_POW_OP(8, 4) #define _RVV_FLOAT32_SIGMOID_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t sigmoid_ps(vfloat32m##LMUL##_t _v, word_type vl) \ + static inline vfloat32m##LMUL##_t sigmoid_ps(vfloat32m##LMUL##_t _v, size_t vl) \ { \ _v = vfneg_v_f32m##LMUL(_v, vl); \ _v = exp_ps(_v, vl); \ @@ -447,8 +447,8 @@ _RVV_FLOAT32_SIGMOID_OP(8, 4) #define c_erfc_sb7 -2.2440952301e+01f /* 0xc1b38712 */ #define _RVV_FLOAT32_FMA_HELPER(LMUL) \ - static inline vfloat32m##LMUL##_t vfmadd_vff_f32m##LMUL(vfloat32m##LMUL##_t a, float32_t b, \ - float32_t c, word_type vl) \ + static inline vfloat32m##LMUL##_t vfmadd_vff_f32m##LMUL(vfloat32m##LMUL##_t a, float b, \ + float c, size_t vl) \ { \ vfloat32m##LMUL##_t ret = vfmul_vf_f32m##LMUL(a, b, vl); \ ret = vfadd_vf_f32m##LMUL(ret, c, vl); \ @@ -456,7 +456,7 @@ _RVV_FLOAT32_SIGMOID_OP(8, 4) } \ \ static inline vfloat32m##LMUL##_t vfmadd_vvf_f32m##LMUL(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, \ - float32_t c, word_type vl) \ + float c, size_t vl) \ { \ vfloat32m##LMUL##_t ret = vfmul_vv_f32m##LMUL(a, b, vl); \ ret = vfadd_vf_f32m##LMUL(ret, c, vl); \ @@ -469,7 +469,7 @@ _RVV_FLOAT32_FMA_HELPER(2) _RVV_FLOAT32_FMA_HELPER(1) #define _RVV_FLOAT32_ERFC_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t erfc_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t erfc_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ /* Argument for polys */ \ vfloat32m##LMUL##_t absx = vfsgnjx_vv_f32m##LMUL(x, x, vl); \ diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index 129a4f940378..e7f18b961ae1 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -32,7 +32,7 @@ #define c_cephes_log_q2 0.693359375 #define _RVV_FLOAT16_LOG_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, word_type vl) \ + static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, size_t vl) \ { \ x = vfmax_vf_f16m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */ \ vbool##MLEN##_t invalid_mask = vmfle_vf_f16m##LMUL##_b##MLEN(x, 0.f, vl); \ @@ -118,7 +118,7 @@ _RVV_FLOAT16_LOG_OP(8, 2) #define c_cephes_exp_p5 5.0000001201E-1 #define _RVV_FLOAT16_EXP_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, word_type vl) \ + static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, size_t vl) \ { \ vfloat16m##LMUL##_t tmp, fx; \ \ @@ -184,7 +184,7 @@ _RVV_FLOAT16_EXP_OP(8, 2) #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI #define _RVV_FLOAT16_SINCOS_OP(LMUL, MLEN) \ - static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, word_type vl) \ + static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, size_t vl) \ { \ /* any x */ \ vfloat16m##LMUL##_t xmm1, xmm2, xmm3, y; \ @@ -257,12 +257,12 @@ _RVV_FLOAT16_SINCOS_OP(2, 8) _RVV_FLOAT16_SINCOS_OP(4, 4) _RVV_FLOAT16_SINCOS_OP(8, 2) -#define _RVV_FLOAT16_SIN_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t sin_ps(vfloat16m##LMUL##_t x, word_type vl) \ - { \ - vfloat16m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ysin; \ +#define _RVV_FLOAT16_SIN_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t sin_ps(vfloat16m##LMUL##_t x, size_t vl) \ + { \ + vfloat16m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ysin; \ } _RVV_FLOAT16_SIN_OP(1, 16) @@ -270,12 +270,12 @@ _RVV_FLOAT16_SIN_OP(2, 8) _RVV_FLOAT16_SIN_OP(4, 4) _RVV_FLOAT16_SIN_OP(8, 2) -#define _RVV_FLOAT16_COS_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t cos_ps(vfloat16m##LMUL##_t x, word_type vl) \ - { \ - vfloat16m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ycos; \ +#define _RVV_FLOAT16_COS_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t cos_ps(vfloat16m##LMUL##_t x, size_t vl) \ + { \ + vfloat16m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ycos; \ } _RVV_FLOAT16_COS_OP(1, 16) @@ -293,7 +293,7 @@ _RVV_FLOAT16_COS_OP(8, 2) #define c_cephes_tanh_p4 -3.33332819422E-1 #define _RVV_FLOAT16_TANH_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, word_type vl) \ + static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, size_t vl) \ { \ vfloat16m##LMUL##_t x2 = vfsgnj_vf_f16m##LMUL(x, 1.f, vl); \ \ @@ -341,11 +341,11 @@ _RVV_FLOAT16_TANH_OP(2, 8) _RVV_FLOAT16_TANH_OP(4, 4) _RVV_FLOAT16_TANH_OP(8, 2) -#define _RVV_FLOAT16_POW_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t pow_ps(vfloat16m##LMUL##_t a, vfloat16m##LMUL##_t b, word_type vl) \ - { \ - /* pow(x, m) = exp(m * log(x)) */ \ - return exp_ps(vfmul_vv_f16m##LMUL(b, log_ps(a, vl), vl), vl); \ +#define _RVV_FLOAT16_POW_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t pow_ps(vfloat16m##LMUL##_t a, vfloat16m##LMUL##_t b, size_t vl) \ + { \ + /* pow(x, m) = exp(m * log(x)) */ \ + return exp_ps(vfmul_vv_f16m##LMUL(b, log_ps(a, vl), vl), vl); \ } _RVV_FLOAT16_POW_OP(1, 16) @@ -354,7 +354,7 @@ _RVV_FLOAT16_POW_OP(4, 4) _RVV_FLOAT16_POW_OP(8, 2) #define _RVV_FLOAT16_SIGMOID_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, word_type vl) \ + static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, size_t vl) \ { \ _v = vfneg_v_f16m##LMUL(_v, vl); \ _v = exp_ps(_v, vl); \ diff --git a/src/layer/riscv/selu_riscv.cpp b/src/layer/riscv/selu_riscv.cpp index 9a4939c84211..932db355cc20 100644 --- a/src/layer/riscv/selu_riscv.cpp +++ b/src/layer/riscv/selu_riscv.cpp @@ -39,7 +39,7 @@ int SELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, 0.f, vl); vbool4_t _higher = vmnot_m_b4(_lower, vl); diff --git a/src/layer/riscv/sigmoid_riscv.cpp b/src/layer/riscv/sigmoid_riscv.cpp index afd07ea2b383..6c10582c668b 100644 --- a/src/layer/riscv/sigmoid_riscv.cpp +++ b/src/layer/riscv/sigmoid_riscv.cpp @@ -64,7 +64,7 @@ int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = sigmoid_ps(_p, vl); @@ -104,7 +104,7 @@ int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = sigmoid_ps(_p, vl); @@ -135,7 +135,7 @@ int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = sigmoid_ps(_p, vl); diff --git a/src/layer/riscv/softmax_riscv.cpp b/src/layer/riscv/softmax_riscv.cpp index 7a93e5de18dc..ca910c3d3c09 100644 --- a/src/layer/riscv/softmax_riscv.cpp +++ b/src/layer/riscv/softmax_riscv.cpp @@ -44,7 +44,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_vol = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); vfloat32m1_t _max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl); @@ -61,7 +61,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons ptr_vol = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); @@ -80,7 +80,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons ptr_vol = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); _p = vfdiv_vf_f32m8(_p, sum, vl); @@ -112,7 +112,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); @@ -141,7 +141,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); @@ -168,7 +168,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); @@ -198,7 +198,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr1 = ptr; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p = vle32_v_f32m8(ptr1, vl); vfloat32m1_t _m = vfmv_s_f_f32m1(vundefined_f32m1(), m, vl); @@ -215,7 +215,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr2 = ptr; while (n2 > 0) { - word_type vl = vsetvl_e32m8(n2); + size_t vl = vsetvl_e32m8(n2); vfloat32m8_t _p = vle32_v_f32m8(ptr2, vl); vfloat32m1_t _s = vfmv_s_f_f32m1(vundefined_f32m1(), s, vl); @@ -233,7 +233,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr3 = ptr; while (n3 > 0) { - word_type vl = vsetvl_e32m8(n3); + size_t vl = vsetvl_e32m8(n3); vfloat32m8_t _p = vle32_v_f32m8(ptr3, vl); @@ -269,7 +269,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _max = vle32_v_f32m8(max, vl); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); @@ -295,7 +295,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); @@ -319,7 +319,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); @@ -358,7 +358,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); @@ -392,7 +392,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons while (n) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl); vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl); @@ -422,7 +422,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl); @@ -457,7 +457,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_1 = ptr; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p = vle32_v_f32m8(ptr_1, vl); vfloat32m1_t _scalar_max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl); _scalar_max = vfredmax_vs_f32m8_f32m1(_scalar_max, _p, _scalar_max, vl); @@ -473,7 +473,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_2 = ptr; while (n2 > 0) { - word_type vl = vsetvl_e32m8(n2); + size_t vl = vsetvl_e32m8(n2); vfloat32m8_t _p = vle32_v_f32m8(ptr_2, vl); vfloat32m1_t _scalar_sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); @@ -491,7 +491,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_3 = ptr; while (n3 > 0) { - word_type vl = vsetvl_e32m8(n3); + size_t vl = vsetvl_e32m8(n3); vfloat32m8_t _p = vle32_v_f32m8(ptr_3, vl); _p = vfdiv_vf_f32m8(_p, sum, vl); diff --git a/src/layer/riscv/swish_riscv.cpp b/src/layer/riscv/swish_riscv.cpp index f12ab157ae9c..17493d7db69a 100644 --- a/src/layer/riscv/swish_riscv.cpp +++ b/src/layer/riscv/swish_riscv.cpp @@ -64,7 +64,7 @@ int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl); @@ -103,7 +103,7 @@ int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl); @@ -134,7 +134,7 @@ int Swish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfdiv_vv_f16m8(_p, vfadd_vf_f16m8(exp_ps(vfneg_v_f16m8(_p, vl), vl), 1.f, vl), vl); diff --git a/src/layer/riscv/tanh_riscv.cpp b/src/layer/riscv/tanh_riscv.cpp index b0f0cafe7d70..d47de61dc59c 100644 --- a/src/layer/riscv/tanh_riscv.cpp +++ b/src/layer/riscv/tanh_riscv.cpp @@ -64,7 +64,7 @@ int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = tanh_ps(_p, vl); @@ -103,7 +103,7 @@ int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = tanh_ps(_p, vl); @@ -134,7 +134,7 @@ int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = tanh_ps(_p, vl); diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp index 62c6a52740b5..e5eb80151b17 100644 --- a/src/layer/riscv/unaryop_riscv.cpp +++ b/src/layer/riscv/unaryop_riscv.cpp @@ -55,7 +55,7 @@ static int unary_op_inplace(Mat& a, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = op(_p, vl); @@ -73,7 +73,7 @@ namespace UnaryOp_riscv_functor { struct unary_op_abs { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return vfsgnj_vf_f32m8(x, 1.f, vl); } @@ -81,7 +81,7 @@ struct unary_op_abs struct unary_op_neg { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return vfneg_v_f32m8(x, vl); } @@ -89,7 +89,7 @@ struct unary_op_neg struct unary_op_floor { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl); vbool4_t _mask = vmfgt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl); @@ -99,7 +99,7 @@ struct unary_op_floor struct unary_op_ceil { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl); vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl); @@ -109,7 +109,7 @@ struct unary_op_ceil struct unary_op_square { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return vfmul_vv_f32m8(x, x, vl); } @@ -117,7 +117,7 @@ struct unary_op_square struct unary_op_sqrt { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return vfsqrt_v_f32m8(x, vl); } @@ -125,7 +125,7 @@ struct unary_op_sqrt struct unary_op_rsqrt { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { vfloat32m8_t _reciprocal = vfrsqrt7_v_f32m8(x, vl); _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); @@ -136,7 +136,7 @@ struct unary_op_rsqrt struct unary_op_exp { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return exp_ps(x, vl); } @@ -144,7 +144,7 @@ struct unary_op_exp struct unary_op_log { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return log_ps(x, vl); } @@ -152,7 +152,7 @@ struct unary_op_log struct unary_op_sin { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return sin_ps(x, vl); } @@ -160,7 +160,7 @@ struct unary_op_sin struct unary_op_cos { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return cos_ps(x, vl); } @@ -168,7 +168,7 @@ struct unary_op_cos struct unary_op_tan { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector tmp(vl); @@ -183,7 +183,7 @@ struct unary_op_tan struct unary_op_asin { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector tmp(vl); @@ -198,7 +198,7 @@ struct unary_op_asin struct unary_op_acos { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector tmp(vl); @@ -213,7 +213,7 @@ struct unary_op_acos struct unary_op_atan { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector tmp(vl); @@ -228,7 +228,7 @@ struct unary_op_atan struct unary_op_reciprocal { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { vfloat32m8_t _reciprocal = vfrec7_v_f32m8(x, vl); _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); @@ -239,7 +239,7 @@ struct unary_op_reciprocal struct unary_op_tanh { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return tanh_ps(x, vl); } @@ -338,7 +338,7 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = op(_p, vl); @@ -356,7 +356,7 @@ namespace UnaryOp_riscv_functor { struct unary_op_abs_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return vfsgnj_vf_f16m8(x, 1.f, vl); } @@ -364,7 +364,7 @@ struct unary_op_abs_fp16s struct unary_op_neg_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return vfneg_v_f16m8(x, vl); } @@ -372,7 +372,7 @@ struct unary_op_neg_fp16s struct unary_op_floor_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl); vbool2_t _mask = vmfgt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl); @@ -382,7 +382,7 @@ struct unary_op_floor_fp16s struct unary_op_ceil_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl); vbool2_t _mask = vmflt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl); @@ -392,7 +392,7 @@ struct unary_op_ceil_fp16s struct unary_op_square_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return vfmul_vv_f16m8(x, x, vl); } @@ -400,7 +400,7 @@ struct unary_op_square_fp16s struct unary_op_sqrt_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return vfsqrt_v_f16m8(x, vl); } @@ -408,7 +408,7 @@ struct unary_op_sqrt_fp16s struct unary_op_rsqrt_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { vfloat16m8_t _reciprocal = vfrsqrt7_v_f16m8(x, vl); _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); @@ -419,7 +419,7 @@ struct unary_op_rsqrt_fp16s struct unary_op_exp_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return exp_ps(x, vl); } @@ -427,7 +427,7 @@ struct unary_op_exp_fp16s struct unary_op_log_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return log_ps(x, vl); } @@ -435,7 +435,7 @@ struct unary_op_log_fp16s struct unary_op_sin_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return sin_ps(x, vl); } @@ -443,7 +443,7 @@ struct unary_op_sin_fp16s struct unary_op_cos_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return cos_ps(x, vl); } @@ -451,7 +451,7 @@ struct unary_op_cos_fp16s struct unary_op_tan_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector<__fp16> tmp(vl); @@ -466,7 +466,7 @@ struct unary_op_tan_fp16s struct unary_op_asin_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector<__fp16> tmp(vl); @@ -481,7 +481,7 @@ struct unary_op_asin_fp16s struct unary_op_acos_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector<__fp16> tmp(vl); @@ -496,7 +496,7 @@ struct unary_op_acos_fp16s struct unary_op_atan_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector<__fp16> tmp(vl); @@ -511,7 +511,7 @@ struct unary_op_atan_fp16s struct unary_op_reciprocal_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { vfloat16m8_t _reciprocal = vfrec7_v_f16m8(x, vl); _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); @@ -522,7 +522,7 @@ struct unary_op_reciprocal_fp16s struct unary_op_tanh_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return tanh_ps(x, vl); } diff --git a/src/layer/squeeze.cpp b/src/layer/squeeze.cpp index 14840a8215ef..7f9de8933cdf 100644 --- a/src/layer/squeeze.cpp +++ b/src/layer/squeeze.cpp @@ -26,6 +26,7 @@ int Squeeze::load_param(const ParamDict& pd) { squeeze_w = pd.get(0, 0); squeeze_h = pd.get(1, 0); + squeeze_d = pd.get(11, 0); squeeze_c = pd.get(2, 0); axes = pd.get(3, Mat()); @@ -36,17 +37,20 @@ int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c { int w = bottom_blob.w; int h = bottom_blob.h; + int d = bottom_blob.d; int channels = bottom_blob.c; int dims = bottom_blob.dims; bool _squeeze_w = false; bool _squeeze_h = false; + bool _squeeze_d = false; bool _squeeze_c = false; if (axes.empty()) { _squeeze_w = w == 1 && squeeze_w; _squeeze_h = h == 1 && squeeze_h; + _squeeze_d = d == 1 && squeeze_d; _squeeze_c = channels == 1 && squeeze_c; } else @@ -82,6 +86,22 @@ int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c { _squeeze_w = w == 1; } + if (dims == 4 && axis == 0) + { + _squeeze_c = channels == 1; + } + if (dims == 4 && axis == 1) + { + _squeeze_d = d == 1; + } + if (dims == 4 && axis == 2) + { + _squeeze_h = h == 1; + } + if (dims == 4 && axis == 3) + { + _squeeze_w = w == 1; + } } } @@ -143,6 +163,70 @@ int Squeeze::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) c } } + if (dims == 4) + { + if (_squeeze_w && _squeeze_h && _squeeze_d && _squeeze_c) + { + top_blob = bottom_blob.reshape(1, opt.blob_allocator); + } + else if (_squeeze_w && _squeeze_h && _squeeze_d) + { + top_blob = bottom_blob.reshape(channels, opt.blob_allocator); + } + else if (_squeeze_h && _squeeze_d && _squeeze_c) + { + top_blob = bottom_blob.reshape(w, opt.blob_allocator); + } + else if (_squeeze_w && _squeeze_d && _squeeze_c) + { + top_blob = bottom_blob.reshape(h, opt.blob_allocator); + } + else if (_squeeze_w && _squeeze_h && _squeeze_c) + { + top_blob = bottom_blob.reshape(d, opt.blob_allocator); + } + else if (_squeeze_w && _squeeze_h) + { + top_blob = bottom_blob.reshape(d, channels, opt.blob_allocator); + } + else if (_squeeze_w && _squeeze_d) + { + top_blob = bottom_blob.reshape(h, channels, opt.blob_allocator); + } + else if (_squeeze_h && _squeeze_d) + { + top_blob = bottom_blob.reshape(w, channels, opt.blob_allocator); + } + else if (_squeeze_h && _squeeze_c) + { + top_blob = bottom_blob.reshape(w, d, opt.blob_allocator); + } + else if (_squeeze_w && _squeeze_c) + { + top_blob = bottom_blob.reshape(h, d, opt.blob_allocator); + } + else if (_squeeze_d && _squeeze_c) + { + top_blob = bottom_blob.reshape(w, h, opt.blob_allocator); + } + else if (_squeeze_w) + { + top_blob = bottom_blob.reshape(h, d, channels, opt.blob_allocator); + } + else if (_squeeze_h) + { + top_blob = bottom_blob.reshape(w, d, channels, opt.blob_allocator); + } + else if (_squeeze_d) + { + top_blob = bottom_blob.reshape(w, h, channels, opt.blob_allocator); + } + else if (_squeeze_c) + { + top_blob = bottom_blob.reshape(w, h, d, opt.blob_allocator); + } + } + if (top_blob.empty()) return -100; diff --git a/src/layer/squeeze.h b/src/layer/squeeze.h index cea5a413cc29..536a3b9769ee 100644 --- a/src/layer/squeeze.h +++ b/src/layer/squeeze.h @@ -31,6 +31,7 @@ class Squeeze : public Layer public: int squeeze_w; int squeeze_h; + int squeeze_d; int squeeze_c; Mat axes; }; diff --git a/src/layer/unfold.cpp b/src/layer/unfold.cpp new file mode 100644 index 000000000000..f747a169cedd --- /dev/null +++ b/src/layer/unfold.cpp @@ -0,0 +1,146 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "unfold.h" + +namespace ncnn { + +Unfold::Unfold() +{ + one_blob_only = true; +} + +int Unfold::load_param(const ParamDict& pd) +{ + kernel_w = pd.get(1, 0); + kernel_h = pd.get(11, kernel_w); + dilation_w = pd.get(2, 1); + dilation_h = pd.get(12, dilation_w); + stride_w = pd.get(3, 1); + stride_h = pd.get(13, stride_w); + pad_left = pd.get(4, 0); + pad_right = pd.get(15, pad_left); + pad_top = pd.get(14, pad_left); + pad_bottom = pd.get(16, pad_top); + pad_value = pd.get(18, 0.f); + + return 0; +} + +int Unfold::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + Mat bottom_blob_bordered; + { + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + opt_b.use_packing_layout = false; + make_padding(bottom_blob, bottom_blob_bordered, opt_b); + if (bottom_blob_bordered.empty()) + return -100; + } + + const int w = bottom_blob_bordered.w; + const int h = bottom_blob_bordered.h; + const int channels = bottom_blob_bordered.c; + const size_t elemsize = bottom_blob_bordered.elemsize; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + const int outw = (w - kernel_extent_w) / stride_w + 1; + const int outh = (h - kernel_extent_h) / stride_h + 1; + + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + top_blob.create(size, maxk * channels, elemsize, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // im2col + const int gap = w * stride_h - outw * stride_w; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < channels; p++) + { + const Mat img = bottom_blob_bordered.channel(p); + float* ptr = top_blob.row(p * maxk); + + for (int u = 0; u < kernel_h; u++) + { + for (int v = 0; v < kernel_w; v++) + { + const float* sptr = img.row(dilation_h * u) + dilation_w * v; + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + ptr[0] = sptr[0]; + + sptr += stride_w; + ptr += 1; + } + + sptr += gap; + } + } + } + } + + return 0; +} + +void Unfold::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + bottom_blob_bordered = bottom_blob; + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b); + } + else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) + { + // tensorflow padding=SAME or onnx padding=SAME_UPPER + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b); + } + } + else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234) + { + // onnx padding=SAME_LOWER + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h; + if (wpad > 0 || hpad > 0) + { + Option opt_b = opt; + opt_b.blob_allocator = opt.workspace_allocator; + copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b); + } + } +} + +} // namespace ncnn diff --git a/src/layer/unfold.h b/src/layer/unfold.h new file mode 100644 index 000000000000..ff7860b7f72a --- /dev/null +++ b/src/layer/unfold.h @@ -0,0 +1,50 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_UNFOLD_H +#define LAYER_UNFOLD_H + +#include "layer.h" + +namespace ncnn { + +class Unfold : public Layer +{ +public: + Unfold(); + + virtual int load_param(const ParamDict& pd); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +protected: + void make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const; + +public: + int kernel_w; + int kernel_h; + int dilation_w; + int dilation_h; + int stride_w; + int stride_h; + int pad_left; // -233=SAME_UPPER -234=SAME_LOWER + int pad_right; + int pad_top; + int pad_bottom; + float pad_value; +}; + +} // namespace ncnn + +#endif // LAYER_UNFOLD_H diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index 5a73695e7c13..22e817d34e65 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -794,7 +794,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) convert_packing(bias_data, bias_data_packed, out_elempack, opt); } - if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && num_input >= 16 && num_output >= 16) + if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) + { + // pass + } + else if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && num_input >= 16 && num_output >= 16) { bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; @@ -872,7 +876,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } pipeline_convolution_gemm->create(shader_type_index, opt, specializations); } - if (is_conv1x1s1d1) + else if (is_conv1x1s1d1) { bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; @@ -1221,13 +1225,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; bool pre_winograd43 = opt.use_winograd43_convolution; - if (vkdev->info.type() == 0 && ((w <= 18 && h <= 18) || ((w >= 23 && w <= 24) && (h >= 23 && h <= 24)))) - pre_winograd43 = false; - if (vkdev->info.type() != 0 && (w <= 12 && h <= 12)) - pre_winograd43 = false; + if (opt.use_winograd23_convolution) + { + if (vkdev->info.type() == 0 && ((w <= 18 && h <= 18) || ((w >= 23 && w <= 24) && (h >= 23 && h <= 24)))) + pre_winograd43 = false; + if (vkdev->info.type() != 0 && (w <= 12 && h <= 12)) + pre_winograd43 = false; - if (use_cooperative_matrix && (w <= 18 && h <= 18)) - pre_winograd43 = false; + if (use_cooperative_matrix && (w <= 18 && h <= 18)) + pre_winograd43 = false; + } if (pre_winograd43) { @@ -1660,10 +1667,13 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16) { bool pre_winograd43 = opt.use_winograd43_convolution; - if (vkdev->info.type() == 0 && ((w <= 18 && h <= 18) || ((w >= 23 && w <= 24) && (h >= 23 && h <= 24)))) - pre_winograd43 = false; - if (vkdev->info.type() != 0 && (w <= 12 && h <= 12)) - pre_winograd43 = false; + if (opt.use_winograd23_convolution) + { + if (vkdev->info.type() == 0 && ((w <= 18 && h <= 18) || ((w >= 23 && w <= 24) && (h >= 23 && h <= 24)))) + pre_winograd43 = false; + if (vkdev->info.type() != 0 && (w <= 12 && h <= 12)) + pre_winograd43 = false; + } if (pre_winograd43) { diff --git a/src/layer/vulkan/elu_vulkan.cpp b/src/layer/vulkan/elu_vulkan.cpp new file mode 100644 index 000000000000..cf35b5b666d3 --- /dev/null +++ b/src/layer/vulkan/elu_vulkan.cpp @@ -0,0 +1,182 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "elu_vulkan.h" + +#include "layer_shader_type.h" + +namespace ncnn { + +ELU_vulkan::ELU_vulkan() +{ + support_vulkan = true; + support_image_storage = true; + + pipeline_elu = 0; + pipeline_elu_pack4 = 0; + pipeline_elu_pack8 = 0; +} + +int ELU_vulkan::create_pipeline(const Option& opt) +{ + const Mat& shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3 || shape.dims == 4) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else + { + elemsize = elempack * 4u; + } + + Mat shape_packed; + if (shape.dims == 1) shape_packed = Mat(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Mat(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Mat(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 4) shape_packed = Mat(shape.w, shape.h, shape.d, shape.c / elempack, (void*)0, elemsize, elempack); + + std::vector specializations(1 + 5); + specializations[0].f = alpha; + specializations[1 + 0].i = shape_packed.dims; + specializations[1 + 1].i = shape_packed.w; + specializations[1 + 2].i = shape_packed.h * shape_packed.d; + specializations[1 + 3].i = shape_packed.c; + specializations[1 + 4].i = shape_packed.cstep; + + Mat local_size_xyz; + if (shape_packed.dims == 1) + { + local_size_xyz.w = std::min(64, shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, shape_packed.w); + local_size_xyz.h = std::min(8, shape_packed.h); + local_size_xyz.c = 1; + } + if (shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, shape_packed.w); + local_size_xyz.h = std::min(4, shape_packed.h); + local_size_xyz.c = std::min(4, shape_packed.c); + } + if (shape_packed.dims == 4) + { + local_size_xyz.w = std::min(4, shape_packed.w); + local_size_xyz.h = std::min(4, shape_packed.h * shape_packed.d); + local_size_xyz.c = std::min(4, shape_packed.c); + } + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_elu = new Pipeline(vkdev); + pipeline_elu->set_optimal_local_size_xyz(local_size_xyz); + pipeline_elu->create(LayerShaderType::elu, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_elu_pack4 = new Pipeline(vkdev); + pipeline_elu_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_elu_pack4->create(LayerShaderType::elu_pack4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + { + pipeline_elu_pack8 = new Pipeline(vkdev); + pipeline_elu_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_elu_pack8->create(LayerShaderType::elu_pack8, opt, specializations); + } + + return 0; +} + +int ELU_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_elu; + pipeline_elu = 0; + + delete pipeline_elu_pack4; + pipeline_elu_pack4 = 0; + + delete pipeline_elu_pack8; + pipeline_elu_pack8 = 0; + + return 0; +} + +int ELU_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const +{ + int elempack = bottom_top_blob.elempack; + + std::vector bindings(1); + bindings[0] = bottom_top_blob; + + std::vector constants(5); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h * bottom_top_blob.d; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_elu_pack8 + : elempack == 4 ? pipeline_elu_pack4 + : pipeline_elu; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + + return 0; +} + +int ELU_vulkan::forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const +{ + int elempack = bottom_top_blob.elempack; + + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = bottom_top_blob; + + std::vector constants(5); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h * bottom_top_blob.d; + constants[3].i = bottom_top_blob.c; + constants[4].i = 0; //bottom_top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_elu_pack8 + : elempack == 4 ? pipeline_elu_pack4 + : pipeline_elu; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/vulkan/elu_vulkan.h b/src/layer/vulkan/elu_vulkan.h new file mode 100644 index 000000000000..62da80a00c55 --- /dev/null +++ b/src/layer/vulkan/elu_vulkan.h @@ -0,0 +1,42 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_ELU_VULKAN_H +#define LAYER_ELU_VULKAN_H + +#include "elu.h" + +namespace ncnn { + +class ELU_vulkan : virtual public ELU +{ +public: + ELU_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + using ELU::forward_inplace; + virtual int forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward_inplace(VkImageMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_elu; + Pipeline* pipeline_elu_pack4; + Pipeline* pipeline_elu_pack8; +}; + +} // namespace ncnn + +#endif // LAYER_ELU_VULKAN_H diff --git a/src/layer/vulkan/shader/elu.comp b/src/layer/vulkan/shader/elu.comp new file mode 100644 index 000000000000..319606a012dc --- /dev/null +++ b/src/layer/vulkan/shader/elu.comp @@ -0,0 +1,73 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float alpha = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afp v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afp v = buffer_ld1(bottom_top_blob_data, gi); +#endif + + v = v > afp(0.0f) ? v : afp(alpha * (exp(v) - 1.0f)); + +#if NCNN_image_shader + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); +#else + buffer_st1(bottom_top_blob_data, gi, v); +#endif +} diff --git a/src/layer/vulkan/shader/elu_pack4.comp b/src/layer/vulkan/shader/elu_pack4.comp new file mode 100644 index 000000000000..6d02f11bd4a9 --- /dev/null +++ b/src/layer/vulkan/shader/elu_pack4.comp @@ -0,0 +1,73 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float alpha = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); +#endif + + v = mix(afpvec4(alpha) * afpvec4(exp(v) - afpvec4(1.0f)), v, greaterThan(v, afpvec4(0.0f))); + +#if NCNN_image_shader + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); +#else + buffer_st4(bottom_top_blob_data, gi, v); +#endif +} diff --git a/src/layer/vulkan/shader/elu_pack8.comp b/src/layer/vulkan/shader/elu_pack8.comp new file mode 100644 index 000000000000..0b8831d61a48 --- /dev/null +++ b/src/layer/vulkan/shader/elu_pack8.comp @@ -0,0 +1,75 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float alpha = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); +#endif + + v[0] = mix(afpvec4(alpha) * afpvec4(exp(v[0]) - afpvec4(1.0f)), v[0], greaterThan(v[0], afpvec4(0.0f))); + v[1] = mix(afpvec4(alpha) * afpvec4(exp(v[1]) - afpvec4(1.0f)), v[1], greaterThan(v[1], afpvec4(0.0f))); + +#if NCNN_image_shader + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); +#else + buffer_st8(bottom_top_blob_data, gi, v); +#endif +} diff --git a/src/layer/x86/avx512_mathfun.h b/src/layer/x86/avx512_mathfun.h index 2892e3d2bf75..0513d5e1be1b 100644 --- a/src/layer/x86/avx512_mathfun.h +++ b/src/layer/x86/avx512_mathfun.h @@ -182,6 +182,48 @@ static NCNN_FORCEINLINE __m512 exp512_ps(__m512 x) return y; } +_PS512_CONST(tanh_hi, 9.0f); +_PS512_CONST(tanh_lo, -9.0f); + +_PS512_CONST(cephes_tanh_p0, -2.76076847742355E-16f); +_PS512_CONST(cephes_tanh_p1, 2.00018790482477E-13f); +_PS512_CONST(cephes_tanh_p2, -8.60467152213735E-11f); +_PS512_CONST(cephes_tanh_p3, 5.12229709037114E-08f); +_PS512_CONST(cephes_tanh_p4, 1.48572235717979E-05f); +_PS512_CONST(cephes_tanh_p5, 6.37261928875436E-04f); +_PS512_CONST(cephes_tanh_p6, 4.89352455891786E-03f); + +_PS512_CONST(cephes_tanh_p7, 1.19825839466702e-06f); +_PS512_CONST(cephes_tanh_p8, 1.18534705686654e-04f); +_PS512_CONST(cephes_tanh_p9, 2.26843463243900e-03f); + +// an approximation of tanh +static inline __m512 tanh512_ps(const __m512 x) +{ + __m512 value = x; + value = _mm512_max_ps(*(__m512*)_ps512_tanh_lo, value); + value = _mm512_min_ps(*(__m512*)_ps512_tanh_hi, value); + + __m512 value_squared = _mm512_mul_ps(value, value); + + __m512 p; + p = _mm512_fmadd_ps(value_squared, *(__m512*)_ps512_cephes_tanh_p0, *(__m512*)_ps512_cephes_tanh_p1); + p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p2); + p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p3); + p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p4); + p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p5); + p = _mm512_fmadd_ps(p, value_squared, *(__m512*)_ps512_cephes_tanh_p6); + p = _mm512_mul_ps(p, value); + + __m512 q; + q = _mm512_fmadd_ps(value_squared, *(__m512*)_ps512_cephes_tanh_p7, *(__m512*)_ps512_cephes_tanh_p8); + q = _mm512_fmadd_ps(q, value_squared, *(__m512*)_ps512_cephes_tanh_p9); + q = _mm512_fmadd_ps(q, value_squared, *(__m512*)_ps512_cephes_tanh_p6); + + __m512 dst = _mm512_div_ps(p, q); + return dst; +} + _PS512_CONST(minus_cephes_DP1, -0.78515625f); _PS512_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); _PS512_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); diff --git a/src/layer/x86/avx_mathfun.h b/src/layer/x86/avx_mathfun.h index db28691344a8..645c399e4eba 100644 --- a/src/layer/x86/avx_mathfun.h +++ b/src/layer/x86/avx_mathfun.h @@ -295,6 +295,48 @@ static NCNN_FORCEINLINE __m256 exp256_ps(__m256 x) return y; } +_PS256_CONST(tanh_hi, 9.0f); +_PS256_CONST(tanh_lo, -9.0f); + +_PS256_CONST(cephes_tanh_p0, -2.76076847742355E-16f); +_PS256_CONST(cephes_tanh_p1, 2.00018790482477E-13f); +_PS256_CONST(cephes_tanh_p2, -8.60467152213735E-11f); +_PS256_CONST(cephes_tanh_p3, 5.12229709037114E-08f); +_PS256_CONST(cephes_tanh_p4, 1.48572235717979E-05f); +_PS256_CONST(cephes_tanh_p5, 6.37261928875436E-04f); +_PS256_CONST(cephes_tanh_p6, 4.89352455891786E-03f); + +_PS256_CONST(cephes_tanh_p7, 1.19825839466702e-06f); +_PS256_CONST(cephes_tanh_p8, 1.18534705686654e-04f); +_PS256_CONST(cephes_tanh_p9, 2.26843463243900e-03f); + +// an approximation of tanh +static inline __m256 tanh256_ps(const __m256 x) +{ + __m256 value = x; + value = _mm256_max_ps(*(__m256*)_ps256_tanh_lo, value); + value = _mm256_min_ps(*(__m256*)_ps256_tanh_hi, value); + + __m256 value_squared = _mm256_mul_ps(value, value); + + __m256 p; + p = _mm256_comp_fmadd_ps(value_squared, *(__m256*)_ps256_cephes_tanh_p0, *(__m256*)_ps256_cephes_tanh_p1); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p2); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p3); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p4); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p5); + p = _mm256_comp_fmadd_ps(p, value_squared, *(__m256*)_ps256_cephes_tanh_p6); + p = _mm256_mul_ps(p, value); + + __m256 q; + q = _mm256_comp_fmadd_ps(value_squared, *(__m256*)_ps256_cephes_tanh_p7, *(__m256*)_ps256_cephes_tanh_p8); + q = _mm256_comp_fmadd_ps(q, value_squared, *(__m256*)_ps256_cephes_tanh_p9); + q = _mm256_comp_fmadd_ps(q, value_squared, *(__m256*)_ps256_cephes_tanh_p6); + + __m256 dst = _mm256_div_ps(p, q); + return dst; +} + _PS256_CONST(minus_cephes_DP1, -0.78515625f); _PS256_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); _PS256_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); diff --git a/src/layer/x86/convolution_3x3_pack16to1.h b/src/layer/x86/convolution_3x3_pack16to1.h index e01383c8204f..0faefbbc4192 100644 --- a/src/layer/x86/convolution_3x3_pack16to1.h +++ b/src/layer/x86/convolution_3x3_pack16to1.h @@ -290,7 +290,7 @@ static void conv3x3s1_winograd63_pack16to1_avx512(const Mat& bottom_blob, Mat& t __m512 _re = _mm512_loadu_ps(r0 + 16 * 14); __m512 _rf = _mm512_loadu_ps(r0 + 16 * 15); - transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); @@ -333,41 +333,7 @@ static void conv3x3s1_winograd63_pack16to1_avx512(const Mat& bottom_blob, Mat& t __m512 _r6 = _mm512_load_ps(r0 + 16 * 6); __m512 _r7 = _mm512_load_ps(r0 + 16 * 7); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); - __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); - __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); - __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); - - __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm512_store_ps(tmpptr, _r0); _mm512_store_ps(tmpptr + 16, _r1); diff --git a/src/layer/x86/convolution_3x3_pack8to1_int8.h b/src/layer/x86/convolution_3x3_pack8to1_int8.h index 4f687ac256d4..d5957faf6d89 100644 --- a/src/layer/x86/convolution_3x3_pack8to1_int8.h +++ b/src/layer/x86/convolution_3x3_pack8to1_int8.h @@ -125,11 +125,6 @@ static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(const Mat& k int p = 0; for (; p + 3 < outch; p += 4) { - const Mat k0 = kernel_tm.channel(p); - const Mat k1 = kernel_tm.channel(p + 1); - const Mat k2 = kernel_tm.channel(p + 2); - const Mat k3 = kernel_tm.channel(p + 3); - Mat g0 = kernel_tm_pack8to1.channel(p / 4); for (int k = 0; k < 36; k++) @@ -138,41 +133,15 @@ static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(const Mat& k for (int q = 0; q + 7 < inch; q += 8) { -#if __AVXVNNI__ || __AVX512VNNI__ || __XOP__ for (int i = 0; i < 4; i++) { - const short* k00 = k0.row(q + i * 2); - const short* k10 = k1.row(q + i * 2); - const short* k20 = k2.row(q + i * 2); - const short* k30 = k3.row(q + i * 2); - - const short* k01 = k0.row(q + i * 2 + 1); - const short* k11 = k1.row(q + i * 2 + 1); - const short* k21 = k2.row(q + i * 2 + 1); - const short* k31 = k3.row(q + i * 2 + 1); - - g00[0] = k00[k]; - g00[1] = k01[k]; - g00[2] = k10[k]; - g00[3] = k11[k]; - g00[4] = k20[k]; - g00[5] = k21[k]; - g00[6] = k30[k]; - g00[7] = k31[k]; - - g00 += 8; - } -#else - for (int i = 0; i < 8; i++) - { - g00[0] = k0.row(q + i)[k]; - g00[1] = k1.row(q + i)[k]; - g00[2] = k2.row(q + i)[k]; - g00[3] = k3.row(q + i)[k]; - - g00 += 4; + for (int j = 0; j < 8; j++) + { + const short* k00 = kernel_tm.channel(p + i).row(q + j); + g00[0] = k00[k]; + g00 += 1; + } } -#endif } } } @@ -508,114 +477,97 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& int nn = inch; // inch always > 0 - __m256i _sum0_1 = _mm256_setzero_si256(); - __m256i _sum2_3 = _mm256_setzero_si256(); - __m256i _sum4_5 = _mm256_setzero_si256(); - __m256i _sum6_7 = _mm256_setzero_si256(); + __m256i _sum00_11 = _mm256_setzero_si256(); + __m256i _sum10_01 = _mm256_setzero_si256(); + __m256i _sum02_13 = _mm256_setzero_si256(); + __m256i _sum12_03 = _mm256_setzero_si256(); + + __m256i _sum04_15 = _mm256_setzero_si256(); + __m256i _sum14_05 = _mm256_setzero_si256(); + __m256i _sum06_17 = _mm256_setzero_si256(); + __m256i _sum16_07 = _mm256_setzero_si256(); for (int j = 0; j < nn; j++) { // 0 1 2 3 4 5 6 7 8 9 a b c d e f - __m256i _val0 = _mm256_loadu_si256((const __m256i*)r0); + __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0); __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0); __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16)); + __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78); + #if __AVXVNNI__ || __AVX512VNNI__ - __m256i _val0_0123 = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - __m256i _val0_4567 = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - __m256i _val0_89ab = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4)); - __m256i _val0_cdef = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6)); - - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val0_0123); - _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w01, _val0_89ab); - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val0_4567); - _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w23, _val0_cdef); + _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01); + _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01); + _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23); + _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23); #else - // 0 0 1 1 2 2 3 3 8 8 9 9 a a b b - // 4 4 5 5 6 6 7 7 c c d d e e f f - __m256i _val0_0123_89ab = _mm256_unpacklo_epi16(_val0, _val0); - __m256i _val0_4567_cdef = _mm256_unpackhi_epi16(_val0, _val0); - - __m256i _val0_0123 = _mm256_permutevar8x32_epi32(_val0_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val0_4567 = _mm256_permutevar8x32_epi32(_val0_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val0_89ab = _mm256_permutevar8x32_epi32(_val0_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - __m256i _val0_cdef = _mm256_permutevar8x32_epi32(_val0_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - - __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val0_0123); - __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val0_0123); - __m256i _sl10_11 = _mm256_mullo_epi16(_w01, _val0_89ab); - __m256i _sh10_11 = _mm256_mulhi_epi16(_w01, _val0_89ab); - __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val0_4567); - __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val0_4567); - __m256i _sl12_13 = _mm256_mullo_epi16(_w23, _val0_cdef); - __m256i _sh12_13 = _mm256_mulhi_epi16(_w23, _val0_cdef); - - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl10_11, _sh10_11)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl12_13, _sh12_13)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl10_11, _sh10_11)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl12_13, _sh12_13)); + _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01)); + _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01)); + _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23)); + _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23)); #endif - __m256i _val1 = _mm256_loadu_si256((const __m256i*)(r0 + 16)); + __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16)); + + __m256i _val32 = _mm256_permute4x64_epi64(_val23, 78); #if __AVXVNNI__ || __AVX512VNNI__ - __m256i _val1_0123 = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - __m256i _val1_4567 = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - __m256i _val1_89ab = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4)); - __m256i _val1_cdef = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6)); - - _sum4_5 = _mm256_dpwssd_epi32(_sum4_5, _w01, _val1_0123); - _sum6_7 = _mm256_dpwssd_epi32(_sum6_7, _w01, _val1_89ab); - _sum4_5 = _mm256_dpwssd_epi32(_sum4_5, _w23, _val1_4567); - _sum6_7 = _mm256_dpwssd_epi32(_sum6_7, _w23, _val1_cdef); + _sum04_15 = _mm256_dpwssd_epi32(_sum04_15, _val23, _w01); + _sum14_05 = _mm256_dpwssd_epi32(_sum14_05, _val32, _w01); + _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23, _w23); + _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32, _w23); #else - __m256i _val1_0123_89ab = _mm256_unpacklo_epi16(_val1, _val1); - __m256i _val1_4567_cdef = _mm256_unpackhi_epi16(_val1, _val1); - - __m256i _val1_0123 = _mm256_permutevar8x32_epi32(_val1_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val1_4567 = _mm256_permutevar8x32_epi32(_val1_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val1_89ab = _mm256_permutevar8x32_epi32(_val1_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - __m256i _val1_cdef = _mm256_permutevar8x32_epi32(_val1_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - - __m256i _sl04_05 = _mm256_mullo_epi16(_w01, _val1_0123); - __m256i _sh04_05 = _mm256_mulhi_epi16(_w01, _val1_0123); - __m256i _sl14_15 = _mm256_mullo_epi16(_w01, _val1_89ab); - __m256i _sh14_15 = _mm256_mulhi_epi16(_w01, _val1_89ab); - __m256i _sl06_07 = _mm256_mullo_epi16(_w23, _val1_4567); - __m256i _sh06_07 = _mm256_mulhi_epi16(_w23, _val1_4567); - __m256i _sl16_17 = _mm256_mullo_epi16(_w23, _val1_cdef); - __m256i _sh16_17 = _mm256_mulhi_epi16(_w23, _val1_cdef); - - _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpacklo_epi16(_sl04_05, _sh04_05)); - _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpacklo_epi16(_sl14_15, _sh14_15)); - _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpacklo_epi16(_sl06_07, _sh06_07)); - _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpacklo_epi16(_sl16_17, _sh16_17)); - _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpackhi_epi16(_sl04_05, _sh04_05)); - _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpackhi_epi16(_sl14_15, _sh14_15)); - _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpackhi_epi16(_sl06_07, _sh06_07)); - _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpackhi_epi16(_sl16_17, _sh16_17)); + _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23, _w01)); + _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32, _w01)); + _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23, _w23)); + _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32, _w23)); #endif r0 += 32; k0 += 32; } - __m256i _sum0_2 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i _sum1_3 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 3, 0, 1)); - _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3); + // transpose 4x8 + { + __m256i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01); + _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03); + _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01); + _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03); + _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1); + _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1); + _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3); + _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3); + } + { + __m256i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm256_unpacklo_epi32(_sum04_15, _sum14_05); + _tmp1 = _mm256_unpacklo_epi32(_sum06_17, _sum16_07); + _tmp2 = _mm256_unpackhi_epi32(_sum04_15, _sum14_05); + _tmp3 = _mm256_unpackhi_epi32(_sum06_17, _sum16_07); + _sum04_15 = _mm256_unpacklo_epi64(_tmp0, _tmp1); + _sum14_05 = _mm256_unpackhi_epi64(_tmp0, _tmp1); + _sum06_17 = _mm256_unpacklo_epi64(_tmp2, _tmp3); + _sum16_07 = _mm256_unpackhi_epi64(_tmp2, _tmp3); + } + + _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01); + _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03); + _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13); - __m256i _sum4_6 = _mm256_permute2x128_si256(_sum4_5, _sum6_7, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i _sum5_7 = _mm256_permute2x128_si256(_sum4_5, _sum6_7, _MM_SHUFFLE(0, 3, 0, 1)); - _sum4_6 = _mm256_add_epi32(_sum4_6, _sum5_7); + _sum04_15 = _mm256_add_epi32(_sum04_15, _sum14_05); + _sum06_17 = _mm256_add_epi32(_sum06_17, _sum16_07); + _sum04_15 = _mm256_add_epi32(_sum04_15, _sum06_17); + + __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0); + _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask); + _sum04_15 = _mm256_permutevar8x32_epi32(_sum04_15, _perm_mask); int sum[16]; - _mm256_storeu_si256((__m256i*)sum, _sum0_2); - _mm256_storeu_si256((__m256i*)(sum + 8), _sum4_6); + _mm256_storeu_si256((__m256i*)sum, _sum00_11); + _mm256_storeu_si256((__m256i*)(sum + 8), _sum04_15); output0_tm[0] = sum[0]; output1_tm[0] = sum[1]; @@ -651,60 +603,42 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& int nn = inch; // inch always > 0 #if __AVX2__ - __m256i _sum0_1 = _mm256_setzero_si256(); - __m256i _sum2_3 = _mm256_setzero_si256(); + __m256i _sum00_11 = _mm256_setzero_si256(); + __m256i _sum10_01 = _mm256_setzero_si256(); + __m256i _sum02_13 = _mm256_setzero_si256(); + __m256i _sum12_03 = _mm256_setzero_si256(); #else - __m128i _sum0 = _mm_setzero_si128(); - __m128i _sum1 = _mm_setzero_si128(); - __m128i _sum2 = _mm_setzero_si128(); - __m128i _sum3 = _mm_setzero_si128(); + __m128i _sum00 = _mm_setzero_si128(); + __m128i _sum01 = _mm_setzero_si128(); + __m128i _sum02 = _mm_setzero_si128(); + __m128i _sum03 = _mm_setzero_si128(); + __m128i _sum10 = _mm_setzero_si128(); + __m128i _sum11 = _mm_setzero_si128(); + __m128i _sum12 = _mm_setzero_si128(); + __m128i _sum13 = _mm_setzero_si128(); #endif for (int j = 0; j < nn; j++) { #if __AVX2__ // 0 1 2 3 4 5 6 7 8 9 a b c d e f - __m256i _val = _mm256_loadu_si256((const __m256i*)r0); + __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0); __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0); __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16)); + __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78); + #if __AVXVNNI__ || __AVX512VNNI__ - __m256i _val_0123 = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - __m256i _val_4567 = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - __m256i _val_89ab = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4)); - __m256i _val_cdef = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6)); - - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val_0123); - _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w01, _val_89ab); - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val_4567); - _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w23, _val_cdef); + _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01); + _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01); + _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23); + _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23); #else - __m256i _val_0123_89ab = _mm256_unpacklo_epi16(_val, _val); - __m256i _val_4567_cdef = _mm256_unpackhi_epi16(_val, _val); - - __m256i _val_0123 = _mm256_permutevar8x32_epi32(_val_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val_4567 = _mm256_permutevar8x32_epi32(_val_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val_89ab = _mm256_permutevar8x32_epi32(_val_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - __m256i _val_cdef = _mm256_permutevar8x32_epi32(_val_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - - __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val_0123); - __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val_0123); - __m256i _sl10_11 = _mm256_mullo_epi16(_w01, _val_89ab); - __m256i _sh10_11 = _mm256_mulhi_epi16(_w01, _val_89ab); - __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val_4567); - __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val_4567); - __m256i _sl12_13 = _mm256_mullo_epi16(_w23, _val_cdef); - __m256i _sh12_13 = _mm256_mulhi_epi16(_w23, _val_cdef); - - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl10_11, _sh10_11)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl12_13, _sh12_13)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl10_11, _sh10_11)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl12_13, _sh12_13)); + _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01)); + _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01)); + _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23)); + _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23)); #endif #else // 0 1 2 3 4 5 6 7 @@ -717,75 +651,23 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24)); #if __XOP__ - __m128i _val0_01 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i _val0_23 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i _val0_45 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i _val0_67 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i _val1_01 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i _val1_23 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i _val1_45 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i _val1_67 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(3, 3, 3, 3)); - - _sum0 = _mm_maddd_epi16(_val0_01, _w0, _sum0); - _sum1 = _mm_maddd_epi16(_val0_23, _w1, _sum1); - _sum2 = _mm_maddd_epi16(_val1_01, _w0, _sum2); - _sum3 = _mm_maddd_epi16(_val1_23, _w1, _sum3); - _sum0 = _mm_maddd_epi16(_val0_45, _w2, _sum0); - _sum1 = _mm_maddd_epi16(_val0_67, _w3, _sum1); - _sum2 = _mm_maddd_epi16(_val1_45, _w2, _sum2); - _sum3 = _mm_maddd_epi16(_val1_67, _w3, _sum3); + _sum00 = _mm_maddd_epi16(_val0, _w0, _sum00); + _sum01 = _mm_maddd_epi16(_val0, _w1, _sum01); + _sum02 = _mm_maddd_epi16(_val0, _w2, _sum02); + _sum03 = _mm_maddd_epi16(_val0, _w3, _sum03); + _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10); + _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11); + _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12); + _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13); #else - // 0 0 1 1 2 2 3 3 - // 4 4 5 5 6 6 7 7 - __m128i _val0_0123 = _mm_unpacklo_epi16(_val0, _val0); - __m128i _val0_4567 = _mm_unpackhi_epi16(_val0, _val0); - - __m128i _val1_0123 = _mm_unpacklo_epi16(_val1, _val1); - __m128i _val1_4567 = _mm_unpackhi_epi16(_val1, _val1); - - __m128i _val0_01 = _mm_unpacklo_epi32(_val0_0123, _val0_0123); - __m128i _val0_23 = _mm_unpackhi_epi32(_val0_0123, _val0_0123); - __m128i _val0_45 = _mm_unpacklo_epi32(_val0_4567, _val0_4567); - __m128i _val0_67 = _mm_unpackhi_epi32(_val0_4567, _val0_4567); - - __m128i _val1_01 = _mm_unpacklo_epi32(_val1_0123, _val1_0123); - __m128i _val1_23 = _mm_unpackhi_epi32(_val1_0123, _val1_0123); - __m128i _val1_45 = _mm_unpacklo_epi32(_val1_4567, _val1_4567); - __m128i _val1_67 = _mm_unpackhi_epi32(_val1_4567, _val1_4567); - - __m128i _sl00 = _mm_mullo_epi16(_w0, _val0_01); - __m128i _sh00 = _mm_mulhi_epi16(_w0, _val0_01); - __m128i _sl10 = _mm_mullo_epi16(_w0, _val1_01); - __m128i _sh10 = _mm_mulhi_epi16(_w0, _val1_01); - __m128i _sl01 = _mm_mullo_epi16(_w1, _val0_23); - __m128i _sh01 = _mm_mulhi_epi16(_w1, _val0_23); - __m128i _sl11 = _mm_mullo_epi16(_w1, _val1_23); - __m128i _sh11 = _mm_mulhi_epi16(_w1, _val1_23); - __m128i _sl02 = _mm_mullo_epi16(_w2, _val0_45); - __m128i _sh02 = _mm_mulhi_epi16(_w2, _val0_45); - __m128i _sl12 = _mm_mullo_epi16(_w2, _val1_45); - __m128i _sh12 = _mm_mulhi_epi16(_w2, _val1_45); - __m128i _sl03 = _mm_mullo_epi16(_w3, _val0_67); - __m128i _sh03 = _mm_mulhi_epi16(_w3, _val0_67); - __m128i _sl13 = _mm_mullo_epi16(_w3, _val1_67); - __m128i _sh13 = _mm_mulhi_epi16(_w3, _val1_67); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl10, _sh10)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl10, _sh10)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl01, _sh01)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl01, _sh01)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl11, _sh11)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl11, _sh11)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl02, _sh02)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl02, _sh02)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl12, _sh12)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl12, _sh12)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl03, _sh03)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl03, _sh03)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl13, _sh13)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl13, _sh13)); + _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00); + _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01); + _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02); + _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03); + _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10); + _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11); + _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12); + _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13); #endif #endif @@ -794,19 +676,64 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& } #if __AVX2__ - __m256i _sum0_2 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i _sum1_3 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 3, 0, 1)); - _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3); + // transpose 4x8 + { + __m256i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01); + _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03); + _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01); + _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03); + _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1); + _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1); + _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3); + _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3); + } + + _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01); + _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03); + _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13); + + __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0); + _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask); int sum[8]; - _mm256_storeu_si256((__m256i*)sum, _sum0_2); + _mm256_storeu_si256((__m256i*)sum, _sum00_11); #else - _sum0 = _mm_add_epi32(_sum0, _sum1); - _sum2 = _mm_add_epi32(_sum2, _sum3); + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01); + _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03); + _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01); + _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03); + _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1); + _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1); + _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3); + _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11); + _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13); + _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11); + _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13); + _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1); + _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1); + _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3); + _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3); + } + + _sum00 = _mm_add_epi32(_sum00, _sum01); + _sum02 = _mm_add_epi32(_sum02, _sum03); + _sum10 = _mm_add_epi32(_sum10, _sum11); + _sum12 = _mm_add_epi32(_sum12, _sum13); + + _sum00 = _mm_add_epi32(_sum00, _sum02); + _sum10 = _mm_add_epi32(_sum10, _sum12); int sum[8]; - _mm_storeu_si128((__m128i*)sum, _sum0); - _mm_storeu_si128((__m128i*)(sum + 4), _sum2); + _mm_storeu_si128((__m128i*)sum, _sum00); + _mm_storeu_si128((__m128i*)(sum + 4), _sum10); #endif output0_tm[0] = sum[0]; @@ -835,9 +762,12 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& #if __AVX2__ __m256i _sum0_1 = _mm256_setzero_si256(); + __m256i _sum2_3 = _mm256_setzero_si256(); #else __m128i _sum0 = _mm_setzero_si128(); __m128i _sum1 = _mm_setzero_si128(); + __m128i _sum2 = _mm_setzero_si128(); + __m128i _sum3 = _mm_setzero_si128(); #endif for (int j = 0; j < nn; j++) @@ -849,37 +779,14 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0); __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16)); + __m256i _valval = _mm256_inserti128_si256(_mm256_castsi128_si256(_val), _val, 1); + #if __AVXVNNI__ || __AVX512VNNI__ - // 0 1 0 1 x x x x - // 0 1 0 1 0 1 0 1 - __m128i _val_01 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i _val_23 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i _val_45 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i _val_67 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(3, 3, 3, 3)); - - __m256i _val_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(_val_01), _val_23, 1); - __m256i _val_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(_val_45), _val_67, 1); - - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val_0123); - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val_4567); + _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01); + _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23); #else - // 0 0 1 1 2 2 3 3 - // 4 4 5 5 6 6 7 7 - __m256i _val_0123 = _mm256_castsi128_si256(_mm_unpacklo_epi16(_val, _val)); - __m256i _val_4567 = _mm256_castsi128_si256(_mm_unpackhi_epi16(_val, _val)); - - _val_0123 = _mm256_permutevar8x32_epi32(_val_0123, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - _val_4567 = _mm256_permutevar8x32_epi32(_val_4567, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - - __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val_0123); - __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val_0123); - __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val_4567); - __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val_4567); - - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03)); + _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01)); + _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23)); #endif #else __m128i _w0 = _mm_loadu_si128((const __m128i*)k0); @@ -888,43 +795,15 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24)); #if __XOP__ - __m128i _val01 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i _val23 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i _val45 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i _val67 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(3, 3, 3, 3)); - - _sum0 = _mm_maddd_epi16(_val01, _w0, _sum0); - _sum1 = _mm_maddd_epi16(_val23, _w1, _sum1); - _sum0 = _mm_maddd_epi16(_val45, _w2, _sum0); - _sum1 = _mm_maddd_epi16(_val67, _w3, _sum1); + _sum0 = _mm_maddd_epi16(_val, _w0, _sum0); + _sum1 = _mm_maddd_epi16(_val, _w1, _sum1); + _sum2 = _mm_maddd_epi16(_val, _w2, _sum2); + _sum3 = _mm_maddd_epi16(_val, _w3, _sum3); #else - // 0 0 1 1 2 2 3 3 - // 4 4 5 5 6 6 7 7 - __m128i _val_0123 = _mm_unpacklo_epi16(_val, _val); - __m128i _val_4567 = _mm_unpackhi_epi16(_val, _val); - - __m128i _val01 = _mm_unpacklo_epi32(_val_0123, _val_0123); - __m128i _val23 = _mm_unpackhi_epi32(_val_0123, _val_0123); - __m128i _val45 = _mm_unpacklo_epi32(_val_4567, _val_4567); - __m128i _val67 = _mm_unpackhi_epi32(_val_4567, _val_4567); - - __m128i _sl0 = _mm_mullo_epi16(_w0, _val01); - __m128i _sh0 = _mm_mulhi_epi16(_w0, _val01); - __m128i _sl1 = _mm_mullo_epi16(_w1, _val23); - __m128i _sh1 = _mm_mulhi_epi16(_w1, _val23); - __m128i _sl2 = _mm_mullo_epi16(_w2, _val45); - __m128i _sh2 = _mm_mulhi_epi16(_w2, _val45); - __m128i _sl3 = _mm_mullo_epi16(_w3, _val67); - __m128i _sh3 = _mm_mulhi_epi16(_w3, _val67); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl1, _sh1)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl2, _sh2)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl2, _sh2)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl3, _sh3)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl3, _sh3)); + _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0); + _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1); + _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2); + _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3); #endif #endif @@ -935,8 +814,27 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& #if __AVX2__ __m128i _sum0 = _mm256_extracti128_si256(_sum0_1, 0); __m128i _sum1 = _mm256_extracti128_si256(_sum0_1, 1); + __m128i _sum2 = _mm256_extracti128_si256(_sum2_3, 0); + __m128i _sum3 = _mm256_extracti128_si256(_sum2_3, 1); #endif + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm_unpacklo_epi32(_sum0, _sum1); + _tmp1 = _mm_unpacklo_epi32(_sum2, _sum3); + _tmp2 = _mm_unpackhi_epi32(_sum0, _sum1); + _tmp3 = _mm_unpackhi_epi32(_sum2, _sum3); + _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1); + _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1); + _sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3); + _sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3); + } + _sum0 = _mm_add_epi32(_sum0, _sum1); + _sum2 = _mm_add_epi32(_sum2, _sum3); + + _sum0 = _mm_add_epi32(_sum0, _sum2); int sum[4]; _mm_storeu_si128((__m128i*)sum, _sum0); @@ -973,55 +871,38 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& const short* r0 = bb2.row(i / 4); const short* k0 = kernel0_tm.row(r); - __m128i _sum0 = _mm_setzero_si128(); - __m128i _sum1 = _mm_setzero_si128(); - __m128i _sum2 = _mm_setzero_si128(); - __m128i _sum3 = _mm_setzero_si128(); - __m128i _sum4 = _mm_setzero_si128(); - __m128i _sum5 = _mm_setzero_si128(); - __m128i _sum6 = _mm_setzero_si128(); - __m128i _sum7 = _mm_setzero_si128(); + __m256i _sum01 = _mm256_setzero_si256(); + __m256i _sum23 = _mm256_setzero_si256(); for (int q = 0; q < inch; q++) { - __m128i _val0 = _mm_loadu_si128((const __m128i*)r0); - __m128i _val1 = _mm_loadu_si128((const __m128i*)(r0 + 8)); - __m128i _val2 = _mm_loadu_si128((const __m128i*)(r0 + 16)); - __m128i _val3 = _mm_loadu_si128((const __m128i*)(r0 + 24)); + __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0); + __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16)); __m128i _w0 = _mm_loadu_si128((const __m128i*)k0); + __m256i _w01 = _mm256_inserti128_si256(_mm256_castsi128_si256(_w0), _w0, 1); - __m128i _sl0 = _mm_mullo_epi16(_val0, _w0); - __m128i _sh0 = _mm_mulhi_epi16(_val0, _w0); - __m128i _sl1 = _mm_mullo_epi16(_val1, _w0); - __m128i _sh1 = _mm_mulhi_epi16(_val1, _w0); - __m128i _sl2 = _mm_mullo_epi16(_val2, _w0); - __m128i _sh2 = _mm_mulhi_epi16(_val2, _w0); - __m128i _sl3 = _mm_mullo_epi16(_val3, _w0); - __m128i _sh3 = _mm_mulhi_epi16(_val3, _w0); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl1, _sh1)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl1, _sh1)); - _sum4 = _mm_add_epi32(_sum4, _mm_unpacklo_epi16(_sl2, _sh2)); - _sum5 = _mm_add_epi32(_sum5, _mm_unpackhi_epi16(_sl2, _sh2)); - _sum6 = _mm_add_epi32(_sum6, _mm_unpacklo_epi16(_sl3, _sh3)); - _sum7 = _mm_add_epi32(_sum7, _mm_unpackhi_epi16(_sl3, _sh3)); +#if __AVXVNNI__ || __AVX512VNNI__ + _sum01 = _mm256_dpwssd_epi32(_sum01, _val01, _w01); + _sum23 = _mm256_dpwssd_epi32(_sum23, _val23, _w01); +#else + _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01, _w01)); + _sum23 = _mm256_add_epi32(_sum23, _mm256_madd_epi16(_val23, _w01)); +#endif k0 += 8; r0 += 32; } - _sum0 = _mm_add_epi32(_sum0, _sum1); - _sum2 = _mm_add_epi32(_sum2, _sum3); - _sum4 = _mm_add_epi32(_sum4, _sum5); - _sum6 = _mm_add_epi32(_sum6, _sum7); + __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0); + __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1); + __m128i _sum2 = _mm256_extracti128_si256(_sum23, 0); + __m128i _sum3 = _mm256_extracti128_si256(_sum23, 1); output0_tm[0] = _mm_reduce_add_epi32(_sum0); - output0_tm[1] = _mm_reduce_add_epi32(_sum2); - output0_tm[2] = _mm_reduce_add_epi32(_sum4); - output0_tm[3] = _mm_reduce_add_epi32(_sum6); + output0_tm[1] = _mm_reduce_add_epi32(_sum1); + output0_tm[2] = _mm_reduce_add_epi32(_sum2); + output0_tm[3] = _mm_reduce_add_epi32(_sum3); output0_tm += 4; } #endif @@ -1034,37 +915,52 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& #endif const short* k0 = kernel0_tm.row(r); +#if __AVX2__ + __m256i _sum01 = _mm256_setzero_si256(); +#else __m128i _sum0 = _mm_setzero_si128(); __m128i _sum1 = _mm_setzero_si128(); - __m128i _sum2 = _mm_setzero_si128(); - __m128i _sum3 = _mm_setzero_si128(); +#endif for (int q = 0; q < inch; q++) { +#if __AVX2__ + __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0); + + __m128i _w0 = _mm_loadu_si128((const __m128i*)k0); + __m256i _w01 = _mm256_inserti128_si256(_mm256_castsi128_si256(_w0), _w0, 1); + +#if __AVXVNNI__ || __AVX512VNNI__ + _sum01 = _mm256_dpwssd_epi32(_sum01, _val01, _w01); +#else + _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01, _w01)); +#endif +#else __m128i _val0 = _mm_loadu_si128((const __m128i*)r0); __m128i _val1 = _mm_loadu_si128((const __m128i*)(r0 + 8)); __m128i _w0 = _mm_loadu_si128((const __m128i*)k0); - __m128i _sl0 = _mm_mullo_epi16(_val0, _w0); - __m128i _sh0 = _mm_mulhi_epi16(_val0, _w0); - __m128i _sl1 = _mm_mullo_epi16(_val1, _w0); - __m128i _sh1 = _mm_mulhi_epi16(_val1, _w0); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl1, _sh1)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl1, _sh1)); +#if __XOP__ + _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0); + _sum1 = _mm_maddd_epi16(_val1, _w0, _sum1); +#else + _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0); + _sum1 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum1); +#endif +#endif k0 += 8; r0 += 16; } - _sum0 = _mm_add_epi32(_sum0, _sum1); - _sum2 = _mm_add_epi32(_sum2, _sum3); +#if __AVX2__ + __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0); + __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1); +#endif output0_tm[0] = _mm_reduce_add_epi32(_sum0); - output0_tm[1] = _mm_reduce_add_epi32(_sum2); + output0_tm[1] = _mm_reduce_add_epi32(_sum1); output0_tm += 2; } for (; i < tiles; i++) @@ -1077,26 +973,23 @@ static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& const short* k0 = kernel0_tm.row(r); __m128i _sum0 = _mm_setzero_si128(); - __m128i _sum1 = _mm_setzero_si128(); for (int q = 0; q < inch; q++) { - __m128i _val = _mm_loadu_si128((const __m128i*)r0); + __m128i _val0 = _mm_loadu_si128((const __m128i*)r0); __m128i _w0 = _mm_loadu_si128((const __m128i*)k0); - __m128i _sl0 = _mm_mullo_epi16(_val, _w0); - __m128i _sh0 = _mm_mulhi_epi16(_val, _w0); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0)); +#if __XOP__ + _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0); +#else + _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0); +#endif k0 += 8; r0 += 8; } - _sum0 = _mm_add_epi32(_sum0, _sum1); - output0_tm[0] = _mm_reduce_add_epi32(_sum0); output0_tm++; } diff --git a/src/layer/x86/convolution_3x3_pack8to4_int8.h b/src/layer/x86/convolution_3x3_pack8to4_int8.h index 547de8774a64..2bb48ce1903a 100644 --- a/src/layer/x86/convolution_3x3_pack8to4_int8.h +++ b/src/layer/x86/convolution_3x3_pack8to4_int8.h @@ -125,59 +125,23 @@ static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(const Mat& k int q = 0; for (; q + 3 < outch; q += 4) { - const Mat k0 = kernel_tm.channel(q); - const Mat k1 = kernel_tm.channel(q + 1); - const Mat k2 = kernel_tm.channel(q + 2); - const Mat k3 = kernel_tm.channel(q + 3); - - Mat kernel_tm = kernel_tm_pack8.channel(q / 4); + Mat g0 = kernel_tm_pack8.channel(q / 4); for (int k = 0; k < 36; k++) { - short* g00 = kernel_tm.row(k); + short* g00 = g0.row(k); for (int p = 0; p + 7 < inch; p += 8) { -#if __AVXVNNI__ || __AVX512VNNI__ || __XOP__ for (int i = 0; i < 4; i++) { - const short* k00 = k0.row(p + i * 2); - const short* k10 = k1.row(p + i * 2); - const short* k20 = k2.row(p + i * 2); - const short* k30 = k3.row(p + i * 2); - - const short* k01 = k0.row(p + i * 2 + 1); - const short* k11 = k1.row(p + i * 2 + 1); - const short* k21 = k2.row(p + i * 2 + 1); - const short* k31 = k3.row(p + i * 2 + 1); - - g00[0] = k00[k]; - g00[1] = k01[k]; - g00[2] = k10[k]; - g00[3] = k11[k]; - g00[4] = k20[k]; - g00[5] = k21[k]; - g00[6] = k30[k]; - g00[7] = k31[k]; - - g00 += 8; - } -#else - for (int i = 0; i < 8; i++) - { - const short* k00 = k0.row(p + i); - const short* k10 = k1.row(p + i); - const short* k20 = k2.row(p + i); - const short* k30 = k3.row(p + i); - - g00[0] = k00[k]; - g00[1] = k10[k]; - g00[2] = k20[k]; - g00[3] = k30[k]; - - g00 += 4; + for (int j = 0; j < 8; j++) + { + const short* k00 = kernel_tm.channel(q + i).row(p + j); + g00[0] = k00[k]; + g00 += 1; + } } -#endif } } } @@ -482,113 +446,96 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& int nn = inch; // inch always > 0 - __m256i _sum0_1 = _mm256_setzero_si256(); - __m256i _sum2_3 = _mm256_setzero_si256(); - __m256i _sum4_5 = _mm256_setzero_si256(); - __m256i _sum6_7 = _mm256_setzero_si256(); + __m256i _sum00_11 = _mm256_setzero_si256(); + __m256i _sum10_01 = _mm256_setzero_si256(); + __m256i _sum02_13 = _mm256_setzero_si256(); + __m256i _sum12_03 = _mm256_setzero_si256(); + + __m256i _sum04_15 = _mm256_setzero_si256(); + __m256i _sum14_05 = _mm256_setzero_si256(); + __m256i _sum06_17 = _mm256_setzero_si256(); + __m256i _sum16_07 = _mm256_setzero_si256(); for (int j = 0; j < nn; j++) { // 0 1 2 3 4 5 6 7 8 9 a b c d e f - __m256i _val0 = _mm256_loadu_si256((const __m256i*)r0); + __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0); __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0); __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16)); + __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78); + #if __AVXVNNI__ || __AVX512VNNI__ - __m256i _val0_0123 = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - __m256i _val0_4567 = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - __m256i _val0_89ab = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4)); - __m256i _val0_cdef = _mm256_permutevar8x32_epi32(_val0, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6)); - - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val0_0123); - _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w01, _val0_89ab); - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val0_4567); - _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w23, _val0_cdef); + _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01); + _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01); + _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23); + _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23); #else - // 0 0 1 1 2 2 3 3 8 8 9 9 a a b b - // 4 4 5 5 6 6 7 7 c c d d e e f f - __m256i _val0_0123_89ab = _mm256_unpacklo_epi16(_val0, _val0); - __m256i _val0_4567_cdef = _mm256_unpackhi_epi16(_val0, _val0); - - __m256i _val0_0123 = _mm256_permutevar8x32_epi32(_val0_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val0_4567 = _mm256_permutevar8x32_epi32(_val0_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val0_89ab = _mm256_permutevar8x32_epi32(_val0_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - __m256i _val0_cdef = _mm256_permutevar8x32_epi32(_val0_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - - __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val0_0123); - __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val0_0123); - __m256i _sl10_11 = _mm256_mullo_epi16(_w01, _val0_89ab); - __m256i _sh10_11 = _mm256_mulhi_epi16(_w01, _val0_89ab); - __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val0_4567); - __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val0_4567); - __m256i _sl12_13 = _mm256_mullo_epi16(_w23, _val0_cdef); - __m256i _sh12_13 = _mm256_mulhi_epi16(_w23, _val0_cdef); - - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl10_11, _sh10_11)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl12_13, _sh12_13)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl10_11, _sh10_11)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl12_13, _sh12_13)); + _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01)); + _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01)); + _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23)); + _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23)); #endif - __m256i _val1 = _mm256_loadu_si256((const __m256i*)(r0 + 16)); + __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16)); + + __m256i _val32 = _mm256_permute4x64_epi64(_val23, 78); #if __AVXVNNI__ || __AVX512VNNI__ - __m256i _val1_0123 = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - __m256i _val1_4567 = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - __m256i _val1_89ab = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4)); - __m256i _val1_cdef = _mm256_permutevar8x32_epi32(_val1, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6)); - - _sum4_5 = _mm256_dpwssd_epi32(_sum4_5, _w01, _val1_0123); - _sum6_7 = _mm256_dpwssd_epi32(_sum6_7, _w01, _val1_89ab); - _sum4_5 = _mm256_dpwssd_epi32(_sum4_5, _w23, _val1_4567); - _sum6_7 = _mm256_dpwssd_epi32(_sum6_7, _w23, _val1_cdef); + _sum04_15 = _mm256_dpwssd_epi32(_sum04_15, _val23, _w01); + _sum14_05 = _mm256_dpwssd_epi32(_sum14_05, _val32, _w01); + _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23, _w23); + _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32, _w23); #else - __m256i _val1_0123_89ab = _mm256_unpacklo_epi16(_val1, _val1); - __m256i _val1_4567_cdef = _mm256_unpackhi_epi16(_val1, _val1); - - __m256i _val1_0123 = _mm256_permutevar8x32_epi32(_val1_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val1_4567 = _mm256_permutevar8x32_epi32(_val1_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val1_89ab = _mm256_permutevar8x32_epi32(_val1_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - __m256i _val1_cdef = _mm256_permutevar8x32_epi32(_val1_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - - __m256i _sl04_05 = _mm256_mullo_epi16(_w01, _val1_0123); - __m256i _sh04_05 = _mm256_mulhi_epi16(_w01, _val1_0123); - __m256i _sl14_15 = _mm256_mullo_epi16(_w01, _val1_89ab); - __m256i _sh14_15 = _mm256_mulhi_epi16(_w01, _val1_89ab); - __m256i _sl06_07 = _mm256_mullo_epi16(_w23, _val1_4567); - __m256i _sh06_07 = _mm256_mulhi_epi16(_w23, _val1_4567); - __m256i _sl16_17 = _mm256_mullo_epi16(_w23, _val1_cdef); - __m256i _sh16_17 = _mm256_mulhi_epi16(_w23, _val1_cdef); - - _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpacklo_epi16(_sl04_05, _sh04_05)); - _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpacklo_epi16(_sl14_15, _sh14_15)); - _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpacklo_epi16(_sl06_07, _sh06_07)); - _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpacklo_epi16(_sl16_17, _sh16_17)); - _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpackhi_epi16(_sl04_05, _sh04_05)); - _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpackhi_epi16(_sl14_15, _sh14_15)); - _sum4_5 = _mm256_add_epi32(_sum4_5, _mm256_unpackhi_epi16(_sl06_07, _sh06_07)); - _sum6_7 = _mm256_add_epi32(_sum6_7, _mm256_unpackhi_epi16(_sl16_17, _sh16_17)); + _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23, _w01)); + _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32, _w01)); + _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23, _w23)); + _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32, _w23)); #endif r0 += 32; k0 += 32; } - __m256i _sum0_2 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i _sum1_3 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 3, 0, 1)); - _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3); + // transpose 4x8 + { + __m256i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01); + _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03); + _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01); + _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03); + _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1); + _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1); + _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3); + _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3); + } + { + __m256i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm256_unpacklo_epi32(_sum04_15, _sum14_05); + _tmp1 = _mm256_unpacklo_epi32(_sum06_17, _sum16_07); + _tmp2 = _mm256_unpackhi_epi32(_sum04_15, _sum14_05); + _tmp3 = _mm256_unpackhi_epi32(_sum06_17, _sum16_07); + _sum04_15 = _mm256_unpacklo_epi64(_tmp0, _tmp1); + _sum14_05 = _mm256_unpackhi_epi64(_tmp0, _tmp1); + _sum06_17 = _mm256_unpacklo_epi64(_tmp2, _tmp3); + _sum16_07 = _mm256_unpackhi_epi64(_tmp2, _tmp3); + } + + _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01); + _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03); + _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13); - __m256i _sum4_6 = _mm256_permute2x128_si256(_sum4_5, _sum6_7, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i _sum5_7 = _mm256_permute2x128_si256(_sum4_5, _sum6_7, _MM_SHUFFLE(0, 3, 0, 1)); - _sum4_6 = _mm256_add_epi32(_sum4_6, _sum5_7); + _sum04_15 = _mm256_add_epi32(_sum04_15, _sum14_05); + _sum06_17 = _mm256_add_epi32(_sum06_17, _sum16_07); + _sum04_15 = _mm256_add_epi32(_sum04_15, _sum06_17); - _mm256_storeu_si256((__m256i*)output0_tm, _sum0_2); - _mm256_storeu_si256((__m256i*)(output0_tm + 8), _sum4_6); + __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0); + _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask); + _sum04_15 = _mm256_permutevar8x32_epi32(_sum04_15, _perm_mask); + + _mm256_storeu_si256((__m256i*)output0_tm, _sum00_11); + _mm256_storeu_si256((__m256i*)(output0_tm + 8), _sum04_15); output0_tm += 16; } #endif @@ -604,60 +551,42 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& int nn = inch; // inch always > 0 #if __AVX2__ - __m256i _sum0_1 = _mm256_setzero_si256(); - __m256i _sum2_3 = _mm256_setzero_si256(); + __m256i _sum00_11 = _mm256_setzero_si256(); + __m256i _sum10_01 = _mm256_setzero_si256(); + __m256i _sum02_13 = _mm256_setzero_si256(); + __m256i _sum12_03 = _mm256_setzero_si256(); #else - __m128i _sum0 = _mm_setzero_si128(); - __m128i _sum1 = _mm_setzero_si128(); - __m128i _sum2 = _mm_setzero_si128(); - __m128i _sum3 = _mm_setzero_si128(); + __m128i _sum00 = _mm_setzero_si128(); + __m128i _sum01 = _mm_setzero_si128(); + __m128i _sum02 = _mm_setzero_si128(); + __m128i _sum03 = _mm_setzero_si128(); + __m128i _sum10 = _mm_setzero_si128(); + __m128i _sum11 = _mm_setzero_si128(); + __m128i _sum12 = _mm_setzero_si128(); + __m128i _sum13 = _mm_setzero_si128(); #endif for (int j = 0; j < nn; j++) { #if __AVX2__ // 0 1 2 3 4 5 6 7 8 9 a b c d e f - __m256i _val = _mm256_loadu_si256((const __m256i*)r0); + __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0); __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0); __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16)); + __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78); + #if __AVXVNNI__ || __AVX512VNNI__ - __m256i _val_0123 = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(1, 1, 1, 1, 0, 0, 0, 0)); - __m256i _val_4567 = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(3, 3, 3, 3, 2, 2, 2, 2)); - __m256i _val_89ab = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(5, 5, 5, 5, 4, 4, 4, 4)); - __m256i _val_cdef = _mm256_permutevar8x32_epi32(_val, _mm256_set_epi32(7, 7, 7, 7, 6, 6, 6, 6)); - - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val_0123); - _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w01, _val_89ab); - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val_4567); - _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _w23, _val_cdef); + _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01); + _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01); + _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23); + _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23); #else - __m256i _val_0123_89ab = _mm256_unpacklo_epi16(_val, _val); - __m256i _val_4567_cdef = _mm256_unpackhi_epi16(_val, _val); - - __m256i _val_0123 = _mm256_permutevar8x32_epi32(_val_0123_89ab, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val_4567 = _mm256_permutevar8x32_epi32(_val_4567_cdef, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - __m256i _val_89ab = _mm256_permutevar8x32_epi32(_val_0123_89ab, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - __m256i _val_cdef = _mm256_permutevar8x32_epi32(_val_4567_cdef, _mm256_set_epi32(7, 7, 6, 6, 5, 5, 4, 4)); - - __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val_0123); - __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val_0123); - __m256i _sl10_11 = _mm256_mullo_epi16(_w01, _val_89ab); - __m256i _sh10_11 = _mm256_mulhi_epi16(_w01, _val_89ab); - __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val_4567); - __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val_4567); - __m256i _sl12_13 = _mm256_mullo_epi16(_w23, _val_cdef); - __m256i _sh12_13 = _mm256_mulhi_epi16(_w23, _val_cdef); - - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl10_11, _sh10_11)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl12_13, _sh12_13)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl10_11, _sh10_11)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl12_13, _sh12_13)); + _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01)); + _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01)); + _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23)); + _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23)); #endif #else // 0 1 2 3 4 5 6 7 @@ -670,75 +599,23 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24)); #if __XOP__ - __m128i _val0_01 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i _val0_23 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i _val0_45 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i _val0_67 = _mm_shuffle_epi32(_val0, _MM_SHUFFLE(3, 3, 3, 3)); - __m128i _val1_01 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i _val1_23 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i _val1_45 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i _val1_67 = _mm_shuffle_epi32(_val1, _MM_SHUFFLE(3, 3, 3, 3)); - - _sum0 = _mm_maddd_epi16(_val0_01, _w0, _sum0); - _sum1 = _mm_maddd_epi16(_val0_23, _w1, _sum1); - _sum2 = _mm_maddd_epi16(_val1_01, _w0, _sum2); - _sum3 = _mm_maddd_epi16(_val1_23, _w1, _sum3); - _sum0 = _mm_maddd_epi16(_val0_45, _w2, _sum0); - _sum1 = _mm_maddd_epi16(_val0_67, _w3, _sum1); - _sum2 = _mm_maddd_epi16(_val1_45, _w2, _sum2); - _sum3 = _mm_maddd_epi16(_val1_67, _w3, _sum3); + _sum00 = _mm_maddd_epi16(_val0, _w0, _sum00); + _sum01 = _mm_maddd_epi16(_val0, _w1, _sum01); + _sum02 = _mm_maddd_epi16(_val0, _w2, _sum02); + _sum03 = _mm_maddd_epi16(_val0, _w3, _sum03); + _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10); + _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11); + _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12); + _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13); #else - // 0 0 1 1 2 2 3 3 - // 4 4 5 5 6 6 7 7 - __m128i _val0_0123 = _mm_unpacklo_epi16(_val0, _val0); - __m128i _val0_4567 = _mm_unpackhi_epi16(_val0, _val0); - - __m128i _val1_0123 = _mm_unpacklo_epi16(_val1, _val1); - __m128i _val1_4567 = _mm_unpackhi_epi16(_val1, _val1); - - __m128i _val0_01 = _mm_unpacklo_epi32(_val0_0123, _val0_0123); - __m128i _val0_23 = _mm_unpackhi_epi32(_val0_0123, _val0_0123); - __m128i _val0_45 = _mm_unpacklo_epi32(_val0_4567, _val0_4567); - __m128i _val0_67 = _mm_unpackhi_epi32(_val0_4567, _val0_4567); - - __m128i _val1_01 = _mm_unpacklo_epi32(_val1_0123, _val1_0123); - __m128i _val1_23 = _mm_unpackhi_epi32(_val1_0123, _val1_0123); - __m128i _val1_45 = _mm_unpacklo_epi32(_val1_4567, _val1_4567); - __m128i _val1_67 = _mm_unpackhi_epi32(_val1_4567, _val1_4567); - - __m128i _sl00 = _mm_mullo_epi16(_w0, _val0_01); - __m128i _sh00 = _mm_mulhi_epi16(_w0, _val0_01); - __m128i _sl10 = _mm_mullo_epi16(_w0, _val1_01); - __m128i _sh10 = _mm_mulhi_epi16(_w0, _val1_01); - __m128i _sl01 = _mm_mullo_epi16(_w1, _val0_23); - __m128i _sh01 = _mm_mulhi_epi16(_w1, _val0_23); - __m128i _sl11 = _mm_mullo_epi16(_w1, _val1_23); - __m128i _sh11 = _mm_mulhi_epi16(_w1, _val1_23); - __m128i _sl02 = _mm_mullo_epi16(_w2, _val0_45); - __m128i _sh02 = _mm_mulhi_epi16(_w2, _val0_45); - __m128i _sl12 = _mm_mullo_epi16(_w2, _val1_45); - __m128i _sh12 = _mm_mulhi_epi16(_w2, _val1_45); - __m128i _sl03 = _mm_mullo_epi16(_w3, _val0_67); - __m128i _sh03 = _mm_mulhi_epi16(_w3, _val0_67); - __m128i _sl13 = _mm_mullo_epi16(_w3, _val1_67); - __m128i _sh13 = _mm_mulhi_epi16(_w3, _val1_67); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl10, _sh10)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl10, _sh10)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl01, _sh01)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl01, _sh01)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl11, _sh11)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl11, _sh11)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl02, _sh02)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl02, _sh02)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl12, _sh12)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl12, _sh12)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl03, _sh03)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl03, _sh03)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl13, _sh13)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl13, _sh13)); + _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00); + _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01); + _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02); + _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03); + _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10); + _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11); + _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12); + _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13); #endif #endif @@ -747,17 +624,62 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& } #if __AVX2__ - __m256i _sum0_2 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 2, 0, 0)); - __m256i _sum1_3 = _mm256_permute2x128_si256(_sum0_1, _sum2_3, _MM_SHUFFLE(0, 3, 0, 1)); - _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3); + // transpose 4x8 + { + __m256i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01); + _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03); + _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01); + _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03); + _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1); + _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1); + _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3); + _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3); + } - _mm256_storeu_si256((__m256i*)output0_tm, _sum0_2); + _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01); + _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03); + _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13); + + __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0); + _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask); + + _mm256_storeu_si256((__m256i*)output0_tm, _sum00_11); #else - _sum0 = _mm_add_epi32(_sum0, _sum1); - _sum2 = _mm_add_epi32(_sum2, _sum3); + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01); + _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03); + _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01); + _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03); + _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1); + _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1); + _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3); + _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3); + } + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11); + _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13); + _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11); + _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13); + _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1); + _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1); + _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3); + _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3); + } - _mm_storeu_si128((__m128i*)output0_tm, _sum0); - _mm_storeu_si128((__m128i*)(output0_tm + 4), _sum2); + _sum00 = _mm_add_epi32(_sum00, _sum01); + _sum02 = _mm_add_epi32(_sum02, _sum03); + _sum10 = _mm_add_epi32(_sum10, _sum11); + _sum12 = _mm_add_epi32(_sum12, _sum13); + + _sum00 = _mm_add_epi32(_sum00, _sum02); + _sum10 = _mm_add_epi32(_sum10, _sum12); + + _mm_storeu_si128((__m128i*)output0_tm, _sum00); + _mm_storeu_si128((__m128i*)(output0_tm + 4), _sum10); #endif output0_tm += 8; } @@ -774,9 +696,12 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& #if __AVX2__ __m256i _sum0_1 = _mm256_setzero_si256(); + __m256i _sum2_3 = _mm256_setzero_si256(); #else __m128i _sum0 = _mm_setzero_si128(); __m128i _sum1 = _mm_setzero_si128(); + __m128i _sum2 = _mm_setzero_si128(); + __m128i _sum3 = _mm_setzero_si128(); #endif for (int j = 0; j < nn; j++) @@ -787,37 +712,14 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0); __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16)); + __m256i _valval = _mm256_inserti128_si256(_mm256_castsi128_si256(_val), _val, 1); + #if __AVXVNNI__ || __AVX512VNNI__ - // 0 1 0 1 x x x x - // 0 1 0 1 0 1 0 1 - __m128i _val_01 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i _val_23 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i _val_45 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i _val_67 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(3, 3, 3, 3)); - - __m256i _val_0123 = _mm256_inserti128_si256(_mm256_castsi128_si256(_val_01), _val_23, 1); - __m256i _val_4567 = _mm256_inserti128_si256(_mm256_castsi128_si256(_val_45), _val_67, 1); - - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w01, _val_0123); - _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _w23, _val_4567); + _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01); + _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23); #else - // 0 0 1 1 2 2 3 3 - // 4 4 5 5 6 6 7 7 - __m256i _val_0123 = _mm256_castsi128_si256(_mm_unpacklo_epi16(_val, _val)); - __m256i _val_4567 = _mm256_castsi128_si256(_mm_unpackhi_epi16(_val, _val)); - - _val_0123 = _mm256_permutevar8x32_epi32(_val_0123, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - _val_4567 = _mm256_permutevar8x32_epi32(_val_4567, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); - - __m256i _sl00_01 = _mm256_mullo_epi16(_w01, _val_0123); - __m256i _sh00_01 = _mm256_mulhi_epi16(_w01, _val_0123); - __m256i _sl02_03 = _mm256_mullo_epi16(_w23, _val_4567); - __m256i _sh02_03 = _mm256_mulhi_epi16(_w23, _val_4567); - - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl00_01, _sh00_01)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl02_03, _sh02_03)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl00_01, _sh00_01)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl02_03, _sh02_03)); + _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01)); + _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23)); #endif #else __m128i _w0 = _mm_loadu_si128((const __m128i*)k0); @@ -826,43 +728,15 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24)); #if __XOP__ - __m128i _val01 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(0, 0, 0, 0)); - __m128i _val23 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(1, 1, 1, 1)); - __m128i _val45 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(2, 2, 2, 2)); - __m128i _val67 = _mm_shuffle_epi32(_val, _MM_SHUFFLE(3, 3, 3, 3)); - - _sum0 = _mm_maddd_epi16(_val01, _w0, _sum0); - _sum1 = _mm_maddd_epi16(_val23, _w1, _sum1); - _sum0 = _mm_maddd_epi16(_val45, _w2, _sum0); - _sum1 = _mm_maddd_epi16(_val67, _w3, _sum1); + _sum0 = _mm_maddd_epi16(_val, _w0, _sum0); + _sum1 = _mm_maddd_epi16(_val, _w1, _sum1); + _sum2 = _mm_maddd_epi16(_val, _w2, _sum2); + _sum3 = _mm_maddd_epi16(_val, _w3, _sum3); #else - // 0 0 1 1 2 2 3 3 - // 4 4 5 5 6 6 7 7 - __m128i _val_0123 = _mm_unpacklo_epi16(_val, _val); - __m128i _val_4567 = _mm_unpackhi_epi16(_val, _val); - - __m128i _val01 = _mm_unpacklo_epi32(_val_0123, _val_0123); - __m128i _val23 = _mm_unpackhi_epi32(_val_0123, _val_0123); - __m128i _val45 = _mm_unpacklo_epi32(_val_4567, _val_4567); - __m128i _val67 = _mm_unpackhi_epi32(_val_4567, _val_4567); - - __m128i _sl0 = _mm_mullo_epi16(_w0, _val01); - __m128i _sh0 = _mm_mulhi_epi16(_w0, _val01); - __m128i _sl1 = _mm_mullo_epi16(_w1, _val23); - __m128i _sh1 = _mm_mulhi_epi16(_w1, _val23); - __m128i _sl2 = _mm_mullo_epi16(_w2, _val45); - __m128i _sh2 = _mm_mulhi_epi16(_w2, _val45); - __m128i _sl3 = _mm_mullo_epi16(_w3, _val67); - __m128i _sh3 = _mm_mulhi_epi16(_w3, _val67); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl0, _sh0)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl1, _sh1)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl2, _sh2)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl2, _sh2)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl3, _sh3)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl3, _sh3)); + _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0); + _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1); + _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2); + _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3); #endif #endif @@ -873,8 +747,27 @@ static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& #if __AVX2__ __m128i _sum0 = _mm256_extracti128_si256(_sum0_1, 0); __m128i _sum1 = _mm256_extracti128_si256(_sum0_1, 1); + __m128i _sum2 = _mm256_extracti128_si256(_sum2_3, 0); + __m128i _sum3 = _mm256_extracti128_si256(_sum2_3, 1); #endif + + // transpose 4x4 + { + __m128i _tmp0, _tmp1, _tmp2, _tmp3; + _tmp0 = _mm_unpacklo_epi32(_sum0, _sum1); + _tmp1 = _mm_unpacklo_epi32(_sum2, _sum3); + _tmp2 = _mm_unpackhi_epi32(_sum0, _sum1); + _tmp3 = _mm_unpackhi_epi32(_sum2, _sum3); + _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1); + _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1); + _sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3); + _sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3); + } + _sum0 = _mm_add_epi32(_sum0, _sum1); + _sum2 = _mm_add_epi32(_sum2, _sum3); + + _sum0 = _mm_add_epi32(_sum0, _sum2); _mm_storeu_si128((__m128i*)output0_tm, _sum0); output0_tm += 4; diff --git a/src/layer/x86/convolution_sgemm_int8.h b/src/layer/x86/convolution_sgemm_int8.h index a533ce79d02d..34097f057b72 100644 --- a/src/layer/x86/convolution_sgemm_int8.h +++ b/src/layer/x86/convolution_sgemm_int8.h @@ -338,17 +338,8 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const if (nn4 > 0) { -#if __AVXVNNI__ || __AVX512VNNI__ - __m256i _sum10_02 = _mm256_setzero_si256(); - __m256i _sum30_22 = _mm256_setzero_si256(); -#else __m256i _sum10_02 = _mm256_setzero_si256(); - __m256i _sum01_13 = _mm256_setzero_si256(); - __m256i _sum11_03 = _mm256_setzero_si256(); __m256i _sum30_22 = _mm256_setzero_si256(); - __m256i _sum21_33 = _mm256_setzero_si256(); - __m256i _sum31_23 = _mm256_setzero_si256(); -#endif int j = 0; for (; j < nn4; j++) @@ -371,72 +362,21 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const _sum20_32 = _mm256_dpwssd_epi32(_sum20_32, _val23_16, _w01_16); _sum30_22 = _mm256_dpwssd_epi32(_sum30_22, _val32_16, _w01_16); #else - __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16); - __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16); - __m256i _sl20_31 = _mm256_mullo_epi16(_val23_16, _w01_16); - __m256i _sh20_31 = _mm256_mulhi_epi16(_val23_16, _w01_16); - __m256i _sl30_21 = _mm256_mullo_epi16(_val32_16, _w01_16); - __m256i _sh30_21 = _mm256_mulhi_epi16(_val32_16, _w01_16); - - _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_unpacklo_epi16(_sl00_11, _sh00_11)); - _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_unpacklo_epi16(_sl10_01, _sh10_01)); - _sum01_13 = _mm256_add_epi32(_sum01_13, _mm256_unpackhi_epi16(_sl00_11, _sh00_11)); - _sum11_03 = _mm256_add_epi32(_sum11_03, _mm256_unpackhi_epi16(_sl10_01, _sh10_01)); - _sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_unpacklo_epi16(_sl20_31, _sh20_31)); - _sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_unpacklo_epi16(_sl30_21, _sh30_21)); - _sum21_33 = _mm256_add_epi32(_sum21_33, _mm256_unpackhi_epi16(_sl20_31, _sh20_31)); - _sum31_23 = _mm256_add_epi32(_sum31_23, _mm256_unpackhi_epi16(_sl30_21, _sh30_21)); + _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16)); + _sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_madd_epi16(_val23_16, _w01_16)); + _sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_madd_epi16(_val32_16, _w01_16)); #endif tmpptr += 16; kptr0 += 16; } -#if __AVXVNNI__ || __AVX512VNNI__ _sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02); _sum20_32 = _mm256_hadd_epi32(_sum20_32, _sum30_22); _sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0)); _sum20_32 = _mm256_permute4x64_epi64(_sum20_32, _MM_SHUFFLE(2, 1, 3, 0)); -#else - // transpose 4x8 - { - __m256i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm256_unpacklo_epi32(_sum00_12, _sum10_02); - _tmp1 = _mm256_unpacklo_epi32(_sum01_13, _sum11_03); - _tmp2 = _mm256_unpackhi_epi32(_sum00_12, _sum10_02); - _tmp3 = _mm256_unpackhi_epi32(_sum01_13, _sum11_03); - _sum00_12 = _mm256_unpacklo_epi64(_tmp0, _tmp1); - _sum10_02 = _mm256_unpackhi_epi64(_tmp0, _tmp1); - _sum01_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3); - _sum11_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3); - } - { - __m256i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm256_unpacklo_epi32(_sum20_32, _sum30_22); - _tmp1 = _mm256_unpacklo_epi32(_sum21_33, _sum31_23); - _tmp2 = _mm256_unpackhi_epi32(_sum20_32, _sum30_22); - _tmp3 = _mm256_unpackhi_epi32(_sum21_33, _sum31_23); - _sum20_32 = _mm256_unpacklo_epi64(_tmp0, _tmp1); - _sum30_22 = _mm256_unpackhi_epi64(_tmp0, _tmp1); - _sum21_33 = _mm256_unpacklo_epi64(_tmp2, _tmp3); - _sum31_23 = _mm256_unpackhi_epi64(_tmp2, _tmp3); - } - - _sum00_12 = _mm256_add_epi32(_sum00_12, _sum10_02); - _sum01_13 = _mm256_add_epi32(_sum01_13, _sum11_03); - _sum00_12 = _mm256_add_epi32(_sum00_12, _sum01_13); - - _sum20_32 = _mm256_add_epi32(_sum20_32, _sum30_22); - _sum21_33 = _mm256_add_epi32(_sum21_33, _sum31_23); - _sum20_32 = _mm256_add_epi32(_sum20_32, _sum21_33); - - __m256i _perm_mask = _mm256_set_epi32(6, 4, 3, 1, 7, 5, 2, 0); - _sum00_12 = _mm256_permutevar8x32_epi32(_sum00_12, _perm_mask); - _sum20_32 = _mm256_permutevar8x32_epi32(_sum20_32, _perm_mask); -#endif } __m128i _sum00 = _mm256_extracti128_si256(_sum00_12, 0); @@ -532,25 +472,10 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const if (nn4 > 0) { #if __AVX2__ -#if __AVXVNNI__ || __AVX512VNNI__ - __m256i _sum10_02 = _mm256_setzero_si256(); -#else __m256i _sum10_02 = _mm256_setzero_si256(); - __m256i _sum01_13 = _mm256_setzero_si256(); - __m256i _sum11_03 = _mm256_setzero_si256(); -#endif -#else -#if __XOP__ - __m128i _sum01 = _mm_setzero_si128(); - __m128i _sum11 = _mm_setzero_si128(); #else __m128i _sum01 = _mm_setzero_si128(); - __m128i _sum02 = _mm_setzero_si128(); - __m128i _sum03 = _mm_setzero_si128(); __m128i _sum11 = _mm_setzero_si128(); - __m128i _sum12 = _mm_setzero_si128(); - __m128i _sum13 = _mm_setzero_si128(); -#endif #endif int j = 0; @@ -571,15 +496,8 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const _sum00_12 = _mm256_dpwssd_epi32(_sum00_12, _val01_16, _w01_16); _sum10_02 = _mm256_dpwssd_epi32(_sum10_02, _val10_16, _w01_16); #else - __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16); - __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16); - - _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_unpacklo_epi16(_sl00_11, _sh00_11)); - _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_unpacklo_epi16(_sl10_01, _sh10_01)); - _sum01_13 = _mm256_add_epi32(_sum01_13, _mm256_unpackhi_epi16(_sl00_11, _sh00_11)); - _sum11_03 = _mm256_add_epi32(_sum11_03, _mm256_unpackhi_epi16(_sl10_01, _sh10_01)); + _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16)); #endif #else __m128i _val01 = _mm_loadl_epi64((const __m128i*)tmpptr); @@ -604,23 +522,10 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10); _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11); #else - __m128i _sl00 = _mm_mullo_epi16(_val0, _w0); - __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0); - __m128i _sl01 = _mm_mullo_epi16(_val0, _w1); - __m128i _sh01 = _mm_mulhi_epi16(_val0, _w1); - __m128i _sl10 = _mm_mullo_epi16(_val1, _w0); - __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0); - __m128i _sl11 = _mm_mullo_epi16(_val1, _w1); - __m128i _sh11 = _mm_mulhi_epi16(_val1, _w1); - - _sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00)); - _sum01 = _mm_add_epi32(_sum01, _mm_unpackhi_epi16(_sl00, _sh00)); - _sum02 = _mm_add_epi32(_sum02, _mm_unpacklo_epi16(_sl01, _sh01)); - _sum03 = _mm_add_epi32(_sum03, _mm_unpackhi_epi16(_sl01, _sh01)); - _sum10 = _mm_add_epi32(_sum10, _mm_unpacklo_epi16(_sl10, _sh10)); - _sum11 = _mm_add_epi32(_sum11, _mm_unpackhi_epi16(_sl10, _sh10)); - _sum12 = _mm_add_epi32(_sum12, _mm_unpacklo_epi16(_sl11, _sh11)); - _sum13 = _mm_add_epi32(_sum13, _mm_unpackhi_epi16(_sl11, _sh11)); + _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00); + _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01); + _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10); + _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11); #endif #endif @@ -629,67 +534,26 @@ static void im2col_sgemm_int8_sse(const Mat& bottom_im2col, Mat& top_blob, const } #if __AVX2__ -#if __AVXVNNI__ || __AVX512VNNI__ _sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02); _sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0)); #else - // transpose 4x8 - { - __m256i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm256_unpacklo_epi32(_sum00_12, _sum10_02); - _tmp1 = _mm256_unpacklo_epi32(_sum01_13, _sum11_03); - _tmp2 = _mm256_unpackhi_epi32(_sum00_12, _sum10_02); - _tmp3 = _mm256_unpackhi_epi32(_sum01_13, _sum11_03); - _sum00_12 = _mm256_unpacklo_epi64(_tmp0, _tmp1); - _sum10_02 = _mm256_unpackhi_epi64(_tmp0, _tmp1); - _sum01_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3); - _sum11_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3); - } - - _sum00_12 = _mm256_add_epi32(_sum00_12, _sum10_02); - _sum01_13 = _mm256_add_epi32(_sum01_13, _sum11_03); - _sum00_12 = _mm256_add_epi32(_sum00_12, _sum01_13); - - __m256i _perm_mask = _mm256_set_epi32(6, 4, 3, 1, 7, 5, 2, 0); - _sum00_12 = _mm256_permutevar8x32_epi32(_sum00_12, _perm_mask); -#endif -#else -#if __XOP__ +#if __SSSE3__ _sum00 = _mm_hadd_epi32(_sum00, _sum01); _sum10 = _mm_hadd_epi32(_sum10, _sum11); #else - // transpose 4x4 - { - __m128i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01); - _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03); - _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01); - _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03); - _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1); - _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1); - _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3); - _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3); - } - { - __m128i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11); - _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13); - _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11); - _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13); - _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1); - _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1); - _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3); - _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3); - } + __m128i _sum00_sh = _mm_shuffle_epi32(_sum00, 216); + __m128i _sum01_sh = _mm_shuffle_epi32(_sum01, 216); + __m128i _sum10_sh = _mm_shuffle_epi32(_sum10, 216); + __m128i _sum11_sh = _mm_shuffle_epi32(_sum11, 216); + + _sum00 = _mm_unpacklo_epi64(_sum00_sh, _sum01_sh); + _sum01 = _mm_unpackhi_epi64(_sum00_sh, _sum01_sh); + _sum10 = _mm_unpacklo_epi64(_sum10_sh, _sum11_sh); + _sum11 = _mm_unpackhi_epi64(_sum10_sh, _sum11_sh); _sum00 = _mm_add_epi32(_sum00, _sum01); - _sum02 = _mm_add_epi32(_sum02, _sum03); _sum10 = _mm_add_epi32(_sum10, _sum11); - _sum12 = _mm_add_epi32(_sum12, _sum13); - - _sum00 = _mm_add_epi32(_sum00, _sum02); - _sum10 = _mm_add_epi32(_sum10, _sum12); #endif #endif } diff --git a/src/layer/x86/convolution_sgemm_pack16.h b/src/layer/x86/convolution_sgemm_pack16.h index 07ea17ce26cf..2612c4005086 100644 --- a/src/layer/x86/convolution_sgemm_pack16.h +++ b/src/layer/x86/convolution_sgemm_pack16.h @@ -67,57 +67,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob, __m512 _ra = _mm512_loadu_ps(img0 + 16 * 10); __m512 _rb = _mm512_loadu_ps(img0 + 16 * 11); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); - __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); - __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); - __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); - __m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9); - __m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9); - __m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb); - __m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb); - - __m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1)); - _tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - _tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0)); - _r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0)); - _r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); - _ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1)); - _rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); @@ -164,41 +114,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob, __m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6); __m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); - __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); - __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); - __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); - - __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); @@ -237,25 +153,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob, __m512 _r2 = _mm512_loadu_ps(img0 + 16 * 2); __m512 _r3 = _mm512_loadu_ps(img0 + 16 * 3); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - - __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x4_ps(_r0, _r1, _r2, _r3); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); @@ -288,14 +186,7 @@ static void im2col_sgemm_pack16_avx512(const Mat& bottom_im2col, Mat& top_blob, __m512 _r0 = _mm512_loadu_ps(img0); __m512 _r1 = _mm512_loadu_ps(img0 + 16); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - - __m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - __m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x2_ps(_r0, _r1); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); diff --git a/src/layer/x86/convolution_sgemm_pack16to1.h b/src/layer/x86/convolution_sgemm_pack16to1.h index 7b26ef27a404..a8a823a34b3c 100644 --- a/src/layer/x86/convolution_sgemm_pack16to1.h +++ b/src/layer/x86/convolution_sgemm_pack16to1.h @@ -66,7 +66,7 @@ static void im2col_sgemm_pack16to1_avx512(const Mat& bottom_im2col, Mat& top_blo __m512 _re = _mm512_loadu_ps(img0 + 16 * 14); __m512 _rf = _mm512_loadu_ps(img0 + 16 * 15); - transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); @@ -117,41 +117,7 @@ static void im2col_sgemm_pack16to1_avx512(const Mat& bottom_im2col, Mat& top_blo __m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6); __m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); - __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); - __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); - __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); - - __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); diff --git a/src/layer/x86/convolution_sgemm_pack16to4.h b/src/layer/x86/convolution_sgemm_pack16to4.h index 1930128c3d2b..e35cc149244a 100644 --- a/src/layer/x86/convolution_sgemm_pack16to4.h +++ b/src/layer/x86/convolution_sgemm_pack16to4.h @@ -59,41 +59,7 @@ static void im2col_sgemm_pack16to4_avx512(const Mat& bottom_im2col, Mat& top_blo __m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6); __m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); - __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); - __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); - __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); - - __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); @@ -132,25 +98,7 @@ static void im2col_sgemm_pack16to4_avx512(const Mat& bottom_im2col, Mat& top_blo __m512 _r2 = _mm512_loadu_ps(img0 + 16 * 2); __m512 _r3 = _mm512_loadu_ps(img0 + 16 * 3); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - - __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x4_ps(_r0, _r1, _r2, _r3); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); diff --git a/src/layer/x86/convolution_sgemm_pack16to8.h b/src/layer/x86/convolution_sgemm_pack16to8.h index ec293efb9f69..397fa0296ba4 100644 --- a/src/layer/x86/convolution_sgemm_pack16to8.h +++ b/src/layer/x86/convolution_sgemm_pack16to8.h @@ -59,41 +59,7 @@ static void im2col_sgemm_pack16to8_avx512(const Mat& bottom_im2col, Mat& top_blo __m512 _r6 = _mm512_loadu_ps(img0 + 16 * 6); __m512 _r7 = _mm512_loadu_ps(img0 + 16 * 7); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); - __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); - __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); - __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); - - __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); @@ -132,25 +98,7 @@ static void im2col_sgemm_pack16to8_avx512(const Mat& bottom_im2col, Mat& top_blo __m512 _r2 = _mm512_loadu_ps(img0 + 16 * 2); __m512 _r3 = _mm512_loadu_ps(img0 + 16 * 3); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - - __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x4_ps(_r0, _r1, _r2, _r3); _mm512_storeu_ps(tmpptr, _r0); _mm512_storeu_ps(tmpptr + 16, _r1); diff --git a/src/layer/x86/convolution_sgemm_pack1to4_int8.h b/src/layer/x86/convolution_sgemm_pack1to4_int8.h index ba567ce3354a..fd084987277f 100644 --- a/src/layer/x86/convolution_sgemm_pack1to4_int8.h +++ b/src/layer/x86/convolution_sgemm_pack1to4_int8.h @@ -301,17 +301,8 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl if (nn4 > 0) { -#if __AVXVNNI__ || __AVX512VNNI__ - __m256i _sum10_02 = _mm256_setzero_si256(); - __m256i _sum30_22 = _mm256_setzero_si256(); -#else __m256i _sum10_02 = _mm256_setzero_si256(); - __m256i _sum01_13 = _mm256_setzero_si256(); - __m256i _sum11_03 = _mm256_setzero_si256(); __m256i _sum30_22 = _mm256_setzero_si256(); - __m256i _sum21_33 = _mm256_setzero_si256(); - __m256i _sum31_23 = _mm256_setzero_si256(); -#endif int j = 0; for (; j < nn4; j++) @@ -334,72 +325,21 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum20_32 = _mm256_dpwssd_epi32(_sum20_32, _val23_16, _w01_16); _sum30_22 = _mm256_dpwssd_epi32(_sum30_22, _val32_16, _w01_16); #else - __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16); - __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16); - __m256i _sl20_31 = _mm256_mullo_epi16(_val23_16, _w01_16); - __m256i _sh20_31 = _mm256_mulhi_epi16(_val23_16, _w01_16); - __m256i _sl30_21 = _mm256_mullo_epi16(_val32_16, _w01_16); - __m256i _sh30_21 = _mm256_mulhi_epi16(_val32_16, _w01_16); - - _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_unpacklo_epi16(_sl00_11, _sh00_11)); - _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_unpacklo_epi16(_sl10_01, _sh10_01)); - _sum01_13 = _mm256_add_epi32(_sum01_13, _mm256_unpackhi_epi16(_sl00_11, _sh00_11)); - _sum11_03 = _mm256_add_epi32(_sum11_03, _mm256_unpackhi_epi16(_sl10_01, _sh10_01)); - _sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_unpacklo_epi16(_sl20_31, _sh20_31)); - _sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_unpacklo_epi16(_sl30_21, _sh30_21)); - _sum21_33 = _mm256_add_epi32(_sum21_33, _mm256_unpackhi_epi16(_sl20_31, _sh20_31)); - _sum31_23 = _mm256_add_epi32(_sum31_23, _mm256_unpackhi_epi16(_sl30_21, _sh30_21)); + _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16)); + _sum20_32 = _mm256_add_epi32(_sum20_32, _mm256_madd_epi16(_val23_16, _w01_16)); + _sum30_22 = _mm256_add_epi32(_sum30_22, _mm256_madd_epi16(_val32_16, _w01_16)); #endif tmpptr += 16; kptr0 += 16; } -#if __AVXVNNI__ || __AVX512VNNI__ _sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02); _sum20_32 = _mm256_hadd_epi32(_sum20_32, _sum30_22); _sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0)); _sum20_32 = _mm256_permute4x64_epi64(_sum20_32, _MM_SHUFFLE(2, 1, 3, 0)); -#else - // transpose 4x8 - { - __m256i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm256_unpacklo_epi32(_sum00_12, _sum10_02); - _tmp1 = _mm256_unpacklo_epi32(_sum01_13, _sum11_03); - _tmp2 = _mm256_unpackhi_epi32(_sum00_12, _sum10_02); - _tmp3 = _mm256_unpackhi_epi32(_sum01_13, _sum11_03); - _sum00_12 = _mm256_unpacklo_epi64(_tmp0, _tmp1); - _sum10_02 = _mm256_unpackhi_epi64(_tmp0, _tmp1); - _sum01_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3); - _sum11_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3); - } - { - __m256i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm256_unpacklo_epi32(_sum20_32, _sum30_22); - _tmp1 = _mm256_unpacklo_epi32(_sum21_33, _sum31_23); - _tmp2 = _mm256_unpackhi_epi32(_sum20_32, _sum30_22); - _tmp3 = _mm256_unpackhi_epi32(_sum21_33, _sum31_23); - _sum20_32 = _mm256_unpacklo_epi64(_tmp0, _tmp1); - _sum30_22 = _mm256_unpackhi_epi64(_tmp0, _tmp1); - _sum21_33 = _mm256_unpacklo_epi64(_tmp2, _tmp3); - _sum31_23 = _mm256_unpackhi_epi64(_tmp2, _tmp3); - } - - _sum00_12 = _mm256_add_epi32(_sum00_12, _sum10_02); - _sum01_13 = _mm256_add_epi32(_sum01_13, _sum11_03); - _sum00_12 = _mm256_add_epi32(_sum00_12, _sum01_13); - - _sum20_32 = _mm256_add_epi32(_sum20_32, _sum30_22); - _sum21_33 = _mm256_add_epi32(_sum21_33, _sum31_23); - _sum20_32 = _mm256_add_epi32(_sum20_32, _sum21_33); - - __m256i _perm_mask = _mm256_set_epi32(6, 4, 3, 1, 7, 5, 2, 0); - _sum00_12 = _mm256_permutevar8x32_epi32(_sum00_12, _perm_mask); - _sum20_32 = _mm256_permutevar8x32_epi32(_sum20_32, _perm_mask); -#endif } __m128i _sum00 = _mm256_extracti128_si256(_sum00_12, 0); @@ -458,25 +398,10 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl if (nn4 > 0) { #if __AVX2__ -#if __AVXVNNI__ || __AVX512VNNI__ - __m256i _sum10_02 = _mm256_setzero_si256(); -#else __m256i _sum10_02 = _mm256_setzero_si256(); - __m256i _sum01_13 = _mm256_setzero_si256(); - __m256i _sum11_03 = _mm256_setzero_si256(); -#endif -#else -#if __XOP__ - __m128i _sum01 = _mm_setzero_si128(); - __m128i _sum11 = _mm_setzero_si128(); #else __m128i _sum01 = _mm_setzero_si128(); - __m128i _sum02 = _mm_setzero_si128(); - __m128i _sum03 = _mm_setzero_si128(); __m128i _sum11 = _mm_setzero_si128(); - __m128i _sum12 = _mm_setzero_si128(); - __m128i _sum13 = _mm_setzero_si128(); -#endif #endif int j = 0; @@ -497,15 +422,8 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum00_12 = _mm256_dpwssd_epi32(_sum00_12, _val01_16, _w01_16); _sum10_02 = _mm256_dpwssd_epi32(_sum10_02, _val10_16, _w01_16); #else - __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16); - __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16); - - _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_unpacklo_epi16(_sl00_11, _sh00_11)); - _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_unpacklo_epi16(_sl10_01, _sh10_01)); - _sum01_13 = _mm256_add_epi32(_sum01_13, _mm256_unpackhi_epi16(_sl00_11, _sh00_11)); - _sum11_03 = _mm256_add_epi32(_sum11_03, _mm256_unpackhi_epi16(_sl10_01, _sh10_01)); + _sum00_12 = _mm256_add_epi32(_sum00_12, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum10_02 = _mm256_add_epi32(_sum10_02, _mm256_madd_epi16(_val10_16, _w01_16)); #endif #else __m128i _val01 = _mm_loadl_epi64((const __m128i*)tmpptr); @@ -530,23 +448,10 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10); _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11); #else - __m128i _sl00 = _mm_mullo_epi16(_val0, _w0); - __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0); - __m128i _sl01 = _mm_mullo_epi16(_val0, _w1); - __m128i _sh01 = _mm_mulhi_epi16(_val0, _w1); - __m128i _sl10 = _mm_mullo_epi16(_val1, _w0); - __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0); - __m128i _sl11 = _mm_mullo_epi16(_val1, _w1); - __m128i _sh11 = _mm_mulhi_epi16(_val1, _w1); - - _sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00)); - _sum01 = _mm_add_epi32(_sum01, _mm_unpackhi_epi16(_sl00, _sh00)); - _sum02 = _mm_add_epi32(_sum02, _mm_unpacklo_epi16(_sl01, _sh01)); - _sum03 = _mm_add_epi32(_sum03, _mm_unpackhi_epi16(_sl01, _sh01)); - _sum10 = _mm_add_epi32(_sum10, _mm_unpacklo_epi16(_sl10, _sh10)); - _sum11 = _mm_add_epi32(_sum11, _mm_unpackhi_epi16(_sl10, _sh10)); - _sum12 = _mm_add_epi32(_sum12, _mm_unpacklo_epi16(_sl11, _sh11)); - _sum13 = _mm_add_epi32(_sum13, _mm_unpackhi_epi16(_sl11, _sh11)); + _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00); + _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01); + _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10); + _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11); #endif #endif @@ -555,67 +460,26 @@ static void im2col_sgemm_pack1to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl } #if __AVX2__ -#if __AVXVNNI__ || __AVX512VNNI__ _sum00_12 = _mm256_hadd_epi32(_sum00_12, _sum10_02); _sum00_12 = _mm256_permute4x64_epi64(_sum00_12, _MM_SHUFFLE(2, 1, 3, 0)); #else - // transpose 4x8 - { - __m256i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm256_unpacklo_epi32(_sum00_12, _sum10_02); - _tmp1 = _mm256_unpacklo_epi32(_sum01_13, _sum11_03); - _tmp2 = _mm256_unpackhi_epi32(_sum00_12, _sum10_02); - _tmp3 = _mm256_unpackhi_epi32(_sum01_13, _sum11_03); - _sum00_12 = _mm256_unpacklo_epi64(_tmp0, _tmp1); - _sum10_02 = _mm256_unpackhi_epi64(_tmp0, _tmp1); - _sum01_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3); - _sum11_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3); - } - - _sum00_12 = _mm256_add_epi32(_sum00_12, _sum10_02); - _sum01_13 = _mm256_add_epi32(_sum01_13, _sum11_03); - _sum00_12 = _mm256_add_epi32(_sum00_12, _sum01_13); - - __m256i _perm_mask = _mm256_set_epi32(6, 4, 3, 1, 7, 5, 2, 0); - _sum00_12 = _mm256_permutevar8x32_epi32(_sum00_12, _perm_mask); -#endif -#else -#if __XOP__ +#if __SSSE3__ _sum00 = _mm_hadd_epi32(_sum00, _sum01); _sum10 = _mm_hadd_epi32(_sum10, _sum11); #else - // transpose 4x4 - { - __m128i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01); - _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03); - _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01); - _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03); - _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1); - _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1); - _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3); - _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3); - } - { - __m128i _tmp0, _tmp1, _tmp2, _tmp3; - _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11); - _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13); - _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11); - _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13); - _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1); - _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1); - _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3); - _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3); - } + __m128i _sum00_sh = _mm_shuffle_epi32(_sum00, 216); + __m128i _sum01_sh = _mm_shuffle_epi32(_sum01, 216); + __m128i _sum10_sh = _mm_shuffle_epi32(_sum10, 216); + __m128i _sum11_sh = _mm_shuffle_epi32(_sum11, 216); + + _sum00 = _mm_unpacklo_epi64(_sum00_sh, _sum01_sh); + _sum01 = _mm_unpackhi_epi64(_sum00_sh, _sum01_sh); + _sum10 = _mm_unpacklo_epi64(_sum10_sh, _sum11_sh); + _sum11 = _mm_unpackhi_epi64(_sum10_sh, _sum11_sh); _sum00 = _mm_add_epi32(_sum00, _sum01); - _sum02 = _mm_add_epi32(_sum02, _sum03); _sum10 = _mm_add_epi32(_sum10, _sum11); - _sum12 = _mm_add_epi32(_sum12, _sum13); - - _sum00 = _mm_add_epi32(_sum00, _sum02); - _sum10 = _mm_add_epi32(_sum10, _sum12); #endif #endif } diff --git a/src/layer/x86/convolution_sgemm_pack8.h b/src/layer/x86/convolution_sgemm_pack8.h index af5b9cddf2ad..64f0dc2fa4cd 100644 --- a/src/layer/x86/convolution_sgemm_pack8.h +++ b/src/layer/x86/convolution_sgemm_pack8.h @@ -67,42 +67,7 @@ static void im2col_sgemm_pack8_avx(const Mat& bottom_im2col, Mat& top_blob, cons __m256 _ra = _mm256_load_ps(img0 + 8 * 10); __m256 _rb = _mm256_load_ps(img0 + 8 * 11); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); - __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); - __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); - __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); - __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9); - __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9); - __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb); - __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb); - __m256 _tmpc = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpd = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpe = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpf = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpg = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmph = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpi = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpj = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpl = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpm = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpn = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0)); - _r3 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 2, 0, 0)); - _r4 = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); - _r5 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0)); - _r6 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 3, 0, 1)); - _r7 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); - _r8 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1)); - _r9 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 3, 0, 1)); - _ra = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); - _rb = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); @@ -149,30 +114,7 @@ static void im2col_sgemm_pack8_avx(const Mat& bottom_im2col, Mat& top_blob, cons __m256 _r6 = _mm256_load_ps(img0 + 8 * 6); __m256 _r7 = _mm256_load_ps(img0 + 8 * 7); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); - __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); - __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); - __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); - __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); - _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); - _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); - _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); - _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); - _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); @@ -211,18 +153,7 @@ static void im2col_sgemm_pack8_avx(const Mat& bottom_im2col, Mat& top_blob, cons __m256 _r2 = _mm256_load_ps(img0 + 8 * 2); __m256 _r3 = _mm256_load_ps(img0 + 8 * 3); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); - _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x4_ps(_r0, _r1, _r2, _r3); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); @@ -255,10 +186,7 @@ static void im2col_sgemm_pack8_avx(const Mat& bottom_im2col, Mat& top_blob, cons __m256 _r0 = _mm256_load_ps(img0); __m256 _r1 = _mm256_load_ps(img0 + 8); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - _r0 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x2_ps(_r0, _r1); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); diff --git a/src/layer/x86/convolution_sgemm_pack8to1.h b/src/layer/x86/convolution_sgemm_pack8to1.h index e23e13363f1c..c554599b6125 100644 --- a/src/layer/x86/convolution_sgemm_pack8to1.h +++ b/src/layer/x86/convolution_sgemm_pack8to1.h @@ -56,30 +56,7 @@ static void im2col_sgemm_pack8to1_avx(const Mat& bottom_im2col, Mat& top_blob, c __m256 _r6 = _mm256_load_ps(img0 + 8 * 6); __m256 _r7 = _mm256_load_ps(img0 + 8 * 7); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); - __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); - __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); - __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); - __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); - _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); - _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); - _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); - _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); - _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); diff --git a/src/layer/x86/convolution_sgemm_pack8to16.h b/src/layer/x86/convolution_sgemm_pack8to16.h index 03e9633142f4..9e1fbe8f348f 100644 --- a/src/layer/x86/convolution_sgemm_pack8to16.h +++ b/src/layer/x86/convolution_sgemm_pack8to16.h @@ -57,7 +57,7 @@ static void im2col_sgemm_pack8to16_avx512(const Mat& bottom_im2col, Mat& top_blo __m256 _r6 = _mm256_load_ps(img0 + 8 * 6); __m256 _r7 = _mm256_load_ps(img0 + 8 * 7); - transpose8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); + transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); diff --git a/src/layer/x86/convolution_sgemm_pack8to1_int8.h b/src/layer/x86/convolution_sgemm_pack8to1_int8.h index 9c080ffd5b9d..b76b6e26f182 100644 --- a/src/layer/x86/convolution_sgemm_pack8to1_int8.h +++ b/src/layer/x86/convolution_sgemm_pack8to1_int8.h @@ -225,23 +225,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16); _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16); #else - __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16); - __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16); - __m256i _sl02_13 = _mm256_mullo_epi16(_val01_16, _w23_16); - __m256i _sh02_13 = _mm256_mulhi_epi16(_val01_16, _w23_16); - __m256i _sl12_03 = _mm256_mullo_epi16(_val10_16, _w23_16); - __m256i _sh12_03 = _mm256_mulhi_epi16(_val10_16, _w23_16); - - _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpacklo_epi16(_sl00_11, _sh00_11)); - _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpacklo_epi16(_sl10_01, _sh10_01)); - _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpacklo_epi16(_sl02_13, _sh02_13)); - _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpacklo_epi16(_sl12_03, _sh12_03)); - _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpackhi_epi16(_sl00_11, _sh00_11)); - _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpackhi_epi16(_sl10_01, _sh10_01)); - _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpackhi_epi16(_sl02_13, _sh02_13)); - _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpackhi_epi16(_sl12_03, _sh12_03)); + _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16)); + _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16)); + _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16)); #endif __m128i _val23 = _mm_loadu_si128((const __m128i*)(tmpptr + 16)); @@ -254,23 +241,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23_16, _w23_16); _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32_16, _w23_16); #else - __m256i _sl04_15 = _mm256_mullo_epi16(_val23_16, _w01_16); - __m256i _sh04_15 = _mm256_mulhi_epi16(_val23_16, _w01_16); - __m256i _sl14_05 = _mm256_mullo_epi16(_val32_16, _w01_16); - __m256i _sh14_05 = _mm256_mulhi_epi16(_val32_16, _w01_16); - __m256i _sl06_17 = _mm256_mullo_epi16(_val23_16, _w23_16); - __m256i _sh06_17 = _mm256_mulhi_epi16(_val23_16, _w23_16); - __m256i _sl16_07 = _mm256_mullo_epi16(_val32_16, _w23_16); - __m256i _sh16_07 = _mm256_mulhi_epi16(_val32_16, _w23_16); - - _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_unpacklo_epi16(_sl04_15, _sh04_15)); - _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_unpacklo_epi16(_sl14_05, _sh14_05)); - _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_unpacklo_epi16(_sl06_17, _sh06_17)); - _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_unpacklo_epi16(_sl16_07, _sh16_07)); - _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_unpackhi_epi16(_sl04_15, _sh04_15)); - _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_unpackhi_epi16(_sl14_05, _sh14_05)); - _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_unpackhi_epi16(_sl06_17, _sh06_17)); - _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_unpackhi_epi16(_sl16_07, _sh16_07)); + _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23_16, _w01_16)); + _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32_16, _w01_16)); + _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23_16, _w23_16)); + _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32_16, _w23_16)); #endif tmpptr += 32; @@ -386,23 +360,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16); _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16); #else - __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16); - __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16); - __m256i _sl02_13 = _mm256_mullo_epi16(_val01_16, _w23_16); - __m256i _sh02_13 = _mm256_mulhi_epi16(_val01_16, _w23_16); - __m256i _sl12_03 = _mm256_mullo_epi16(_val10_16, _w23_16); - __m256i _sh12_03 = _mm256_mulhi_epi16(_val10_16, _w23_16); - - _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpacklo_epi16(_sl00_11, _sh00_11)); - _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpacklo_epi16(_sl10_01, _sh10_01)); - _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpacklo_epi16(_sl02_13, _sh02_13)); - _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpacklo_epi16(_sl12_03, _sh12_03)); - _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpackhi_epi16(_sl00_11, _sh00_11)); - _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpackhi_epi16(_sl10_01, _sh10_01)); - _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpackhi_epi16(_sl02_13, _sh02_13)); - _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpackhi_epi16(_sl12_03, _sh12_03)); + _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16)); + _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16)); + _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16)); #endif #else __m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr); @@ -429,39 +390,14 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12); _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13); #else - __m128i _sl00 = _mm_mullo_epi16(_val0, _w0); - __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0); - __m128i _sl01 = _mm_mullo_epi16(_val0, _w1); - __m128i _sh01 = _mm_mulhi_epi16(_val0, _w1); - __m128i _sl02 = _mm_mullo_epi16(_val0, _w2); - __m128i _sh02 = _mm_mulhi_epi16(_val0, _w2); - __m128i _sl03 = _mm_mullo_epi16(_val0, _w3); - __m128i _sh03 = _mm_mulhi_epi16(_val0, _w3); - __m128i _sl10 = _mm_mullo_epi16(_val1, _w0); - __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0); - __m128i _sl11 = _mm_mullo_epi16(_val1, _w1); - __m128i _sh11 = _mm_mulhi_epi16(_val1, _w1); - __m128i _sl12 = _mm_mullo_epi16(_val1, _w2); - __m128i _sh12 = _mm_mulhi_epi16(_val1, _w2); - __m128i _sl13 = _mm_mullo_epi16(_val1, _w3); - __m128i _sh13 = _mm_mulhi_epi16(_val1, _w3); - - _sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00)); - _sum01 = _mm_add_epi32(_sum01, _mm_unpacklo_epi16(_sl01, _sh01)); - _sum02 = _mm_add_epi32(_sum02, _mm_unpacklo_epi16(_sl02, _sh02)); - _sum03 = _mm_add_epi32(_sum03, _mm_unpacklo_epi16(_sl03, _sh03)); - _sum00 = _mm_add_epi32(_sum00, _mm_unpackhi_epi16(_sl00, _sh00)); - _sum01 = _mm_add_epi32(_sum01, _mm_unpackhi_epi16(_sl01, _sh01)); - _sum02 = _mm_add_epi32(_sum02, _mm_unpackhi_epi16(_sl02, _sh02)); - _sum03 = _mm_add_epi32(_sum03, _mm_unpackhi_epi16(_sl03, _sh03)); - _sum10 = _mm_add_epi32(_sum10, _mm_unpacklo_epi16(_sl10, _sh10)); - _sum11 = _mm_add_epi32(_sum11, _mm_unpacklo_epi16(_sl11, _sh11)); - _sum12 = _mm_add_epi32(_sum12, _mm_unpacklo_epi16(_sl12, _sh12)); - _sum13 = _mm_add_epi32(_sum13, _mm_unpacklo_epi16(_sl13, _sh13)); - _sum10 = _mm_add_epi32(_sum10, _mm_unpackhi_epi16(_sl10, _sh10)); - _sum11 = _mm_add_epi32(_sum11, _mm_unpackhi_epi16(_sl11, _sh11)); - _sum12 = _mm_add_epi32(_sum12, _mm_unpackhi_epi16(_sl12, _sh12)); - _sum13 = _mm_add_epi32(_sum13, _mm_unpackhi_epi16(_sl13, _sh13)); + _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00); + _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01); + _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02); + _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03); + _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10); + _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11); + _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12); + _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13); #endif #endif @@ -582,15 +518,8 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01_16); _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23_16); #else - __m256i _sl0_1 = _mm256_mullo_epi16(_valval, _w01_16); - __m256i _sh0_1 = _mm256_mulhi_epi16(_valval, _w01_16); - __m256i _sl2_3 = _mm256_mullo_epi16(_valval, _w23_16); - __m256i _sh2_3 = _mm256_mulhi_epi16(_valval, _w23_16); - - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl0_1, _sh0_1)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl2_3, _sh2_3)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl0_1, _sh0_1)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl2_3, _sh2_3)); + _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01_16)); + _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23_16)); #endif #else __m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr); @@ -615,23 +544,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum2 = _mm_maddd_epi16(_val, _w2, _sum2); _sum3 = _mm_maddd_epi16(_val, _w3, _sum3); #else - __m128i _sl0 = _mm_mullo_epi16(_val, _w0); - __m128i _sh0 = _mm_mulhi_epi16(_val, _w0); - __m128i _sl1 = _mm_mullo_epi16(_val, _w1); - __m128i _sh1 = _mm_mulhi_epi16(_val, _w1); - __m128i _sl2 = _mm_mullo_epi16(_val, _w2); - __m128i _sh2 = _mm_mulhi_epi16(_val, _w2); - __m128i _sl3 = _mm_mullo_epi16(_val, _w3); - __m128i _sh3 = _mm_mulhi_epi16(_val, _w3); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpacklo_epi16(_sl1, _sh1)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl2, _sh2)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpacklo_epi16(_sl3, _sh3)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpackhi_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpackhi_epi16(_sl2, _sh2)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl3, _sh3)); + _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0); + _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1); + _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2); + _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3); #endif #endif @@ -694,10 +610,8 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl int nn = inch * maxk; // inch always > 0 - __m256i _sum0_2 = _mm256_setzero_si256(); - __m256i _sum1_3 = _mm256_setzero_si256(); - __m256i _sum4_6 = _mm256_setzero_si256(); - __m256i _sum5_7 = _mm256_setzero_si256(); + __m256i _sum01 = _mm256_setzero_si256(); + __m256i _sum23 = _mm256_setzero_si256(); int j = 0; for (; j < nn; j++) @@ -711,31 +625,27 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl __m256i _w01_16 = _mm256_cvtepi8_epi16(_w01); _w01_16 = _mm256_permute4x64_epi64(_w01_16, _MM_SHUFFLE(1, 0, 1, 0)); - __m256i _sl00_10 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_10 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl20_30 = _mm256_mullo_epi16(_val23_16, _w01_16); - __m256i _sh20_30 = _mm256_mulhi_epi16(_val23_16, _w01_16); - - _sum0_2 = _mm256_add_epi32(_sum0_2, _mm256_unpacklo_epi16(_sl00_10, _sh00_10)); - _sum1_3 = _mm256_add_epi32(_sum1_3, _mm256_unpackhi_epi16(_sl00_10, _sh00_10)); - _sum4_6 = _mm256_add_epi32(_sum4_6, _mm256_unpacklo_epi16(_sl20_30, _sh20_30)); - _sum5_7 = _mm256_add_epi32(_sum5_7, _mm256_unpackhi_epi16(_sl20_30, _sh20_30)); +#if __AVXVNNI__ || __AVX512VNNI__ + _sum01 = _mm256_dpwssd_epi32(_sum01, _val01_16, _w01_16); + _sum23 = _mm256_dpwssd_epi32(_sum23, _val23_16, _w01_16); +#else + _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum23 = _mm256_add_epi32(_sum23, _mm256_madd_epi16(_val23_16, _w01_16)); +#endif tmpptr += 32; kptr0 += 8; } - _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3); - _sum4_6 = _mm256_add_epi32(_sum4_6, _sum5_7); - __m128i _sum0 = _mm256_extracti128_si256(_sum0_2, 0); - __m128i _sum2 = _mm256_extracti128_si256(_sum0_2, 1); - __m128i _sum4 = _mm256_extracti128_si256(_sum4_6, 0); - __m128i _sum6 = _mm256_extracti128_si256(_sum4_6, 1); + __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0); + __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1); + __m128i _sum2 = _mm256_extracti128_si256(_sum23, 0); + __m128i _sum3 = _mm256_extracti128_si256(_sum23, 1); outptr0[0] = _mm_reduce_add_epi32(_sum0); - outptr0[1] = _mm_reduce_add_epi32(_sum2); - outptr0[2] = _mm_reduce_add_epi32(_sum4); - outptr0[3] = _mm_reduce_add_epi32(_sum6); + outptr0[1] = _mm_reduce_add_epi32(_sum1); + outptr0[2] = _mm_reduce_add_epi32(_sum2); + outptr0[3] = _mm_reduce_add_epi32(_sum3); outptr0 += 4; } #endif @@ -751,13 +661,10 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl int nn = inch * maxk; // inch always > 0 #if __AVX2__ - __m256i _sum0_2 = _mm256_setzero_si256(); - __m256i _sum1_3 = _mm256_setzero_si256(); + __m256i _sum01 = _mm256_setzero_si256(); #else __m128i _sum0 = _mm_setzero_si128(); __m128i _sum1 = _mm_setzero_si128(); - __m128i _sum2 = _mm_setzero_si128(); - __m128i _sum3 = _mm_setzero_si128(); #endif int j = 0; @@ -771,11 +678,11 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl __m256i _w01_16 = _mm256_cvtepi8_epi16(_w01); _w01_16 = _mm256_permute4x64_epi64(_w01_16, _MM_SHUFFLE(1, 0, 1, 0)); - __m256i _sl00_10 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_10 = _mm256_mulhi_epi16(_val01_16, _w01_16); - - _sum0_2 = _mm256_add_epi32(_sum0_2, _mm256_unpacklo_epi16(_sl00_10, _sh00_10)); - _sum1_3 = _mm256_add_epi32(_sum1_3, _mm256_unpackhi_epi16(_sl00_10, _sh00_10)); +#if __AVXVNNI__ || __AVX512VNNI__ + _sum01 = _mm256_dpwssd_epi32(_sum01, _val01_16, _w01_16); +#else + _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01_16, _w01_16)); +#endif #else __m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr); __m128i _extval01 = _mm_cmpgt_epi8(_mm_setzero_si128(), _val01); @@ -790,15 +697,13 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl __m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01); #endif - __m128i _sl00 = _mm_mullo_epi16(_val0, _w0); - __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0); - __m128i _sl10 = _mm_mullo_epi16(_val1, _w0); - __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl10, _sh10)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl10, _sh10)); +#if __XOP__ + _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0); + _sum1 = _mm_maddd_epi16(_val1, _w0, _sum1); +#else + _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0); + _sum1 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum1); +#endif #endif tmpptr += 16; @@ -806,16 +711,12 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl } #if __AVX2__ - _sum0_2 = _mm256_add_epi32(_sum0_2, _sum1_3); - __m128i _sum0 = _mm256_extracti128_si256(_sum0_2, 0); - __m128i _sum2 = _mm256_extracti128_si256(_sum0_2, 1); -#else - _sum0 = _mm_add_epi32(_sum0, _sum1); - _sum2 = _mm_add_epi32(_sum2, _sum3); + __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0); + __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1); #endif outptr0[0] = _mm_reduce_add_epi32(_sum0); - outptr0[1] = _mm_reduce_add_epi32(_sum2); + outptr0[1] = _mm_reduce_add_epi32(_sum1); outptr0 += 2; } for (; i < size; i++) @@ -830,7 +731,6 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl int nn = inch * maxk; // inch always > 0 __m128i _sum0 = _mm_setzero_si128(); - __m128i _sum1 = _mm_setzero_si128(); int j = 0; for (; j < nn; j++) @@ -851,18 +751,16 @@ static void im2col_sgemm_pack8to1_int8_sse(const Mat& bottom_im2col, Mat& top_bl __m128i _w0 = _mm_unpacklo_epi8(_w01, _extw01); #endif - __m128i _sl00 = _mm_mullo_epi16(_val0, _w0); - __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl00, _sh00)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl00, _sh00)); +#if __XOP__ + _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0); +#else + _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0); +#endif tmpptr += 8; kptr0 += 8; } - _sum0 = _mm_add_epi32(_sum0, _sum1); - outptr0[0] = _mm_reduce_add_epi32(_sum0); outptr0 += 1; } diff --git a/src/layer/x86/convolution_sgemm_pack8to4.h b/src/layer/x86/convolution_sgemm_pack8to4.h index 5fec80fea323..a98c816c5dec 100644 --- a/src/layer/x86/convolution_sgemm_pack8to4.h +++ b/src/layer/x86/convolution_sgemm_pack8to4.h @@ -59,30 +59,7 @@ static void im2col_sgemm_pack8to4_avx(const Mat& bottom_im2col, Mat& top_blob, c __m256 _r6 = _mm256_load_ps(img0 + 8 * 6); __m256 _r7 = _mm256_load_ps(img0 + 8 * 7); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); - __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); - __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); - __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); - __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); - _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); - _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); - _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); - _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); - _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); @@ -121,18 +98,7 @@ static void im2col_sgemm_pack8to4_avx(const Mat& bottom_im2col, Mat& top_blob, c __m256 _r2 = _mm256_load_ps(img0 + 8 * 2); __m256 _r3 = _mm256_load_ps(img0 + 8 * 3); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); - _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x4_ps(_r0, _r1, _r2, _r3); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); diff --git a/src/layer/x86/convolution_sgemm_pack8to4_int8.h b/src/layer/x86/convolution_sgemm_pack8to4_int8.h index 5e650c654056..8fdaece96520 100644 --- a/src/layer/x86/convolution_sgemm_pack8to4_int8.h +++ b/src/layer/x86/convolution_sgemm_pack8to4_int8.h @@ -215,23 +215,10 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16); _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16); #else - __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16); - __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16); - __m256i _sl02_13 = _mm256_mullo_epi16(_val01_16, _w23_16); - __m256i _sh02_13 = _mm256_mulhi_epi16(_val01_16, _w23_16); - __m256i _sl12_03 = _mm256_mullo_epi16(_val10_16, _w23_16); - __m256i _sh12_03 = _mm256_mulhi_epi16(_val10_16, _w23_16); - - _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpacklo_epi16(_sl00_11, _sh00_11)); - _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpacklo_epi16(_sl10_01, _sh10_01)); - _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpacklo_epi16(_sl02_13, _sh02_13)); - _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpacklo_epi16(_sl12_03, _sh12_03)); - _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpackhi_epi16(_sl00_11, _sh00_11)); - _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpackhi_epi16(_sl10_01, _sh10_01)); - _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpackhi_epi16(_sl02_13, _sh02_13)); - _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpackhi_epi16(_sl12_03, _sh12_03)); + _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16)); + _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16)); + _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16)); #endif __m128i _val23 = _mm_loadu_si128((const __m128i*)(tmpptr + 16)); @@ -244,23 +231,10 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23_16, _w23_16); _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32_16, _w23_16); #else - __m256i _sl04_15 = _mm256_mullo_epi16(_val23_16, _w01_16); - __m256i _sh04_15 = _mm256_mulhi_epi16(_val23_16, _w01_16); - __m256i _sl14_05 = _mm256_mullo_epi16(_val32_16, _w01_16); - __m256i _sh14_05 = _mm256_mulhi_epi16(_val32_16, _w01_16); - __m256i _sl06_17 = _mm256_mullo_epi16(_val23_16, _w23_16); - __m256i _sh06_17 = _mm256_mulhi_epi16(_val23_16, _w23_16); - __m256i _sl16_07 = _mm256_mullo_epi16(_val32_16, _w23_16); - __m256i _sh16_07 = _mm256_mulhi_epi16(_val32_16, _w23_16); - - _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_unpacklo_epi16(_sl04_15, _sh04_15)); - _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_unpacklo_epi16(_sl14_05, _sh14_05)); - _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_unpacklo_epi16(_sl06_17, _sh06_17)); - _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_unpacklo_epi16(_sl16_07, _sh16_07)); - _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_unpackhi_epi16(_sl04_15, _sh04_15)); - _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_unpackhi_epi16(_sl14_05, _sh14_05)); - _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_unpackhi_epi16(_sl06_17, _sh06_17)); - _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_unpackhi_epi16(_sl16_07, _sh16_07)); + _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23_16, _w01_16)); + _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32_16, _w01_16)); + _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23_16, _w23_16)); + _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32_16, _w23_16)); #endif tmpptr += 32; @@ -355,23 +329,10 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01_16, _w23_16); _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10_16, _w23_16); #else - __m256i _sl00_11 = _mm256_mullo_epi16(_val01_16, _w01_16); - __m256i _sh00_11 = _mm256_mulhi_epi16(_val01_16, _w01_16); - __m256i _sl10_01 = _mm256_mullo_epi16(_val10_16, _w01_16); - __m256i _sh10_01 = _mm256_mulhi_epi16(_val10_16, _w01_16); - __m256i _sl02_13 = _mm256_mullo_epi16(_val01_16, _w23_16); - __m256i _sh02_13 = _mm256_mulhi_epi16(_val01_16, _w23_16); - __m256i _sl12_03 = _mm256_mullo_epi16(_val10_16, _w23_16); - __m256i _sh12_03 = _mm256_mulhi_epi16(_val10_16, _w23_16); - - _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpacklo_epi16(_sl00_11, _sh00_11)); - _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpacklo_epi16(_sl10_01, _sh10_01)); - _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpacklo_epi16(_sl02_13, _sh02_13)); - _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpacklo_epi16(_sl12_03, _sh12_03)); - _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_unpackhi_epi16(_sl00_11, _sh00_11)); - _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_unpackhi_epi16(_sl10_01, _sh10_01)); - _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_unpackhi_epi16(_sl02_13, _sh02_13)); - _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_unpackhi_epi16(_sl12_03, _sh12_03)); + _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01_16, _w01_16)); + _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10_16, _w01_16)); + _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01_16, _w23_16)); + _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10_16, _w23_16)); #endif #else __m128i _val01 = _mm_loadu_si128((const __m128i*)tmpptr); @@ -398,39 +359,14 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12); _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13); #else - __m128i _sl00 = _mm_mullo_epi16(_val0, _w0); - __m128i _sh00 = _mm_mulhi_epi16(_val0, _w0); - __m128i _sl01 = _mm_mullo_epi16(_val0, _w1); - __m128i _sh01 = _mm_mulhi_epi16(_val0, _w1); - __m128i _sl02 = _mm_mullo_epi16(_val0, _w2); - __m128i _sh02 = _mm_mulhi_epi16(_val0, _w2); - __m128i _sl03 = _mm_mullo_epi16(_val0, _w3); - __m128i _sh03 = _mm_mulhi_epi16(_val0, _w3); - __m128i _sl10 = _mm_mullo_epi16(_val1, _w0); - __m128i _sh10 = _mm_mulhi_epi16(_val1, _w0); - __m128i _sl11 = _mm_mullo_epi16(_val1, _w1); - __m128i _sh11 = _mm_mulhi_epi16(_val1, _w1); - __m128i _sl12 = _mm_mullo_epi16(_val1, _w2); - __m128i _sh12 = _mm_mulhi_epi16(_val1, _w2); - __m128i _sl13 = _mm_mullo_epi16(_val1, _w3); - __m128i _sh13 = _mm_mulhi_epi16(_val1, _w3); - - _sum00 = _mm_add_epi32(_sum00, _mm_unpacklo_epi16(_sl00, _sh00)); - _sum01 = _mm_add_epi32(_sum01, _mm_unpacklo_epi16(_sl01, _sh01)); - _sum02 = _mm_add_epi32(_sum02, _mm_unpacklo_epi16(_sl02, _sh02)); - _sum03 = _mm_add_epi32(_sum03, _mm_unpacklo_epi16(_sl03, _sh03)); - _sum00 = _mm_add_epi32(_sum00, _mm_unpackhi_epi16(_sl00, _sh00)); - _sum01 = _mm_add_epi32(_sum01, _mm_unpackhi_epi16(_sl01, _sh01)); - _sum02 = _mm_add_epi32(_sum02, _mm_unpackhi_epi16(_sl02, _sh02)); - _sum03 = _mm_add_epi32(_sum03, _mm_unpackhi_epi16(_sl03, _sh03)); - _sum10 = _mm_add_epi32(_sum10, _mm_unpacklo_epi16(_sl10, _sh10)); - _sum11 = _mm_add_epi32(_sum11, _mm_unpacklo_epi16(_sl11, _sh11)); - _sum12 = _mm_add_epi32(_sum12, _mm_unpacklo_epi16(_sl12, _sh12)); - _sum13 = _mm_add_epi32(_sum13, _mm_unpacklo_epi16(_sl13, _sh13)); - _sum10 = _mm_add_epi32(_sum10, _mm_unpackhi_epi16(_sl10, _sh10)); - _sum11 = _mm_add_epi32(_sum11, _mm_unpackhi_epi16(_sl11, _sh11)); - _sum12 = _mm_add_epi32(_sum12, _mm_unpackhi_epi16(_sl12, _sh12)); - _sum13 = _mm_add_epi32(_sum13, _mm_unpackhi_epi16(_sl13, _sh13)); + _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00); + _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01); + _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02); + _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03); + _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10); + _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11); + _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12); + _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13); #endif #endif @@ -537,15 +473,8 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01_16); _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23_16); #else - __m256i _sl0_1 = _mm256_mullo_epi16(_valval, _w01_16); - __m256i _sh0_1 = _mm256_mulhi_epi16(_valval, _w01_16); - __m256i _sl2_3 = _mm256_mullo_epi16(_valval, _w23_16); - __m256i _sh2_3 = _mm256_mulhi_epi16(_valval, _w23_16); - - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpacklo_epi16(_sl0_1, _sh0_1)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpacklo_epi16(_sl2_3, _sh2_3)); - _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_unpackhi_epi16(_sl0_1, _sh0_1)); - _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_unpackhi_epi16(_sl2_3, _sh2_3)); + _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01_16)); + _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23_16)); #endif #else __m128i _val = _mm_loadl_epi64((const __m128i*)tmpptr); @@ -570,23 +499,10 @@ static void im2col_sgemm_pack8to4_int8_sse(const Mat& bottom_im2col, Mat& top_bl _sum2 = _mm_maddd_epi16(_val, _w2, _sum2); _sum3 = _mm_maddd_epi16(_val, _w3, _sum3); #else - __m128i _sl0 = _mm_mullo_epi16(_val, _w0); - __m128i _sh0 = _mm_mulhi_epi16(_val, _w0); - __m128i _sl1 = _mm_mullo_epi16(_val, _w1); - __m128i _sh1 = _mm_mulhi_epi16(_val, _w1); - __m128i _sl2 = _mm_mullo_epi16(_val, _w2); - __m128i _sh2 = _mm_mulhi_epi16(_val, _w2); - __m128i _sl3 = _mm_mullo_epi16(_val, _w3); - __m128i _sh3 = _mm_mulhi_epi16(_val, _w3); - - _sum0 = _mm_add_epi32(_sum0, _mm_unpacklo_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpacklo_epi16(_sl1, _sh1)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpacklo_epi16(_sl2, _sh2)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpacklo_epi16(_sl3, _sh3)); - _sum0 = _mm_add_epi32(_sum0, _mm_unpackhi_epi16(_sl0, _sh0)); - _sum1 = _mm_add_epi32(_sum1, _mm_unpackhi_epi16(_sl1, _sh1)); - _sum2 = _mm_add_epi32(_sum2, _mm_unpackhi_epi16(_sl2, _sh2)); - _sum3 = _mm_add_epi32(_sum3, _mm_unpackhi_epi16(_sl3, _sh3)); + _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0); + _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1); + _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2); + _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3); #endif #endif diff --git a/src/layer/x86/convolution_winograd_dot_pack16.h b/src/layer/x86/convolution_winograd_dot_pack16.h index acbd3191e431..fec99b1bb4c6 100644 --- a/src/layer/x86/convolution_winograd_dot_pack16.h +++ b/src/layer/x86/convolution_winograd_dot_pack16.h @@ -65,57 +65,7 @@ static void convolution_winograd_dot_pack16_avx512(Mat& bottom_blob_tm, int outc __m512 _ra = _mm512_load_ps(r0 + 16 * 10); __m512 _rb = _mm512_load_ps(r0 + 16 * 11); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); - __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); - __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); - __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); - __m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9); - __m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9); - __m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb); - __m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb); - - __m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1)); - _tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - _tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0)); - _r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0)); - _r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); - _ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1)); - _rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb); _mm512_store_ps(tmpptr, _r0); _mm512_store_ps(tmpptr + 16, _r1); @@ -154,41 +104,7 @@ static void convolution_winograd_dot_pack16_avx512(Mat& bottom_blob_tm, int outc __m512 _r6 = _mm512_load_ps(r0 + 16 * 6); __m512 _r7 = _mm512_load_ps(r0 + 16 * 7); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); - __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); - __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); - __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); - - __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm512_store_ps(tmpptr, _r0); _mm512_store_ps(tmpptr + 16, _r1); @@ -219,25 +135,7 @@ static void convolution_winograd_dot_pack16_avx512(Mat& bottom_blob_tm, int outc __m512 _r2 = _mm512_load_ps(r0 + 16 * 2); __m512 _r3 = _mm512_load_ps(r0 + 16 * 3); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); - __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); - - __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x4_ps(_r0, _r1, _r2, _r3); _mm512_store_ps(tmpptr, _r0); _mm512_store_ps(tmpptr + 16, _r1); @@ -262,14 +160,7 @@ static void convolution_winograd_dot_pack16_avx512(Mat& bottom_blob_tm, int outc __m512 _r0 = _mm512_load_ps(r0); __m512 _r1 = _mm512_load_ps(r0 + 16); - __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); - __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); - - __m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - __m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - - _r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x2_ps(_r0, _r1); _mm512_store_ps(tmpptr, _r0); _mm512_store_ps(tmpptr + 16, _r1); diff --git a/src/layer/x86/convolution_winograd_dot_pack8.h b/src/layer/x86/convolution_winograd_dot_pack8.h index eaa56e30adef..2855ca23cab9 100644 --- a/src/layer/x86/convolution_winograd_dot_pack8.h +++ b/src/layer/x86/convolution_winograd_dot_pack8.h @@ -65,42 +65,7 @@ static void convolution_winograd_dot_pack8_avx(Mat& bottom_blob_tm, int outch, c __m256 _ra = _mm256_load_ps(r0 + 8 * 10); __m256 _rb = _mm256_load_ps(r0 + 8 * 11); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); - __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); - __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); - __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); - __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9); - __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9); - __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb); - __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb); - __m256 _tmpc = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpd = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpe = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpf = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpg = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmph = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpi = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpj = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpl = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpm = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpn = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0)); - _r3 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 2, 0, 0)); - _r4 = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); - _r5 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0)); - _r6 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 3, 0, 1)); - _r7 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); - _r8 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1)); - _r9 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 3, 0, 1)); - _ra = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); - _rb = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x12_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); @@ -139,30 +104,7 @@ static void convolution_winograd_dot_pack8_avx(Mat& bottom_blob_tm, int outch, c __m256 _r6 = _mm256_load_ps(r0 + 8 * 6); __m256 _r7 = _mm256_load_ps(r0 + 8 * 7); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); - __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); - __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); - __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); - __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); - _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); - _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); - _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); - _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); - _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); @@ -193,18 +135,7 @@ static void convolution_winograd_dot_pack8_avx(Mat& bottom_blob_tm, int outch, c __m256 _r2 = _mm256_load_ps(r0 + 8 * 2); __m256 _r3 = _mm256_load_ps(r0 + 8 * 3); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); - __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); - __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - _r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); - _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x4_ps(_r0, _r1, _r2, _r3); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); @@ -229,10 +160,7 @@ static void convolution_winograd_dot_pack8_avx(Mat& bottom_blob_tm, int outch, c __m256 _r0 = _mm256_load_ps(r0); __m256 _r1 = _mm256_load_ps(r0 + 8); - __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); - __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); - _r0 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x2_ps(_r0, _r1); _mm256_store_ps(tmpptr, _r0); _mm256_store_ps(tmpptr + 8, _r1); diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp index 60522b04eaa4..9acceb28854e 100644 --- a/src/layer/x86/convolution_x86.cpp +++ b/src/layer/x86/convolution_x86.cpp @@ -16,12 +16,15 @@ #if __SSE2__ #include +#if __SSSE3__ +#include #if __SSE4_1__ #include #if __AVX__ #include #endif #endif // __SSE4_1__ +#endif // __SSSE3__ #endif // __SSE2__ #include "x86_activation.h" #include "x86_usability.h" diff --git a/src/layer/x86/deformableconv2d_pack16.h b/src/layer/x86/deformableconv2d_pack16.h new file mode 100644 index 000000000000..42f260f6e96c --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16.h @@ -0,0 +1,435 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + __m512 _val_channel4 = _val_channel0; + __m512 _val_channel5 = _val_channel0; + __m512 _val_channel6 = _val_channel0; + __m512 _val_channel7 = _val_channel0; + __m512 _val_channel8 = _val_channel0; + __m512 _val_channel9 = _val_channel0; + __m512 _val_channela = _val_channel0; + __m512 _val_channelb = _val_channel0; + __m512 _val_channelc = _val_channel0; + __m512 _val_channeld = _val_channel0; + __m512 _val_channele = _val_channel0; + __m512 _val_channelf = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v1_channel4 = _val_channel0; + __m512 _v1_channel5 = _val_channel0; + __m512 _v1_channel6 = _val_channel0; + __m512 _v1_channel7 = _val_channel0; + __m512 _v1_channel8 = _val_channel0; + __m512 _v1_channel9 = _val_channel0; + __m512 _v1_channela = _val_channel0; + __m512 _v1_channelb = _val_channel0; + __m512 _v1_channelc = _val_channel0; + __m512 _v1_channeld = _val_channel0; + __m512 _v1_channele = _val_channel0; + __m512 _v1_channelf = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v2_channel4 = _val_channel0; + __m512 _v2_channel5 = _val_channel0; + __m512 _v2_channel6 = _val_channel0; + __m512 _v2_channel7 = _val_channel0; + __m512 _v2_channel8 = _val_channel0; + __m512 _v2_channel9 = _val_channel0; + __m512 _v2_channela = _val_channel0; + __m512 _v2_channelb = _val_channel0; + __m512 _v2_channelc = _val_channel0; + __m512 _v2_channeld = _val_channel0; + __m512 _v2_channele = _val_channel0; + __m512 _v2_channelf = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v3_channel4 = _val_channel0; + __m512 _v3_channel5 = _val_channel0; + __m512 _v3_channel6 = _val_channel0; + __m512 _v3_channel7 = _val_channel0; + __m512 _v3_channel8 = _val_channel0; + __m512 _v3_channel9 = _val_channel0; + __m512 _v3_channela = _val_channel0; + __m512 _v3_channelb = _val_channel0; + __m512 _v3_channelc = _val_channel0; + __m512 _v3_channeld = _val_channel0; + __m512 _v3_channele = _val_channel0; + __m512 _v3_channelf = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + __m512 _v4_channel4 = _val_channel0; + __m512 _v4_channel5 = _val_channel0; + __m512 _v4_channel6 = _val_channel0; + __m512 _v4_channel7 = _val_channel0; + __m512 _v4_channel8 = _val_channel0; + __m512 _v4_channel9 = _val_channel0; + __m512 _v4_channela = _val_channel0; + __m512 _v4_channelb = _val_channel0; + __m512 _v4_channelc = _val_channel0; + __m512 _v4_channeld = _val_channel0; + __m512 _v4_channele = _val_channel0; + __m512 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + _v1_channel4 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 4]); + _v1_channel5 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 5]); + _v1_channel6 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 6]); + _v1_channel7 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 7]); + _v1_channel8 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 8]); + _v1_channel9 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 9]); + _v1_channela = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 10]); + _v1_channelb = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 11]); + _v1_channelc = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 12]); + _v1_channeld = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 13]); + _v1_channele = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 14]); + _v1_channelf = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 15]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + _v2_channel4 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 4]); + _v2_channel5 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 5]); + _v2_channel6 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 6]); + _v2_channel7 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 7]); + _v2_channel8 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 8]); + _v2_channel9 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 9]); + _v2_channela = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 10]); + _v2_channelb = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 11]); + _v2_channelc = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 12]); + _v2_channeld = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 13]); + _v2_channele = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 14]); + _v2_channelf = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 15]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + _v3_channel4 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 4]); + _v3_channel5 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 5]); + _v3_channel6 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 6]); + _v3_channel7 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 7]); + _v3_channel8 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 8]); + _v3_channel9 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 9]); + _v3_channela = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 10]); + _v3_channelb = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 11]); + _v3_channelc = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 12]); + _v3_channeld = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 13]); + _v3_channele = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 14]); + _v3_channelf = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 15]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + _v4_channel4 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 4]); + _v4_channel5 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 5]); + _v4_channel6 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 6]); + _v4_channel7 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 7]); + _v4_channel8 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 8]); + _v4_channel9 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 9]); + _v4_channela = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 10]); + _v4_channelb = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 11]); + _v4_channelc = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 12]); + _v4_channeld = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 13]); + _v4_channele = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 14]); + _v4_channelf = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 15]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm512_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm512_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm512_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm512_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm512_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm512_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm512_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm512_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm512_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm512_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm512_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm512_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm512_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm512_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm512_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm512_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm512_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm512_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm512_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm512_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm512_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm512_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm512_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm512_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm512_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm512_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm512_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm512_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm512_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm512_mul_ps(_val_channel9, _mask); + _val_channela = _mm512_mul_ps(_val_channela, _mask); + _val_channelb = _mm512_mul_ps(_val_channelb, _mask); + _val_channelc = _mm512_mul_ps(_val_channelc, _mask); + _val_channeld = _mm512_mul_ps(_val_channeld, _mask); + _val_channele = _mm512_mul_ps(_val_channele, _mask); + _val_channelf = _mm512_mul_ps(_val_channelf, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack + __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m512 _conv_w6 = _mm512_load_ps(kptr + 96); // 6 * out_elempack + __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m512 _conv_w8 = _mm512_load_ps(kptr + 128); // 8 * out_elempack + __m512 _conv_w9 = _mm512_load_ps(kptr + 144); // 9 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm512_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m512 _conv_wa = _mm512_load_ps(kptr + 160); // 10 * out_elempack + __m512 _conv_wb = _mm512_load_ps(kptr + 176); // 11 * out_elempack + _sum = _mm512_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm512_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m512 _conv_wc = _mm512_load_ps(kptr + 192); // 12 * out_elempack + __m512 _conv_wd = _mm512_load_ps(kptr + 208); // 13 * out_elempack + _sum = _mm512_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm512_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m512 _conv_we = _mm512_load_ps(kptr + 224); // 14 * out_elempack + __m512 _conv_wf = _mm512_load_ps(kptr + 240); // 15 * out_elempack + _sum = _mm512_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm512_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to1.h b/src/layer/x86/deformableconv2d_pack16to1.h new file mode 100644 index 000000000000..c721f5c52334 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to1.h @@ -0,0 +1,370 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to1_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + float _val_channel4 = _val_channel0; + float _val_channel5 = _val_channel0; + float _val_channel6 = _val_channel0; + float _val_channel7 = _val_channel0; + float _val_channel8 = _val_channel0; + float _val_channel9 = _val_channel0; + float _val_channela = _val_channel0; + float _val_channelb = _val_channel0; + float _val_channelc = _val_channel0; + float _val_channeld = _val_channel0; + float _val_channele = _val_channel0; + float _val_channelf = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v1_channel4 = _val_channel0; + float _v1_channel5 = _val_channel0; + float _v1_channel6 = _val_channel0; + float _v1_channel7 = _val_channel0; + float _v1_channel8 = _val_channel0; + float _v1_channel9 = _val_channel0; + float _v1_channela = _val_channel0; + float _v1_channelb = _val_channel0; + float _v1_channelc = _val_channel0; + float _v1_channeld = _val_channel0; + float _v1_channele = _val_channel0; + float _v1_channelf = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v2_channel4 = _val_channel0; + float _v2_channel5 = _val_channel0; + float _v2_channel6 = _val_channel0; + float _v2_channel7 = _val_channel0; + float _v2_channel8 = _val_channel0; + float _v2_channel9 = _val_channel0; + float _v2_channela = _val_channel0; + float _v2_channelb = _val_channel0; + float _v2_channelc = _val_channel0; + float _v2_channeld = _val_channel0; + float _v2_channele = _val_channel0; + float _v2_channelf = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v3_channel4 = _val_channel0; + float _v3_channel5 = _val_channel0; + float _v3_channel6 = _val_channel0; + float _v3_channel7 = _val_channel0; + float _v3_channel8 = _val_channel0; + float _v3_channel9 = _val_channel0; + float _v3_channela = _val_channel0; + float _v3_channelb = _val_channel0; + float _v3_channelc = _val_channel0; + float _v3_channeld = _val_channel0; + float _v3_channele = _val_channel0; + float _v3_channelf = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + float _v4_channel4 = _val_channel0; + float _v4_channel5 = _val_channel0; + float _v4_channel6 = _val_channel0; + float _v4_channel7 = _val_channel0; + float _v4_channel8 = _val_channel0; + float _v4_channel9 = _val_channel0; + float _v4_channela = _val_channel0; + float _v4_channelb = _val_channel0; + float _v4_channelc = _val_channel0; + float _v4_channeld = _val_channel0; + float _v4_channele = _val_channel0; + float _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = *(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = *(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = *(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = *(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = *(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = *(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = *(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = *(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = *(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = *(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = *(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = *(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = *(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = *(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = *(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = *(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = *(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = *(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = *(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = *(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = *(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = *(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = *(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = *(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = *(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = *(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = *(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = *(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = *(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = *(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = *(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = *(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = *(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = *(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = *(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = *(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = *(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = *(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = *(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = *(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = *(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = *(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = *(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = *(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = *(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = *(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = *(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = *(data_im_ptr + v4_pos * elempack + 15); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + _val_channel4 = w1 * _v1_channel4 + w2 * _v2_channel4 + w3 * _v3_channel4 + w4 * _v4_channel4; + _val_channel5 = w1 * _v1_channel5 + w2 * _v2_channel5 + w3 * _v3_channel5 + w4 * _v4_channel5; + _val_channel6 = w1 * _v1_channel6 + w2 * _v2_channel6 + w3 * _v3_channel6 + w4 * _v4_channel6; + _val_channel7 = w1 * _v1_channel7 + w2 * _v2_channel7 + w3 * _v3_channel7 + w4 * _v4_channel7; + _val_channel8 = w1 * _v1_channel8 + w2 * _v2_channel8 + w3 * _v3_channel8 + w4 * _v4_channel8; + _val_channel9 = w1 * _v1_channel9 + w2 * _v2_channel9 + w3 * _v3_channel9 + w4 * _v4_channel9; + _val_channela = w1 * _v1_channela + w2 * _v2_channela + w3 * _v3_channela + w4 * _v4_channela; + _val_channelb = w1 * _v1_channelb + w2 * _v2_channelb + w3 * _v3_channelb + w4 * _v4_channelb; + _val_channelc = w1 * _v1_channelc + w2 * _v2_channelc + w3 * _v3_channelc + w4 * _v4_channelc; + _val_channeld = w1 * _v1_channeld + w2 * _v2_channeld + w3 * _v3_channeld + w4 * _v4_channeld; + _val_channele = w1 * _v1_channele + w2 * _v2_channele + w3 * _v3_channele + w4 * _v4_channele; + _val_channelf = w1 * _v1_channelf + w2 * _v2_channelf + w3 * _v3_channelf + w4 * _v4_channelf; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + _val_channel4 *= mask_; + _val_channel5 *= mask_; + _val_channel6 *= mask_; + _val_channel7 *= mask_; + _val_channel8 *= mask_; + _val_channel9 *= mask_; + _val_channela *= mask_; + _val_channelb *= mask_; + _val_channelc *= mask_; + _val_channeld *= mask_; + _val_channele *= mask_; + _val_channelf *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + _sum += (_val_channel0 * _conv_w0); + _sum += (_val_channel1 * _conv_w1); + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + _sum += (_val_channel2 * _conv_w2); + _sum += (_val_channel3 * _conv_w3); + float _conv_w4 = *(kptr + 4); // 4 * out_elempack + float _conv_w5 = *(kptr + 5); // 5 * out_elempack + _sum += (_val_channel4 * _conv_w4); + _sum += (_val_channel5 * _conv_w5); + float _conv_w6 = *(kptr + 6); // 6 * out_elempack + float _conv_w7 = *(kptr + 7); // 7 * out_elempack + _sum += (_val_channel6 * _conv_w6); + _sum += (_val_channel7 * _conv_w7); + float _conv_w8 = *(kptr + 8); // 8 * out_elempack + float _conv_w9 = *(kptr + 9); // 9 * out_elempack + _sum += (_val_channel8 * _conv_w8); + _sum += (_val_channel9 * _conv_w9); + float _conv_wa = *(kptr + 10); // 10 * out_elempack + float _conv_wb = *(kptr + 11); // 11 * out_elempack + _sum += (_val_channela * _conv_wa); + _sum += (_val_channelb * _conv_wb); + float _conv_wc = *(kptr + 12); // 12 * out_elempack + float _conv_wd = *(kptr + 13); // 13 * out_elempack + _sum += (_val_channelc * _conv_wc); + _sum += (_val_channeld * _conv_wd); + float _conv_we = *(kptr + 14); // 14 * out_elempack + float _conv_wf = *(kptr + 15); // 15 * out_elempack + _sum += (_val_channele * _conv_we); + _sum += (_val_channelf * _conv_wf); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to4.h b/src/layer/x86/deformableconv2d_pack16to4.h new file mode 100644 index 000000000000..a75e26ec8cf4 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to4.h @@ -0,0 +1,435 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to4_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + __m128 _val_channel4 = _val_channel0; + __m128 _val_channel5 = _val_channel0; + __m128 _val_channel6 = _val_channel0; + __m128 _val_channel7 = _val_channel0; + __m128 _val_channel8 = _val_channel0; + __m128 _val_channel9 = _val_channel0; + __m128 _val_channela = _val_channel0; + __m128 _val_channelb = _val_channel0; + __m128 _val_channelc = _val_channel0; + __m128 _val_channeld = _val_channel0; + __m128 _val_channele = _val_channel0; + __m128 _val_channelf = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v1_channel4 = _val_channel0; + __m128 _v1_channel5 = _val_channel0; + __m128 _v1_channel6 = _val_channel0; + __m128 _v1_channel7 = _val_channel0; + __m128 _v1_channel8 = _val_channel0; + __m128 _v1_channel9 = _val_channel0; + __m128 _v1_channela = _val_channel0; + __m128 _v1_channelb = _val_channel0; + __m128 _v1_channelc = _val_channel0; + __m128 _v1_channeld = _val_channel0; + __m128 _v1_channele = _val_channel0; + __m128 _v1_channelf = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v2_channel4 = _val_channel0; + __m128 _v2_channel5 = _val_channel0; + __m128 _v2_channel6 = _val_channel0; + __m128 _v2_channel7 = _val_channel0; + __m128 _v2_channel8 = _val_channel0; + __m128 _v2_channel9 = _val_channel0; + __m128 _v2_channela = _val_channel0; + __m128 _v2_channelb = _val_channel0; + __m128 _v2_channelc = _val_channel0; + __m128 _v2_channeld = _val_channel0; + __m128 _v2_channele = _val_channel0; + __m128 _v2_channelf = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v3_channel4 = _val_channel0; + __m128 _v3_channel5 = _val_channel0; + __m128 _v3_channel6 = _val_channel0; + __m128 _v3_channel7 = _val_channel0; + __m128 _v3_channel8 = _val_channel0; + __m128 _v3_channel9 = _val_channel0; + __m128 _v3_channela = _val_channel0; + __m128 _v3_channelb = _val_channel0; + __m128 _v3_channelc = _val_channel0; + __m128 _v3_channeld = _val_channel0; + __m128 _v3_channele = _val_channel0; + __m128 _v3_channelf = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + __m128 _v4_channel4 = _val_channel0; + __m128 _v4_channel5 = _val_channel0; + __m128 _v4_channel6 = _val_channel0; + __m128 _v4_channel7 = _val_channel0; + __m128 _v4_channel8 = _val_channel0; + __m128 _v4_channel9 = _val_channel0; + __m128 _v4_channela = _val_channel0; + __m128 _v4_channelb = _val_channel0; + __m128 _v4_channelc = _val_channel0; + __m128 _v4_channeld = _val_channel0; + __m128 _v4_channele = _val_channel0; + __m128 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 15); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm_comp_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm_comp_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm_comp_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm_comp_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm_comp_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm_comp_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm_comp_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm_comp_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm_comp_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm_comp_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm_comp_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm_comp_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm_comp_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm_comp_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm_comp_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm_comp_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm_mul_ps(_val_channel9, _mask); + _val_channela = _mm_mul_ps(_val_channela, _mask); + _val_channelb = _mm_mul_ps(_val_channelb, _mask); + _val_channelc = _mm_mul_ps(_val_channelc, _mask); + _val_channeld = _mm_mul_ps(_val_channeld, _mask); + _val_channele = _mm_mul_ps(_val_channele, _mask); + _val_channelf = _mm_mul_ps(_val_channelf, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack + __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack + __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m128 _conv_w8 = _mm_load_ps(kptr + 32); // 8 * out_elempack + __m128 _conv_w9 = _mm_load_ps(kptr + 36); // 9 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m128 _conv_wa = _mm_load_ps(kptr + 40); // 10 * out_elempack + __m128 _conv_wb = _mm_load_ps(kptr + 44); // 11 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm_comp_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m128 _conv_wc = _mm_load_ps(kptr + 48); // 12 * out_elempack + __m128 _conv_wd = _mm_load_ps(kptr + 52); // 13 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm_comp_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m128 _conv_we = _mm_load_ps(kptr + 56); // 14 * out_elempack + __m128 _conv_wf = _mm_load_ps(kptr + 60); // 15 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm_comp_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack16to8.h b/src/layer/x86/deformableconv2d_pack16to8.h new file mode 100644 index 000000000000..f44fc9ad0c8e --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack16to8.h @@ -0,0 +1,435 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack16to8_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 16; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + __m256 _val_channel4 = _val_channel0; + __m256 _val_channel5 = _val_channel0; + __m256 _val_channel6 = _val_channel0; + __m256 _val_channel7 = _val_channel0; + __m256 _val_channel8 = _val_channel0; + __m256 _val_channel9 = _val_channel0; + __m256 _val_channela = _val_channel0; + __m256 _val_channelb = _val_channel0; + __m256 _val_channelc = _val_channel0; + __m256 _val_channeld = _val_channel0; + __m256 _val_channele = _val_channel0; + __m256 _val_channelf = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v1_channel4 = _val_channel0; + __m256 _v1_channel5 = _val_channel0; + __m256 _v1_channel6 = _val_channel0; + __m256 _v1_channel7 = _val_channel0; + __m256 _v1_channel8 = _val_channel0; + __m256 _v1_channel9 = _val_channel0; + __m256 _v1_channela = _val_channel0; + __m256 _v1_channelb = _val_channel0; + __m256 _v1_channelc = _val_channel0; + __m256 _v1_channeld = _val_channel0; + __m256 _v1_channele = _val_channel0; + __m256 _v1_channelf = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v2_channel4 = _val_channel0; + __m256 _v2_channel5 = _val_channel0; + __m256 _v2_channel6 = _val_channel0; + __m256 _v2_channel7 = _val_channel0; + __m256 _v2_channel8 = _val_channel0; + __m256 _v2_channel9 = _val_channel0; + __m256 _v2_channela = _val_channel0; + __m256 _v2_channelb = _val_channel0; + __m256 _v2_channelc = _val_channel0; + __m256 _v2_channeld = _val_channel0; + __m256 _v2_channele = _val_channel0; + __m256 _v2_channelf = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v3_channel4 = _val_channel0; + __m256 _v3_channel5 = _val_channel0; + __m256 _v3_channel6 = _val_channel0; + __m256 _v3_channel7 = _val_channel0; + __m256 _v3_channel8 = _val_channel0; + __m256 _v3_channel9 = _val_channel0; + __m256 _v3_channela = _val_channel0; + __m256 _v3_channelb = _val_channel0; + __m256 _v3_channelc = _val_channel0; + __m256 _v3_channeld = _val_channel0; + __m256 _v3_channele = _val_channel0; + __m256 _v3_channelf = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + __m256 _v4_channel4 = _val_channel0; + __m256 _v4_channel5 = _val_channel0; + __m256 _v4_channel6 = _val_channel0; + __m256 _v4_channel7 = _val_channel0; + __m256 _v4_channel8 = _val_channel0; + __m256 _v4_channel9 = _val_channel0; + __m256 _v4_channela = _val_channel0; + __m256 _v4_channelb = _val_channel0; + __m256 _v4_channelc = _val_channel0; + __m256 _v4_channeld = _val_channel0; + __m256 _v4_channele = _val_channel0; + __m256 _v4_channelf = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 7); + _v1_channel8 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 8); + _v1_channel9 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 9); + _v1_channela = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 10); + _v1_channelb = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 11); + _v1_channelc = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 12); + _v1_channeld = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 13); + _v1_channele = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 14); + _v1_channelf = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 15); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 7); + _v2_channel8 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 8); + _v2_channel9 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 9); + _v2_channela = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 10); + _v2_channelb = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 11); + _v2_channelc = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 12); + _v2_channeld = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 13); + _v2_channele = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 14); + _v2_channelf = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 15); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 7); + _v3_channel8 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 8); + _v3_channel9 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 9); + _v3_channela = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 10); + _v3_channelb = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 11); + _v3_channelc = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 12); + _v3_channeld = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 13); + _v3_channele = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 14); + _v3_channelf = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 15); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 7); + _v4_channel8 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 8); + _v4_channel9 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 9); + _v4_channela = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 10); + _v4_channelb = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 11); + _v4_channelc = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 12); + _v4_channeld = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 13); + _v4_channele = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 14); + _v4_channelf = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 15); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm256_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm256_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm256_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm256_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + _val_channel8 = _mm256_comp_fmadd_ps(_v1_channel8, _w1, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v2_channel8, _w2, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v3_channel8, _w3, _val_channel8); + _val_channel8 = _mm256_comp_fmadd_ps(_v4_channel8, _w4, _val_channel8); + _val_channel9 = _mm256_comp_fmadd_ps(_v1_channel9, _w1, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v2_channel9, _w2, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v3_channel9, _w3, _val_channel9); + _val_channel9 = _mm256_comp_fmadd_ps(_v4_channel9, _w4, _val_channel9); + _val_channela = _mm256_comp_fmadd_ps(_v1_channela, _w1, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v2_channela, _w2, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v3_channela, _w3, _val_channela); + _val_channela = _mm256_comp_fmadd_ps(_v4_channela, _w4, _val_channela); + _val_channelb = _mm256_comp_fmadd_ps(_v1_channelb, _w1, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v2_channelb, _w2, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v3_channelb, _w3, _val_channelb); + _val_channelb = _mm256_comp_fmadd_ps(_v4_channelb, _w4, _val_channelb); + _val_channelc = _mm256_comp_fmadd_ps(_v1_channelc, _w1, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v2_channelc, _w2, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v3_channelc, _w3, _val_channelc); + _val_channelc = _mm256_comp_fmadd_ps(_v4_channelc, _w4, _val_channelc); + _val_channeld = _mm256_comp_fmadd_ps(_v1_channeld, _w1, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v2_channeld, _w2, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v3_channeld, _w3, _val_channeld); + _val_channeld = _mm256_comp_fmadd_ps(_v4_channeld, _w4, _val_channeld); + _val_channele = _mm256_comp_fmadd_ps(_v1_channele, _w1, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v2_channele, _w2, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v3_channele, _w3, _val_channele); + _val_channele = _mm256_comp_fmadd_ps(_v4_channele, _w4, _val_channele); + _val_channelf = _mm256_comp_fmadd_ps(_v1_channelf, _w1, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v2_channelf, _w2, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v3_channelf, _w3, _val_channelf); + _val_channelf = _mm256_comp_fmadd_ps(_v4_channelf, _w4, _val_channelf); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm256_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm256_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm256_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm256_mul_ps(_val_channel7, _mask); + _val_channel8 = _mm256_mul_ps(_val_channel8, _mask); + _val_channel9 = _mm256_mul_ps(_val_channel9, _mask); + _val_channela = _mm256_mul_ps(_val_channela, _mask); + _val_channelb = _mm256_mul_ps(_val_channelb, _mask); + _val_channelc = _mm256_mul_ps(_val_channelc, _mask); + _val_channeld = _mm256_mul_ps(_val_channeld, _mask); + _val_channele = _mm256_mul_ps(_val_channele, _mask); + _val_channelf = _mm256_mul_ps(_val_channelf, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack + __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack + __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + __m256 _conv_w8 = _mm256_load_ps(kptr + 64); // 8 * out_elempack + __m256 _conv_w9 = _mm256_load_ps(kptr + 72); // 9 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel8, _conv_w8, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel9, _conv_w9, _sum); + __m256 _conv_wa = _mm256_load_ps(kptr + 80); // 10 * out_elempack + __m256 _conv_wb = _mm256_load_ps(kptr + 88); // 11 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channela, _conv_wa, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channelb, _conv_wb, _sum); + __m256 _conv_wc = _mm256_load_ps(kptr + 96); // 12 * out_elempack + __m256 _conv_wd = _mm256_load_ps(kptr + 104); // 13 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channelc, _conv_wc, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channeld, _conv_wd, _sum); + __m256 _conv_we = _mm256_load_ps(kptr + 112); // 14 * out_elempack + __m256 _conv_wf = _mm256_load_ps(kptr + 120); // 15 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channele, _conv_we, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channelf, _conv_wf, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to16.h b/src/layer/x86/deformableconv2d_pack1to16.h new file mode 100644 index 000000000000..b50e787e9c8e --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to16.h @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to4.h b/src/layer/x86/deformableconv2d_pack1to4.h new file mode 100644 index 000000000000..0388111306fa --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to4.h @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack1to8.h b/src/layer/x86/deformableconv2d_pack1to8.h new file mode 100644 index 000000000000..fe1e0c8c0a68 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack1to8.h @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack1to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 1; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4.h b/src/layer/x86/deformableconv2d_pack4.h new file mode 100644 index 000000000000..32b27963fb16 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4.h @@ -0,0 +1,243 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to1.h b/src/layer/x86/deformableconv2d_pack4to1.h new file mode 100644 index 000000000000..7ee073a91cb5 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to1.h @@ -0,0 +1,211 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to1_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to16.h b/src/layer/x86/deformableconv2d_pack4to16.h new file mode 100644 index 000000000000..809bb7cb2b50 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to16.h @@ -0,0 +1,243 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack4to8.h b/src/layer/x86/deformableconv2d_pack4to8.h new file mode 100644 index 000000000000..84099691826b --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack4to8.h @@ -0,0 +1,243 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack4to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 4; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8.h b/src/layer/x86/deformableconv2d_pack8.h new file mode 100644 index 000000000000..277817e39482 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8.h @@ -0,0 +1,307 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 8; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m256 _sum = _mm256_setzero_ps(); + if (bias_data_ptr) + _sum = _mm256_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val_channel0 = _mm256_loadu_ps(zeros_ptr); + __m256 _val_channel1 = _val_channel0; + __m256 _val_channel2 = _val_channel0; + __m256 _val_channel3 = _val_channel0; + __m256 _val_channel4 = _val_channel0; + __m256 _val_channel5 = _val_channel0; + __m256 _val_channel6 = _val_channel0; + __m256 _val_channel7 = _val_channel0; + if (cond) + { + __m256 _v1_channel0 = _val_channel0; + __m256 _v1_channel1 = _val_channel0; + __m256 _v1_channel2 = _val_channel0; + __m256 _v1_channel3 = _val_channel0; + __m256 _v1_channel4 = _val_channel0; + __m256 _v1_channel5 = _val_channel0; + __m256 _v1_channel6 = _val_channel0; + __m256 _v1_channel7 = _val_channel0; + __m256 _v2_channel0 = _val_channel0; + __m256 _v2_channel1 = _val_channel0; + __m256 _v2_channel2 = _val_channel0; + __m256 _v2_channel3 = _val_channel0; + __m256 _v2_channel4 = _val_channel0; + __m256 _v2_channel5 = _val_channel0; + __m256 _v2_channel6 = _val_channel0; + __m256 _v2_channel7 = _val_channel0; + __m256 _v3_channel0 = _val_channel0; + __m256 _v3_channel1 = _val_channel0; + __m256 _v3_channel2 = _val_channel0; + __m256 _v3_channel3 = _val_channel0; + __m256 _v3_channel4 = _val_channel0; + __m256 _v3_channel5 = _val_channel0; + __m256 _v3_channel6 = _val_channel0; + __m256 _v3_channel7 = _val_channel0; + __m256 _v4_channel0 = _val_channel0; + __m256 _v4_channel1 = _val_channel0; + __m256 _v4_channel2 = _val_channel0; + __m256 _v4_channel3 = _val_channel0; + __m256 _v4_channel4 = _val_channel0; + __m256 _v4_channel5 = _val_channel0; + __m256 _v4_channel6 = _val_channel0; + __m256 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm256_broadcast_ss(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm256_broadcast_ss(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm256_broadcast_ss(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm256_broadcast_ss(data_im_ptr + v4_pos * elempack + 7); + } + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val_channel0 = _mm256_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm256_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm256_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm256_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm256_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm256_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm256_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm256_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm256_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm256_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm256_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm256_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm256_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm256_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm256_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm256_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val_channel0 = _mm256_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm256_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm256_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm256_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm256_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm256_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm256_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm256_mul_ps(_val_channel7, _mask); + } + __m256 _conv_w0 = _mm256_load_ps(kptr); + __m256 _conv_w1 = _mm256_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m256 _conv_w2 = _mm256_load_ps(kptr + 16); // 2 * out_elempack + __m256 _conv_w3 = _mm256_load_ps(kptr + 24); // 3 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m256 _conv_w4 = _mm256_load_ps(kptr + 32); // 4 * out_elempack + __m256 _conv_w5 = _mm256_load_ps(kptr + 40); // 5 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m256 _conv_w6 = _mm256_load_ps(kptr + 48); // 6 * out_elempack + __m256 _conv_w7 = _mm256_load_ps(kptr + 56); // 7 * out_elempack + _sum = _mm256_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm256_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx(_sum, activation_type, activation_params); + _mm256_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to1.h b/src/layer/x86/deformableconv2d_pack8to1.h new file mode 100644 index 000000000000..c4b97b40f062 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to1.h @@ -0,0 +1,259 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to1_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 1; + const int wstep = out_elempack * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + float _sum = 0.f; + if (bias_data_ptr) + _sum = *(bias_data_ptr + oc); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + float _val_channel0 = 0.f; + float _val_channel1 = _val_channel0; + float _val_channel2 = _val_channel0; + float _val_channel3 = _val_channel0; + float _val_channel4 = _val_channel0; + float _val_channel5 = _val_channel0; + float _val_channel6 = _val_channel0; + float _val_channel7 = _val_channel0; + if (cond) + { + float _v1_channel0 = _val_channel0; + float _v1_channel1 = _val_channel0; + float _v1_channel2 = _val_channel0; + float _v1_channel3 = _val_channel0; + float _v1_channel4 = _val_channel0; + float _v1_channel5 = _val_channel0; + float _v1_channel6 = _val_channel0; + float _v1_channel7 = _val_channel0; + float _v2_channel0 = _val_channel0; + float _v2_channel1 = _val_channel0; + float _v2_channel2 = _val_channel0; + float _v2_channel3 = _val_channel0; + float _v2_channel4 = _val_channel0; + float _v2_channel5 = _val_channel0; + float _v2_channel6 = _val_channel0; + float _v2_channel7 = _val_channel0; + float _v3_channel0 = _val_channel0; + float _v3_channel1 = _val_channel0; + float _v3_channel2 = _val_channel0; + float _v3_channel3 = _val_channel0; + float _v3_channel4 = _val_channel0; + float _v3_channel5 = _val_channel0; + float _v3_channel6 = _val_channel0; + float _v3_channel7 = _val_channel0; + float _v4_channel0 = _val_channel0; + float _v4_channel1 = _val_channel0; + float _v4_channel2 = _val_channel0; + float _v4_channel3 = _val_channel0; + float _v4_channel4 = _val_channel0; + float _v4_channel5 = _val_channel0; + float _v4_channel6 = _val_channel0; + float _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = *(data_im_ptr + v1_pos * elempack); + _v1_channel1 = *(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = *(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = *(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = *(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = *(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = *(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = *(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = *(data_im_ptr + v2_pos * elempack); + _v2_channel1 = *(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = *(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = *(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = *(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = *(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = *(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = *(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = *(data_im_ptr + v3_pos * elempack); + _v3_channel1 = *(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = *(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = *(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = *(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = *(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = *(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = *(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = *(data_im_ptr + v4_pos * elempack); + _v4_channel1 = *(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = *(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = *(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = *(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = *(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = *(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = *(data_im_ptr + v4_pos * elempack + 7); + } + _val_channel0 = w1 * _v1_channel0 + w2 * _v2_channel0 + w3 * _v3_channel0 + w4 * _v4_channel0; + _val_channel1 = w1 * _v1_channel1 + w2 * _v2_channel1 + w3 * _v3_channel1 + w4 * _v4_channel1; + _val_channel2 = w1 * _v1_channel2 + w2 * _v2_channel2 + w3 * _v3_channel2 + w4 * _v4_channel2; + _val_channel3 = w1 * _v1_channel3 + w2 * _v2_channel3 + w3 * _v3_channel3 + w4 * _v4_channel3; + _val_channel4 = w1 * _v1_channel4 + w2 * _v2_channel4 + w3 * _v3_channel4 + w4 * _v4_channel4; + _val_channel5 = w1 * _v1_channel5 + w2 * _v2_channel5 + w3 * _v3_channel5 + w4 * _v4_channel5; + _val_channel6 = w1 * _v1_channel6 + w2 * _v2_channel6 + w3 * _v3_channel6 + w4 * _v4_channel6; + _val_channel7 = w1 * _v1_channel7 + w2 * _v2_channel7 + w3 * _v3_channel7 + w4 * _v4_channel7; + } + if (has_mask) + { + _val_channel0 *= mask_; + _val_channel1 *= mask_; + _val_channel2 *= mask_; + _val_channel3 *= mask_; + _val_channel4 *= mask_; + _val_channel5 *= mask_; + _val_channel6 *= mask_; + _val_channel7 *= mask_; + } + float _conv_w0 = *(kptr); + float _conv_w1 = *(kptr + out_elempack); // 1 * out_elempack + float _conv_w2 = *(kptr + 2); // 2 * out_elempack + float _conv_w3 = *(kptr + 3); // 3 * out_elempack + float _conv_w4 = *(kptr + 4); // 4 * out_elempack + float _conv_w5 = *(kptr + 5); // 5 * out_elempack + float _conv_w6 = *(kptr + 6); // 6 * out_elempack + float _conv_w7 = *(kptr + 7); // 7 * out_elempack + _sum += (_val_channel0 * _conv_w0 + _val_channel1 * _conv_w1 + _val_channel2 * _conv_w2 + _val_channel3 * _conv_w3 + _val_channel4 * _conv_w4 + _val_channel5 * _conv_w5 + _val_channel6 * _conv_w6 + _val_channel7 * _conv_w7); + kptr += wstep; + } + } + } + _sum = activation_ss(_sum, activation_type, activation_params); + *(outptr + h_col * outw + w_col) = _sum; + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to16.h b/src/layer/x86/deformableconv2d_pack8to16.h new file mode 100644 index 000000000000..15e5ed076e64 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to16.h @@ -0,0 +1,307 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 16; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m512 _sum = _mm512_setzero_ps(); + if (bias_data_ptr) + _sum = _mm512_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val_channel0 = _mm512_loadu_ps(zeros_ptr); + __m512 _val_channel1 = _val_channel0; + __m512 _val_channel2 = _val_channel0; + __m512 _val_channel3 = _val_channel0; + __m512 _val_channel4 = _val_channel0; + __m512 _val_channel5 = _val_channel0; + __m512 _val_channel6 = _val_channel0; + __m512 _val_channel7 = _val_channel0; + if (cond) + { + __m512 _v1_channel0 = _val_channel0; + __m512 _v1_channel1 = _val_channel0; + __m512 _v1_channel2 = _val_channel0; + __m512 _v1_channel3 = _val_channel0; + __m512 _v1_channel4 = _val_channel0; + __m512 _v1_channel5 = _val_channel0; + __m512 _v1_channel6 = _val_channel0; + __m512 _v1_channel7 = _val_channel0; + __m512 _v2_channel0 = _val_channel0; + __m512 _v2_channel1 = _val_channel0; + __m512 _v2_channel2 = _val_channel0; + __m512 _v2_channel3 = _val_channel0; + __m512 _v2_channel4 = _val_channel0; + __m512 _v2_channel5 = _val_channel0; + __m512 _v2_channel6 = _val_channel0; + __m512 _v2_channel7 = _val_channel0; + __m512 _v3_channel0 = _val_channel0; + __m512 _v3_channel1 = _val_channel0; + __m512 _v3_channel2 = _val_channel0; + __m512 _v3_channel3 = _val_channel0; + __m512 _v3_channel4 = _val_channel0; + __m512 _v3_channel5 = _val_channel0; + __m512 _v3_channel6 = _val_channel0; + __m512 _v3_channel7 = _val_channel0; + __m512 _v4_channel0 = _val_channel0; + __m512 _v4_channel1 = _val_channel0; + __m512 _v4_channel2 = _val_channel0; + __m512 _v4_channel3 = _val_channel0; + __m512 _v4_channel4 = _val_channel0; + __m512 _v4_channel5 = _val_channel0; + __m512 _v4_channel6 = _val_channel0; + __m512 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack]); + _v1_channel1 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 1]); + _v1_channel2 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 2]); + _v1_channel3 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 3]); + _v1_channel4 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 4]); + _v1_channel5 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 5]); + _v1_channel6 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 6]); + _v1_channel7 = _mm512_set1_ps(data_im_ptr[v1_pos * elempack + 7]); + } + if (v2_cond) + { + _v2_channel0 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack]); + _v2_channel1 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 1]); + _v2_channel2 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 2]); + _v2_channel3 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 3]); + _v2_channel4 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 4]); + _v2_channel5 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 5]); + _v2_channel6 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 6]); + _v2_channel7 = _mm512_set1_ps(data_im_ptr[v2_pos * elempack + 7]); + } + if (v3_cond) + { + _v3_channel0 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack]); + _v3_channel1 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 1]); + _v3_channel2 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 2]); + _v3_channel3 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 3]); + _v3_channel4 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 4]); + _v3_channel5 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 5]); + _v3_channel6 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 6]); + _v3_channel7 = _mm512_set1_ps(data_im_ptr[v3_pos * elempack + 7]); + } + if (v4_cond) + { + _v4_channel0 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack]); + _v4_channel1 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 1]); + _v4_channel2 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 2]); + _v4_channel3 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 3]); + _v4_channel4 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 4]); + _v4_channel5 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 5]); + _v4_channel6 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 6]); + _v4_channel7 = _mm512_set1_ps(data_im_ptr[v4_pos * elempack + 7]); + } + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val_channel0 = _mm512_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm512_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm512_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm512_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm512_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm512_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm512_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm512_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm512_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm512_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm512_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm512_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm512_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm512_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm512_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm512_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val_channel0 = _mm512_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm512_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm512_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm512_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm512_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm512_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm512_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm512_mul_ps(_val_channel7, _mask); + } + __m512 _conv_w0 = _mm512_load_ps(kptr); + __m512 _conv_w1 = _mm512_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm512_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m512 _conv_w2 = _mm512_load_ps(kptr + 32); // 2 * out_elempack + __m512 _conv_w3 = _mm512_load_ps(kptr + 48); // 3 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm512_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m512 _conv_w4 = _mm512_load_ps(kptr + 64); // 4 * out_elempack + __m512 _conv_w5 = _mm512_load_ps(kptr + 80); // 5 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm512_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m512 _conv_w6 = _mm512_load_ps(kptr + 96); // 6 * out_elempack + __m512 _conv_w7 = _mm512_load_ps(kptr + 112); // 7 * out_elempack + _sum = _mm512_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm512_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_avx512(_sum, activation_type, activation_params); + _mm512_store_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_pack8to4.h b/src/layer/x86/deformableconv2d_pack8to4.h new file mode 100644 index 000000000000..85aa06aaa036 --- /dev/null +++ b/src/layer/x86/deformableconv2d_pack8to4.h @@ -0,0 +1,307 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void deformableconv2d_pack8to4_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& weight_data_packed, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, int activation_type, const Mat& activation_params, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + const int size = outw * outh; + const int maxk = kernel_w * kernel_h; + + const float* bias_data_ptr = bias_data; + const int elempack = 8; + const int out_elempack = 4; + const int wstep = out_elempack * elempack; + const float zeros[out_elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < outch; oc++) + { + const float* kptr = weight_data_packed.channel(oc); + float* outptr = top_blob.channel(oc); + __m128 _sum = _mm_setzero_ps(); + if (bias_data_ptr) + _sum = _mm_loadu_ps(bias_data_ptr + oc * out_elempack); + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[out_elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[out_elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[out_elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[out_elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[out_elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val_channel0 = _mm_loadu_ps(zeros_ptr); + __m128 _val_channel1 = _val_channel0; + __m128 _val_channel2 = _val_channel0; + __m128 _val_channel3 = _val_channel0; + __m128 _val_channel4 = _val_channel0; + __m128 _val_channel5 = _val_channel0; + __m128 _val_channel6 = _val_channel0; + __m128 _val_channel7 = _val_channel0; + if (cond) + { + __m128 _v1_channel0 = _val_channel0; + __m128 _v1_channel1 = _val_channel0; + __m128 _v1_channel2 = _val_channel0; + __m128 _v1_channel3 = _val_channel0; + __m128 _v1_channel4 = _val_channel0; + __m128 _v1_channel5 = _val_channel0; + __m128 _v1_channel6 = _val_channel0; + __m128 _v1_channel7 = _val_channel0; + __m128 _v2_channel0 = _val_channel0; + __m128 _v2_channel1 = _val_channel0; + __m128 _v2_channel2 = _val_channel0; + __m128 _v2_channel3 = _val_channel0; + __m128 _v2_channel4 = _val_channel0; + __m128 _v2_channel5 = _val_channel0; + __m128 _v2_channel6 = _val_channel0; + __m128 _v2_channel7 = _val_channel0; + __m128 _v3_channel0 = _val_channel0; + __m128 _v3_channel1 = _val_channel0; + __m128 _v3_channel2 = _val_channel0; + __m128 _v3_channel3 = _val_channel0; + __m128 _v3_channel4 = _val_channel0; + __m128 _v3_channel5 = _val_channel0; + __m128 _v3_channel6 = _val_channel0; + __m128 _v3_channel7 = _val_channel0; + __m128 _v4_channel0 = _val_channel0; + __m128 _v4_channel1 = _val_channel0; + __m128 _v4_channel2 = _val_channel0; + __m128 _v4_channel3 = _val_channel0; + __m128 _v4_channel4 = _val_channel0; + __m128 _v4_channel5 = _val_channel0; + __m128 _v4_channel6 = _val_channel0; + __m128 _v4_channel7 = _val_channel0; + if (v1_cond) + { + _v1_channel0 = _mm_load1_ps(data_im_ptr + v1_pos * elempack); + _v1_channel1 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 1); + _v1_channel2 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 2); + _v1_channel3 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 3); + _v1_channel4 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 4); + _v1_channel5 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 5); + _v1_channel6 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 6); + _v1_channel7 = _mm_load1_ps(data_im_ptr + v1_pos * elempack + 7); + } + if (v2_cond) + { + _v2_channel0 = _mm_load1_ps(data_im_ptr + v2_pos * elempack); + _v2_channel1 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 1); + _v2_channel2 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 2); + _v2_channel3 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 3); + _v2_channel4 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 4); + _v2_channel5 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 5); + _v2_channel6 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 6); + _v2_channel7 = _mm_load1_ps(data_im_ptr + v2_pos * elempack + 7); + } + if (v3_cond) + { + _v3_channel0 = _mm_load1_ps(data_im_ptr + v3_pos * elempack); + _v3_channel1 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 1); + _v3_channel2 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 2); + _v3_channel3 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 3); + _v3_channel4 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 4); + _v3_channel5 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 5); + _v3_channel6 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 6); + _v3_channel7 = _mm_load1_ps(data_im_ptr + v3_pos * elempack + 7); + } + if (v4_cond) + { + _v4_channel0 = _mm_load1_ps(data_im_ptr + v4_pos * elempack); + _v4_channel1 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 1); + _v4_channel2 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 2); + _v4_channel3 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 3); + _v4_channel4 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 4); + _v4_channel5 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 5); + _v4_channel6 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 6); + _v4_channel7 = _mm_load1_ps(data_im_ptr + v4_pos * elempack + 7); + } + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val_channel0 = _mm_comp_fmadd_ps(_v1_channel0, _w1, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v2_channel0, _w2, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v3_channel0, _w3, _val_channel0); + _val_channel0 = _mm_comp_fmadd_ps(_v4_channel0, _w4, _val_channel0); + _val_channel1 = _mm_comp_fmadd_ps(_v1_channel1, _w1, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v2_channel1, _w2, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v3_channel1, _w3, _val_channel1); + _val_channel1 = _mm_comp_fmadd_ps(_v4_channel1, _w4, _val_channel1); + _val_channel2 = _mm_comp_fmadd_ps(_v1_channel2, _w1, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v2_channel2, _w2, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v3_channel2, _w3, _val_channel2); + _val_channel2 = _mm_comp_fmadd_ps(_v4_channel2, _w4, _val_channel2); + _val_channel3 = _mm_comp_fmadd_ps(_v1_channel3, _w1, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v2_channel3, _w2, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v3_channel3, _w3, _val_channel3); + _val_channel3 = _mm_comp_fmadd_ps(_v4_channel3, _w4, _val_channel3); + _val_channel4 = _mm_comp_fmadd_ps(_v1_channel4, _w1, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v2_channel4, _w2, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v3_channel4, _w3, _val_channel4); + _val_channel4 = _mm_comp_fmadd_ps(_v4_channel4, _w4, _val_channel4); + _val_channel5 = _mm_comp_fmadd_ps(_v1_channel5, _w1, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v2_channel5, _w2, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v3_channel5, _w3, _val_channel5); + _val_channel5 = _mm_comp_fmadd_ps(_v4_channel5, _w4, _val_channel5); + _val_channel6 = _mm_comp_fmadd_ps(_v1_channel6, _w1, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v2_channel6, _w2, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v3_channel6, _w3, _val_channel6); + _val_channel6 = _mm_comp_fmadd_ps(_v4_channel6, _w4, _val_channel6); + _val_channel7 = _mm_comp_fmadd_ps(_v1_channel7, _w1, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v2_channel7, _w2, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v3_channel7, _w3, _val_channel7); + _val_channel7 = _mm_comp_fmadd_ps(_v4_channel7, _w4, _val_channel7); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val_channel0 = _mm_mul_ps(_val_channel0, _mask); + _val_channel1 = _mm_mul_ps(_val_channel1, _mask); + _val_channel2 = _mm_mul_ps(_val_channel2, _mask); + _val_channel3 = _mm_mul_ps(_val_channel3, _mask); + _val_channel4 = _mm_mul_ps(_val_channel4, _mask); + _val_channel5 = _mm_mul_ps(_val_channel5, _mask); + _val_channel6 = _mm_mul_ps(_val_channel6, _mask); + _val_channel7 = _mm_mul_ps(_val_channel7, _mask); + } + __m128 _conv_w0 = _mm_load_ps(kptr); + __m128 _conv_w1 = _mm_load_ps(kptr + out_elempack); // 1 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel0, _conv_w0, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel1, _conv_w1, _sum); + __m128 _conv_w2 = _mm_load_ps(kptr + 8); // 2 * out_elempack + __m128 _conv_w3 = _mm_load_ps(kptr + 12); // 3 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel2, _conv_w2, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel3, _conv_w3, _sum); + __m128 _conv_w4 = _mm_load_ps(kptr + 16); // 4 * out_elempack + __m128 _conv_w5 = _mm_load_ps(kptr + 20); // 5 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel4, _conv_w4, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel5, _conv_w5, _sum); + __m128 _conv_w6 = _mm_load_ps(kptr + 24); // 6 * out_elempack + __m128 _conv_w7 = _mm_load_ps(kptr + 28); // 7 * out_elempack + _sum = _mm_comp_fmadd_ps(_val_channel6, _conv_w6, _sum); + _sum = _mm_comp_fmadd_ps(_val_channel7, _conv_w7, _sum); + kptr += wstep; + } + } + } + _sum = activation_sse(_sum, activation_type, activation_params); + _mm_storeu_ps(outptr + (h_col * outw + w_col) * out_elempack, _sum); + } + } + } +} diff --git a/src/layer/x86/deformableconv2d_sgemm.h b/src/layer/x86/deformableconv2d_sgemm.h new file mode 100644 index 000000000000..648af448b128 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm.h @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm.h" + +static void deformableconv2d_im2col_sgemm_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16.h b/src/layer/x86/deformableconv2d_sgemm_pack16.h new file mode 100644 index 000000000000..37aab40f1e45 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16.h" + +static void deformableconv2d_im2col_sgemm_pack16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to1.h b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h new file mode 100644 index 000000000000..686333e6ee4c --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to1.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to1.h" + +static void deformableconv2d_im2col_sgemm_pack16to1_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to1_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to4.h b/src/layer/x86/deformableconv2d_sgemm_pack16to4.h new file mode 100644 index 000000000000..a7438d1f983a --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to4.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to4.h" + +static void deformableconv2d_im2col_sgemm_pack16to4_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to4_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack16to8.h b/src/layer/x86/deformableconv2d_sgemm_pack16to8.h new file mode 100644 index 000000000000..d441d2549404 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack16to8.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack16to8.h" + +static void deformableconv2d_im2col_sgemm_pack16to8_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 16; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m512 _val = _mm512_loadu_ps(zeros_ptr); + if (cond) + { + __m512 _v1 = _val; + __m512 _v2 = _val; + __m512 _v3 = _val; + __m512 _v4 = _val; + if (v1_cond) + _v1 = _mm512_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm512_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm512_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm512_load_ps(data_im_ptr + v4_pos * elempack); + __m512 _w1 = _mm512_loadu_ps(w1_ptr); + __m512 _w2 = _mm512_loadu_ps(w2_ptr); + __m512 _w3 = _mm512_loadu_ps(w3_ptr); + __m512 _w4 = _mm512_loadu_ps(w4_ptr); + _val = _mm512_fmadd_ps(_v1, _w1, _val); + _val = _mm512_fmadd_ps(_v2, _w2, _val); + _val = _mm512_fmadd_ps(_v3, _w3, _val); + _val = _mm512_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m512 _mask = _mm512_loadu_ps(mask_ptr); + _val = _mm512_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm512_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack16to8_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to16.h b/src/layer/x86/deformableconv2d_sgemm_pack1to16.h new file mode 100644 index 000000000000..d30c11926fdb --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to16.h @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to16.h" + +static void deformableconv2d_im2col_sgemm_pack1to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to4.h b/src/layer/x86/deformableconv2d_sgemm_pack1to4.h new file mode 100644 index 000000000000..0070999c05cd --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to4.h @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to4.h" + +static void deformableconv2d_im2col_sgemm_pack1to4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to4_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack1to8.h b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h new file mode 100644 index 000000000000..d02c4245d7c2 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack1to8.h @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack1to8.h" + +static void deformableconv2d_im2col_sgemm_pack1to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + for (int ic = 0; ic < inch; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + bottom_im2col.channel(ic).row(i * kernel_w + j)[h_col * outw + w_col] = val * mask_; + } + } + } + } + } + } + + im2col_sgemm_pack1to8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4.h b/src/layer/x86/deformableconv2d_sgemm_pack4.h new file mode 100644 index 000000000000..140fa78e5228 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4.h" + +static void deformableconv2d_im2col_sgemm_pack4_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to1.h b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h new file mode 100644 index 000000000000..d5d7b57cab51 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to1.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to1.h" + +static void deformableconv2d_im2col_sgemm_pack4to1_sse(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to1_sse(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to16.h b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h new file mode 100644 index 000000000000..7eef68bb01a8 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to16.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to16.h" + +static void deformableconv2d_im2col_sgemm_pack4to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack4to8.h b/src/layer/x86/deformableconv2d_sgemm_pack4to8.h new file mode 100644 index 000000000000..1096d5dc8343 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack4to8.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack4to8.h" + +static void deformableconv2d_im2col_sgemm_pack4to8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 4; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m128 _val = _mm_loadu_ps(zeros_ptr); + if (cond) + { + __m128 _v1 = _val; + __m128 _v2 = _val; + __m128 _v3 = _val; + __m128 _v4 = _val; + if (v1_cond) + _v1 = _mm_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm_load_ps(data_im_ptr + v4_pos * elempack); + __m128 _w1 = _mm_loadu_ps(w1_ptr); + __m128 _w2 = _mm_loadu_ps(w2_ptr); + __m128 _w3 = _mm_loadu_ps(w3_ptr); + __m128 _w4 = _mm_loadu_ps(w4_ptr); + _val = _mm_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m128 _mask = _mm_loadu_ps(mask_ptr); + _val = _mm_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack4to8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8.h b/src/layer/x86/deformableconv2d_sgemm_pack8.h new file mode 100644 index 000000000000..fce556068598 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8.h" + +static void deformableconv2d_im2col_sgemm_pack8_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to1.h b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h new file mode 100644 index 000000000000..635c08625ab1 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to1.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to1.h" + +static void deformableconv2d_im2col_sgemm_pack8to1_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to1_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to16.h b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h new file mode 100644 index 000000000000..161e983f1a01 --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to16.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to16.h" + +static void deformableconv2d_im2col_sgemm_pack8to16_avx512(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to16_avx512(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_sgemm_pack8to4.h b/src/layer/x86/deformableconv2d_sgemm_pack8to4.h new file mode 100644 index 000000000000..45c853d2262c --- /dev/null +++ b/src/layer/x86/deformableconv2d_sgemm_pack8to4.h @@ -0,0 +1,180 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution_sgemm_pack8to4.h" + +static void deformableconv2d_im2col_sgemm_pack8to4_avx(const std::vector& bottom_blobs, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_left, int pad_top, const Option& opt) +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& offset = bottom_blobs[1]; + const bool has_mask = (bottom_blobs.size() == 3); + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + const int size = outw * outh; + + const int maxk = kernel_w * kernel_h; + const int elempack = 8; + const float zeros[elempack] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; + const float* zeros_ptr = zeros; + + // im2col + Mat bottom_im2col(size, maxk, inch, 4u * elempack, elempack, opt.workspace_allocator); + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < outh; h_col++) + { + for (int w_col = 0; w_col < outw; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + int v1_pos = 0; + int v2_pos = 0; + int v3_pos = 0; + int v4_pos = 0; + if (cond) + { + int h_low = floor(h_im); + int w_low = floor(w_im); + int h_high = h_low + 1; + int w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + if (v1_cond) + v1_pos = h_low * w + w_low; + if (v2_cond) + v2_pos = h_low * w + w_high; + if (v3_cond) + v3_pos = h_high * w + w_low; + if (v4_cond) + v4_pos = h_high * w + w_high; + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + const float w1s[elempack] = {w1, w1, w1, w1, w1, w1, w1, w1}; + const float* w1_ptr = w1s; + const float w2s[elempack] = {w2, w2, w2, w2, w2, w2, w2, w2}; + const float* w2_ptr = w2s; + const float w3s[elempack] = {w3, w3, w3, w3, w3, w3, w3, w3}; + const float* w3_ptr = w3s; + const float w4s[elempack] = {w4, w4, w4, w4, w4, w4, w4, w4}; + const float* w4_ptr = w4s; + const float masks[elempack] = {mask_, mask_, mask_, mask_, mask_, mask_, mask_, mask_}; + const float* mask_ptr = masks; + + for (int ic = 0; ic < inch; ic++) + { + const float* data_im_ptr = bottom_blob.channel(ic); + __m256 _val = _mm256_loadu_ps(zeros_ptr); + if (cond) + { + __m256 _v1 = _val; + __m256 _v2 = _val; + __m256 _v3 = _val; + __m256 _v4 = _val; + if (v1_cond) + _v1 = _mm256_load_ps(data_im_ptr + v1_pos * elempack); + if (v2_cond) + _v2 = _mm256_load_ps(data_im_ptr + v2_pos * elempack); + if (v3_cond) + _v3 = _mm256_load_ps(data_im_ptr + v3_pos * elempack); + if (v4_cond) + _v4 = _mm256_load_ps(data_im_ptr + v4_pos * elempack); + __m256 _w1 = _mm256_loadu_ps(w1_ptr); + __m256 _w2 = _mm256_loadu_ps(w2_ptr); + __m256 _w3 = _mm256_loadu_ps(w3_ptr); + __m256 _w4 = _mm256_loadu_ps(w4_ptr); + _val = _mm256_comp_fmadd_ps(_v1, _w1, _val); + _val = _mm256_comp_fmadd_ps(_v2, _w2, _val); + _val = _mm256_comp_fmadd_ps(_v3, _w3, _val); + _val = _mm256_comp_fmadd_ps(_v4, _w4, _val); + } + if (has_mask) + { + __m256 _mask = _mm256_loadu_ps(mask_ptr); + _val = _mm256_mul_ps(_val, _mask); + } + float* ptr = bottom_im2col.channel(ic); + _mm256_store_ps(ptr + ((i * kernel_w + j) * size + h_col * outw + w_col) * elempack, _val); + } + } + } + } + } + } + + im2col_sgemm_pack8to4_avx(bottom_im2col, top_blob, kernel, _bias, opt); +} diff --git a/src/layer/x86/deformableconv2d_x86.cpp b/src/layer/x86/deformableconv2d_x86.cpp index 869815283d9e..caff2e17d06f 100644 --- a/src/layer/x86/deformableconv2d_x86.cpp +++ b/src/layer/x86/deformableconv2d_x86.cpp @@ -14,63 +14,352 @@ #include "deformableconv2d_x86.h" +#if __SSE2__ +#include +#if __SSE4_1__ +#include +#if __AVX__ +#include +#endif +#endif // __SSE4_1__ +#endif // __SSE2__ +#include "x86_activation.h" +#include "x86_usability.h" + +#include "benchmark.h" +#include "cpu.h" #include "layer_type.h" namespace ncnn { +#include "deformableconv2d_sgemm.h" + +#if __SSE2__ +#include "deformableconv2d_pack4.h" +#include "deformableconv2d_pack1to4.h" +#include "deformableconv2d_pack4to1.h" + +#include "deformableconv2d_sgemm_pack4.h" +#include "deformableconv2d_sgemm_pack1to4.h" +#include "deformableconv2d_sgemm_pack4to1.h" + +#if __AVX__ +#include "deformableconv2d_pack8.h" +#include "deformableconv2d_pack4to8.h" +#include "deformableconv2d_pack1to8.h" +#include "deformableconv2d_pack8to4.h" +#include "deformableconv2d_pack8to1.h" + +#include "deformableconv2d_sgemm_pack8.h" +#include "deformableconv2d_sgemm_pack4to8.h" +#include "deformableconv2d_sgemm_pack1to8.h" +#include "deformableconv2d_sgemm_pack8to4.h" +#include "deformableconv2d_sgemm_pack8to1.h" + +#if __AVX512F__ +#include "deformableconv2d_pack16.h" +#include "deformableconv2d_pack8to16.h" +#include "deformableconv2d_pack4to16.h" +#include "deformableconv2d_pack1to16.h" +#include "deformableconv2d_pack16to8.h" +#include "deformableconv2d_pack16to4.h" +#include "deformableconv2d_pack16to1.h" + +#include "deformableconv2d_sgemm_pack16.h" +#include "deformableconv2d_sgemm_pack8to16.h" +#include "deformableconv2d_sgemm_pack4to16.h" +#include "deformableconv2d_sgemm_pack1to16.h" +#include "deformableconv2d_sgemm_pack16to8.h" +#include "deformableconv2d_sgemm_pack16to4.h" +#include "deformableconv2d_sgemm_pack16to1.h" +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ + DeformableConv2D_x86::DeformableConv2D_x86() { - one_blob_only = false; - support_inplace = false; +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ - inner_product = 0; - permute = 0; + activation = 0; } -int DeformableConv2D_x86::create_pipeline(const Option& opt) +static int _4Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int l1, int l2, int l3) +{ + return ((i0 * l1 + i1) * l2 + i2) * l3 + i3; +} + +static int _6Dindex_to_1Dindex(int i0, int i1, int i2, int i3, int i4, int i5, int l1, int l2, int l3, int l4, int l5) +{ + return ((((i0 * l1 + i1) * l2 + i2) * l3 + i3) * l4 + i4) * l5 + i5; +} + +static void deformableconv2d_transform_kernel_packed_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, int kernel_w, int kernel_h, int elempack, int out_elempack) { - const int in_c = weight_data_size / (num_output * kernel_h * kernel_w); + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-inch/pa-kw-kh-outch/pb { - Mat weight_3d = weight_data.reshape(kernel_w * kernel_h, in_c, num_output); - weight_data_t.create(in_c, kernel_w * kernel_h, num_output); - if (weight_data_t.empty()) - return -100; - for (int q = 0; q < num_output; q++) - { - const Mat m = weight_3d.channel(q); - float* outptr = weight_data_t.channel(q); + const float* weight_ptr = weight_data; - for (int i = 0; i < kernel_w * kernel_h; i++) + weight_data_tm.create(num_input * maxk * num_output / (elempack * out_elempack), (size_t)4u * elempack * out_elempack, elempack * out_elempack); + float* ptr = weight_data_tm; + for (int oc = 0; oc < num_output; oc++) + { + for (int i = 0; i < kernel_h; i++) { - for (int j = 0; j < in_c; j++) + for (int j = 0; j < kernel_w; j++) { - *outptr++ = m.row(j)[i]; + for (int ic = 0; ic < num_input; ic++) + { + ptr[_6Dindex_to_1Dindex(oc / out_elempack, i, j, ic / elempack, ic % elempack, oc % out_elempack, kernel_h, kernel_w, num_input / elempack, elempack, out_elempack)] = weight_ptr[_4Dindex_to_1Dindex(oc, ic, i, j, num_input, kernel_h, kernel_w)]; + } } } } - weight_3d.release(); - weight_data_t = weight_data_t.reshape(in_c * kernel_w * kernel_h, num_output); + weight_data_tm = weight_data_tm.reshape(num_input / elempack, maxk, num_output / out_elempack); + } +} + +int DeformableConv2D_x86::create_pipeline(const Option& opt) +{ + activation = create_activation_layer(activation_type, activation_params, opt); + + int kernel_size = kernel_w * kernel_h; + int num_input = weight_data_size / kernel_size / num_output; + + int elempack = 1; + int out_elempack = 1; + +#if __SSE2__ + if (opt.use_packing_layout) + { +#if __AVX512F__ + elempack = num_input % 16 == 0 ? 16 : num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + elempack = num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else + elempack = num_input % 4 == 0 ? 4 : 1; + out_elempack = num_output % 4 == 0 ? 4 : 1; +#endif + } +#endif // __SSE2__ + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 8 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to8_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 4 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to4_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 1 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to16_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + if (elempack == 16 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack16to1_avx512(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + +#endif // __AVX512F__ + + // pack8 + if (elempack == 8 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack4to8 + if (elempack == 4 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1to8 + if (elempack == 1 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to8_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack8to4 + if (elempack == 8 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to4_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack8to1 + if (elempack == 8 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack8to1_avx(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __AVX__ + + // pack4 + if (elempack == 4 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack1to4 + if (elempack == 1 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack1to4_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_pack4to1_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + deformableconv2d_transform_kernel_packed_sse(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h, elempack, out_elempack); + } + } +#endif // __SSE2__ - inner_product = ncnn::create_layer(ncnn::LayerType::InnerProduct); - ncnn::ParamDict pd; - pd.set(0, num_output); - pd.set(1, bias_term); - pd.set(2, weight_data_size); - pd.set(9, activation_type); - pd.set(10, activation_params); - inner_product->load_param(pd); - ncnn::Mat weights[2]; - weights[0] = weight_data_t; - if (bias_term) - weights[1] = bias_data; - inner_product->load_model(ncnn::ModelBinFromMatArray(weights)); - inner_product->create_pipeline(opt); + // pack1 + if (elempack == 1 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + convolution_im2col_sgemm_transform_kernel_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h); + } + else + { + weight_data_tm = weight_data; + } + } - permute = ncnn::create_layer(ncnn::LayerType::Permute); - ncnn::ParamDict permute_pd; - permute_pd.set(0, 1); - permute->load_param(permute_pd); - permute->create_pipeline(opt); + if (opt.lightmode) + { + weight_data.release(); } return 0; @@ -78,17 +367,11 @@ int DeformableConv2D_x86::create_pipeline(const Option& opt) int DeformableConv2D_x86::destroy_pipeline(const Option& opt) { - if (inner_product) + if (activation) { - inner_product->destroy_pipeline(opt); - delete inner_product; - inner_product = 0; - } - if (permute) - { - permute->destroy_pipeline(opt); - delete permute; - permute = 0; + activation->destroy_pipeline(opt); + delete activation; + activation = 0; } return 0; @@ -98,134 +381,427 @@ int DeformableConv2D_x86::forward(const std::vector& bottom_blobs, std::vec { const Mat& bottom_blob = bottom_blobs[0]; const Mat& offset = bottom_blobs[1]; - const bool has_mask = (bottom_blobs.size() == 3); + Mat& top_blob = top_blobs[0]; - const int w = bottom_blob.w; - const int h = bottom_blob.h; - const int in_c = bottom_blob.c; - const size_t elemsize = bottom_blob.elemsize; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; - const int out_w = (w + pad_left + pad_right - kernel_extent_w) / stride_w + 1; const int out_h = (h + pad_top + pad_bottom - kernel_extent_h) / stride_h + 1; - // output = im2col matmul weight_t, im2col.shape is [out_h * out_w, kernel_h * kernel_w * in_c] (in python), - // weight_t.shape is [num_output, kernel_h * kernel_w * in_c] (in python), - // output.shape is [out_h * out_w, num_output] (in python). - Mat im2col; - im2col.create(kernel_h * kernel_w * in_c * out_h * out_w, elemsize, opt.blob_allocator); - if (im2col.empty()) - return -100; + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { +#if __AVX512F__ + out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else + out_elempack = num_output % 4 == 0 ? 4 : 1; +#endif + } +#endif // __SSE2__ + size_t out_elemsize = elemsize / elempack * out_elempack; - Mat& output = top_blobs[0]; - output.create(num_output, out_h * out_w, elemsize, opt.blob_allocator); - if (output.empty()) + top_blob.create(out_w, out_h, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) return -100; - Mat bottom_blob_flatten = bottom_blob.reshape(w * h * in_c); - Mat offset_flatten = offset.reshape(offset.w * offset.h * offset.c); - const float* data_im_ptr = bottom_blob_flatten; - const float* data_offset_ptr = offset_flatten; - float* im2col_ptr = im2col; + const int num_input = channels * elempack; - // im2col - #pragma omp parallel for num_threads(opt.num_threads) - for (int h_col = 0; h_col < out_h; h_col++) +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (elempack == 16 && out_elempack == 16) { - for (int w_col = 0; w_col < out_w; w_col++) + if (opt.use_sgemm_convolution) { - int h_in = h_col * stride_h - pad_top; - int w_in = w_col * stride_w - pad_left; - float* data_col_ptr = im2col_ptr + (h_col * out_w + w_col) * kernel_h * kernel_w * in_c; - for (int i = 0; i < kernel_h; i++) + deformableconv2d_im2col_sgemm_pack16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) { - for (int j = 0; j < kernel_w; j++) - { - const int data_offset_h_ptr = (((i * kernel_w + j) * 2) * out_h + h_col) * out_w + w_col; - const int data_offset_w_ptr = (((i * kernel_w + j) * 2 + 1) * out_h + h_col) * out_w + w_col; - - const float offset_h = data_offset_ptr[data_offset_h_ptr]; - const float offset_w = data_offset_ptr[data_offset_w_ptr]; - const float mask_ = has_mask ? bottom_blobs[2].channel(i * kernel_w + j).row(h_col)[w_col] : 1.f; - const float h_im = h_in + i * dilation_h + offset_h; - const float w_im = w_in + j * dilation_w + offset_w; - - // Bilinear - const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; - float w1 = 0.f; - float w2 = 0.f; - float w3 = 0.f; - float w4 = 0.f; - bool v1_cond = false; - bool v2_cond = false; - bool v3_cond = false; - bool v4_cond = false; - int v1_pos = 0; - int v2_pos = 0; - int v3_pos = 0; - int v4_pos = 0; - if (cond) - { - int h_low = floor(h_im); - int w_low = floor(w_im); - int h_high = h_low + 1; - int w_high = w_low + 1; - - float lh = h_im - h_low; - float lw = w_im - w_low; - float hh = 1 - lh; - float hw = 1 - lw; - - v1_cond = (h_low >= 0 && w_low >= 0); - v2_cond = (h_low >= 0 && w_high <= w - 1); - v3_cond = (h_high <= h - 1 && w_low >= 0); - v4_cond = (h_high <= h - 1 && w_high <= w - 1); - if (v1_cond) - v1_pos = h_low * w + w_low; - if (v2_cond) - v2_pos = h_low * w + w_high; - if (v3_cond) - v3_pos = h_high * w + w_low; - if (v4_cond) - v4_pos = h_high * w + w_high; - - w1 = hh * hw; - w2 = hh * lw; - w3 = lh * hw; - w4 = lh * lw; - } + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to8_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to8_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to4_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to4_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 16) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to16_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to16_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 16 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack16to1_avx512(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack16to1_avx512(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + +#endif // __AVX512F__ + + if (elempack == 8 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 8) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to8_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to8_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to1_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to1_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 8 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack8to4_avx(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack8to4_avx(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } +#endif // __AVX__ + + if (elempack == 4 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 1 && out_elempack == 4) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack1to4_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack1to4_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } + + if (elempack == 4 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_pack4to1_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + deformableconv2d_pack4to1_sse(bottom_blobs, top_blob, weight_data_tm, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, activation_type, activation_params, opt); + } + } +#endif // __SSE2__ + + if (elempack == 1 && out_elempack == 1) + { + if (opt.use_sgemm_convolution) + { + deformableconv2d_im2col_sgemm_sse(bottom_blobs, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_left, pad_top, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else + { + const bool offset_not_pack = offset.elempack == 1; + const bool mask_not_pack = has_mask ? bottom_blobs[2].elempack == 1 : true; + const float* weight_ptr = weight_data_tm; - const float* data_im_channel_ptr = data_im_ptr; - for (int c_im = 0; c_im < in_c; c_im++) + // naive deformable conv + #pragma omp parallel for num_threads(opt.num_threads) + for (int h_col = 0; h_col < out_h; h_col++) + { + for (int w_col = 0; w_col < out_w; w_col++) + { + int h_in = h_col * stride_h - pad_top; + int w_in = w_col * stride_w - pad_left; + for (int oc = 0; oc < num_output; oc++) { - float val = 0.f; - if (cond) + float sum = 0.f; + if (bias_term) + sum = bias_data[oc]; + for (int i = 0; i < kernel_h; i++) { - float v1 = v1_cond ? data_im_channel_ptr[v1_pos] : 0.f; - float v2 = v2_cond ? data_im_channel_ptr[v2_pos] : 0.f; - float v3 = v3_cond ? data_im_channel_ptr[v3_pos] : 0.f; - float v4 = v4_cond ? data_im_channel_ptr[v4_pos] : 0.f; - val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + for (int j = 0; j < kernel_w; j++) + { + float offset_h = 0.f; + float offset_w = 0.f; + float mask_ = 1.f; + if (offset_not_pack) + { + offset_h = offset.channel((i * kernel_w + j) * 2).row(h_col)[w_col]; + offset_w = offset.channel((i * kernel_w + j) * 2 + 1).row(h_col)[w_col]; + } + else + { + const int y_c = (i * kernel_w + j) * 2; + const int x_c = (i * kernel_w + j) * 2 + 1; + offset_h = offset.channel(y_c / offset.elempack).row(h_col)[w_col * offset.elempack + y_c % offset.elempack]; + offset_w = offset.channel(x_c / offset.elempack).row(h_col)[w_col * offset.elempack + x_c % offset.elempack]; + } + if (has_mask) + { + const Mat& mask = bottom_blobs[2]; + if (mask_not_pack) + { + mask_ = mask.channel(i * kernel_w + j).row(h_col)[w_col]; + } + else + { + const int m_c = i * kernel_w + j; + mask_ = mask.channel(m_c / mask.elempack).row(h_col)[w_col * mask.elempack + m_c % mask.elempack]; + } + } + const float h_im = h_in + i * dilation_h + offset_h; + const float w_im = w_in + j * dilation_w + offset_w; + + // Bilinear + const bool cond = h_im > -1 && w_im > -1 && h_im < h && w_im < w; + int h_low = 0; + int w_low = 0; + int h_high = 0; + int w_high = 0; + float w1 = 0.f; + float w2 = 0.f; + float w3 = 0.f; + float w4 = 0.f; + bool v1_cond = false; + bool v2_cond = false; + bool v3_cond = false; + bool v4_cond = false; + if (cond) + { + h_low = floor(h_im); + w_low = floor(w_im); + h_high = h_low + 1; + w_high = w_low + 1; + + float lh = h_im - h_low; + float lw = w_im - w_low; + float hh = 1 - lh; + float hw = 1 - lw; + + v1_cond = (h_low >= 0 && w_low >= 0); + v2_cond = (h_low >= 0 && w_high <= w - 1); + v3_cond = (h_high <= h - 1 && w_low >= 0); + v4_cond = (h_high <= h - 1 && w_high <= w - 1); + + w1 = hh * hw; + w2 = hh * lw; + w3 = lh * hw; + w4 = lh * lw; + } + + for (int ic = 0; ic < channels; ic++) + { + float val = 0.f; + if (cond) + { + float v1 = v1_cond ? bottom_blob.channel(ic).row(h_low)[w_low] : 0.f; + float v2 = v2_cond ? bottom_blob.channel(ic).row(h_low)[w_high] : 0.f; + float v3 = v3_cond ? bottom_blob.channel(ic).row(h_high)[w_low] : 0.f; + float v4 = v4_cond ? bottom_blob.channel(ic).row(h_high)[w_high] : 0.f; + val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + } + sum += val * mask_ * weight_ptr[((oc * channels + ic) * kernel_h + i) * kernel_w + j]; + } + } } - *data_col_ptr = val * mask_; - data_col_ptr += 1; - data_im_channel_ptr += h * w; + top_blob.channel(oc).row(h_col)[w_col] = activation_ss(sum, activation_type, activation_params); } } } } } - im2col = im2col.reshape(kernel_h * kernel_w * in_c, out_h * out_w); - // call InnerProduct - inner_product->forward(im2col, output, opt); - ncnn::Mat output_t; - // call Permute - permute->forward(output, output_t, opt); - output_t = output_t.reshape(out_w, out_h, num_output); - top_blobs[0] = output_t; + return 0; } diff --git a/src/layer/x86/deformableconv2d_x86.h b/src/layer/x86/deformableconv2d_x86.h index 0e21c9392af4..a4f02f8fccb0 100644 --- a/src/layer/x86/deformableconv2d_x86.h +++ b/src/layer/x86/deformableconv2d_x86.h @@ -30,10 +30,10 @@ class DeformableConv2D_x86 : virtual public DeformableConv2D virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: - Mat weight_data_t; + Layer* activation; - Layer* inner_product; - Layer* permute; + Mat weight_data_tm; + Mat weight_sgemm_data; }; } // namespace ncnn diff --git a/src/layer/x86/flatten_x86.cpp b/src/layer/x86/flatten_x86.cpp index 19e663197a5e..7c2ae662d6a2 100644 --- a/src/layer/x86/flatten_x86.cpp +++ b/src/layer/x86/flatten_x86.cpp @@ -141,7 +141,7 @@ int Flatten_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m512 _re = _mm512_loadu_ps(ptr + 16 * 14); __m512 _rf = _mm512_loadu_ps(ptr + 16 * 15); - transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); _mm512_storeu_ps(outptr0, _r0); _mm512_storeu_ps(outptr1, _r1); @@ -230,7 +230,7 @@ int Flatten_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m256 _row6 = _mm256_loadu_ps(ptr + 48); __m256 _row7 = _mm256_loadu_ps(ptr + 56); - transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); + transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); _mm256_storeu_ps(outptr0, _row0); _mm256_storeu_ps(outptr1, _row1); @@ -362,7 +362,7 @@ int Flatten_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m512 _re = _mm512_loadu_ps(ptr + 16 * 14); __m512 _rf = _mm512_loadu_ps(ptr + 16 * 15); - transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); _mm512_storeu_ps(outptr0, _r0); _mm512_storeu_ps(outptr1, _r1); @@ -451,7 +451,7 @@ int Flatten_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m256 _row6 = _mm256_loadu_ps(ptr + 48); __m256 _row7 = _mm256_loadu_ps(ptr + 56); - transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); + transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); _mm256_storeu_ps(outptr0, _row0); _mm256_storeu_ps(outptr1, _row1); diff --git a/src/layer/x86/gelu_x86.cpp b/src/layer/x86/gelu_x86.cpp new file mode 100644 index 000000000000..352d330b8777 --- /dev/null +++ b/src/layer/x86/gelu_x86.cpp @@ -0,0 +1,154 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "gelu_x86.h" + +#if __SSE2__ +#include +#include "sse_mathfun.h" +#if __AVX__ +#include +#include "avx_mathfun.h" +#if __AVX512F__ +#include "avx512_mathfun.h" +#endif // __AVX512F__ +#endif // __AVX__ +#endif // __SSE2__ + +namespace ncnn { + +GELU_x86::GELU_x86() +{ +#if __SSE2__ + support_packing = true; +#endif // __SSE2__ +} + +int GELU_x86::create_pipeline(const Option& /*opt*/) +{ + if (!fast_gelu) + { + support_packing = false; + } + return 0; +} + +int GELU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const +{ + if (!fast_gelu) + { + return GELU::forward_inplace(bottom_top_blob, opt); + } + + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int elempack = bottom_top_blob.elempack; + int channels = bottom_top_blob.c; + int size = w * h * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + + int i = 0; + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + __m512 _half512 = _mm512_set1_ps(0.5f); + __m512 _one512 = _mm512_set1_ps(1.f); + __m512 _fast1c512 = _mm512_set1_ps(0.79788452f); + __m512 _fast2c512 = _mm512_set1_ps(0.044715f); + for (; i + 15 < size; i += 16) + { + __m512 _pLoad = _mm512_loadu_ps(ptr); + + __m512 _cube = _mm512_mul_ps(_pLoad, _pLoad); + _cube = _mm512_mul_ps(_pLoad, _cube); + + __m512 _blob = _mm512_mul_ps(_fast2c512, _cube); + _blob = _mm512_add_ps(_pLoad, _blob); + _blob = _mm512_mul_ps(_fast1c512, _blob); + _blob = tanh512_ps(_blob); + _blob = _mm512_add_ps(_one512, _blob); + + _blob = _mm512_mul_ps(_half512, _mm512_mul_ps(_blob, _pLoad)); + + _mm512_storeu_ps(ptr, _blob); + + ptr += 16; + } +#endif // __AVX512F__ + __m256 _half256 = _mm256_set1_ps(0.5f); + __m256 _one256 = _mm256_set1_ps(1.f); + __m256 _fast1c256 = _mm256_set1_ps(0.79788452f); + __m256 _fast2c256 = _mm256_set1_ps(0.044715f); + for (; i + 7 < size; i += 8) + { + __m256 _pLoad = _mm256_loadu_ps(ptr); + + __m256 _cube = _mm256_mul_ps(_pLoad, _pLoad); + _cube = _mm256_mul_ps(_pLoad, _cube); + + __m256 _blob = _mm256_mul_ps(_fast2c256, _cube); + _blob = _mm256_add_ps(_pLoad, _blob); + _blob = _mm256_mul_ps(_fast1c256, _blob); + _blob = tanh256_ps(_blob); + _blob = _mm256_add_ps(_one256, _blob); + + _blob = _mm256_mul_ps(_half256, _mm256_mul_ps(_blob, _pLoad)); + + _mm256_storeu_ps(ptr, _blob); + + ptr += 8; + } +#endif // __AVX__ + __m128 _half128 = _mm_set1_ps(0.5f); + __m128 _one128 = _mm_set1_ps(1.f); + __m128 _fast1c128 = _mm_set1_ps(0.79788452f); + __m128 _fast2c128 = _mm_set1_ps(0.044715f); + for (; i + 3 < size; i += 4) + { + __m128 _pLoad = _mm_loadu_ps(ptr); + + __m128 _cube = _mm_mul_ps(_pLoad, _pLoad); + _cube = _mm_mul_ps(_pLoad, _cube); + + __m128 _blob = _mm_mul_ps(_fast2c128, _cube); + _blob = _mm_add_ps(_pLoad, _blob); + _blob = _mm_mul_ps(_fast1c128, _blob); + _blob = tanh_ps(_blob); + _blob = _mm_add_ps(_one128, _blob); + + _blob = _mm_mul_ps(_half128, _mm_mul_ps(_blob, _pLoad)); + + _mm_storeu_ps(ptr, _blob); + + ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + // y = 0.5x * (1 + tanh(sqrt(2/Pi) * (x + 0.044715x^3))) + *ptr = 0.5f * *ptr * (1.0f + tanhf(0.79788452f * (*ptr + 0.044715f * *ptr * *ptr * *ptr))); + + ptr++; + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/x86/gelu_x86.h b/src/layer/x86/gelu_x86.h new file mode 100644 index 000000000000..75d821bfd45d --- /dev/null +++ b/src/layer/x86/gelu_x86.h @@ -0,0 +1,33 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_GELU_X86_H +#define LAYER_GELU_X86_H + +#include "gelu.h" + +namespace ncnn { + +class GELU_x86 : virtual public GELU +{ +public: + GELU_x86(); + + virtual int create_pipeline(const Option& opt); + virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_GELU_X86_H diff --git a/src/layer/x86/innerproduct_fp.h b/src/layer/x86/innerproduct_fp.h new file mode 100644 index 000000000000..6edcd74dcd91 --- /dev/null +++ b/src/layer/x86/innerproduct_fp.h @@ -0,0 +1,1401 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__ +void innerproduct_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); +void innerproduct_transform_kernel_fp16s_sse_f16c(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt); +#endif + +#if NCNN_IMPL_FP16S +static void innerproduct_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) +#else +static void innerproduct_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) +#endif +{ +#if NCNN_RUNTIME_CPU && NCNN_IMPL_FP16S && NCNN_F16C && __AVX__ && !__F16C__ + if (ncnn::cpu_support_x86_f16c()) + { + innerproduct_fp16s_sse_f16c(bottom_blob, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); + return; + } +#else // NCNN_RUNTIME_CPU + + const int num_input = bottom_blob.w * bottom_blob.elempack; + const int outw = top_blob.w; + const int out_elempack = top_blob.elempack; + + const float* bias_data_ptr = bias_data; + +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (out_elempack == 16) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outw; p++) + { + __m512 _sum0 = _mm512_setzero_ps(); + __m512 _sum1 = _mm512_setzero_ps(); + __m512 _sum2 = _mm512_setzero_ps(); + __m512 _sum3 = _mm512_setzero_ps(); + __m512 _sum4 = _mm512_setzero_ps(); + __m512 _sum5 = _mm512_setzero_ps(); + __m512 _sum6 = _mm512_setzero_ps(); + __m512 _sum7 = _mm512_setzero_ps(); + + if (bias_data_ptr) + { + _sum0 = _mm512_loadu_ps(bias_data_ptr + p * 16); + } + +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif + const float* sptr = bottom_blob; + + int i = 0; + for (; i + 7 < num_input; i += 8) + { + __m512 _val0 = _mm512_set1_ps(sptr[0]); + __m512 _val1 = _mm512_set1_ps(sptr[1]); + __m512 _val2 = _mm512_set1_ps(sptr[2]); + __m512 _val3 = _mm512_set1_ps(sptr[3]); +#if NCNN_IMPL_FP16S + __m512i _w01 = _mm512_loadu_si512(kptr); + __m512i _w23 = _mm512_loadu_si512(kptr + 32); + __m512 _w0 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 0)); + __m512 _w1 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 1)); + __m512 _w2 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 0)); + __m512 _w3 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 1)); +#else + __m512 _w0 = _mm512_loadu_ps(kptr + 16 * 0); + __m512 _w1 = _mm512_loadu_ps(kptr + 16 * 1); + __m512 _w2 = _mm512_loadu_ps(kptr + 16 * 2); + __m512 _w3 = _mm512_loadu_ps(kptr + 16 * 3); +#endif + + _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0); + _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1); + _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2); + _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3); + + __m512 _val4 = _mm512_set1_ps(sptr[4]); + __m512 _val5 = _mm512_set1_ps(sptr[5]); + __m512 _val6 = _mm512_set1_ps(sptr[6]); + __m512 _val7 = _mm512_set1_ps(sptr[7]); +#if NCNN_IMPL_FP16S + __m512i _w45 = _mm512_loadu_si512(kptr + 64); + __m512i _w67 = _mm512_loadu_si512(kptr + 96); + __m512 _w4 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w45, 0)); + __m512 _w5 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w45, 1)); + __m512 _w6 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w67, 0)); + __m512 _w7 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w67, 1)); +#else + __m512 _w4 = _mm512_loadu_ps(kptr + 16 * 4); + __m512 _w5 = _mm512_loadu_ps(kptr + 16 * 5); + __m512 _w6 = _mm512_loadu_ps(kptr + 16 * 6); + __m512 _w7 = _mm512_loadu_ps(kptr + 16 * 7); +#endif + + _sum4 = _mm512_fmadd_ps(_val4, _w4, _sum4); + _sum5 = _mm512_fmadd_ps(_val5, _w5, _sum5); + _sum6 = _mm512_fmadd_ps(_val6, _w6, _sum6); + _sum7 = _mm512_fmadd_ps(_val7, _w7, _sum7); + + sptr += 8; + kptr += 128; + } + for (; i + 3 < num_input; i += 4) + { + __m512 _val0 = _mm512_set1_ps(sptr[0]); + __m512 _val1 = _mm512_set1_ps(sptr[1]); + __m512 _val2 = _mm512_set1_ps(sptr[2]); + __m512 _val3 = _mm512_set1_ps(sptr[3]); +#if NCNN_IMPL_FP16S + __m512i _w01 = _mm512_loadu_si512(kptr); + __m512i _w23 = _mm512_loadu_si512(kptr + 32); + __m512 _w0 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 0)); + __m512 _w1 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 1)); + __m512 _w2 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 0)); + __m512 _w3 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 1)); +#else + __m512 _w0 = _mm512_loadu_ps(kptr); + __m512 _w1 = _mm512_loadu_ps(kptr + 16); + __m512 _w2 = _mm512_loadu_ps(kptr + 32); + __m512 _w3 = _mm512_loadu_ps(kptr + 48); +#endif + + _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0); + _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1); + _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2); + _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3); + + sptr += 4; + kptr += 64; + } + for (; i < num_input; i++) + { + __m512 _val = _mm512_set1_ps(sptr[0]); +#if NCNN_IMPL_FP16S + __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr)); +#else + __m512 _w = _mm512_loadu_ps(kptr); +#endif + _sum0 = _mm512_fmadd_ps(_val, _w, _sum0); + + sptr += 1; + kptr += 16; + } + + _sum0 = _mm512_add_ps(_sum0, _sum1); + _sum2 = _mm512_add_ps(_sum2, _sum3); + _sum4 = _mm512_add_ps(_sum4, _sum5); + _sum6 = _mm512_add_ps(_sum6, _sum7); + _sum0 = _mm512_add_ps(_sum0, _sum2); + _sum4 = _mm512_add_ps(_sum4, _sum6); + _sum0 = _mm512_add_ps(_sum0, _sum4); + + _sum0 = activation_avx512(_sum0, activation_type, activation_params); + + float* outptr = top_blob; + _mm512_storeu_ps(outptr + p * 16, _sum0); + } + } +#endif // __AVX512F__ + + if (out_elempack == 8) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outw; p++) + { + __m256 _sum0 = _mm256_setzero_ps(); + __m256 _sum1 = _mm256_setzero_ps(); + __m256 _sum2 = _mm256_setzero_ps(); + __m256 _sum3 = _mm256_setzero_ps(); + __m256 _sum4 = _mm256_setzero_ps(); + __m256 _sum5 = _mm256_setzero_ps(); + __m256 _sum6 = _mm256_setzero_ps(); + __m256 _sum7 = _mm256_setzero_ps(); + + if (bias_data_ptr) + { + _sum0 = _mm256_loadu_ps(bias_data_ptr + p * 8); + } + +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif + const float* sptr = bottom_blob; + + int i = 0; + for (; i + 7 < num_input; i += 8) + { + __m256 _val0 = _mm256_broadcast_ss(sptr); + __m256 _val1 = _mm256_broadcast_ss(sptr + 1); + __m256 _val2 = _mm256_broadcast_ss(sptr + 2); + __m256 _val3 = _mm256_broadcast_ss(sptr + 3); +#if NCNN_IMPL_FP16S + __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr); + __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16)); + __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0)); + __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1)); + __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0)); + __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1)); +#else + __m256 _w0 = _mm256_loadu_ps(kptr); + __m256 _w1 = _mm256_loadu_ps(kptr + 8); + __m256 _w2 = _mm256_loadu_ps(kptr + 16); + __m256 _w3 = _mm256_loadu_ps(kptr + 24); +#endif + + _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); + _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); + + __m256 _val4 = _mm256_broadcast_ss(sptr + 4); + __m256 _val5 = _mm256_broadcast_ss(sptr + 5); + __m256 _val6 = _mm256_broadcast_ss(sptr + 6); + __m256 _val7 = _mm256_broadcast_ss(sptr + 7); +#if NCNN_IMPL_FP16S + __m256i _w45 = _mm256_lddqu_si256((const __m256i*)(kptr + 32)); + __m256i _w67 = _mm256_lddqu_si256((const __m256i*)(kptr + 48)); + __m256 _w4 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w45, 0)); + __m256 _w5 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w45, 1)); + __m256 _w6 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w67, 0)); + __m256 _w7 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w67, 1)); +#else + __m256 _w4 = _mm256_loadu_ps(kptr + 32); + __m256 _w5 = _mm256_loadu_ps(kptr + 40); + __m256 _w6 = _mm256_loadu_ps(kptr + 48); + __m256 _w7 = _mm256_loadu_ps(kptr + 56); +#endif + + _sum4 = _mm256_comp_fmadd_ps(_val4, _w4, _sum4); + _sum5 = _mm256_comp_fmadd_ps(_val5, _w5, _sum5); + _sum6 = _mm256_comp_fmadd_ps(_val6, _w6, _sum6); + _sum7 = _mm256_comp_fmadd_ps(_val7, _w7, _sum7); + + sptr += 8; + kptr += 64; + } + for (; i + 3 < num_input; i += 4) + { + __m256 _val0 = _mm256_broadcast_ss(sptr); + __m256 _val1 = _mm256_broadcast_ss(sptr + 1); + __m256 _val2 = _mm256_broadcast_ss(sptr + 2); + __m256 _val3 = _mm256_broadcast_ss(sptr + 3); +#if NCNN_IMPL_FP16S + __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr); + __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16)); + __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0)); + __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1)); + __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0)); + __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1)); +#else + __m256 _w0 = _mm256_loadu_ps(kptr); + __m256 _w1 = _mm256_loadu_ps(kptr + 8); + __m256 _w2 = _mm256_loadu_ps(kptr + 16); + __m256 _w3 = _mm256_loadu_ps(kptr + 24); +#endif + + _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); + _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); + + sptr += 4; + kptr += 32; + } + for (; i < num_input; i++) + { + __m256 _val = _mm256_set1_ps(sptr[0]); +#if NCNN_IMPL_FP16S + __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr)); +#else + __m256 _w = _mm256_loadu_ps(kptr); +#endif + _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0); + + sptr += 1; + kptr += 8; + } + + _sum0 = _mm256_add_ps(_sum0, _sum1); + _sum2 = _mm256_add_ps(_sum2, _sum3); + _sum4 = _mm256_add_ps(_sum4, _sum5); + _sum6 = _mm256_add_ps(_sum6, _sum7); + _sum0 = _mm256_add_ps(_sum0, _sum2); + _sum4 = _mm256_add_ps(_sum4, _sum6); + _sum0 = _mm256_add_ps(_sum0, _sum4); + + _sum0 = activation_avx(_sum0, activation_type, activation_params); + + float* outptr = top_blob; + _mm256_storeu_ps(outptr + p * 8, _sum0); + } + } +#endif // __AVX__ + + if (out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < outw; p++) + { + __m128 _sum0 = _mm_setzero_ps(); +#if __AVX__ + __m256 _sum01 = _mm256_setzero_ps(); + __m256 _sum23 = _mm256_setzero_ps(); + __m256 _sum45 = _mm256_setzero_ps(); + __m256 _sum67 = _mm256_setzero_ps(); +#else + __m128 _sum1 = _mm_setzero_ps(); + __m128 _sum2 = _mm_setzero_ps(); + __m128 _sum3 = _mm_setzero_ps(); +#endif + + if (bias_data_ptr) + { + _sum0 = _mm_loadu_ps(bias_data_ptr + p * 4); + } + +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif + const float* sptr = bottom_blob; + + int i = 0; +#if __AVX__ + for (; i + 7 < num_input; i += 8) + { + __m128 _val0 = _mm_broadcast_ss(sptr); + __m128 _val1 = _mm_broadcast_ss(sptr + 1); + __m128 _val2 = _mm_broadcast_ss(sptr + 2); + __m128 _val3 = _mm_broadcast_ss(sptr + 3); + __m128 _val4 = _mm_broadcast_ss(sptr + 4); + __m128 _val5 = _mm_broadcast_ss(sptr + 5); + __m128 _val6 = _mm_broadcast_ss(sptr + 6); + __m128 _val7 = _mm_broadcast_ss(sptr + 7); + + __m256 _val01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val0), _val1, 1); + __m256 _val23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val2), _val3, 1); + __m256 _val45 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val4), _val5, 1); + __m256 _val67 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val6), _val7, 1); + +#if NCNN_IMPL_FP16S + __m256i _w0123 = _mm256_lddqu_si256((const __m256i*)kptr); + __m256i _w4567 = _mm256_lddqu_si256((const __m256i*)(kptr + 16)); + __m256 _w01 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 0)); + __m256 _w23 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 1)); + __m256 _w45 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w4567, 0)); + __m256 _w67 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w4567, 1)); +#else + __m256 _w01 = _mm256_loadu_ps(kptr); + __m256 _w23 = _mm256_loadu_ps(kptr + 8); + __m256 _w45 = _mm256_loadu_ps(kptr + 16); + __m256 _w67 = _mm256_loadu_ps(kptr + 24); +#endif + + _sum01 = _mm256_comp_fmadd_ps(_val01, _w01, _sum01); + _sum23 = _mm256_comp_fmadd_ps(_val23, _w23, _sum23); + _sum45 = _mm256_comp_fmadd_ps(_val45, _w45, _sum45); + _sum67 = _mm256_comp_fmadd_ps(_val67, _w67, _sum67); + + sptr += 8; + kptr += 32; + } +#endif + for (; i + 3 < num_input; i += 4) + { +#if __AVX__ + __m128 _val0 = _mm_broadcast_ss(sptr); + __m128 _val1 = _mm_broadcast_ss(sptr + 1); + __m128 _val2 = _mm_broadcast_ss(sptr + 2); + __m128 _val3 = _mm_broadcast_ss(sptr + 3); + + __m256 _val01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val0), _val1, 1); + __m256 _val23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val2), _val3, 1); + +#if NCNN_IMPL_FP16S + __m256i _w0123 = _mm256_lddqu_si256((const __m256i*)kptr); + __m256 _w01 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 0)); + __m256 _w23 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 1)); +#else + __m256 _w01 = _mm256_loadu_ps(kptr); + __m256 _w23 = _mm256_loadu_ps(kptr + 8); +#endif + + _sum01 = _mm256_comp_fmadd_ps(_val01, _w01, _sum01); + _sum23 = _mm256_comp_fmadd_ps(_val23, _w23, _sum23); +#else + __m128 _val0 = _mm_set1_ps(sptr[0]); + __m128 _val1 = _mm_set1_ps(sptr[1]); + __m128 _val2 = _mm_set1_ps(sptr[2]); + __m128 _val3 = _mm_set1_ps(sptr[3]); + + __m128 _w0 = _mm_loadu_ps(kptr); + __m128 _w1 = _mm_loadu_ps(kptr + 4); + __m128 _w2 = _mm_loadu_ps(kptr + 8); + __m128 _w3 = _mm_loadu_ps(kptr + 12); + + _sum0 = _mm_comp_fmadd_ps(_val0, _w0, _sum0); + _sum1 = _mm_comp_fmadd_ps(_val1, _w1, _sum1); + _sum2 = _mm_comp_fmadd_ps(_val2, _w2, _sum2); + _sum3 = _mm_comp_fmadd_ps(_val3, _w3, _sum3); +#endif + + sptr += 4; + kptr += 16; + } + for (; i < num_input; i++) + { + __m128 _val = _mm_set1_ps(sptr[0]); +#if NCNN_IMPL_FP16S + __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); +#else + __m128 _w = _mm_loadu_ps(kptr); +#endif + _sum0 = _mm_comp_fmadd_ps(_val, _w, _sum0); + + sptr += 1; + kptr += 4; + } + +#if __AVX__ + _sum01 = _mm256_add_ps(_sum01, _sum23); + _sum45 = _mm256_add_ps(_sum45, _sum67); + _sum01 = _mm256_add_ps(_sum01, _sum45); + + _sum0 = _mm_add_ps(_sum0, _mm256_extractf128_ps(_sum01, 0)); + _sum0 = _mm_add_ps(_sum0, _mm256_extractf128_ps(_sum01, 1)); +#else + _sum0 = _mm_add_ps(_sum0, _sum1); + _sum2 = _mm_add_ps(_sum2, _sum3); + _sum0 = _mm_add_ps(_sum0, _sum2); +#endif + + _sum0 = activation_sse(_sum0, activation_type, activation_params); + + float* outptr = top_blob; + _mm_storeu_ps(outptr + p * 4, _sum0); + } + } +#endif // __SSE2__ + + if (out_elempack == 1) + { +#if __SSE2__ +#if __AVX__ + int remain_outw_start = 0; + int nn_outw = outw >> 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outw; pp++) + { + int p = pp * 8; + + float sums[8] = {0.0f}; + if (bias_data_ptr) + { + sums[0] = bias_data_ptr[p]; + sums[1] = bias_data_ptr[p + 1]; + sums[2] = bias_data_ptr[p + 2]; + sums[3] = bias_data_ptr[p + 3]; + sums[4] = bias_data_ptr[p + 4]; + sums[5] = bias_data_ptr[p + 5]; + sums[6] = bias_data_ptr[p + 6]; + sums[7] = bias_data_ptr[p + 7]; + } + +#if NCNN_IMPL_FP16S + const unsigned short* w0 = weight_data_tm.row(p); + const unsigned short* w1 = weight_data_tm.row(p + 1); + const unsigned short* w2 = weight_data_tm.row(p + 2); + const unsigned short* w3 = weight_data_tm.row(p + 3); + const unsigned short* w4 = weight_data_tm.row(p + 4); + const unsigned short* w5 = weight_data_tm.row(p + 5); + const unsigned short* w6 = weight_data_tm.row(p + 6); + const unsigned short* w7 = weight_data_tm.row(p + 7); +#else + const float* w0 = (const float*)weight_data_tm + num_input * p; + const float* w1 = (const float*)weight_data_tm + num_input * (p + 1); + const float* w2 = (const float*)weight_data_tm + num_input * (p + 2); + const float* w3 = (const float*)weight_data_tm + num_input * (p + 3); + const float* w4 = (const float*)weight_data_tm + num_input * (p + 4); + const float* w5 = (const float*)weight_data_tm + num_input * (p + 5); + const float* w6 = (const float*)weight_data_tm + num_input * (p + 6); + const float* w7 = (const float*)weight_data_tm + num_input * (p + 7); +#endif + const float* m = bottom_blob; + + __m256 _sum0 = _mm256_setzero_ps(); + __m256 _sum1 = _mm256_setzero_ps(); + __m256 _sum2 = _mm256_setzero_ps(); + __m256 _sum3 = _mm256_setzero_ps(); + __m256 _sum4 = _mm256_setzero_ps(); + __m256 _sum5 = _mm256_setzero_ps(); + __m256 _sum6 = _mm256_setzero_ps(); + __m256 _sum7 = _mm256_setzero_ps(); + + int i = 0; + for (; i + 7 < num_input; i += 8) + { + __m256 _m = _mm256_loadu_ps(m); + +#if NCNN_IMPL_FP16S + __m256 _w0 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w0)); + __m256 _w1 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w1)); + __m256 _w2 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w2)); + __m256 _w3 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w3)); +#else + __m256 _w0 = _mm256_loadu_ps(w0); + __m256 _w1 = _mm256_loadu_ps(w1); + __m256 _w2 = _mm256_loadu_ps(w2); + __m256 _w3 = _mm256_loadu_ps(w3); +#endif + + _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0); + _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); + +#if NCNN_IMPL_FP16S + __m256 _w4 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w4)); + __m256 _w5 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w5)); + __m256 _w6 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w6)); + __m256 _w7 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w7)); +#else + __m256 _w4 = _mm256_loadu_ps(w4); + __m256 _w5 = _mm256_loadu_ps(w5); + __m256 _w6 = _mm256_loadu_ps(w6); + __m256 _w7 = _mm256_loadu_ps(w7); +#endif + + _sum4 = _mm256_comp_fmadd_ps(_m, _w4, _sum4); + _sum5 = _mm256_comp_fmadd_ps(_m, _w5, _sum5); + _sum6 = _mm256_comp_fmadd_ps(_m, _w6, _sum6); + _sum7 = _mm256_comp_fmadd_ps(_m, _w7, _sum7); + + m += 8; + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + w4 += 8; + w5 += 8; + w6 += 8; + w7 += 8; + } + for (; i < num_input; i++) + { +#if NCNN_IMPL_FP16S + sums[0] += *m * float16_to_float32(*w0); + sums[1] += *m * float16_to_float32(*w1); + sums[2] += *m * float16_to_float32(*w2); + sums[3] += *m * float16_to_float32(*w3); + sums[4] += *m * float16_to_float32(*w4); + sums[5] += *m * float16_to_float32(*w5); + sums[6] += *m * float16_to_float32(*w6); + sums[7] += *m * float16_to_float32(*w7); +#else + sums[0] += *m * *w0; + sums[1] += *m * *w1; + sums[2] += *m * *w2; + sums[3] += *m * *w3; + sums[4] += *m * *w4; + sums[5] += *m * *w5; + sums[6] += *m * *w6; + sums[7] += *m * *w7; +#endif + + m++; + w0++; + w1++; + w2++; + w3++; + w4++; + w5++; + w6++; + w7++; + } + + __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7); + __m256 _sums_f = _mm256_loadu_ps(sums); + _sums = _mm256_add_ps(_sums_f, _sums); + _sums = activation_avx(_sums, activation_type, activation_params); + + float* outptr = top_blob; + _mm256_storeu_ps(outptr + p, _sums); + } + + remain_outw_start += (nn_outw << 3); + nn_outw = (outw - remain_outw_start) >> 2; +#else + int remain_outw_start = 0; + int nn_outw = outw >> 2; +#endif // __AVX__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outw; pp++) + { + int p = remain_outw_start + (pp * 4); + + float sums[4] = {0.0f}; + if (bias_data_ptr) + { + sums[0] = bias_data_ptr[p]; + sums[1] = bias_data_ptr[p + 1]; + sums[2] = bias_data_ptr[p + 2]; + sums[3] = bias_data_ptr[p + 3]; + } + +#if NCNN_IMPL_FP16S + const unsigned short* w0 = weight_data_tm.row(p); + const unsigned short* w1 = weight_data_tm.row(p + 1); + const unsigned short* w2 = weight_data_tm.row(p + 2); + const unsigned short* w3 = weight_data_tm.row(p + 3); +#else + const float* w0 = (const float*)weight_data_tm + num_input * p; + const float* w1 = (const float*)weight_data_tm + num_input * (p + 1); + const float* w2 = (const float*)weight_data_tm + num_input * (p + 2); + const float* w3 = (const float*)weight_data_tm + num_input * (p + 3); +#endif + const float* m = bottom_blob; + + int i = 0; +#if __AVX__ + __m256 _sum0 = _mm256_setzero_ps(); + __m256 _sum1 = _mm256_setzero_ps(); + __m256 _sum2 = _mm256_setzero_ps(); + __m256 _sum3 = _mm256_setzero_ps(); + for (; i + 7 < num_input; i += 8) + { + __m256 _m = _mm256_loadu_ps(m); + +#if NCNN_IMPL_FP16S + __m256 _w0 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w0)); + __m256 _w1 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w1)); + __m256 _w2 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w2)); + __m256 _w3 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w3)); +#else + __m256 _w0 = _mm256_loadu_ps(w0); + __m256 _w1 = _mm256_loadu_ps(w1); + __m256 _w2 = _mm256_loadu_ps(w2); + __m256 _w3 = _mm256_loadu_ps(w3); +#endif + + _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0); + _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); + + m += 8; + w0 += 8; + w1 += 8; + w2 += 8; + w3 += 8; + } +#endif // __AVX__ + + __m128 _sum0l = _mm_setzero_ps(); + __m128 _sum1l = _mm_setzero_ps(); + __m128 _sum2l = _mm_setzero_ps(); + __m128 _sum3l = _mm_setzero_ps(); + for (; i + 3 < num_input; i += 4) + { + __m128 _m = _mm_loadu_ps(m); + +#if NCNN_IMPL_FP16S + __m128 _w0 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w0)); + __m128 _w1 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w1)); + __m128 _w2 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w2)); + __m128 _w3 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w3)); +#else + __m128 _w0 = _mm_loadu_ps(w0); + __m128 _w1 = _mm_loadu_ps(w1); + __m128 _w2 = _mm_loadu_ps(w2); + __m128 _w3 = _mm_loadu_ps(w3); +#endif + + _sum0l = _mm_comp_fmadd_ps(_m, _w0, _sum0l); + _sum1l = _mm_comp_fmadd_ps(_m, _w1, _sum1l); + _sum2l = _mm_comp_fmadd_ps(_m, _w2, _sum2l); + _sum3l = _mm_comp_fmadd_ps(_m, _w3, _sum3l); + + m += 4; + w0 += 4; + w1 += 4; + w2 += 4; + w3 += 4; + } + for (; i < num_input; i++) + { +#if NCNN_IMPL_FP16S + sums[0] += *m * float16_to_float32(*w0); + sums[1] += *m * float16_to_float32(*w1); + sums[2] += *m * float16_to_float32(*w2); + sums[3] += *m * float16_to_float32(*w3); +#else + sums[0] += *m * *w0; + sums[1] += *m * *w1; + sums[2] += *m * *w2; + sums[3] += *m * *w3; +#endif + + m++; + w0++; + w1++; + w2++; + w3++; + } + + __m128 _sums = _mm_loadu_ps(sums); +#if __AVX__ + _sums = _mm_add_ps(HorizontalSums(_sum0, _sum1, _sum2, _sum3), _sums); +#endif + _MM_TRANSPOSE4_PS(_sum0l, _sum1l, _sum2l, _sum3l); + _sums = _mm_add_ps(_sum0l, _sums); + _sums = _mm_add_ps(_sum1l, _sums); + _sums = _mm_add_ps(_sum2l, _sums); + _sums = _mm_add_ps(_sum3l, _sums); + _sums = activation_sse(_sums, activation_type, activation_params); + + float* outptr = top_blob; + _mm_storeu_ps(outptr + p, _sums); + } + + remain_outw_start += (nn_outw << 2); +#else + int remain_outw_start = 0; +#endif // __SSE2__ + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outw_start; p < outw; p++) + { + float sum = 0.f; + + if (bias_data_ptr) + sum = bias_data_ptr[p]; + +#if NCNN_IMPL_FP16S + const unsigned short* w = weight_data_tm.row(p); +#else + const float* w = (const float*)weight_data_tm + num_input * p; +#endif + const float* m = bottom_blob; + + int i = 0; +#if __SSE2__ +#if __AVX__ + __m256 _sum = _mm256_setzero_ps(); + for (; i + 7 < num_input; i += 8) + { + __m256 _m = _mm256_loadu_ps(m); +#if NCNN_IMPL_FP16S + __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w)); +#else + __m256 _w = _mm256_loadu_ps(w); +#endif + _sum = _mm256_comp_fmadd_ps(_m, _w, _sum); + + m += 8; + w += 8; + } +#endif // __AVX__ + __m128 _suml = _mm_setzero_ps(); + for (; i + 3 < num_input; i += 4) + { + __m128 _m = _mm_loadu_ps(m); +#if NCNN_IMPL_FP16S + __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w)); +#else + __m128 _w = _mm_loadu_ps(w); +#endif + _suml = _mm_comp_fmadd_ps(_m, _w, _suml); + + m += 4; + w += 4; + } +#endif // __SSE2__ + for (; i < num_input; i++) + { +#if NCNN_IMPL_FP16S + sum += *m * float16_to_float32(*w); +#else + sum += *m * *w; +#endif + m++; + w++; + } + +#if __SSE2__ +#if __AVX__ + _suml = _mm_add_ps(_suml, _mm256_extractf128_ps(_sum, 1)); + _suml = _mm_add_ps(_suml, _mm256_castps256_ps128(_sum)); +#endif // __AVX__ + sum += _mm_reduce_add_ps(_suml); +#endif // __SSE2__ + + sum = activation_ss(sum, activation_type, activation_params); + + float* outptr = top_blob; + outptr[p] = sum; + } + } +#endif // NCNN_RUNTIME_CPU +} + +#if NCNN_IMPL_FP16S +static void innerproduct_transform_kernel_fp16s_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt) +#else +static void innerproduct_transform_kernel_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt) +#endif +{ +#if NCNN_RUNTIME_CPU && NCNN_IMPL_FP16S && NCNN_F16C && __AVX__ && !__F16C__ + if (ncnn::cpu_support_x86_f16c()) + { + innerproduct_transform_kernel_fp16s_sse_f16c(weight_data, weight_data_tm, num_input, num_output, opt); + return; + } +#else // NCNN_RUNTIME_CPU + + int out_elempack = 1; +#if __SSE2__ + if (opt.use_packing_layout) + { +#if __AVX512F__ + out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#elif __AVX__ + out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else + out_elempack = num_output % 4 == 0 ? 4 : 1; +#endif + } +#endif // __SSE2__ + + // src = inch-outch + // dst = pb-inch-outch/pb +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + if (out_elempack == 16) + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + +#if NCNN_IMPL_FP16S + weight_data_tm.create(num_input, num_output / 16, (size_t)32u, 16); +#else + weight_data_tm.create(num_input, num_output / 16, (size_t)64u, 16); +#endif + + for (int q = 0; q + 15 < num_output; q += 16) + { +#if NCNN_IMPL_FP16S + unsigned short* g0 = weight_data_tm.row(q / 16); +#else + float* g0 = weight_data_tm.row(q / 16); +#endif + + const float* k0 = weight_data_r2.row(q); + const float* k1 = weight_data_r2.row(q + 1); + const float* k2 = weight_data_r2.row(q + 2); + const float* k3 = weight_data_r2.row(q + 3); + const float* k4 = weight_data_r2.row(q + 4); + const float* k5 = weight_data_r2.row(q + 5); + const float* k6 = weight_data_r2.row(q + 6); + const float* k7 = weight_data_r2.row(q + 7); + const float* k8 = weight_data_r2.row(q + 8); + const float* k9 = weight_data_r2.row(q + 9); + const float* ka = weight_data_r2.row(q + 10); + const float* kb = weight_data_r2.row(q + 11); + const float* kc = weight_data_r2.row(q + 12); + const float* kd = weight_data_r2.row(q + 13); + const float* ke = weight_data_r2.row(q + 14); + const float* kf = weight_data_r2.row(q + 15); + + int p = 0; + for (; p + 15 < num_input; p += 16) + { + // transpose 16x16 +#if NCNN_IMPL_FP16S + __m256i _r0 = _mm512_cvtps_ph(_mm512_loadu_ps(k0), _MM_FROUND_TRUNC); + __m256i _r1 = _mm512_cvtps_ph(_mm512_loadu_ps(k1), _MM_FROUND_TRUNC); + __m256i _r2 = _mm512_cvtps_ph(_mm512_loadu_ps(k2), _MM_FROUND_TRUNC); + __m256i _r3 = _mm512_cvtps_ph(_mm512_loadu_ps(k3), _MM_FROUND_TRUNC); + __m256i _r4 = _mm512_cvtps_ph(_mm512_loadu_ps(k4), _MM_FROUND_TRUNC); + __m256i _r5 = _mm512_cvtps_ph(_mm512_loadu_ps(k5), _MM_FROUND_TRUNC); + __m256i _r6 = _mm512_cvtps_ph(_mm512_loadu_ps(k6), _MM_FROUND_TRUNC); + __m256i _r7 = _mm512_cvtps_ph(_mm512_loadu_ps(k7), _MM_FROUND_TRUNC); + __m256i _r8 = _mm512_cvtps_ph(_mm512_loadu_ps(k8), _MM_FROUND_TRUNC); + __m256i _r9 = _mm512_cvtps_ph(_mm512_loadu_ps(k9), _MM_FROUND_TRUNC); + __m256i _ra = _mm512_cvtps_ph(_mm512_loadu_ps(ka), _MM_FROUND_TRUNC); + __m256i _rb = _mm512_cvtps_ph(_mm512_loadu_ps(kb), _MM_FROUND_TRUNC); + __m256i _rc = _mm512_cvtps_ph(_mm512_loadu_ps(kc), _MM_FROUND_TRUNC); + __m256i _rd = _mm512_cvtps_ph(_mm512_loadu_ps(kd), _MM_FROUND_TRUNC); + __m256i _re = _mm512_cvtps_ph(_mm512_loadu_ps(ke), _MM_FROUND_TRUNC); + __m256i _rf = _mm512_cvtps_ph(_mm512_loadu_ps(kf), _MM_FROUND_TRUNC); + + transpose16x16_epi16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + + _mm256_storeu_si256((__m256i*)g0, _r0); + _mm256_storeu_si256((__m256i*)(g0 + 16), _r1); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 8), _r8); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 9), _r9); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 10), _ra); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 11), _rb); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 12), _rc); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 13), _rd); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 14), _re); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 15), _rf); +#else + __m512 _r0 = _mm512_loadu_ps(k0); + __m512 _r1 = _mm512_loadu_ps(k1); + __m512 _r2 = _mm512_loadu_ps(k2); + __m512 _r3 = _mm512_loadu_ps(k3); + __m512 _r4 = _mm512_loadu_ps(k4); + __m512 _r5 = _mm512_loadu_ps(k5); + __m512 _r6 = _mm512_loadu_ps(k6); + __m512 _r7 = _mm512_loadu_ps(k7); + __m512 _r8 = _mm512_loadu_ps(k8); + __m512 _r9 = _mm512_loadu_ps(k9); + __m512 _ra = _mm512_loadu_ps(ka); + __m512 _rb = _mm512_loadu_ps(kb); + __m512 _rc = _mm512_loadu_ps(kc); + __m512 _rd = _mm512_loadu_ps(kd); + __m512 _re = _mm512_loadu_ps(ke); + __m512 _rf = _mm512_loadu_ps(kf); + + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + + _mm512_storeu_ps(g0, _r0); + _mm512_storeu_ps(g0 + 16, _r1); + _mm512_storeu_ps(g0 + 16 * 2, _r2); + _mm512_storeu_ps(g0 + 16 * 3, _r3); + _mm512_storeu_ps(g0 + 16 * 4, _r4); + _mm512_storeu_ps(g0 + 16 * 5, _r5); + _mm512_storeu_ps(g0 + 16 * 6, _r6); + _mm512_storeu_ps(g0 + 16 * 7, _r7); + _mm512_storeu_ps(g0 + 16 * 8, _r8); + _mm512_storeu_ps(g0 + 16 * 9, _r9); + _mm512_storeu_ps(g0 + 16 * 10, _ra); + _mm512_storeu_ps(g0 + 16 * 11, _rb); + _mm512_storeu_ps(g0 + 16 * 12, _rc); + _mm512_storeu_ps(g0 + 16 * 13, _rd); + _mm512_storeu_ps(g0 + 16 * 14, _re); + _mm512_storeu_ps(g0 + 16 * 15, _rf); +#endif + + k0 += 16; + k1 += 16; + k2 += 16; + k3 += 16; + k4 += 16; + k5 += 16; + k6 += 16; + k7 += 16; + k8 += 16; + k9 += 16; + ka += 16; + kb += 16; + kc += 16; + kd += 16; + ke += 16; + kf += 16; + g0 += 256; + } + for (; p + 7 < num_input; p += 8) + { + // transpose 8x16 +#if NCNN_IMPL_FP16S + __m128i _r0 = _mm256_cvtps_ph(_mm256_loadu_ps(k0), _MM_FROUND_TRUNC); + __m128i _r1 = _mm256_cvtps_ph(_mm256_loadu_ps(k1), _MM_FROUND_TRUNC); + __m128i _r2 = _mm256_cvtps_ph(_mm256_loadu_ps(k2), _MM_FROUND_TRUNC); + __m128i _r3 = _mm256_cvtps_ph(_mm256_loadu_ps(k3), _MM_FROUND_TRUNC); + __m128i _r4 = _mm256_cvtps_ph(_mm256_loadu_ps(k4), _MM_FROUND_TRUNC); + __m128i _r5 = _mm256_cvtps_ph(_mm256_loadu_ps(k5), _MM_FROUND_TRUNC); + __m128i _r6 = _mm256_cvtps_ph(_mm256_loadu_ps(k6), _MM_FROUND_TRUNC); + __m128i _r7 = _mm256_cvtps_ph(_mm256_loadu_ps(k7), _MM_FROUND_TRUNC); + __m128i _r8 = _mm256_cvtps_ph(_mm256_loadu_ps(k8), _MM_FROUND_TRUNC); + __m128i _r9 = _mm256_cvtps_ph(_mm256_loadu_ps(k9), _MM_FROUND_TRUNC); + __m128i _ra = _mm256_cvtps_ph(_mm256_loadu_ps(ka), _MM_FROUND_TRUNC); + __m128i _rb = _mm256_cvtps_ph(_mm256_loadu_ps(kb), _MM_FROUND_TRUNC); + __m128i _rc = _mm256_cvtps_ph(_mm256_loadu_ps(kc), _MM_FROUND_TRUNC); + __m128i _rd = _mm256_cvtps_ph(_mm256_loadu_ps(kd), _MM_FROUND_TRUNC); + __m128i _re = _mm256_cvtps_ph(_mm256_loadu_ps(ke), _MM_FROUND_TRUNC); + __m128i _rf = _mm256_cvtps_ph(_mm256_loadu_ps(kf), _MM_FROUND_TRUNC); + + __m256i _r08 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r8, 1); + __m256i _r19 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _r9, 1); + __m256i _r2a = _mm256_inserti128_si256(_mm256_castsi128_si256(_r2), _ra, 1); + __m256i _r3b = _mm256_inserti128_si256(_mm256_castsi128_si256(_r3), _rb, 1); + __m256i _r4c = _mm256_inserti128_si256(_mm256_castsi128_si256(_r4), _rc, 1); + __m256i _r5d = _mm256_inserti128_si256(_mm256_castsi128_si256(_r5), _rd, 1); + __m256i _r6e = _mm256_inserti128_si256(_mm256_castsi128_si256(_r6), _re, 1); + __m256i _r7f = _mm256_inserti128_si256(_mm256_castsi128_si256(_r7), _rf, 1); + + __m256i _tmp0 = _mm256_unpacklo_epi16(_r08, _r19); + __m256i _tmp1 = _mm256_unpackhi_epi16(_r08, _r19); + __m256i _tmp2 = _mm256_unpacklo_epi16(_r2a, _r3b); + __m256i _tmp3 = _mm256_unpackhi_epi16(_r2a, _r3b); + __m256i _tmp4 = _mm256_unpacklo_epi16(_r4c, _r5d); + __m256i _tmp5 = _mm256_unpackhi_epi16(_r4c, _r5d); + __m256i _tmp6 = _mm256_unpacklo_epi16(_r6e, _r7f); + __m256i _tmp7 = _mm256_unpackhi_epi16(_r6e, _r7f); + + __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); + __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); + __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); + __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); + __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); + __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); + __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); + __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); + + _r08 = _mm256_unpacklo_epi64(_tmpg, _tmpk); + _r19 = _mm256_unpackhi_epi64(_tmpg, _tmpk); + _r2a = _mm256_unpacklo_epi64(_tmph, _tmpl); + _r3b = _mm256_unpackhi_epi64(_tmph, _tmpl); + _r4c = _mm256_unpacklo_epi64(_tmpi, _tmpm); + _r5d = _mm256_unpackhi_epi64(_tmpi, _tmpm); + _r6e = _mm256_unpacklo_epi64(_tmpj, _tmpn); + _r7f = _mm256_unpackhi_epi64(_tmpj, _tmpn); + + _mm256_storeu_si256((__m256i*)g0, _r08); + _mm256_storeu_si256((__m256i*)(g0 + 16), _r19); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2a); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3b); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4c); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5d); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6e); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7f); +#else + __m256 _r0 = _mm256_loadu_ps(k0); + __m256 _r1 = _mm256_loadu_ps(k1); + __m256 _r2 = _mm256_loadu_ps(k2); + __m256 _r3 = _mm256_loadu_ps(k3); + __m256 _r4 = _mm256_loadu_ps(k4); + __m256 _r5 = _mm256_loadu_ps(k5); + __m256 _r6 = _mm256_loadu_ps(k6); + __m256 _r7 = _mm256_loadu_ps(k7); + __m256 _r8 = _mm256_loadu_ps(k8); + __m256 _r9 = _mm256_loadu_ps(k9); + __m256 _ra = _mm256_loadu_ps(ka); + __m256 _rb = _mm256_loadu_ps(kb); + __m256 _rc = _mm256_loadu_ps(kc); + __m256 _rd = _mm256_loadu_ps(kd); + __m256 _re = _mm256_loadu_ps(ke); + __m256 _rf = _mm256_loadu_ps(kf); + + transpose8x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + + _mm256_storeu_ps(g0, _r0); + _mm256_storeu_ps(g0 + 8, _r1); + _mm256_storeu_ps(g0 + 8 * 2, _r2); + _mm256_storeu_ps(g0 + 8 * 3, _r3); + _mm256_storeu_ps(g0 + 8 * 4, _r4); + _mm256_storeu_ps(g0 + 8 * 5, _r5); + _mm256_storeu_ps(g0 + 8 * 6, _r6); + _mm256_storeu_ps(g0 + 8 * 7, _r7); + _mm256_storeu_ps(g0 + 8 * 8, _r8); + _mm256_storeu_ps(g0 + 8 * 9, _r9); + _mm256_storeu_ps(g0 + 8 * 10, _ra); + _mm256_storeu_ps(g0 + 8 * 11, _rb); + _mm256_storeu_ps(g0 + 8 * 12, _rc); + _mm256_storeu_ps(g0 + 8 * 13, _rd); + _mm256_storeu_ps(g0 + 8 * 14, _re); + _mm256_storeu_ps(g0 + 8 * 15, _rf); +#endif + + k0 += 8; + k1 += 8; + k2 += 8; + k3 += 8; + k4 += 8; + k5 += 8; + k6 += 8; + k7 += 8; + k8 += 8; + k9 += 8; + ka += 8; + kb += 8; + kc += 8; + kd += 8; + ke += 8; + kf += 8; + g0 += 128; + } + for (; p < num_input; p++) + { +#if NCNN_IMPL_FP16S + g0[0] = float32_to_float16(*k0++); + g0[1] = float32_to_float16(*k1++); + g0[2] = float32_to_float16(*k2++); + g0[3] = float32_to_float16(*k3++); + g0[4] = float32_to_float16(*k4++); + g0[5] = float32_to_float16(*k5++); + g0[6] = float32_to_float16(*k6++); + g0[7] = float32_to_float16(*k7++); + g0[8] = float32_to_float16(*k8++); + g0[9] = float32_to_float16(*k9++); + g0[10] = float32_to_float16(*ka++); + g0[11] = float32_to_float16(*kb++); + g0[12] = float32_to_float16(*kc++); + g0[13] = float32_to_float16(*kd++); + g0[14] = float32_to_float16(*ke++); + g0[15] = float32_to_float16(*kf++); +#else + g0[0] = *k0++; + g0[1] = *k1++; + g0[2] = *k2++; + g0[3] = *k3++; + g0[4] = *k4++; + g0[5] = *k5++; + g0[6] = *k6++; + g0[7] = *k7++; + g0[8] = *k8++; + g0[9] = *k9++; + g0[10] = *ka++; + g0[11] = *kb++; + g0[12] = *kc++; + g0[13] = *kd++; + g0[14] = *ke++; + g0[15] = *kf++; +#endif + g0 += 16; + } + } + } +#endif // __AVX512F__ + + if (out_elempack == 8) + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + +#if NCNN_IMPL_FP16S + weight_data_tm.create(num_input, num_output / 8, (size_t)16u, 8); +#else + weight_data_tm.create(num_input, num_output / 8, (size_t)32u, 8); +#endif + + for (int q = 0; q + 7 < num_output; q += 8) + { +#if NCNN_IMPL_FP16S + unsigned short* g0 = weight_data_tm.row(q / 8); +#else + float* g0 = weight_data_tm.row(q / 8); +#endif + + const float* k0 = weight_data_r2.row(q); + const float* k1 = weight_data_r2.row(q + 1); + const float* k2 = weight_data_r2.row(q + 2); + const float* k3 = weight_data_r2.row(q + 3); + const float* k4 = weight_data_r2.row(q + 4); + const float* k5 = weight_data_r2.row(q + 5); + const float* k6 = weight_data_r2.row(q + 6); + const float* k7 = weight_data_r2.row(q + 7); + + int p = 0; +#if __AVX512F__ + for (; p + 15 < num_input; p += 16) + { + // transpose 16x8 +#if NCNN_IMPL_FP16S + __m256i _r0 = _mm512_cvtps_ph(_mm512_loadu_ps(k0), _MM_FROUND_TRUNC); + __m256i _r1 = _mm512_cvtps_ph(_mm512_loadu_ps(k1), _MM_FROUND_TRUNC); + __m256i _r2 = _mm512_cvtps_ph(_mm512_loadu_ps(k2), _MM_FROUND_TRUNC); + __m256i _r3 = _mm512_cvtps_ph(_mm512_loadu_ps(k3), _MM_FROUND_TRUNC); + __m256i _r4 = _mm512_cvtps_ph(_mm512_loadu_ps(k4), _MM_FROUND_TRUNC); + __m256i _r5 = _mm512_cvtps_ph(_mm512_loadu_ps(k5), _MM_FROUND_TRUNC); + __m256i _r6 = _mm512_cvtps_ph(_mm512_loadu_ps(k6), _MM_FROUND_TRUNC); + __m256i _r7 = _mm512_cvtps_ph(_mm512_loadu_ps(k7), _MM_FROUND_TRUNC); + + transpose16x8_epi16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); + + _mm256_storeu_si256((__m256i*)g0, _r0); + _mm256_storeu_si256((__m256i*)(g0 + 16), _r1); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6); + _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7); +#else + __m512 _r0 = _mm512_loadu_ps(k0); + __m512 _r1 = _mm512_loadu_ps(k1); + __m512 _r2 = _mm512_loadu_ps(k2); + __m512 _r3 = _mm512_loadu_ps(k3); + __m512 _r4 = _mm512_loadu_ps(k4); + __m512 _r5 = _mm512_loadu_ps(k5); + __m512 _r6 = _mm512_loadu_ps(k6); + __m512 _r7 = _mm512_loadu_ps(k7); + + transpose16x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); + + _mm512_storeu_ps(g0, _r0); + _mm512_storeu_ps(g0 + 16, _r1); + _mm512_storeu_ps(g0 + 16 * 2, _r2); + _mm512_storeu_ps(g0 + 16 * 3, _r3); + _mm512_storeu_ps(g0 + 16 * 4, _r4); + _mm512_storeu_ps(g0 + 16 * 5, _r5); + _mm512_storeu_ps(g0 + 16 * 6, _r6); + _mm512_storeu_ps(g0 + 16 * 7, _r7); +#endif + + k0 += 16; + k1 += 16; + k2 += 16; + k3 += 16; + k4 += 16; + k5 += 16; + k6 += 16; + k7 += 16; + g0 += 128; + } +#endif // __AVX512F__ + for (; p + 7 < num_input; p += 8) + { + // transpose 8x8 +#if NCNN_IMPL_FP16S + __m128i _r0 = _mm256_cvtps_ph(_mm256_loadu_ps(k0), _MM_FROUND_TRUNC); + __m128i _r1 = _mm256_cvtps_ph(_mm256_loadu_ps(k1), _MM_FROUND_TRUNC); + __m128i _r2 = _mm256_cvtps_ph(_mm256_loadu_ps(k2), _MM_FROUND_TRUNC); + __m128i _r3 = _mm256_cvtps_ph(_mm256_loadu_ps(k3), _MM_FROUND_TRUNC); + __m128i _r4 = _mm256_cvtps_ph(_mm256_loadu_ps(k4), _MM_FROUND_TRUNC); + __m128i _r5 = _mm256_cvtps_ph(_mm256_loadu_ps(k5), _MM_FROUND_TRUNC); + __m128i _r6 = _mm256_cvtps_ph(_mm256_loadu_ps(k6), _MM_FROUND_TRUNC); + __m128i _r7 = _mm256_cvtps_ph(_mm256_loadu_ps(k7), _MM_FROUND_TRUNC); + + transpose8x8_epi16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); + + _mm_storeu_si128((__m128i*)g0, _r0); + _mm_storeu_si128((__m128i*)(g0 + 8), _r1); + _mm_storeu_si128((__m128i*)(g0 + 16), _r2); + _mm_storeu_si128((__m128i*)(g0 + 24), _r3); + _mm_storeu_si128((__m128i*)(g0 + 32), _r4); + _mm_storeu_si128((__m128i*)(g0 + 40), _r5); + _mm_storeu_si128((__m128i*)(g0 + 48), _r6); + _mm_storeu_si128((__m128i*)(g0 + 56), _r7); +#else + __m256 _r0 = _mm256_loadu_ps(k0); + __m256 _r1 = _mm256_loadu_ps(k1); + __m256 _r2 = _mm256_loadu_ps(k2); + __m256 _r3 = _mm256_loadu_ps(k3); + __m256 _r4 = _mm256_loadu_ps(k4); + __m256 _r5 = _mm256_loadu_ps(k5); + __m256 _r6 = _mm256_loadu_ps(k6); + __m256 _r7 = _mm256_loadu_ps(k7); + + transpose8x8_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); + + _mm256_storeu_ps(g0, _r0); + _mm256_storeu_ps(g0 + 8, _r1); + _mm256_storeu_ps(g0 + 16, _r2); + _mm256_storeu_ps(g0 + 24, _r3); + _mm256_storeu_ps(g0 + 32, _r4); + _mm256_storeu_ps(g0 + 40, _r5); + _mm256_storeu_ps(g0 + 48, _r6); + _mm256_storeu_ps(g0 + 56, _r7); +#endif + + k0 += 8; + k1 += 8; + k2 += 8; + k3 += 8; + k4 += 8; + k5 += 8; + k6 += 8; + k7 += 8; + g0 += 64; + } + for (; p < num_input; p++) + { +#if NCNN_IMPL_FP16S + g0[0] = float32_to_float16(*k0++); + g0[1] = float32_to_float16(*k1++); + g0[2] = float32_to_float16(*k2++); + g0[3] = float32_to_float16(*k3++); + g0[4] = float32_to_float16(*k4++); + g0[5] = float32_to_float16(*k5++); + g0[6] = float32_to_float16(*k6++); + g0[7] = float32_to_float16(*k7++); +#else + g0[0] = *k0++; + g0[1] = *k1++; + g0[2] = *k2++; + g0[3] = *k3++; + g0[4] = *k4++; + g0[5] = *k5++; + g0[6] = *k6++; + g0[7] = *k7++; +#endif + g0 += 8; + } + } + } +#endif // __AVX__ + + if (out_elempack == 4) + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + +#if NCNN_IMPL_FP16S + weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4); +#else + weight_data_tm.create(num_input, num_output / 4, (size_t)16u, 4); +#endif + + for (int q = 0; q + 3 < num_output; q += 4) + { +#if NCNN_IMPL_FP16S + unsigned short* g0 = weight_data_tm.row(q / 4); +#else + float* g0 = weight_data_tm.row(q / 4); +#endif + + const float* k0 = weight_data_r2.row(q); + const float* k1 = weight_data_r2.row(q + 1); + const float* k2 = weight_data_r2.row(q + 2); + const float* k3 = weight_data_r2.row(q + 3); + + int p = 0; + for (; p + 3 < num_input; p += 4) + { + // transpose 4x4 + __m128 _r0 = _mm_loadu_ps(k0); + __m128 _r1 = _mm_loadu_ps(k1); + __m128 _r2 = _mm_loadu_ps(k2); + __m128 _r3 = _mm_loadu_ps(k3); + _MM_TRANSPOSE4_PS(_r0, _r1, _r2, _r3); +#if NCNN_IMPL_FP16S + __m256 _r01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_r0), _r1, 1); + __m256 _r23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_r2), _r3, 1); + __m128i _r01_fp16 = _mm256_cvtps_ph(_r01, _MM_FROUND_TRUNC); + __m128i _r23_fp16 = _mm256_cvtps_ph(_r23, _MM_FROUND_TRUNC); + _mm_storeu_si128((__m128i*)g0, _r01_fp16); + _mm_storeu_si128((__m128i*)(g0 + 8), _r23_fp16); +#else + _mm_storeu_ps(g0, _r0); + _mm_storeu_ps(g0 + 4, _r1); + _mm_storeu_ps(g0 + 8, _r2); + _mm_storeu_ps(g0 + 12, _r3); +#endif + + k0 += 4; + k1 += 4; + k2 += 4; + k3 += 4; + g0 += 16; + } + for (; p < num_input; p++) + { +#if NCNN_IMPL_FP16S + g0[0] = float32_to_float16(*k0++); + g0[1] = float32_to_float16(*k1++); + g0[2] = float32_to_float16(*k2++); + g0[3] = float32_to_float16(*k3++); +#else + g0[0] = *k0++; + g0[1] = *k1++; + g0[2] = *k2++; + g0[3] = *k3++; +#endif + g0 += 4; + } + } + } +#endif // __SSE2__ + + if (out_elempack == 1) + { +#if NCNN_IMPL_FP16S + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt); +#else + weight_data_tm = weight_data; +#endif + } +#endif // NCNN_RUNTIME_CPU +} diff --git a/src/layer/x86/innerproduct_fp16s.h b/src/layer/x86/innerproduct_fp16s.h deleted file mode 100644 index acf22ec2dd33..000000000000 --- a/src/layer/x86/innerproduct_fp16s.h +++ /dev/null @@ -1,1200 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#if __AVX512F__ -static void innerproduct_fp16s_pack16_avx512(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) -{ - const int num_input = bottom_blob.w * bottom_blob.elempack; - const int num_output = top_blob.w; - - const float* bias_data_ptr = bias_data; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output; p++) - { - __m512 _sum0 = _mm512_setzero_ps(); - __m512 _sum1 = _mm512_setzero_ps(); - __m512 _sum2 = _mm512_setzero_ps(); - __m512 _sum3 = _mm512_setzero_ps(); - __m512 _sum4 = _mm512_setzero_ps(); - __m512 _sum5 = _mm512_setzero_ps(); - __m512 _sum6 = _mm512_setzero_ps(); - __m512 _sum7 = _mm512_setzero_ps(); - - if (bias_data_ptr) - { - _sum0 = _mm512_loadu_ps(bias_data_ptr + p * 16); - } - - const unsigned short* kptr = weight_data_fp16.row(p); - - const float* sptr = bottom_blob; - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m512i _w01 = _mm512_loadu_si512(kptr); - __m512 _val0 = _mm512_set1_ps(sptr[0]); - __m512 _w0 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 0)); - _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0); - - __m512 _val1 = _mm512_set1_ps(sptr[1]); - __m512 _w1 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 1)); - _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1); - - __m512i _w23 = _mm512_loadu_si512(kptr + 32); - __m512 _val2 = _mm512_set1_ps(sptr[2]); - __m512 _w2 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 0)); - _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2); - - __m512 _val3 = _mm512_set1_ps(sptr[3]); - __m512 _w3 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 1)); - _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3); - - __m512i _w45 = _mm512_loadu_si512(kptr + 64); - __m512 _val4 = _mm512_set1_ps(sptr[4]); - __m512 _w4 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w45, 0)); - _sum4 = _mm512_fmadd_ps(_val4, _w4, _sum4); - - __m512 _val5 = _mm512_set1_ps(sptr[5]); - __m512 _w5 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w45, 1)); - _sum5 = _mm512_fmadd_ps(_val5, _w5, _sum5); - - __m512i _w67 = _mm512_loadu_si512(kptr + 96); - __m512 _val6 = _mm512_set1_ps(sptr[6]); - __m512 _w6 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w67, 0)); - _sum6 = _mm512_fmadd_ps(_val6, _w6, _sum6); - - __m512 _val7 = _mm512_set1_ps(sptr[7]); - __m512 _w7 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w67, 1)); - _sum7 = _mm512_fmadd_ps(_val7, _w7, _sum7); - - sptr += 8; - kptr += 128; - } - for (; i + 3 < num_input; i += 4) - { - __m512i _w01 = _mm512_loadu_si512(kptr); - __m512 _val0 = _mm512_set1_ps(sptr[0]); - __m512 _w0 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 0)); - _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0); - - __m512 _val1 = _mm512_set1_ps(sptr[1]); - __m512 _w1 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w01, 1)); - _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1); - - __m512i _w23 = _mm512_loadu_si512(kptr + 32); - __m512 _val2 = _mm512_set1_ps(sptr[2]); - __m512 _w2 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 0)); - _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2); - - __m512 _val3 = _mm512_set1_ps(sptr[3]); - __m512 _w3 = _mm512_cvtph_ps(_mm512_extracti32x8_epi32(_w23, 1)); - _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3); - - sptr += 4; - kptr += 64; - } - for (; i < num_input; i++) - { - __m512 _val = _mm512_set1_ps(sptr[0]); - __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr)); - _sum0 = _mm512_fmadd_ps(_val, _w, _sum0); - - sptr += 1; - kptr += 16; - } - - _sum0 = _mm512_add_ps(_sum0, _sum1); - _sum2 = _mm512_add_ps(_sum2, _sum3); - _sum4 = _mm512_add_ps(_sum4, _sum5); - _sum6 = _mm512_add_ps(_sum6, _sum7); - _sum0 = _mm512_add_ps(_sum0, _sum2); - _sum4 = _mm512_add_ps(_sum4, _sum6); - _sum0 = _mm512_add_ps(_sum0, _sum4); - - _sum0 = activation_avx512(_sum0, activation_type, activation_params); - - float* outptr = top_blob; - _mm512_storeu_ps(outptr + p * 16, _sum0); - } -} -#endif // __AVX512F__ - -#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__ -void innerproduct_fp16s_pack8_avx_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); -void innerproduct_fp16s_pack4_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); -void innerproduct_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); -void innerproduct_transform_kernel_fp16s_sse_f16c(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt); -#endif - -static void innerproduct_fp16s_pack8_avx(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) -{ -#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__ - if (ncnn::cpu_support_x86_f16c()) - { - innerproduct_fp16s_pack8_avx_f16c(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); - return; - } -#endif - -#if __F16C__ - const int num_input = bottom_blob.w * bottom_blob.elempack; - const int num_output = top_blob.w; - - const float* bias_data_ptr = bias_data; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output; p++) - { - __m256 _sum0 = _mm256_setzero_ps(); - __m256 _sum1 = _mm256_setzero_ps(); - __m256 _sum2 = _mm256_setzero_ps(); - __m256 _sum3 = _mm256_setzero_ps(); - __m256 _sum4 = _mm256_setzero_ps(); - __m256 _sum5 = _mm256_setzero_ps(); - __m256 _sum6 = _mm256_setzero_ps(); - __m256 _sum7 = _mm256_setzero_ps(); - - if (bias_data_ptr) - { - _sum0 = _mm256_loadu_ps(bias_data_ptr + p * 8); - } - - const unsigned short* kptr = weight_data_fp16.row(p); - - const float* sptr = bottom_blob; - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr); - __m256 _val0 = _mm256_broadcast_ss(sptr); - __m128i _w0_fp16 = _mm256_extractf128_si256(_w01, 0); - __m256 _w0 = _mm256_cvtph_ps(_w0_fp16); - _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); - - __m256 _val1 = _mm256_broadcast_ss(sptr + 1); - __m128i _w1_fp16 = _mm256_extractf128_si256(_w01, 1); - __m256 _w1 = _mm256_cvtph_ps(_w1_fp16); - _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); - - __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16)); - __m256 _val2 = _mm256_broadcast_ss(sptr + 2); - __m128i _w2_fp16 = _mm256_extractf128_si256(_w23, 0); - __m256 _w2 = _mm256_cvtph_ps(_w2_fp16); - _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2); - - __m256 _val3 = _mm256_broadcast_ss(sptr + 3); - __m128i _w3_fp16 = _mm256_extractf128_si256(_w23, 1); - __m256 _w3 = _mm256_cvtph_ps(_w3_fp16); - _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); - - __m256i _w45 = _mm256_lddqu_si256((const __m256i*)(kptr + 32)); - __m256 _val4 = _mm256_broadcast_ss(sptr + 4); - __m128i _w4_fp16 = _mm256_extractf128_si256(_w45, 0); - __m256 _w4 = _mm256_cvtph_ps(_w4_fp16); - _sum4 = _mm256_comp_fmadd_ps(_val4, _w4, _sum4); - - __m256 _val5 = _mm256_broadcast_ss(sptr + 5); - __m128i _w5_fp16 = _mm256_extractf128_si256(_w45, 1); - __m256 _w5 = _mm256_cvtph_ps(_w5_fp16); - _sum5 = _mm256_comp_fmadd_ps(_val5, _w5, _sum5); - - __m256i _w67 = _mm256_lddqu_si256((const __m256i*)(kptr + 48)); - __m256 _val6 = _mm256_broadcast_ss(sptr + 6); - __m128i _w6_fp16 = _mm256_extractf128_si256(_w67, 0); - __m256 _w6 = _mm256_cvtph_ps(_w6_fp16); - _sum6 = _mm256_comp_fmadd_ps(_val6, _w6, _sum6); - - __m256 _val7 = _mm256_broadcast_ss(sptr + 7); - __m128i _w7_fp16 = _mm256_extractf128_si256(_w67, 1); - __m256 _w7 = _mm256_cvtph_ps(_w7_fp16); - _sum7 = _mm256_comp_fmadd_ps(_val7, _w7, _sum7); - - sptr += 8; - kptr += 64; - } - for (; i + 3 < num_input; i += 4) - { - __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr); - __m256 _val0 = _mm256_broadcast_ss(sptr); - __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0)); - _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); - - __m256 _val1 = _mm256_broadcast_ss(sptr + 1); - __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1)); - _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); - - __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16)); - __m256 _val2 = _mm256_broadcast_ss(sptr + 2); - __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0)); - _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2); - - __m256 _val3 = _mm256_broadcast_ss(sptr + 3); - __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1)); - _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); - - sptr += 4; - kptr += 32; - } - for (; i < num_input; i++) - { - __m256 _val = _mm256_set1_ps(sptr[0]); - __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr)); - _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0); - - sptr += 1; - kptr += 8; - } - - _sum0 = _mm256_add_ps(_sum0, _sum1); - _sum2 = _mm256_add_ps(_sum2, _sum3); - _sum4 = _mm256_add_ps(_sum4, _sum5); - _sum6 = _mm256_add_ps(_sum6, _sum7); - _sum0 = _mm256_add_ps(_sum0, _sum2); - _sum4 = _mm256_add_ps(_sum4, _sum6); - _sum0 = _mm256_add_ps(_sum0, _sum4); - - _sum0 = activation_avx(_sum0, activation_type, activation_params); - - float* outptr = top_blob; - _mm256_storeu_ps(outptr + p * 8, _sum0); - } -#else // __F16C__ - (void)bottom_blob; - (void)top_blob; - (void)weight_data_fp16; - (void)bias_data; - (void)activation_type; - (void)activation_params; - (void)opt; -#endif // __F16C__ -} - -static void innerproduct_fp16s_pack4_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) -{ -#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__ - if (ncnn::cpu_support_x86_f16c()) - { - innerproduct_fp16s_pack4_sse_f16c(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); - return; - } -#endif - -#if __F16C__ - const int num_input = bottom_blob.w * bottom_blob.elempack; - const int num_output = top_blob.w; - - const float* bias_data_ptr = bias_data; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output; p++) - { - __m128 _sum0 = _mm_setzero_ps(); - - __m256 _sum01 = _mm256_setzero_ps(); - __m256 _sum23 = _mm256_setzero_ps(); - __m256 _sum45 = _mm256_setzero_ps(); - __m256 _sum67 = _mm256_setzero_ps(); - - if (bias_data_ptr) - { - _sum0 = _mm_loadu_ps(bias_data_ptr + p * 4); - } - - const unsigned short* kptr = weight_data_fp16.row(p); - - const float* sptr = bottom_blob; - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m128 _val0 = _mm_broadcast_ss(sptr); - __m128 _val1 = _mm_broadcast_ss(sptr + 1); - __m128 _val2 = _mm_broadcast_ss(sptr + 2); - __m128 _val3 = _mm_broadcast_ss(sptr + 3); - __m128 _val4 = _mm_broadcast_ss(sptr + 4); - __m128 _val5 = _mm_broadcast_ss(sptr + 5); - __m128 _val6 = _mm_broadcast_ss(sptr + 6); - __m128 _val7 = _mm_broadcast_ss(sptr + 7); - - __m256 _val01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val0), _val1, 1); - __m256 _val23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val2), _val3, 1); - __m256 _val45 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val4), _val5, 1); - __m256 _val67 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val6), _val7, 1); - - __m256i _w0123 = _mm256_lddqu_si256((const __m256i*)kptr); - __m256i _w4567 = _mm256_lddqu_si256((const __m256i*)(kptr + 16)); - - __m256 _w01 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 0)); - __m256 _w23 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 1)); - __m256 _w45 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w4567, 0)); - __m256 _w67 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w4567, 1)); - - _sum01 = _mm256_comp_fmadd_ps(_val01, _w01, _sum01); - _sum23 = _mm256_comp_fmadd_ps(_val23, _w23, _sum23); - _sum45 = _mm256_comp_fmadd_ps(_val45, _w45, _sum45); - _sum67 = _mm256_comp_fmadd_ps(_val67, _w67, _sum67); - - sptr += 8; - kptr += 32; - } - for (; i + 3 < num_input; i += 4) - { - __m128 _val0 = _mm_set1_ps(sptr[0]); - __m128 _val1 = _mm_set1_ps(sptr[1]); - __m128 _val2 = _mm_set1_ps(sptr[2]); - __m128 _val3 = _mm_set1_ps(sptr[3]); - - __m256 _val01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val0), _val1, 1); - __m256 _val23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_val2), _val3, 1); - - __m256i _w0123 = _mm256_lddqu_si256((const __m256i*)kptr); - __m256 _w01 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 0)); - __m256 _w23 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w0123, 1)); - - _sum01 = _mm256_comp_fmadd_ps(_val01, _w01, _sum01); - _sum23 = _mm256_comp_fmadd_ps(_val23, _w23, _sum23); - - sptr += 4; - kptr += 16; - } - for (; i < num_input; i++) - { - __m128 _val = _mm_set1_ps(sptr[0]); - __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); - _sum0 = _mm_comp_fmadd_ps(_val, _w, _sum0); - - sptr += 1; - kptr += 4; - } - - _sum01 = _mm256_add_ps(_sum01, _sum23); - _sum45 = _mm256_add_ps(_sum45, _sum67); - _sum01 = _mm256_add_ps(_sum01, _sum45); - - _sum0 = _mm_add_ps(_sum0, _mm256_extractf128_ps(_sum01, 0)); - _sum0 = _mm_add_ps(_sum0, _mm256_extractf128_ps(_sum01, 1)); - - _sum0 = activation_sse(_sum0, activation_type, activation_params); - - float* outptr = top_blob; - _mm_storeu_ps(outptr + p * 4, _sum0); - } -#else // __F16C__ - (void)bottom_blob; - (void)top_blob; - (void)weight_data_fp16; - (void)bias_data; - (void)activation_type; - (void)activation_params; - (void)opt; -#endif // __F16C__ -} - -static void innerproduct_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) -{ -#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__ - if (ncnn::cpu_support_x86_f16c()) - { - innerproduct_fp16s_sse_f16c(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); - return; - } -#endif - -#if __F16C__ - const int num_input = bottom_blob.w * bottom_blob.elempack; - const int num_output = top_blob.w; - - const float* bias_data_ptr = bias_data; - - int remain_num_output_start = 0; - int nn_num_output = num_output >> 3; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp = 0; pp < nn_num_output; pp++) - { - int p = pp * 8; - - float sums[8] = {0.0f}; - if (bias_data_ptr) - { - sums[0] = bias_data_ptr[p]; - sums[1] = bias_data_ptr[p + 1]; - sums[2] = bias_data_ptr[p + 2]; - sums[3] = bias_data_ptr[p + 3]; - sums[4] = bias_data_ptr[p + 4]; - sums[5] = bias_data_ptr[p + 5]; - sums[6] = bias_data_ptr[p + 6]; - sums[7] = bias_data_ptr[p + 7]; - } - - const unsigned short* w0 = weight_data_fp16.row(p); - const unsigned short* w1 = weight_data_fp16.row(p + 1); - const unsigned short* w2 = weight_data_fp16.row(p + 2); - const unsigned short* w3 = weight_data_fp16.row(p + 3); - const unsigned short* w4 = weight_data_fp16.row(p + 4); - const unsigned short* w5 = weight_data_fp16.row(p + 5); - const unsigned short* w6 = weight_data_fp16.row(p + 6); - const unsigned short* w7 = weight_data_fp16.row(p + 7); - - const float* m = bottom_blob; - - __m256 _sum0 = _mm256_setzero_ps(); - __m256 _sum1 = _mm256_setzero_ps(); - __m256 _sum2 = _mm256_setzero_ps(); - __m256 _sum3 = _mm256_setzero_ps(); - __m256 _sum4 = _mm256_setzero_ps(); - __m256 _sum5 = _mm256_setzero_ps(); - __m256 _sum6 = _mm256_setzero_ps(); - __m256 _sum7 = _mm256_setzero_ps(); - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w0 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w0)); - __m256 _w1 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w1)); - __m256 _w2 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w2)); - __m256 _w3 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w3)); - __m256 _w4 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w4)); - __m256 _w5 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w5)); - __m256 _w6 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w6)); - __m256 _w7 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w7)); - - _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0); - _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1); - _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2); - _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); - _sum4 = _mm256_comp_fmadd_ps(_m, _w4, _sum4); - _sum5 = _mm256_comp_fmadd_ps(_m, _w5, _sum5); - _sum6 = _mm256_comp_fmadd_ps(_m, _w6, _sum6); - _sum7 = _mm256_comp_fmadd_ps(_m, _w7, _sum7); - - m += 8; - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - w4 += 8; - w5 += 8; - w6 += 8; - w7 += 8; - } - for (; i < num_input; i++) - { - sums[0] += *m * float16_to_float32(*w0); - sums[1] += *m * float16_to_float32(*w1); - sums[2] += *m * float16_to_float32(*w2); - sums[3] += *m * float16_to_float32(*w3); - sums[4] += *m * float16_to_float32(*w4); - sums[5] += *m * float16_to_float32(*w5); - sums[6] += *m * float16_to_float32(*w6); - sums[7] += *m * float16_to_float32(*w7); - - m++; - w0++; - w1++; - w2++; - w3++; - w4++; - w5++; - w6++; - w7++; - } - - __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7); - __m256 _sums_f = _mm256_loadu_ps(sums); - _sums = _mm256_add_ps(_sums_f, _sums); - _sums = activation_avx(_sums, activation_type, activation_params); - - float* outptr = top_blob; - _mm256_storeu_ps(outptr + p, _sums); - } - - remain_num_output_start += (nn_num_output << 3); - nn_num_output = (num_output - remain_num_output_start) >> 2; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp = 0; pp < nn_num_output; pp++) - { - int p = remain_num_output_start + (pp * 4); - - float sums[4] = {0.0f}; - if (bias_data_ptr) - { - sums[0] = bias_data_ptr[p]; - sums[1] = bias_data_ptr[p + 1]; - sums[2] = bias_data_ptr[p + 2]; - sums[3] = bias_data_ptr[p + 3]; - } - - const unsigned short* w0 = weight_data_fp16.row(p); - const unsigned short* w1 = weight_data_fp16.row(p + 1); - const unsigned short* w2 = weight_data_fp16.row(p + 2); - const unsigned short* w3 = weight_data_fp16.row(p + 3); - - const float* m = bottom_blob; - - int i = 0; - - __m256 _sum0 = _mm256_setzero_ps(); - __m256 _sum1 = _mm256_setzero_ps(); - __m256 _sum2 = _mm256_setzero_ps(); - __m256 _sum3 = _mm256_setzero_ps(); - for (; i + 7 < num_input; i += 8) - { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w0 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w0)); - __m256 _w1 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w1)); - __m256 _w2 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w2)); - __m256 _w3 = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w3)); - - _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0); - _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1); - _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2); - _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); - - m += 8; - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - } - - __m128 _sum0l = _mm_setzero_ps(); - __m128 _sum1l = _mm_setzero_ps(); - __m128 _sum2l = _mm_setzero_ps(); - __m128 _sum3l = _mm_setzero_ps(); - for (; i + 3 < num_input; i += 4) - { - __m128 _m = _mm_loadu_ps(m); - - __m128 _w0 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w0)); - __m128 _w1 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w1)); - __m128 _w2 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w2)); - __m128 _w3 = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w3)); - - _sum0l = _mm_comp_fmadd_ps(_m, _w0, _sum0l); - _sum1l = _mm_comp_fmadd_ps(_m, _w1, _sum1l); - _sum2l = _mm_comp_fmadd_ps(_m, _w2, _sum2l); - _sum3l = _mm_comp_fmadd_ps(_m, _w3, _sum3l); - - m += 4; - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - } - for (; i < num_input; i++) - { - sums[0] += *m * float16_to_float32(*w0); - sums[1] += *m * float16_to_float32(*w1); - sums[2] += *m * float16_to_float32(*w2); - sums[3] += *m * float16_to_float32(*w3); - - m++; - w0++; - w1++; - w2++; - w3++; - } - - __m128 _sums = _mm_loadu_ps(sums); - - _sums = _mm_add_ps(HorizontalSums(_sum0, _sum1, _sum2, _sum3), _sums); - - _MM_TRANSPOSE4_PS(_sum0l, _sum1l, _sum2l, _sum3l); - _sums = _mm_add_ps(_sum0l, _sums); - _sums = _mm_add_ps(_sum1l, _sums); - _sums = _mm_add_ps(_sum2l, _sums); - _sums = _mm_add_ps(_sum3l, _sums); - _sums = activation_sse(_sums, activation_type, activation_params); - - float* outptr = top_blob; - _mm_storeu_ps(outptr + p, _sums); - } - - remain_num_output_start += (nn_num_output << 2); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = remain_num_output_start; p < num_output; p++) - { - float sum = 0.f; - - if (bias_data_ptr) - sum = bias_data_ptr[p]; - - const unsigned short* w = weight_data_fp16.row(p); - - const float* m = bottom_blob; - - int i = 0; - - __m256 _sum = _mm256_setzero_ps(); - for (; i + 7 < num_input; i += 8) - { - __m256 _m = _mm256_loadu_ps(m); - __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)w)); - _sum = _mm256_comp_fmadd_ps(_m, _w, _sum); - - m += 8; - w += 8; - } - - __m128 _suml = _mm_setzero_ps(); - for (; i + 3 < num_input; i += 4) - { - __m128 _m = _mm_loadu_ps(m); - __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)w)); - _suml = _mm_comp_fmadd_ps(_m, _w, _suml); - - m += 4; - w += 4; - } - for (; i < num_input; i++) - { - sum += *m * float16_to_float32(*w); - m++; - w++; - } - - sum += _mm256_reduce_add_ps(_sum); - - sum += _mm_reduce_add_ps(_suml); - - sum = activation_ss(sum, activation_type, activation_params); - - float* outptr = top_blob; - outptr[p] = sum; - } -#else // __F16C__ - (void)bottom_blob; - (void)top_blob; - (void)weight_data_fp16; - (void)bias_data; - (void)activation_type; - (void)activation_params; - (void)opt; -#endif // __F16C__ -} - -static void innerproduct_transform_kernel_fp16s_sse(const Mat& weight_data, Mat& weight_data_tm, int num_input, int num_output, const Option& opt) -{ -#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__ - if (ncnn::cpu_support_x86_f16c()) - { - innerproduct_transform_kernel_fp16s_sse_f16c(weight_data, weight_data_tm, num_input, num_output, opt); - return; - } -#endif - -#if __F16C__ - int out_elempack = 1; - if (opt.use_packing_layout) - { -#if __AVX512F__ - out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; -#else - out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; -#endif - } - - // src = inch-outch - // dst = pb-inch-outch/pb -#if __AVX512F__ - if (out_elempack == 16) - { - Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - - weight_data_tm.create(num_input, num_output / 16, (size_t)32u, 16); - - for (int q = 0; q + 15 < num_output; q += 16) - { - unsigned short* g0 = weight_data_tm.row(q / 16); - - const float* k0 = weight_data_r2.row(q); - const float* k1 = weight_data_r2.row(q + 1); - const float* k2 = weight_data_r2.row(q + 2); - const float* k3 = weight_data_r2.row(q + 3); - const float* k4 = weight_data_r2.row(q + 4); - const float* k5 = weight_data_r2.row(q + 5); - const float* k6 = weight_data_r2.row(q + 6); - const float* k7 = weight_data_r2.row(q + 7); - const float* k8 = weight_data_r2.row(q + 8); - const float* k9 = weight_data_r2.row(q + 9); - const float* ka = weight_data_r2.row(q + 10); - const float* kb = weight_data_r2.row(q + 11); - const float* kc = weight_data_r2.row(q + 12); - const float* kd = weight_data_r2.row(q + 13); - const float* ke = weight_data_r2.row(q + 14); - const float* kf = weight_data_r2.row(q + 15); - - int p = 0; - for (; p + 15 < num_input; p += 16) - { - // transpose 16x16 - __m256i _r0 = _mm512_cvtps_ph(_mm512_loadu_ps(k0), _MM_FROUND_TRUNC); - __m256i _r1 = _mm512_cvtps_ph(_mm512_loadu_ps(k1), _MM_FROUND_TRUNC); - __m256i _r2 = _mm512_cvtps_ph(_mm512_loadu_ps(k2), _MM_FROUND_TRUNC); - __m256i _r3 = _mm512_cvtps_ph(_mm512_loadu_ps(k3), _MM_FROUND_TRUNC); - __m256i _r4 = _mm512_cvtps_ph(_mm512_loadu_ps(k4), _MM_FROUND_TRUNC); - __m256i _r5 = _mm512_cvtps_ph(_mm512_loadu_ps(k5), _MM_FROUND_TRUNC); - __m256i _r6 = _mm512_cvtps_ph(_mm512_loadu_ps(k6), _MM_FROUND_TRUNC); - __m256i _r7 = _mm512_cvtps_ph(_mm512_loadu_ps(k7), _MM_FROUND_TRUNC); - __m256i _r8 = _mm512_cvtps_ph(_mm512_loadu_ps(k8), _MM_FROUND_TRUNC); - __m256i _r9 = _mm512_cvtps_ph(_mm512_loadu_ps(k9), _MM_FROUND_TRUNC); - __m256i _ra = _mm512_cvtps_ph(_mm512_loadu_ps(ka), _MM_FROUND_TRUNC); - __m256i _rb = _mm512_cvtps_ph(_mm512_loadu_ps(kb), _MM_FROUND_TRUNC); - __m256i _rc = _mm512_cvtps_ph(_mm512_loadu_ps(kc), _MM_FROUND_TRUNC); - __m256i _rd = _mm512_cvtps_ph(_mm512_loadu_ps(kd), _MM_FROUND_TRUNC); - __m256i _re = _mm512_cvtps_ph(_mm512_loadu_ps(ke), _MM_FROUND_TRUNC); - __m256i _rf = _mm512_cvtps_ph(_mm512_loadu_ps(kf), _MM_FROUND_TRUNC); - - __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1); - __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1); - __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3); - __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3); - __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5); - __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5); - __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7); - __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7); - __m256i _tmp8 = _mm256_unpacklo_epi16(_r8, _r9); - __m256i _tmp9 = _mm256_unpackhi_epi16(_r8, _r9); - __m256i _tmpa = _mm256_unpacklo_epi16(_ra, _rb); - __m256i _tmpb = _mm256_unpackhi_epi16(_ra, _rb); - __m256i _tmpc = _mm256_unpacklo_epi16(_rc, _rd); - __m256i _tmpd = _mm256_unpackhi_epi16(_rc, _rd); - __m256i _tmpe = _mm256_unpacklo_epi16(_re, _rf); - __m256i _tmpf = _mm256_unpackhi_epi16(_re, _rf); - - __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); - __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); - __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); - __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); - __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); - __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); - __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); - __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); - __m256i _tmpo = _mm256_unpacklo_epi32(_tmp8, _tmpa); - __m256i _tmpp = _mm256_unpackhi_epi32(_tmp8, _tmpa); - __m256i _tmpq = _mm256_unpacklo_epi32(_tmp9, _tmpb); - __m256i _tmpr = _mm256_unpackhi_epi32(_tmp9, _tmpb); - __m256i _tmps = _mm256_unpacklo_epi32(_tmpc, _tmpe); - __m256i _tmpt = _mm256_unpackhi_epi32(_tmpc, _tmpe); - __m256i _tmpu = _mm256_unpacklo_epi32(_tmpd, _tmpf); - __m256i _tmpv = _mm256_unpackhi_epi32(_tmpd, _tmpf); - - _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk); - _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk); - _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl); - _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl); - _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm); - _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm); - _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn); - _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn); - _tmp8 = _mm256_unpacklo_epi64(_tmpo, _tmps); - _tmp9 = _mm256_unpackhi_epi64(_tmpo, _tmps); - _tmpa = _mm256_unpacklo_epi64(_tmpp, _tmpt); - _tmpb = _mm256_unpackhi_epi64(_tmpp, _tmpt); - _tmpc = _mm256_unpacklo_epi64(_tmpq, _tmpu); - _tmpd = _mm256_unpackhi_epi64(_tmpq, _tmpu); - _tmpe = _mm256_unpacklo_epi64(_tmpr, _tmpv); - _tmpf = _mm256_unpackhi_epi64(_tmpr, _tmpv); - - _r0 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 2, 0, 0)); - _r3 = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 2, 0, 0)); - _r4 = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); - _r5 = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); - _r6 = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); - _r7 = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); - _r8 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 3, 0, 1)); - _r9 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 3, 0, 1)); - _ra = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 3, 0, 1)); - _rb = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 3, 0, 1)); - _rc = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); - _rd = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); - _re = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); - _rf = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); - - _mm256_storeu_si256((__m256i*)g0, _r0); - _mm256_storeu_si256((__m256i*)(g0 + 16), _r1); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 8), _r8); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 9), _r9); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 10), _ra); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 11), _rb); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 12), _rc); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 13), _rd); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 14), _re); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 15), _rf); - - k0 += 16; - k1 += 16; - k2 += 16; - k3 += 16; - k4 += 16; - k5 += 16; - k6 += 16; - k7 += 16; - k8 += 16; - k9 += 16; - ka += 16; - kb += 16; - kc += 16; - kd += 16; - ke += 16; - kf += 16; - g0 += 256; - } - for (; p + 7 < num_input; p += 8) - { - // transpose 16x8 - __m128i _r0 = _mm256_cvtps_ph(_mm256_loadu_ps(k0), _MM_FROUND_TRUNC); - __m128i _r1 = _mm256_cvtps_ph(_mm256_loadu_ps(k1), _MM_FROUND_TRUNC); - __m128i _r2 = _mm256_cvtps_ph(_mm256_loadu_ps(k2), _MM_FROUND_TRUNC); - __m128i _r3 = _mm256_cvtps_ph(_mm256_loadu_ps(k3), _MM_FROUND_TRUNC); - __m128i _r4 = _mm256_cvtps_ph(_mm256_loadu_ps(k4), _MM_FROUND_TRUNC); - __m128i _r5 = _mm256_cvtps_ph(_mm256_loadu_ps(k5), _MM_FROUND_TRUNC); - __m128i _r6 = _mm256_cvtps_ph(_mm256_loadu_ps(k6), _MM_FROUND_TRUNC); - __m128i _r7 = _mm256_cvtps_ph(_mm256_loadu_ps(k7), _MM_FROUND_TRUNC); - __m128i _r8 = _mm256_cvtps_ph(_mm256_loadu_ps(k8), _MM_FROUND_TRUNC); - __m128i _r9 = _mm256_cvtps_ph(_mm256_loadu_ps(k9), _MM_FROUND_TRUNC); - __m128i _ra = _mm256_cvtps_ph(_mm256_loadu_ps(ka), _MM_FROUND_TRUNC); - __m128i _rb = _mm256_cvtps_ph(_mm256_loadu_ps(kb), _MM_FROUND_TRUNC); - __m128i _rc = _mm256_cvtps_ph(_mm256_loadu_ps(kc), _MM_FROUND_TRUNC); - __m128i _rd = _mm256_cvtps_ph(_mm256_loadu_ps(kd), _MM_FROUND_TRUNC); - __m128i _re = _mm256_cvtps_ph(_mm256_loadu_ps(ke), _MM_FROUND_TRUNC); - __m128i _rf = _mm256_cvtps_ph(_mm256_loadu_ps(kf), _MM_FROUND_TRUNC); - - __m256i _r08 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r0), _r8, 1); - __m256i _r19 = _mm256_inserti128_si256(_mm256_castsi128_si256(_r1), _r9, 1); - __m256i _r2a = _mm256_inserti128_si256(_mm256_castsi128_si256(_r2), _ra, 1); - __m256i _r3b = _mm256_inserti128_si256(_mm256_castsi128_si256(_r3), _rb, 1); - __m256i _r4c = _mm256_inserti128_si256(_mm256_castsi128_si256(_r4), _rc, 1); - __m256i _r5d = _mm256_inserti128_si256(_mm256_castsi128_si256(_r5), _rd, 1); - __m256i _r6e = _mm256_inserti128_si256(_mm256_castsi128_si256(_r6), _re, 1); - __m256i _r7f = _mm256_inserti128_si256(_mm256_castsi128_si256(_r7), _rf, 1); - - __m256i _tmp0 = _mm256_unpacklo_epi16(_r08, _r19); - __m256i _tmp1 = _mm256_unpackhi_epi16(_r08, _r19); - __m256i _tmp2 = _mm256_unpacklo_epi16(_r2a, _r3b); - __m256i _tmp3 = _mm256_unpackhi_epi16(_r2a, _r3b); - __m256i _tmp4 = _mm256_unpacklo_epi16(_r4c, _r5d); - __m256i _tmp5 = _mm256_unpackhi_epi16(_r4c, _r5d); - __m256i _tmp6 = _mm256_unpacklo_epi16(_r6e, _r7f); - __m256i _tmp7 = _mm256_unpackhi_epi16(_r6e, _r7f); - - __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); - __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); - __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); - __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); - __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); - __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); - __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); - __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); - - _r08 = _mm256_unpacklo_epi64(_tmpg, _tmpk); - _r19 = _mm256_unpackhi_epi64(_tmpg, _tmpk); - _r2a = _mm256_unpacklo_epi64(_tmph, _tmpl); - _r3b = _mm256_unpackhi_epi64(_tmph, _tmpl); - _r4c = _mm256_unpacklo_epi64(_tmpi, _tmpm); - _r5d = _mm256_unpackhi_epi64(_tmpi, _tmpm); - _r6e = _mm256_unpacklo_epi64(_tmpj, _tmpn); - _r7f = _mm256_unpackhi_epi64(_tmpj, _tmpn); - - _mm256_storeu_si256((__m256i*)g0, _r08); - _mm256_storeu_si256((__m256i*)(g0 + 16), _r19); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2a); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3b); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4c); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5d); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6e); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7f); - - k0 += 8; - k1 += 8; - k2 += 8; - k3 += 8; - k4 += 8; - k5 += 8; - k6 += 8; - k7 += 8; - k8 += 8; - k9 += 8; - ka += 8; - kb += 8; - kc += 8; - kd += 8; - ke += 8; - kf += 8; - g0 += 128; - } - for (; p < num_input; p++) - { - g0[0] = float32_to_float16(*k0++); - g0[1] = float32_to_float16(*k1++); - g0[2] = float32_to_float16(*k2++); - g0[3] = float32_to_float16(*k3++); - g0[4] = float32_to_float16(*k4++); - g0[5] = float32_to_float16(*k5++); - g0[6] = float32_to_float16(*k6++); - g0[7] = float32_to_float16(*k7++); - g0[8] = float32_to_float16(*k8++); - g0[9] = float32_to_float16(*k9++); - g0[10] = float32_to_float16(*ka++); - g0[11] = float32_to_float16(*kb++); - g0[12] = float32_to_float16(*kc++); - g0[13] = float32_to_float16(*kd++); - g0[14] = float32_to_float16(*ke++); - g0[15] = float32_to_float16(*kf++); - g0 += 16; - } - } - } -#endif // __AVX512F__ - - if (out_elempack == 8) - { - Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - - weight_data_tm.create(num_input, num_output / 8, (size_t)16u, 8); - - for (int q = 0; q + 7 < num_output; q += 8) - { - unsigned short* g0 = weight_data_tm.row(q / 8); - - const float* k0 = weight_data_r2.row(q); - const float* k1 = weight_data_r2.row(q + 1); - const float* k2 = weight_data_r2.row(q + 2); - const float* k3 = weight_data_r2.row(q + 3); - const float* k4 = weight_data_r2.row(q + 4); - const float* k5 = weight_data_r2.row(q + 5); - const float* k6 = weight_data_r2.row(q + 6); - const float* k7 = weight_data_r2.row(q + 7); - - int p = 0; -#if __AVX512F__ - for (; p + 15 < num_input; p += 16) - { - // transpose 8x16 - __m256i _r0 = _mm512_cvtps_ph(_mm512_loadu_ps(k0), _MM_FROUND_TRUNC); - __m256i _r1 = _mm512_cvtps_ph(_mm512_loadu_ps(k1), _MM_FROUND_TRUNC); - __m256i _r2 = _mm512_cvtps_ph(_mm512_loadu_ps(k2), _MM_FROUND_TRUNC); - __m256i _r3 = _mm512_cvtps_ph(_mm512_loadu_ps(k3), _MM_FROUND_TRUNC); - __m256i _r4 = _mm512_cvtps_ph(_mm512_loadu_ps(k4), _MM_FROUND_TRUNC); - __m256i _r5 = _mm512_cvtps_ph(_mm512_loadu_ps(k5), _MM_FROUND_TRUNC); - __m256i _r6 = _mm512_cvtps_ph(_mm512_loadu_ps(k6), _MM_FROUND_TRUNC); - __m256i _r7 = _mm512_cvtps_ph(_mm512_loadu_ps(k7), _MM_FROUND_TRUNC); - - __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1); - __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1); - __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3); - __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3); - __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5); - __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5); - __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7); - __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7); - - __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); - __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); - __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); - __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); - __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); - __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); - __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); - __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); - - _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk); - _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk); - _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl); - _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl); - _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm); - _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm); - _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn); - _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn); - - _r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0)); - _r1 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 2, 0, 0)); - _r2 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); - _r3 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); - _r4 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1)); - _r5 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 3, 0, 1)); - _r6 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); - _r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); - - _mm256_storeu_si256((__m256i*)g0, _r0); - _mm256_storeu_si256((__m256i*)(g0 + 16), _r1); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 2), _r2); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 3), _r3); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 4), _r4); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 5), _r5); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 6), _r6); - _mm256_storeu_si256((__m256i*)(g0 + 16 * 7), _r7); - - k0 += 16; - k1 += 16; - k2 += 16; - k3 += 16; - k4 += 16; - k5 += 16; - k6 += 16; - k7 += 16; - g0 += 128; - } -#endif // __AVX512F__ - for (; p + 7 < num_input; p += 8) - { - // transpose 8x8 - __m128i _r0 = _mm256_cvtps_ph(_mm256_loadu_ps(k0), _MM_FROUND_TRUNC); - __m128i _r1 = _mm256_cvtps_ph(_mm256_loadu_ps(k1), _MM_FROUND_TRUNC); - __m128i _r2 = _mm256_cvtps_ph(_mm256_loadu_ps(k2), _MM_FROUND_TRUNC); - __m128i _r3 = _mm256_cvtps_ph(_mm256_loadu_ps(k3), _MM_FROUND_TRUNC); - __m128i _r4 = _mm256_cvtps_ph(_mm256_loadu_ps(k4), _MM_FROUND_TRUNC); - __m128i _r5 = _mm256_cvtps_ph(_mm256_loadu_ps(k5), _MM_FROUND_TRUNC); - __m128i _r6 = _mm256_cvtps_ph(_mm256_loadu_ps(k6), _MM_FROUND_TRUNC); - __m128i _r7 = _mm256_cvtps_ph(_mm256_loadu_ps(k7), _MM_FROUND_TRUNC); - - __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1); - __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1); - __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3); - __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3); - __m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5); - __m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5); - __m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7); - __m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7); - - __m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2); - __m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2); - __m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3); - __m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3); - __m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6); - __m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6); - __m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7); - __m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7); - - _r0 = _mm_unpacklo_epi64(_tmp8, _tmpc); - _r1 = _mm_unpackhi_epi64(_tmp8, _tmpc); - _r2 = _mm_unpacklo_epi64(_tmp9, _tmpd); - _r3 = _mm_unpackhi_epi64(_tmp9, _tmpd); - _r4 = _mm_unpacklo_epi64(_tmpa, _tmpe); - _r5 = _mm_unpackhi_epi64(_tmpa, _tmpe); - _r6 = _mm_unpacklo_epi64(_tmpb, _tmpf); - _r7 = _mm_unpackhi_epi64(_tmpb, _tmpf); - - _mm_storeu_si128((__m128i*)g0, _r0); - _mm_storeu_si128((__m128i*)(g0 + 8), _r1); - _mm_storeu_si128((__m128i*)(g0 + 16), _r2); - _mm_storeu_si128((__m128i*)(g0 + 24), _r3); - _mm_storeu_si128((__m128i*)(g0 + 32), _r4); - _mm_storeu_si128((__m128i*)(g0 + 40), _r5); - _mm_storeu_si128((__m128i*)(g0 + 48), _r6); - _mm_storeu_si128((__m128i*)(g0 + 56), _r7); - - k0 += 8; - k1 += 8; - k2 += 8; - k3 += 8; - k4 += 8; - k5 += 8; - k6 += 8; - k7 += 8; - g0 += 64; - } - for (; p < num_input; p++) - { - g0[0] = float32_to_float16(*k0++); - g0[1] = float32_to_float16(*k1++); - g0[2] = float32_to_float16(*k2++); - g0[3] = float32_to_float16(*k3++); - g0[4] = float32_to_float16(*k4++); - g0[5] = float32_to_float16(*k5++); - g0[6] = float32_to_float16(*k6++); - g0[7] = float32_to_float16(*k7++); - g0 += 8; - } - } - } - - if (out_elempack == 4) - { - Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - - weight_data_tm.create(num_input, num_output / 4, (size_t)8u, 4); - - for (int q = 0; q + 3 < num_output; q += 4) - { - unsigned short* g0 = weight_data_tm.row(q / 4); - - const float* k0 = weight_data_r2.row(q); - const float* k1 = weight_data_r2.row(q + 1); - const float* k2 = weight_data_r2.row(q + 2); - const float* k3 = weight_data_r2.row(q + 3); - - int p = 0; - for (; p + 3 < num_input; p += 4) - { - // transpose 4x4 - __m128 _r0 = _mm_loadu_ps(k0); - __m128 _r1 = _mm_loadu_ps(k1); - __m128 _r2 = _mm_loadu_ps(k2); - __m128 _r3 = _mm_loadu_ps(k3); - _MM_TRANSPOSE4_PS(_r0, _r1, _r2, _r3); - __m256 _r01 = _mm256_insertf128_ps(_mm256_castps128_ps256(_r0), _r1, 1); - __m256 _r23 = _mm256_insertf128_ps(_mm256_castps128_ps256(_r2), _r3, 1); - __m128i _r01_fp16 = _mm256_cvtps_ph(_r01, _MM_FROUND_TRUNC); - __m128i _r23_fp16 = _mm256_cvtps_ph(_r23, _MM_FROUND_TRUNC); - _mm_storeu_si128((__m128i*)g0, _r01_fp16); - _mm_storeu_si128((__m128i*)(g0 + 8), _r23_fp16); - - k0 += 4; - k1 += 4; - k2 += 4; - k3 += 4; - g0 += 16; - } - for (; p < num_input; p++) - { - g0[0] = float32_to_float16(*k0++); - g0[1] = float32_to_float16(*k1++); - g0[2] = float32_to_float16(*k2++); - g0[3] = float32_to_float16(*k3++); - g0 += 4; - } - } - } - - if (out_elempack == 1) - { - Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - ncnn::cast_float32_to_float16(weight_data_r2, weight_data_tm, opt); - } -#else // __F16C__ - (void)weight_data; - (void)weight_data_tm; - (void)num_input; - (void)num_output; - (void)opt; -#endif // __F16C__ -} diff --git a/src/layer/x86/innerproduct_gemm_fp16s.h b/src/layer/x86/innerproduct_gemm_fp.h similarity index 79% rename from src/layer/x86/innerproduct_gemm_fp16s.h rename to src/layer/x86/innerproduct_gemm_fp.h index 02b8d66c8c11..68a1a37d75f0 100644 --- a/src/layer/x86/innerproduct_gemm_fp16s.h +++ b/src/layer/x86/innerproduct_gemm_fp.h @@ -13,20 +13,23 @@ // specific language governing permissions and limitations under the License. #if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__ -void innerproduct_gemm_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); +void innerproduct_gemm_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt); #endif -static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) +#if NCNN_IMPL_FP16S +static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) +#else +static void innerproduct_gemm_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) +#endif { -#if NCNN_RUNTIME_CPU && NCNN_F16C && __AVX__ && !__F16C__ +#if NCNN_RUNTIME_CPU && NCNN_IMPL_FP16S && NCNN_F16C && __AVX__ && !__F16C__ if (ncnn::cpu_support_x86_f16c()) { - innerproduct_gemm_fp16s_sse_f16c(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); + innerproduct_gemm_fp16s_sse_f16c(bottom_blob, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); return; } -#endif +#else // NCNN_RUNTIME_CPU -#if __F16C__ const int num_input = bottom_blob.w; const int elempack = bottom_blob.elempack; const int num_output = top_blob.w; @@ -35,18 +38,24 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c const float* bias_data_ptr = bias_data; int num_output_elempack = 1; +#if __SSE2__ if (opt.use_packing_layout) { #if __AVX512F__ num_output_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; -#else +#elif __AVX__ num_output_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; +#else + num_output_elempack = num_output % 4 == 0 ? 4 : 1; #endif } +#endif // __SSE2__ #pragma omp parallel for num_threads(opt.num_threads) for (int j = 0; j < h; j++) { +#if __SSE2__ +#if __AVX__ #if __AVX512F__ if (elempack == 16 && num_output_elempack == 16) { @@ -54,7 +63,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m512 _sum0 = _mm512_setzero_ps(); @@ -99,7 +112,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m512 _vale = _mm512_set1_ps(m[14]); __m512 _valf = _mm512_set1_ps(m[15]); +#if NCNN_IMPL_FP16S __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr)); +#else + __m512 _w = _mm512_loadu_ps(kptr); +#endif _sum0 = _mm512_fmadd_ps(_val0, _w, _sum0); _sum1 = _mm512_fmadd_ps(_val1, _w, _sum1); @@ -139,7 +156,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c _sume = activation_avx512(_sume, activation_type, activation_params); _sumf = activation_avx512(_sumf, activation_type, activation_params); - transpose16_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb, _sumc, _sumd, _sume, _sumf); + transpose16x16_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb, _sumc, _sumd, _sume, _sumf); _mm512_storeu_ps(outptr, _sum0); _mm512_storeu_ps(outptr + 16, _sum1); @@ -167,7 +184,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m512 _sum = _mm512_setzero_ps(); @@ -181,7 +202,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m512 _val = _mm512_set1_ps(m[0]); +#if NCNN_IMPL_FP16S __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr)); +#else + __m512 _w = _mm512_loadu_ps(kptr); +#endif + _sum = _mm512_fmadd_ps(_val, _w, _sum); m += 1; @@ -201,7 +227,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m512 _sum0 = _mm512_setzero_ps(); @@ -222,8 +252,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m512 _val1 = _mm512_set1_ps(m[1]); __m512 _val2 = _mm512_set1_ps(m[2]); __m512 _val3 = _mm512_set1_ps(m[3]); - +#if NCNN_IMPL_FP16S __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr)); +#else + __m512 _w = _mm512_loadu_ps(kptr); +#endif _sum0 = _mm512_fmadd_ps(_val0, _w, _sum0); _sum1 = _mm512_fmadd_ps(_val1, _w, _sum1); @@ -239,26 +272,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c _sum2 = activation_avx512(_sum2, activation_type, activation_params); _sum3 = activation_avx512(_sum3, activation_type, activation_params); - // transpose 16x4 - __m512 _tmp0 = _mm512_unpacklo_ps(_sum0, _sum1); - __m512 _tmp1 = _mm512_unpackhi_ps(_sum0, _sum1); - __m512 _tmp2 = _mm512_unpacklo_ps(_sum2, _sum3); - __m512 _tmp3 = _mm512_unpackhi_ps(_sum2, _sum3); - - __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); - - _sum0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _sum1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _sum2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _sum3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x4_ps(_sum0, _sum1, _sum2, _sum3); _mm512_storeu_ps(outptr, _sum0); _mm512_storeu_ps(outptr + 16, _sum1); @@ -274,7 +288,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m512 _sum0 = _mm512_setzero_ps(); @@ -303,8 +321,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m512 _val5 = _mm512_set1_ps(m[5]); __m512 _val6 = _mm512_set1_ps(m[6]); __m512 _val7 = _mm512_set1_ps(m[7]); - +#if NCNN_IMPL_FP16S __m512 _w = _mm512_cvtph_ps(_mm256_lddqu_si256((const __m256i*)kptr)); +#else + __m512 _w = _mm512_loadu_ps(kptr); +#endif _sum0 = _mm512_fmadd_ps(_val0, _w, _sum0); _sum1 = _mm512_fmadd_ps(_val1, _w, _sum1); @@ -328,42 +349,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c _sum6 = activation_avx512(_sum6, activation_type, activation_params); _sum7 = activation_avx512(_sum7, activation_type, activation_params); - // transpose 16x8 - __m512 _tmp0 = _mm512_unpacklo_ps(_sum0, _sum1); - __m512 _tmp1 = _mm512_unpackhi_ps(_sum0, _sum1); - __m512 _tmp2 = _mm512_unpacklo_ps(_sum2, _sum3); - __m512 _tmp3 = _mm512_unpackhi_ps(_sum2, _sum3); - __m512 _tmp4 = _mm512_unpacklo_ps(_sum4, _sum5); - __m512 _tmp5 = _mm512_unpackhi_ps(_sum4, _sum5); - __m512 _tmp6 = _mm512_unpacklo_ps(_sum6, _sum7); - __m512 _tmp7 = _mm512_unpackhi_ps(_sum6, _sum7); - - __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); - __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); - __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); - - _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); - _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); - _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); - - _sum0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); - _sum1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); - _sum2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); - _sum3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); - _sum4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); - _sum5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); - _sum6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); - _sum7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + transpose16x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7); _mm512_storeu_ps(outptr, _sum0); _mm512_storeu_ps(outptr + 16, _sum1); @@ -383,10 +369,17 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = (const float*)weight_data_tm + num_input * p; +#endif const float* m = bottom_blob.row(j); __m512 _sum0 = _mm512_setzero_ps(); + __m512 _sum1 = _mm512_setzero_ps(); + __m512 _sum2 = _mm512_setzero_ps(); + __m512 _sum3 = _mm512_setzero_ps(); if (bias_data_ptr) { @@ -400,7 +393,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m512 _val1 = _mm512_loadu_ps(m + 16); __m512 _val2 = _mm512_loadu_ps(m + 32); __m512 _val3 = _mm512_loadu_ps(m + 48); - +#if NCNN_IMPL_FP16S __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); __m256 _ww = _mm256_insertf128_ps(_mm256_castps128_ps256(_w), _w, 1); __m512 _www = _mm512_insertf32x8(_mm512_castps256_ps512(_ww), _ww, 1); @@ -409,11 +402,17 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m512 _w1 = _mm512_permute_ps(_www, _MM_SHUFFLE(1, 1, 1, 1)); __m512 _w2 = _mm512_permute_ps(_www, _MM_SHUFFLE(2, 2, 2, 2)); __m512 _w3 = _mm512_permute_ps(_www, _MM_SHUFFLE(3, 3, 3, 3)); +#else + __m512 _w0 = _mm512_set1_ps(kptr[0]); + __m512 _w1 = _mm512_set1_ps(kptr[1]); + __m512 _w2 = _mm512_set1_ps(kptr[2]); + __m512 _w3 = _mm512_set1_ps(kptr[3]); +#endif _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0); - _sum0 = _mm512_fmadd_ps(_val1, _w1, _sum0); - _sum0 = _mm512_fmadd_ps(_val2, _w2, _sum0); - _sum0 = _mm512_fmadd_ps(_val3, _w3, _sum0); + _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1); + _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2); + _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3); m += 64; kptr += 4; @@ -421,13 +420,21 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m512 _val = _mm512_loadu_ps(m); +#if NCNN_IMPL_FP16S __m512 _w = _mm512_set1_ps(float16_to_float32(kptr[0])); +#else + __m512 _w = _mm512_set1_ps(kptr[0]); +#endif _sum0 = _mm512_fmadd_ps(_val, _w, _sum0); m += 16; kptr += 1; } + _sum0 = _mm512_add_ps(_sum0, _sum1); + _sum2 = _mm512_add_ps(_sum2, _sum3); + _sum0 = _mm512_add_ps(_sum0, _sum2); + _sum0 = activation_avx512(_sum0, activation_type, activation_params); _mm512_storeu_ps(outptr, _sum0); @@ -441,7 +448,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m512 _sum0 = _mm512_setzero_ps(); @@ -461,7 +472,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m512 _val = _mm512_loadu_ps(m); - +#if NCNN_IMPL_FP16S __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); __m256 _ww = _mm256_insertf128_ps(_mm256_castps128_ps256(_w), _w, 1); __m512 _www = _mm512_insertf32x8(_mm512_castps256_ps512(_ww), _ww, 1); @@ -470,6 +481,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m512 _w1 = _mm512_permute_ps(_www, _MM_SHUFFLE(1, 1, 1, 1)); __m512 _w2 = _mm512_permute_ps(_www, _MM_SHUFFLE(2, 2, 2, 2)); __m512 _w3 = _mm512_permute_ps(_www, _MM_SHUFFLE(3, 3, 3, 3)); +#else + __m512 _w0 = _mm512_set1_ps(kptr[0]); + __m512 _w1 = _mm512_set1_ps(kptr[1]); + __m512 _w2 = _mm512_set1_ps(kptr[2]); + __m512 _w3 = _mm512_set1_ps(kptr[3]); +#endif _sum0 = _mm512_fmadd_ps(_val, _w0, _sum0); _sum1 = _mm512_fmadd_ps(_val, _w1, _sum1); @@ -499,7 +516,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m512 _sum0 = _mm512_setzero_ps(); @@ -527,7 +548,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m512 _val = _mm512_loadu_ps(m); - +#if NCNN_IMPL_FP16S __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr)); __m512 _ww = _mm512_castps256_ps512(_w); __m512 _www0 = _mm512_shuffle_f32x4(_ww, _ww, _MM_SHUFFLE(0, 0, 0, 0)); @@ -541,6 +562,16 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m512 _w5 = _mm512_permute_ps(_www1, _MM_SHUFFLE(1, 1, 1, 1)); __m512 _w6 = _mm512_permute_ps(_www1, _MM_SHUFFLE(2, 2, 2, 2)); __m512 _w7 = _mm512_permute_ps(_www1, _MM_SHUFFLE(3, 3, 3, 3)); +#else + __m512 _w0 = _mm512_set1_ps(kptr[0]); + __m512 _w1 = _mm512_set1_ps(kptr[1]); + __m512 _w2 = _mm512_set1_ps(kptr[2]); + __m512 _w3 = _mm512_set1_ps(kptr[3]); + __m512 _w4 = _mm512_set1_ps(kptr[4]); + __m512 _w5 = _mm512_set1_ps(kptr[5]); + __m512 _w6 = _mm512_set1_ps(kptr[6]); + __m512 _w7 = _mm512_set1_ps(kptr[7]); +#endif _sum0 = _mm512_fmadd_ps(_val, _w0, _sum0); _sum1 = _mm512_fmadd_ps(_val, _w1, _sum1); @@ -575,6 +606,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c outptr += 128; } } + #endif // __AVX512F__ if (elempack == 8 && num_output_elempack == 8) @@ -583,7 +615,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m256 _sum0 = _mm256_setzero_ps(); @@ -611,8 +647,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m256 _val5 = _mm256_broadcast_ss(m + 5); __m256 _val6 = _mm256_broadcast_ss(m + 6); __m256 _val7 = _mm256_broadcast_ss(m + 7); - +#if NCNN_IMPL_FP16S __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr)); +#else + __m256 _w = _mm256_loadu_ps(kptr); +#endif _sum0 = _mm256_comp_fmadd_ps(_val0, _w, _sum0); _sum1 = _mm256_comp_fmadd_ps(_val1, _w, _sum1); @@ -636,7 +675,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c _sum6 = activation_avx(_sum6, activation_type, activation_params); _sum7 = activation_avx(_sum7, activation_type, activation_params); - transpose8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7); + transpose8x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7); _mm256_storeu_ps(outptr, _sum0); _mm256_storeu_ps(outptr + 8, _sum1); @@ -656,14 +695,21 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); - __m256 _sum = _mm256_setzero_ps(); + __m256 _sum0 = _mm256_setzero_ps(); + __m256 _sum1 = _mm256_setzero_ps(); + __m256 _sum2 = _mm256_setzero_ps(); + __m256 _sum3 = _mm256_setzero_ps(); if (bias_data_ptr) { - _sum = _mm256_loadu_ps(bias_data_ptr + p * 8); + _sum0 = _mm256_loadu_ps(bias_data_ptr + p * 8); } int i = 0; @@ -673,33 +719,47 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m256 _val1 = _mm256_broadcast_ss(m + 1); __m256 _val2 = _mm256_broadcast_ss(m + 2); __m256 _val3 = _mm256_broadcast_ss(m + 3); - __m256 _val4 = _mm256_broadcast_ss(m + 4); - __m256 _val5 = _mm256_broadcast_ss(m + 5); - __m256 _val6 = _mm256_broadcast_ss(m + 6); - __m256 _val7 = _mm256_broadcast_ss(m + 7); - +#if NCNN_IMPL_FP16S __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr); __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16)); - __m256i _w45 = _mm256_lddqu_si256((const __m256i*)(kptr + 32)); - __m256i _w67 = _mm256_lddqu_si256((const __m256i*)(kptr + 48)); - __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0)); __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1)); __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0)); __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1)); +#else + __m256 _w0 = _mm256_loadu_ps(kptr); + __m256 _w1 = _mm256_loadu_ps(kptr + 8); + __m256 _w2 = _mm256_loadu_ps(kptr + 16); + __m256 _w3 = _mm256_loadu_ps(kptr + 24); +#endif + + _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); + _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); + + __m256 _val4 = _mm256_broadcast_ss(m + 4); + __m256 _val5 = _mm256_broadcast_ss(m + 5); + __m256 _val6 = _mm256_broadcast_ss(m + 6); + __m256 _val7 = _mm256_broadcast_ss(m + 7); +#if NCNN_IMPL_FP16S + __m256i _w45 = _mm256_lddqu_si256((const __m256i*)(kptr + 32)); + __m256i _w67 = _mm256_lddqu_si256((const __m256i*)(kptr + 48)); __m256 _w4 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w45, 0)); __m256 _w5 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w45, 1)); __m256 _w6 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w67, 0)); __m256 _w7 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w67, 1)); +#else + __m256 _w4 = _mm256_loadu_ps(kptr + 32); + __m256 _w5 = _mm256_loadu_ps(kptr + 40); + __m256 _w6 = _mm256_loadu_ps(kptr + 48); + __m256 _w7 = _mm256_loadu_ps(kptr + 56); +#endif - _sum = _mm256_comp_fmadd_ps(_val0, _w0, _sum); - _sum = _mm256_comp_fmadd_ps(_val1, _w1, _sum); - _sum = _mm256_comp_fmadd_ps(_val2, _w2, _sum); - _sum = _mm256_comp_fmadd_ps(_val3, _w3, _sum); - _sum = _mm256_comp_fmadd_ps(_val4, _w4, _sum); - _sum = _mm256_comp_fmadd_ps(_val5, _w5, _sum); - _sum = _mm256_comp_fmadd_ps(_val6, _w6, _sum); - _sum = _mm256_comp_fmadd_ps(_val7, _w7, _sum); + _sum0 = _mm256_comp_fmadd_ps(_val4, _w4, _sum0); + _sum1 = _mm256_comp_fmadd_ps(_val5, _w5, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_val6, _w6, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_val7, _w7, _sum3); m += 8; kptr += 64; @@ -710,19 +770,24 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m256 _val1 = _mm256_broadcast_ss(m + 1); __m256 _val2 = _mm256_broadcast_ss(m + 2); __m256 _val3 = _mm256_broadcast_ss(m + 3); - +#if NCNN_IMPL_FP16S __m256i _w01 = _mm256_lddqu_si256((const __m256i*)kptr); __m256i _w23 = _mm256_lddqu_si256((const __m256i*)(kptr + 16)); - __m256 _w0 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 0)); __m256 _w1 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w01, 1)); __m256 _w2 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 0)); __m256 _w3 = _mm256_cvtph_ps(_mm256_extractf128_si256(_w23, 1)); +#else + __m256 _w0 = _mm256_loadu_ps(kptr); + __m256 _w1 = _mm256_loadu_ps(kptr + 8); + __m256 _w2 = _mm256_loadu_ps(kptr + 16); + __m256 _w3 = _mm256_loadu_ps(kptr + 24); +#endif - _sum = _mm256_comp_fmadd_ps(_val0, _w0, _sum); - _sum = _mm256_comp_fmadd_ps(_val1, _w1, _sum); - _sum = _mm256_comp_fmadd_ps(_val2, _w2, _sum); - _sum = _mm256_comp_fmadd_ps(_val3, _w3, _sum); + _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); + _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); m += 4; kptr += 32; @@ -730,16 +795,24 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m256 _val = _mm256_set1_ps(m[0]); +#if NCNN_IMPL_FP16S __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr)); - _sum = _mm256_comp_fmadd_ps(_val, _w, _sum); +#else + __m256 _w = _mm256_loadu_ps(kptr); +#endif + _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0); m += 1; kptr += 8; } - _sum = activation_avx(_sum, activation_type, activation_params); + _sum0 = _mm256_add_ps(_sum0, _sum1); + _sum2 = _mm256_add_ps(_sum2, _sum3); + _sum0 = _mm256_add_ps(_sum0, _sum2); + + _sum0 = activation_avx(_sum0, activation_type, activation_params); - _mm256_storeu_ps(outptr, _sum); + _mm256_storeu_ps(outptr, _sum0); outptr += 8; } } @@ -750,7 +823,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m256 _sum0 = _mm256_setzero_ps(); @@ -771,8 +848,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m256 _val1 = _mm256_broadcast_ss(m + 1); __m256 _val2 = _mm256_broadcast_ss(m + 2); __m256 _val3 = _mm256_broadcast_ss(m + 3); - +#if NCNN_IMPL_FP16S __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr)); +#else + __m256 _w = _mm256_loadu_ps(kptr); +#endif _sum0 = _mm256_comp_fmadd_ps(_val0, _w, _sum0); _sum1 = _mm256_comp_fmadd_ps(_val1, _w, _sum1); @@ -788,19 +868,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c _sum2 = activation_avx(_sum2, activation_type, activation_params); _sum3 = activation_avx(_sum3, activation_type, activation_params); - // transpose 8x4 - __m256 _tmp0 = _mm256_unpacklo_ps(_sum0, _sum1); - __m256 _tmp1 = _mm256_unpackhi_ps(_sum0, _sum1); - __m256 _tmp2 = _mm256_unpacklo_ps(_sum2, _sum3); - __m256 _tmp3 = _mm256_unpackhi_ps(_sum2, _sum3); - __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); - __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); - __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); - _sum0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); - _sum1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); - _sum2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); - _sum3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); + transpose8x4_ps(_sum0, _sum1, _sum2, _sum3); _mm256_storeu_ps(outptr, _sum0); _mm256_storeu_ps(outptr + 8, _sum1); @@ -816,7 +884,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = (const float*)weight_data_tm + num_input * p; +#endif const float* m = bottom_blob.row(j); __m256 _sum0 = _mm256_setzero_ps(); @@ -836,7 +908,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m256 _val1 = _mm256_loadu_ps(m + 8); __m256 _val2 = _mm256_loadu_ps(m + 16); __m256 _val3 = _mm256_loadu_ps(m + 24); - +#if NCNN_IMPL_FP16S __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); __m256 _ww = _mm256_insertf128_ps(_mm256_castps128_ps256(_w), _w, 1); @@ -844,6 +916,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m256 _w1 = _mm256_permute_ps(_ww, _MM_SHUFFLE(1, 1, 1, 1)); __m256 _w2 = _mm256_permute_ps(_ww, _MM_SHUFFLE(2, 2, 2, 2)); __m256 _w3 = _mm256_permute_ps(_ww, _MM_SHUFFLE(3, 3, 3, 3)); +#else + __m256 _w0 = _mm256_set1_ps(kptr[0]); + __m256 _w1 = _mm256_set1_ps(kptr[1]); + __m256 _w2 = _mm256_set1_ps(kptr[2]); + __m256 _w3 = _mm256_set1_ps(kptr[3]); +#endif _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); @@ -856,8 +934,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m256 _val = _mm256_loadu_ps(m); - __m256 _k = _mm256_set1_ps(float16_to_float32(kptr[0])); - _sum0 = _mm256_comp_fmadd_ps(_val, _k, _sum0); +#if NCNN_IMPL_FP16S + __m256 _w = _mm256_set1_ps(float16_to_float32(kptr[0])); +#else + __m256 _w = _mm256_set1_ps(kptr[0]); +#endif + _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0); m += 8; kptr += 1; @@ -880,7 +962,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m256 _sum0 = _mm256_setzero_ps(); @@ -900,7 +986,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m256 _val = _mm256_loadu_ps(m); - +#if NCNN_IMPL_FP16S __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); __m256 _ww = _mm256_insertf128_ps(_mm256_castps128_ps256(_w), _w, 1); @@ -908,6 +994,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m256 _w1 = _mm256_permute_ps(_ww, _MM_SHUFFLE(1, 1, 1, 1)); __m256 _w2 = _mm256_permute_ps(_ww, _MM_SHUFFLE(2, 2, 2, 2)); __m256 _w3 = _mm256_permute_ps(_ww, _MM_SHUFFLE(3, 3, 3, 3)); +#else + __m256 _w0 = _mm256_set1_ps(kptr[0]); + __m256 _w1 = _mm256_set1_ps(kptr[1]); + __m256 _w2 = _mm256_set1_ps(kptr[2]); + __m256 _w3 = _mm256_set1_ps(kptr[3]); +#endif _sum0 = _mm256_comp_fmadd_ps(_val, _w0, _sum0); _sum1 = _mm256_comp_fmadd_ps(_val, _w1, _sum1); @@ -930,6 +1022,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c outptr += 32; } } +#endif // __AVX__ if (elempack == 4 && num_output_elempack == 4) { @@ -937,7 +1030,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m128 _sum0 = _mm_setzero_ps(); @@ -958,9 +1055,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m128 _val1 = _mm_set1_ps(m[1]); __m128 _val2 = _mm_set1_ps(m[2]); __m128 _val3 = _mm_set1_ps(m[3]); - +#if NCNN_IMPL_FP16S __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); - +#else + __m128 _w = _mm_loadu_ps(kptr); +#endif _sum0 = _mm_comp_fmadd_ps(_val0, _w, _sum0); _sum1 = _mm_comp_fmadd_ps(_val1, _w, _sum1); _sum2 = _mm_comp_fmadd_ps(_val2, _w, _sum2); @@ -991,7 +1090,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output / num_output_elempack; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = weight_data_tm.row(p); +#endif const float* m = bottom_blob.row(j); __m128 _sum = _mm_setzero_ps(); @@ -1005,7 +1108,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m128 _val = _mm_set1_ps(m[0]); +#if NCNN_IMPL_FP16S __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); +#else + __m128 _w = _mm_loadu_ps(kptr); +#endif _sum = _mm_comp_fmadd_ps(_val, _w, _sum); m += 1; @@ -1025,7 +1132,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = (const float*)weight_data_tm + num_input * p; +#endif const float* m = bottom_blob.row(j); __m128 _sum0 = _mm_setzero_ps(); @@ -1045,13 +1156,19 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c __m128 _val1 = _mm_loadu_ps(m + 4); __m128 _val2 = _mm_loadu_ps(m + 8); __m128 _val3 = _mm_loadu_ps(m + 12); - +#if NCNN_IMPL_FP16S __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); __m128 _w0 = _mm_permute_ps(_w, _MM_SHUFFLE(0, 0, 0, 0)); __m128 _w1 = _mm_permute_ps(_w, _MM_SHUFFLE(1, 1, 1, 1)); __m128 _w2 = _mm_permute_ps(_w, _MM_SHUFFLE(2, 2, 2, 2)); __m128 _w3 = _mm_permute_ps(_w, _MM_SHUFFLE(3, 3, 3, 3)); +#else + __m128 _w0 = _mm_set1_ps(kptr[0]); + __m128 _w1 = _mm_set1_ps(kptr[1]); + __m128 _w2 = _mm_set1_ps(kptr[2]); + __m128 _w3 = _mm_set1_ps(kptr[3]); +#endif _sum0 = _mm_comp_fmadd_ps(_val0, _w0, _sum0); _sum1 = _mm_comp_fmadd_ps(_val1, _w1, _sum1); @@ -1064,8 +1181,12 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (; i < num_input; i++) { __m128 _val = _mm_loadu_ps(m); - __m128 _k = _mm_set1_ps(float16_to_float32(kptr[0])); - _sum0 = _mm_comp_fmadd_ps(_val, _k, _sum0); +#if NCNN_IMPL_FP16S + __m128 _w = _mm_set1_ps(float16_to_float32(kptr[0])); +#else + __m128 _w = _mm_set1_ps(kptr[0]); +#endif + _sum0 = _mm_comp_fmadd_ps(_val, _w, _sum0); m += 4; kptr += 1; @@ -1081,6 +1202,7 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c outptr += 4; } } +#endif // __SSE2__ if (elempack == 1 && num_output_elempack == 1) { @@ -1088,7 +1210,11 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c for (int p = 0; p < num_output; p++) { - const unsigned short* kptr = weight_data_fp16.row(p); +#if NCNN_IMPL_FP16S + const unsigned short* kptr = weight_data_tm.row(p); +#else + const float* kptr = (const float*)weight_data_tm + num_input * p; +#endif const float* m = bottom_blob.row(j); float sum = 0.f; @@ -1099,33 +1225,54 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c } int i = 0; +#if __SSE2__ +#if __AVX__ __m256 _sum = _mm256_setzero_ps(); for (; i + 7 < num_input; i += 8) { __m256 _m = _mm256_loadu_ps(m); +#if NCNN_IMPL_FP16S __m256 _w = _mm256_cvtph_ps(_mm_lddqu_si128((const __m128i*)kptr)); +#else + __m256 _w = _mm256_loadu_ps(kptr); +#endif _sum = _mm256_comp_fmadd_ps(_m, _w, _sum); m += 8; kptr += 8; } +#endif // __AVX__ __m128 _suml = _mm_setzero_ps(); for (; i + 3 < num_input; i += 4) { __m128 _val = _mm_loadu_ps(m); +#if NCNN_IMPL_FP16S __m128 _w = _mm_cvtph_ps(_mm_loadl_epi64((const __m128i*)kptr)); +#else + __m128 _w = _mm_loadu_ps(kptr); +#endif _suml = _mm_comp_fmadd_ps(_val, _w, _suml); m += 4; kptr += 4; } +#endif // __SSE2__ for (; i < num_input; i++) { +#if NCNN_IMPL_FP16S sum += *m++ * float16_to_float32(*kptr++); +#else + sum += *m++ * *kptr++; +#endif } - sum += _mm256_reduce_add_ps(_sum); +#if __SSE2__ +#if __AVX__ + _suml = _mm_add_ps(_suml, _mm256_extractf128_ps(_sum, 1)); + _suml = _mm_add_ps(_suml, _mm256_castps256_ps128(_sum)); +#endif // __AVX__ sum += _mm_reduce_add_ps(_suml); +#endif // __SSE2__ sum = activation_ss(sum, activation_type, activation_params); @@ -1134,13 +1281,5 @@ static void innerproduct_gemm_fp16s_sse(const Mat& bottom_blob, Mat& top_blob, c } } } -#else // __F16C__ - (void)bottom_blob; - (void)top_blob; - (void)weight_data_fp16; - (void)bias_data; - (void)activation_type; - (void)activation_params; - (void)opt; -#endif // __F16C__ +#endif // NCNN_RUNTIME_CPU } diff --git a/src/layer/x86/innerproduct_x86.cpp b/src/layer/x86/innerproduct_x86.cpp index 30f046d7678e..c9139bc2a66c 100644 --- a/src/layer/x86/innerproduct_x86.cpp +++ b/src/layer/x86/innerproduct_x86.cpp @@ -30,9 +30,14 @@ namespace ncnn { -#if NCNN_F16C -#include "innerproduct_fp16s.h" -#include "innerproduct_gemm_fp16s.h" +#include "innerproduct_fp.h" +#include "innerproduct_gemm_fp.h" + +#if NCNN_F16C && __AVX__ +#define NCNN_IMPL_FP16S 1 +#include "innerproduct_fp.h" +#include "innerproduct_gemm_fp.h" +#undef NCNN_IMPL_FP16S #endif InnerProduct_x86::InnerProduct_x86() @@ -64,7 +69,7 @@ int InnerProduct_x86::create_pipeline(const Option& opt) } #endif -#if NCNN_F16C +#if NCNN_F16C && __AVX__ if (cpu_support_x86_f16c() && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); @@ -73,1306 +78,58 @@ int InnerProduct_x86::create_pipeline(const Option& opt) const int num_input = weight_data_size / num_output; - int out_elempack = 1; - -#if __SSE2__ - if (opt.use_packing_layout) - { -#if __AVX512F__ - out_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; -#elif __AVX__ - out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; -#else - out_elempack = num_output % 4 == 0 ? 4 : 1; -#endif - } -#endif // __SSE2__ - - if (out_elempack != 1) - { - // src = inch-outch - // dst = pb-inch-outch/pb - { - Mat weight_data_r2 = weight_data.reshape(num_input, num_output); - - weight_data_tm.create(num_input, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack); - - for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) - { - float* g0 = weight_data_tm.row(q / out_elempack); - - for (int p = 0; p < num_input; p++) - { - for (int j = 0; j < out_elempack; j++) - { - *g0++ = weight_data_r2.row(q + j)[p]; - } - } - } - } - } - else - { - weight_data_tm = weight_data; - } - - if (opt.lightmode) - { - weight_data.release(); - } - - return 0; -} - -int InnerProduct_x86::destroy_pipeline(const Option& opt) -{ - if (flatten) - { - flatten->destroy_pipeline(opt); - delete flatten; - flatten = 0; - } - - return 0; -} - -int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ -#if NCNN_INT8 - if (opt.use_int8_inference && int8_scale_term) - { - return forward_int8_x86(bottom_blob, top_blob, opt); - } -#endif - -#if NCNN_F16C - if (cpu_support_x86_f16c() && opt.use_fp16_storage) - { - return forward_fp16s(bottom_blob, top_blob, opt); - } -#endif - - const int num_input = weight_data_size / num_output; - - if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1) - { - // gemm - int h = bottom_blob.h; - size_t elemsize = bottom_blob.elemsize; - int elempack = bottom_blob.elempack; - - top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - int num_output_elempack = 1; -#if __SSE2__ - if (opt.use_packing_layout) - { -#if __AVX512F__ - num_output_elempack = num_output % 16 == 0 ? 16 : num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; -#elif __AVX__ - num_output_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; -#else - num_output_elempack = num_output % 4 == 0 ? 4 : 1; -#endif - } -#endif // __SSE2__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int j = 0; j < h; j++) - { -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ - if (elempack == 16 && num_output_elempack == 16) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m512 _sum0 = _mm512_set1_ps(0.f); - __m512 _sum1 = _mm512_set1_ps(0.f); - __m512 _sum2 = _mm512_set1_ps(0.f); - __m512 _sum3 = _mm512_set1_ps(0.f); - __m512 _sum4 = _mm512_set1_ps(0.f); - __m512 _sum5 = _mm512_set1_ps(0.f); - __m512 _sum6 = _mm512_set1_ps(0.f); - __m512 _sum7 = _mm512_set1_ps(0.f); - __m512 _sum8 = _mm512_set1_ps(0.f); - __m512 _sum9 = _mm512_set1_ps(0.f); - __m512 _suma = _mm512_set1_ps(0.f); - __m512 _sumb = _mm512_set1_ps(0.f); - __m512 _sumc = _mm512_set1_ps(0.f); - __m512 _sumd = _mm512_set1_ps(0.f); - __m512 _sume = _mm512_set1_ps(0.f); - __m512 _sumf = _mm512_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm512_set1_ps(bias_data[p * 16 + 0]); - _sum1 = _mm512_set1_ps(bias_data[p * 16 + 1]); - _sum2 = _mm512_set1_ps(bias_data[p * 16 + 2]); - _sum3 = _mm512_set1_ps(bias_data[p * 16 + 3]); - _sum4 = _mm512_set1_ps(bias_data[p * 16 + 4]); - _sum5 = _mm512_set1_ps(bias_data[p * 16 + 5]); - _sum6 = _mm512_set1_ps(bias_data[p * 16 + 6]); - _sum7 = _mm512_set1_ps(bias_data[p * 16 + 7]); - _sum8 = _mm512_set1_ps(bias_data[p * 16 + 8]); - _sum9 = _mm512_set1_ps(bias_data[p * 16 + 9]); - _suma = _mm512_set1_ps(bias_data[p * 16 + 10]); - _sumb = _mm512_set1_ps(bias_data[p * 16 + 11]); - _sumc = _mm512_set1_ps(bias_data[p * 16 + 12]); - _sumd = _mm512_set1_ps(bias_data[p * 16 + 13]); - _sume = _mm512_set1_ps(bias_data[p * 16 + 14]); - _sumf = _mm512_set1_ps(bias_data[p * 16 + 15]); - } - - for (int i = 0; i < num_input; i++) - { - __m512 _val = _mm512_loadu_ps(m); - _sum0 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[0]), _sum0); - _sum1 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[1]), _sum1); - _sum2 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[2]), _sum2); - _sum3 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[3]), _sum3); - _sum4 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[4]), _sum4); - _sum5 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[5]), _sum5); - _sum6 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[6]), _sum6); - _sum7 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[7]), _sum7); - _sum8 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[8]), _sum8); - _sum9 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[9]), _sum9); - _suma = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[10]), _suma); - _sumb = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[11]), _sumb); - _sumc = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[12]), _sumc); - _sumd = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[13]), _sumd); - _sume = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[14]), _sume); - _sumf = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[15]), _sumf); - - m += 16; - kptr += 16; - } - - _sum0 = activation_avx512(_sum0, activation_type, activation_params); - _sum1 = activation_avx512(_sum1, activation_type, activation_params); - _sum2 = activation_avx512(_sum2, activation_type, activation_params); - _sum3 = activation_avx512(_sum3, activation_type, activation_params); - _sum4 = activation_avx512(_sum4, activation_type, activation_params); - _sum5 = activation_avx512(_sum5, activation_type, activation_params); - _sum6 = activation_avx512(_sum6, activation_type, activation_params); - _sum7 = activation_avx512(_sum7, activation_type, activation_params); - _sum8 = activation_avx512(_sum8, activation_type, activation_params); - _sum9 = activation_avx512(_sum9, activation_type, activation_params); - _suma = activation_avx512(_suma, activation_type, activation_params); - _sumb = activation_avx512(_sumb, activation_type, activation_params); - _sumc = activation_avx512(_sumc, activation_type, activation_params); - _sumd = activation_avx512(_sumd, activation_type, activation_params); - _sume = activation_avx512(_sume, activation_type, activation_params); - _sumf = activation_avx512(_sumf, activation_type, activation_params); - - _mm512_storeu_ps(outptr, _sum0); - _mm512_storeu_ps(outptr + 16, _sum1); - _mm512_storeu_ps(outptr + 16 * 2, _sum2); - _mm512_storeu_ps(outptr + 16 * 3, _sum3); - _mm512_storeu_ps(outptr + 16 * 4, _sum4); - _mm512_storeu_ps(outptr + 16 * 5, _sum5); - _mm512_storeu_ps(outptr + 16 * 6, _sum6); - _mm512_storeu_ps(outptr + 16 * 7, _sum7); - _mm512_storeu_ps(outptr + 16 * 8, _sum8); - _mm512_storeu_ps(outptr + 16 * 9, _sum9); - _mm512_storeu_ps(outptr + 16 * 10, _suma); - _mm512_storeu_ps(outptr + 16 * 11, _sumb); - _mm512_storeu_ps(outptr + 16 * 12, _sumc); - _mm512_storeu_ps(outptr + 16 * 13, _sumd); - _mm512_storeu_ps(outptr + 16 * 14, _sume); - _mm512_storeu_ps(outptr + 16 * 15, _sumf); - outptr += 256; - } - } - - if (elempack == 1 && num_output_elempack == 16) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m512 _sum = _mm512_set1_ps(0.f); - - if (bias_term) - { - _sum = _mm512_loadu_ps((const float*)bias_data + p * 16); - } - - int i = 0; - for (; i < num_input; i++) - { - __m512 _val = _mm512_set1_ps(m[0]); - __m512 _w = _mm512_loadu_ps(kptr); - _sum = _mm512_fmadd_ps(_val, _w, _sum); - - m += 1; - kptr += 16; - } - - _sum = activation_avx512(_sum, activation_type, activation_params); - - _mm512_storeu_ps(outptr, _sum); - outptr += 16; - } - } - - if (elempack == 4 && num_output_elempack == 16) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m128 _sum0 = _mm_set1_ps(0.f); - __m128 _sum1 = _mm_set1_ps(0.f); - __m128 _sum2 = _mm_set1_ps(0.f); - __m128 _sum3 = _mm_set1_ps(0.f); - __m128 _sum4 = _mm_set1_ps(0.f); - __m128 _sum5 = _mm_set1_ps(0.f); - __m128 _sum6 = _mm_set1_ps(0.f); - __m128 _sum7 = _mm_set1_ps(0.f); - __m128 _sum8 = _mm_set1_ps(0.f); - __m128 _sum9 = _mm_set1_ps(0.f); - __m128 _suma = _mm_set1_ps(0.f); - __m128 _sumb = _mm_set1_ps(0.f); - __m128 _sumc = _mm_set1_ps(0.f); - __m128 _sumd = _mm_set1_ps(0.f); - __m128 _sume = _mm_set1_ps(0.f); - __m128 _sumf = _mm_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm_set1_ps(bias_data[p * 16 + 0]); - _sum1 = _mm_set1_ps(bias_data[p * 16 + 1]); - _sum2 = _mm_set1_ps(bias_data[p * 16 + 2]); - _sum3 = _mm_set1_ps(bias_data[p * 16 + 3]); - _sum4 = _mm_set1_ps(bias_data[p * 16 + 4]); - _sum5 = _mm_set1_ps(bias_data[p * 16 + 5]); - _sum6 = _mm_set1_ps(bias_data[p * 16 + 6]); - _sum7 = _mm_set1_ps(bias_data[p * 16 + 7]); - _sum8 = _mm_set1_ps(bias_data[p * 16 + 8]); - _sum9 = _mm_set1_ps(bias_data[p * 16 + 9]); - _suma = _mm_set1_ps(bias_data[p * 16 + 10]); - _sumb = _mm_set1_ps(bias_data[p * 16 + 11]); - _sumc = _mm_set1_ps(bias_data[p * 16 + 12]); - _sumd = _mm_set1_ps(bias_data[p * 16 + 13]); - _sume = _mm_set1_ps(bias_data[p * 16 + 14]); - _sumf = _mm_set1_ps(bias_data[p * 16 + 15]); - } - - int i = 0; - for (; i < num_input; i++) - { - __m128 _val = _mm_loadu_ps(m); - _sum0 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[0]), _sum0); - _sum1 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[1]), _sum1); - _sum2 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[2]), _sum2); - _sum3 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[3]), _sum3); - _sum4 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[4]), _sum4); - _sum5 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[5]), _sum5); - _sum6 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[6]), _sum6); - _sum7 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[7]), _sum7); - _sum8 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[8]), _sum8); - _sum9 = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[9]), _sum9); - _suma = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[10]), _suma); - _sumb = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[11]), _sumb); - _sumc = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[12]), _sumc); - _sumd = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[13]), _sumd); - _sume = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[14]), _sume); - _sumf = _mm_fmadd_ps(_val, _mm_set1_ps(kptr[15]), _sumf); - - m += 4; - kptr += 16; - } - - _sum0 = activation_sse(_sum0, activation_type, activation_params); - _sum1 = activation_sse(_sum1, activation_type, activation_params); - _sum2 = activation_sse(_sum2, activation_type, activation_params); - _sum3 = activation_sse(_sum3, activation_type, activation_params); - _sum4 = activation_sse(_sum4, activation_type, activation_params); - _sum5 = activation_sse(_sum5, activation_type, activation_params); - _sum6 = activation_sse(_sum6, activation_type, activation_params); - _sum7 = activation_sse(_sum7, activation_type, activation_params); - _sum8 = activation_sse(_sum8, activation_type, activation_params); - _sum9 = activation_sse(_sum9, activation_type, activation_params); - _suma = activation_sse(_suma, activation_type, activation_params); - _sumb = activation_sse(_sumb, activation_type, activation_params); - _sumc = activation_sse(_sumc, activation_type, activation_params); - _sumd = activation_sse(_sumd, activation_type, activation_params); - _sume = activation_sse(_sume, activation_type, activation_params); - _sumf = activation_sse(_sumf, activation_type, activation_params); - - _mm_storeu_ps(outptr, _sum0); - _mm_storeu_ps(outptr + 4, _sum1); - _mm_storeu_ps(outptr + 4 * 2, _sum2); - _mm_storeu_ps(outptr + 4 * 3, _sum3); - _mm_storeu_ps(outptr + 4 * 4, _sum4); - _mm_storeu_ps(outptr + 4 * 5, _sum5); - _mm_storeu_ps(outptr + 4 * 6, _sum6); - _mm_storeu_ps(outptr + 4 * 7, _sum7); - _mm_storeu_ps(outptr + 4 * 8, _sum8); - _mm_storeu_ps(outptr + 4 * 9, _sum9); - _mm_storeu_ps(outptr + 4 * 10, _suma); - _mm_storeu_ps(outptr + 4 * 11, _sumb); - _mm_storeu_ps(outptr + 4 * 12, _sumc); - _mm_storeu_ps(outptr + 4 * 13, _sumd); - _mm_storeu_ps(outptr + 4 * 14, _sume); - _mm_storeu_ps(outptr + 4 * 15, _sumf); - outptr += 64; - } - } - - if (elempack == 8 && num_output_elempack == 16) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - __m256 _sum4 = _mm256_set1_ps(0.f); - __m256 _sum5 = _mm256_set1_ps(0.f); - __m256 _sum6 = _mm256_set1_ps(0.f); - __m256 _sum7 = _mm256_set1_ps(0.f); - __m256 _sum8 = _mm256_set1_ps(0.f); - __m256 _sum9 = _mm256_set1_ps(0.f); - __m256 _suma = _mm256_set1_ps(0.f); - __m256 _sumb = _mm256_set1_ps(0.f); - __m256 _sumc = _mm256_set1_ps(0.f); - __m256 _sumd = _mm256_set1_ps(0.f); - __m256 _sume = _mm256_set1_ps(0.f); - __m256 _sumf = _mm256_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm256_set1_ps(bias_data[p * 16 + 0]); - _sum1 = _mm256_set1_ps(bias_data[p * 16 + 1]); - _sum2 = _mm256_set1_ps(bias_data[p * 16 + 2]); - _sum3 = _mm256_set1_ps(bias_data[p * 16 + 3]); - _sum4 = _mm256_set1_ps(bias_data[p * 16 + 4]); - _sum5 = _mm256_set1_ps(bias_data[p * 16 + 5]); - _sum6 = _mm256_set1_ps(bias_data[p * 16 + 6]); - _sum7 = _mm256_set1_ps(bias_data[p * 16 + 7]); - _sum8 = _mm256_set1_ps(bias_data[p * 16 + 8]); - _sum9 = _mm256_set1_ps(bias_data[p * 16 + 9]); - _suma = _mm256_set1_ps(bias_data[p * 16 + 10]); - _sumb = _mm256_set1_ps(bias_data[p * 16 + 11]); - _sumc = _mm256_set1_ps(bias_data[p * 16 + 12]); - _sumd = _mm256_set1_ps(bias_data[p * 16 + 13]); - _sume = _mm256_set1_ps(bias_data[p * 16 + 14]); - _sumf = _mm256_set1_ps(bias_data[p * 16 + 15]); - } - - int i = 0; - for (; i < num_input; i++) - { - __m256 _val = _mm256_loadu_ps(m); - _sum0 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[0]), _sum0); - _sum1 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[1]), _sum1); - _sum2 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[2]), _sum2); - _sum3 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[3]), _sum3); - _sum4 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[4]), _sum4); - _sum5 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[5]), _sum5); - _sum6 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[6]), _sum6); - _sum7 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[7]), _sum7); - _sum8 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[8]), _sum8); - _sum9 = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[9]), _sum9); - _suma = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[10]), _suma); - _sumb = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[11]), _sumb); - _sumc = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[12]), _sumc); - _sumd = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[13]), _sumd); - _sume = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[14]), _sume); - _sumf = _mm256_fmadd_ps(_val, _mm256_set1_ps(kptr[15]), _sumf); - - m += 8; - kptr += 16; - } - - _sum0 = activation_avx(_sum0, activation_type, activation_params); - _sum1 = activation_avx(_sum1, activation_type, activation_params); - _sum2 = activation_avx(_sum2, activation_type, activation_params); - _sum3 = activation_avx(_sum3, activation_type, activation_params); - _sum4 = activation_avx(_sum4, activation_type, activation_params); - _sum5 = activation_avx(_sum5, activation_type, activation_params); - _sum6 = activation_avx(_sum6, activation_type, activation_params); - _sum7 = activation_avx(_sum7, activation_type, activation_params); - _sum8 = activation_avx(_sum8, activation_type, activation_params); - _sum9 = activation_avx(_sum9, activation_type, activation_params); - _suma = activation_avx(_suma, activation_type, activation_params); - _sumb = activation_avx(_sumb, activation_type, activation_params); - _sumc = activation_avx(_sumc, activation_type, activation_params); - _sumd = activation_avx(_sumd, activation_type, activation_params); - _sume = activation_avx(_sume, activation_type, activation_params); - _sumf = activation_avx(_sumf, activation_type, activation_params); - - _mm256_storeu_ps(outptr, _sum0); - _mm256_storeu_ps(outptr + 8, _sum1); - _mm256_storeu_ps(outptr + 8 * 2, _sum2); - _mm256_storeu_ps(outptr + 8 * 3, _sum3); - _mm256_storeu_ps(outptr + 8 * 4, _sum4); - _mm256_storeu_ps(outptr + 8 * 5, _sum5); - _mm256_storeu_ps(outptr + 8 * 6, _sum6); - _mm256_storeu_ps(outptr + 8 * 7, _sum7); - _mm256_storeu_ps(outptr + 8 * 8, _sum8); - _mm256_storeu_ps(outptr + 8 * 9, _sum9); - _mm256_storeu_ps(outptr + 8 * 10, _suma); - _mm256_storeu_ps(outptr + 8 * 11, _sumb); - _mm256_storeu_ps(outptr + 8 * 12, _sumc); - _mm256_storeu_ps(outptr + 8 * 13, _sumd); - _mm256_storeu_ps(outptr + 8 * 14, _sume); - _mm256_storeu_ps(outptr + 8 * 15, _sumf); - outptr += 128; - } - } - - if (elempack == 16 && num_output_elempack == 1) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output; p++) - { - const float* kptr = (const float*)weight_data_tm + num_input * p; - const float* m = bottom_blob.row(j); - - __m512 _sum0 = _mm512_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm512_set1_ps(bias_data[p]); - } - - int i = 0; - for (; i < num_input; i++) - { - __m512 _val = _mm512_loadu_ps(m); - __m512 _k = _mm512_set1_ps(kptr[0]); - _sum0 = _mm512_fmadd_ps(_val, _k, _sum0); - - m += 16; - kptr += 1; - } - - _sum0 = activation_avx512(_sum0, activation_type, activation_params); - - _mm512_storeu_ps(outptr, _sum0); - outptr += 16; - } - } - - if (elempack == 16 && num_output_elempack == 4) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m512 _sum0 = _mm512_set1_ps(0.f); - __m512 _sum1 = _mm512_set1_ps(0.f); - __m512 _sum2 = _mm512_set1_ps(0.f); - __m512 _sum3 = _mm512_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm512_set1_ps(bias_data[p * 4 + 0]); - _sum1 = _mm512_set1_ps(bias_data[p * 4 + 1]); - _sum2 = _mm512_set1_ps(bias_data[p * 4 + 2]); - _sum3 = _mm512_set1_ps(bias_data[p * 4 + 3]); - } - - int i = 0; - for (; i < num_input; i++) - { - __m512 _val = _mm512_loadu_ps(m); - _sum0 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[0]), _sum0); - _sum1 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[1]), _sum1); - _sum2 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[2]), _sum2); - _sum3 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[3]), _sum3); - - m += 16; - kptr += 4; - } - - _sum0 = activation_avx512(_sum0, activation_type, activation_params); - _sum1 = activation_avx512(_sum1, activation_type, activation_params); - _sum2 = activation_avx512(_sum2, activation_type, activation_params); - _sum3 = activation_avx512(_sum3, activation_type, activation_params); - - _mm512_storeu_ps(outptr, _sum0); - _mm512_storeu_ps(outptr + 16, _sum1); - _mm512_storeu_ps(outptr + 32, _sum2); - _mm512_storeu_ps(outptr + 48, _sum3); - outptr += 64; - } - } - - if (elempack == 16 && num_output_elempack == 8) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m512 _sum0 = _mm512_set1_ps(0.f); - __m512 _sum1 = _mm512_set1_ps(0.f); - __m512 _sum2 = _mm512_set1_ps(0.f); - __m512 _sum3 = _mm512_set1_ps(0.f); - __m512 _sum4 = _mm512_set1_ps(0.f); - __m512 _sum5 = _mm512_set1_ps(0.f); - __m512 _sum6 = _mm512_set1_ps(0.f); - __m512 _sum7 = _mm512_set1_ps(0.f); + innerproduct_transform_kernel_sse(weight_data, weight_data_tm, num_input, num_output, opt); - if (bias_term) - { - _sum0 = _mm512_set1_ps(bias_data[p * 8 + 0]); - _sum1 = _mm512_set1_ps(bias_data[p * 8 + 1]); - _sum2 = _mm512_set1_ps(bias_data[p * 8 + 2]); - _sum3 = _mm512_set1_ps(bias_data[p * 8 + 3]); - _sum4 = _mm512_set1_ps(bias_data[p * 8 + 4]); - _sum5 = _mm512_set1_ps(bias_data[p * 8 + 5]); - _sum6 = _mm512_set1_ps(bias_data[p * 8 + 6]); - _sum7 = _mm512_set1_ps(bias_data[p * 8 + 7]); - } - - int i = 0; - for (; i < num_input; i++) - { - __m512 _val = _mm512_loadu_ps(m); - _sum0 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[0]), _sum0); - _sum1 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[1]), _sum1); - _sum2 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[2]), _sum2); - _sum3 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[3]), _sum3); - _sum4 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[4]), _sum4); - _sum5 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[5]), _sum5); - _sum6 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[6]), _sum6); - _sum7 = _mm512_fmadd_ps(_val, _mm512_set1_ps(kptr[7]), _sum7); - - m += 16; - kptr += 8; - } - - _sum0 = activation_avx512(_sum0, activation_type, activation_params); - _sum1 = activation_avx512(_sum1, activation_type, activation_params); - _sum2 = activation_avx512(_sum2, activation_type, activation_params); - _sum3 = activation_avx512(_sum3, activation_type, activation_params); - _sum4 = activation_avx512(_sum4, activation_type, activation_params); - _sum5 = activation_avx512(_sum5, activation_type, activation_params); - _sum6 = activation_avx512(_sum6, activation_type, activation_params); - _sum7 = activation_avx512(_sum7, activation_type, activation_params); - - _mm512_storeu_ps(outptr, _sum0); - _mm512_storeu_ps(outptr + 16, _sum1); - _mm512_storeu_ps(outptr + 16 * 2, _sum2); - _mm512_storeu_ps(outptr + 16 * 3, _sum3); - _mm512_storeu_ps(outptr + 16 * 4, _sum4); - _mm512_storeu_ps(outptr + 16 * 5, _sum5); - _mm512_storeu_ps(outptr + 16 * 6, _sum6); - _mm512_storeu_ps(outptr + 16 * 7, _sum7); - outptr += 128; - } - } - -#endif // __AVX512F__ - - if (elempack == 8 && num_output_elempack == 8) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - __m256 _sum4 = _mm256_set1_ps(0.f); - __m256 _sum5 = _mm256_set1_ps(0.f); - __m256 _sum6 = _mm256_set1_ps(0.f); - __m256 _sum7 = _mm256_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm256_set1_ps(bias_data[p * 8 + 0]); - _sum1 = _mm256_set1_ps(bias_data[p * 8 + 1]); - _sum2 = _mm256_set1_ps(bias_data[p * 8 + 2]); - _sum3 = _mm256_set1_ps(bias_data[p * 8 + 3]); - _sum4 = _mm256_set1_ps(bias_data[p * 8 + 4]); - _sum5 = _mm256_set1_ps(bias_data[p * 8 + 5]); - _sum6 = _mm256_set1_ps(bias_data[p * 8 + 6]); - _sum7 = _mm256_set1_ps(bias_data[p * 8 + 7]); - } - - for (int i = 0; i < num_input; i++) - { - __m256 _val = _mm256_loadu_ps(m); - __m256 _k0 = _mm256_set1_ps(kptr[0]); - __m256 _k1 = _mm256_set1_ps(kptr[1]); - __m256 _k2 = _mm256_set1_ps(kptr[2]); - __m256 _k3 = _mm256_set1_ps(kptr[3]); - __m256 _k4 = _mm256_set1_ps(kptr[4]); - __m256 _k5 = _mm256_set1_ps(kptr[5]); - __m256 _k6 = _mm256_set1_ps(kptr[6]); - __m256 _k7 = _mm256_set1_ps(kptr[7]); - _sum0 = _mm256_comp_fmadd_ps(_val, _k0, _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val, _k1, _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val, _k2, _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val, _k3, _sum3); - _sum4 = _mm256_comp_fmadd_ps(_val, _k4, _sum4); - _sum5 = _mm256_comp_fmadd_ps(_val, _k5, _sum5); - _sum6 = _mm256_comp_fmadd_ps(_val, _k6, _sum6); - _sum7 = _mm256_comp_fmadd_ps(_val, _k7, _sum7); - - m += 8; - kptr += 8; - } - - _sum0 = activation_avx(_sum0, activation_type, activation_params); - _sum1 = activation_avx(_sum1, activation_type, activation_params); - _sum2 = activation_avx(_sum2, activation_type, activation_params); - _sum3 = activation_avx(_sum3, activation_type, activation_params); - _sum4 = activation_avx(_sum4, activation_type, activation_params); - _sum5 = activation_avx(_sum5, activation_type, activation_params); - _sum6 = activation_avx(_sum6, activation_type, activation_params); - _sum7 = activation_avx(_sum7, activation_type, activation_params); - - _mm256_storeu_ps(outptr, _sum0); - _mm256_storeu_ps(outptr + 8, _sum1); - _mm256_storeu_ps(outptr + 16, _sum2); - _mm256_storeu_ps(outptr + 24, _sum3); - _mm256_storeu_ps(outptr + 32, _sum4); - _mm256_storeu_ps(outptr + 40, _sum5); - _mm256_storeu_ps(outptr + 48, _sum6); - _mm256_storeu_ps(outptr + 56, _sum7); - outptr += 64; - } - } - - if (elempack == 1 && num_output_elempack == 8) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m256 _sum = _mm256_set1_ps(0.f); - - if (bias_term) - { - _sum = _mm256_loadu_ps((const float*)bias_data + p * 8); - } - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m256 _val0 = _mm256_broadcast_ss(m); - __m256 _val1 = _mm256_broadcast_ss(m + 1); - __m256 _val2 = _mm256_broadcast_ss(m + 2); - __m256 _val3 = _mm256_broadcast_ss(m + 3); - __m256 _val4 = _mm256_broadcast_ss(m + 4); - __m256 _val5 = _mm256_broadcast_ss(m + 5); - __m256 _val6 = _mm256_broadcast_ss(m + 6); - __m256 _val7 = _mm256_broadcast_ss(m + 7); - - __m256 _w0 = _mm256_loadu_ps(kptr); - _sum = _mm256_comp_fmadd_ps(_val0, _w0, _sum); - __m256 _w1 = _mm256_loadu_ps(kptr + 8); - _sum = _mm256_comp_fmadd_ps(_val1, _w1, _sum); - __m256 _w2 = _mm256_loadu_ps(kptr + 16); - _sum = _mm256_comp_fmadd_ps(_val2, _w2, _sum); - __m256 _w3 = _mm256_loadu_ps(kptr + 24); - _sum = _mm256_comp_fmadd_ps(_val3, _w3, _sum); - __m256 _w4 = _mm256_loadu_ps(kptr + 32); - _sum = _mm256_comp_fmadd_ps(_val4, _w4, _sum); - __m256 _w5 = _mm256_loadu_ps(kptr + 40); - _sum = _mm256_comp_fmadd_ps(_val5, _w5, _sum); - __m256 _w6 = _mm256_loadu_ps(kptr + 48); - _sum = _mm256_comp_fmadd_ps(_val6, _w6, _sum); - __m256 _w7 = _mm256_loadu_ps(kptr + 56); - _sum = _mm256_comp_fmadd_ps(_val7, _w7, _sum); - - m += 8; - kptr += 64; - } - for (; i + 3 < num_input; i += 4) - { - __m256 _val0 = _mm256_broadcast_ss(m); - __m256 _val1 = _mm256_broadcast_ss(m + 1); - __m256 _val2 = _mm256_broadcast_ss(m + 2); - __m256 _val3 = _mm256_broadcast_ss(m + 3); - - __m256 _w0 = _mm256_loadu_ps(kptr); - _sum = _mm256_comp_fmadd_ps(_val0, _w0, _sum); - __m256 _w1 = _mm256_loadu_ps(kptr + 8); - _sum = _mm256_comp_fmadd_ps(_val1, _w1, _sum); - __m256 _w2 = _mm256_loadu_ps(kptr + 16); - _sum = _mm256_comp_fmadd_ps(_val2, _w2, _sum); - __m256 _w3 = _mm256_loadu_ps(kptr + 24); - _sum = _mm256_comp_fmadd_ps(_val3, _w3, _sum); - - m += 4; - kptr += 32; - } - for (; i < num_input; i++) - { - __m256 _val = _mm256_set1_ps(m[0]); - __m256 _w = _mm256_loadu_ps(kptr); - _sum = _mm256_comp_fmadd_ps(_val, _w, _sum); - - m += 1; - kptr += 8; - } - - _sum = activation_avx(_sum, activation_type, activation_params); - - _mm256_storeu_ps(outptr, _sum); - outptr += 8; - } - } - - if (elempack == 4 && num_output_elempack == 8) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m128 _sum0 = _mm_set1_ps(0.f); - __m128 _sum1 = _mm_set1_ps(0.f); - __m128 _sum2 = _mm_set1_ps(0.f); - __m128 _sum3 = _mm_set1_ps(0.f); - __m128 _sum4 = _mm_set1_ps(0.f); - __m128 _sum5 = _mm_set1_ps(0.f); - __m128 _sum6 = _mm_set1_ps(0.f); - __m128 _sum7 = _mm_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm_set1_ps(bias_data[p * 8 + 0]); - _sum1 = _mm_set1_ps(bias_data[p * 8 + 1]); - _sum2 = _mm_set1_ps(bias_data[p * 8 + 2]); - _sum3 = _mm_set1_ps(bias_data[p * 8 + 3]); - _sum4 = _mm_set1_ps(bias_data[p * 8 + 4]); - _sum5 = _mm_set1_ps(bias_data[p * 8 + 5]); - _sum6 = _mm_set1_ps(bias_data[p * 8 + 6]); - _sum7 = _mm_set1_ps(bias_data[p * 8 + 7]); - } - - int i = 0; - for (; i < num_input; i++) - { - __m128 _val = _mm_loadu_ps(m); - _sum0 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[0]), _sum0); - _sum1 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[1]), _sum1); - _sum2 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[2]), _sum2); - _sum3 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[3]), _sum3); - _sum4 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[4]), _sum4); - _sum5 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[5]), _sum5); - _sum6 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[6]), _sum6); - _sum7 = _mm_comp_fmadd_ps(_val, _mm_set1_ps(kptr[7]), _sum7); - - m += 4; - kptr += 8; - } - - _sum0 = activation_sse(_sum0, activation_type, activation_params); - _sum1 = activation_sse(_sum1, activation_type, activation_params); - _sum2 = activation_sse(_sum2, activation_type, activation_params); - _sum3 = activation_sse(_sum3, activation_type, activation_params); - _sum4 = activation_sse(_sum4, activation_type, activation_params); - _sum5 = activation_sse(_sum5, activation_type, activation_params); - _sum6 = activation_sse(_sum6, activation_type, activation_params); - _sum7 = activation_sse(_sum7, activation_type, activation_params); - - _mm_storeu_ps(outptr, _sum0); - _mm_storeu_ps(outptr + 4, _sum1); - _mm_storeu_ps(outptr + 8, _sum2); - _mm_storeu_ps(outptr + 12, _sum3); - _mm_storeu_ps(outptr + 16, _sum4); - _mm_storeu_ps(outptr + 20, _sum5); - _mm_storeu_ps(outptr + 24, _sum6); - _mm_storeu_ps(outptr + 28, _sum7); - outptr += 32; - } - } - - if (elempack == 8 && num_output_elempack == 1) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output; p++) - { - const float* kptr = (const float*)weight_data_tm + num_input * p; - const float* m = bottom_blob.row(j); - - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm256_set1_ps(bias_data[p]); - } - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m256 _val0 = _mm256_loadu_ps(m); - __m256 _val1 = _mm256_loadu_ps(m + 8); - __m256 _val2 = _mm256_loadu_ps(m + 16); - __m256 _val3 = _mm256_loadu_ps(m + 24); - __m256 _val4 = _mm256_loadu_ps(m + 32); - __m256 _val5 = _mm256_loadu_ps(m + 40); - __m256 _val6 = _mm256_loadu_ps(m + 48); - __m256 _val7 = _mm256_loadu_ps(m + 56); - _sum0 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[0]), _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[1]), _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[2]), _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[3]), _sum3); - _sum0 = _mm256_comp_fmadd_ps(_val4, _mm256_set1_ps(kptr[4]), _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val5, _mm256_set1_ps(kptr[5]), _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val6, _mm256_set1_ps(kptr[6]), _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val7, _mm256_set1_ps(kptr[7]), _sum3); - - m += 64; - kptr += 8; - } - for (; i + 3 < num_input; i += 4) - { - __m256 _val0 = _mm256_loadu_ps(m); - __m256 _val1 = _mm256_loadu_ps(m + 8); - __m256 _val2 = _mm256_loadu_ps(m + 16); - __m256 _val3 = _mm256_loadu_ps(m + 24); - _sum0 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[0]), _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[1]), _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[2]), _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[3]), _sum3); - - m += 32; - kptr += 4; - } - for (; i < num_input; i++) - { - __m256 _val = _mm256_loadu_ps(m); - __m256 _k = _mm256_set1_ps(kptr[0]); - _sum0 = _mm256_comp_fmadd_ps(_val, _k, _sum0); - - m += 8; - kptr += 1; - } - - _sum0 = _mm256_add_ps(_sum0, _sum1); - _sum2 = _mm256_add_ps(_sum2, _sum3); - _sum0 = _mm256_add_ps(_sum0, _sum2); - - _sum0 = activation_avx(_sum0, activation_type, activation_params); - - _mm256_storeu_ps(outptr, _sum0); - outptr += 8; - } - } - - if (elempack == 8 && num_output_elempack == 4) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm256_set1_ps(bias_data[p * 4 + 0]); - _sum1 = _mm256_set1_ps(bias_data[p * 4 + 1]); - _sum2 = _mm256_set1_ps(bias_data[p * 4 + 2]); - _sum3 = _mm256_set1_ps(bias_data[p * 4 + 3]); - } - - int i = 0; - for (; i + 3 < num_input; i += 4) - { - __m256 _val0 = _mm256_loadu_ps(m); - __m256 _val1 = _mm256_loadu_ps(m + 8); - __m256 _val2 = _mm256_loadu_ps(m + 16); - __m256 _val3 = _mm256_loadu_ps(m + 24); - _sum0 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[0]), _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[1]), _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[2]), _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val0, _mm256_set1_ps(kptr[3]), _sum3); - _sum0 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[4]), _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[5]), _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[6]), _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val1, _mm256_set1_ps(kptr[7]), _sum3); - kptr += 8; - - _sum0 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[0]), _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[1]), _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[2]), _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val2, _mm256_set1_ps(kptr[3]), _sum3); - _sum0 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[4]), _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[5]), _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[6]), _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val3, _mm256_set1_ps(kptr[7]), _sum3); - - m += 32; - kptr += 8; - } - for (; i < num_input; i++) - { - __m256 _val = _mm256_loadu_ps(m); - _sum0 = _mm256_comp_fmadd_ps(_val, _mm256_set1_ps(kptr[0]), _sum0); - _sum1 = _mm256_comp_fmadd_ps(_val, _mm256_set1_ps(kptr[1]), _sum1); - _sum2 = _mm256_comp_fmadd_ps(_val, _mm256_set1_ps(kptr[2]), _sum2); - _sum3 = _mm256_comp_fmadd_ps(_val, _mm256_set1_ps(kptr[3]), _sum3); - - m += 8; - kptr += 4; - } - - _sum0 = activation_avx(_sum0, activation_type, activation_params); - _sum1 = activation_avx(_sum1, activation_type, activation_params); - _sum2 = activation_avx(_sum2, activation_type, activation_params); - _sum3 = activation_avx(_sum3, activation_type, activation_params); - - _mm256_storeu_ps(outptr, _sum0); - _mm256_storeu_ps(outptr + 8, _sum1); - _mm256_storeu_ps(outptr + 16, _sum2); - _mm256_storeu_ps(outptr + 24, _sum3); - outptr += 32; - } - } -#endif // __AVX__ - - if (elempack == 4 && num_output_elempack == 4) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m128 _sum0 = _mm_set1_ps(0.f); - __m128 _sum1 = _mm_set1_ps(0.f); - __m128 _sum2 = _mm_set1_ps(0.f); - __m128 _sum3 = _mm_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm_set1_ps(bias_data[p * 4 + 0]); - _sum1 = _mm_set1_ps(bias_data[p * 4 + 1]); - _sum2 = _mm_set1_ps(bias_data[p * 4 + 2]); - _sum3 = _mm_set1_ps(bias_data[p * 4 + 3]); - } - - int i = 0; - for (; i + 3 < num_input; i += 4) - { - __m128 _val0 = _mm_loadu_ps(m); - __m128 _val1 = _mm_loadu_ps(m + 4); - __m128 _val2 = _mm_loadu_ps(m + 8); - __m128 _val3 = _mm_loadu_ps(m + 12); - _sum0 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[0])), _sum0); - _sum1 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[1])), _sum1); - _sum2 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[2])), _sum2); - _sum3 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[3])), _sum3); - _sum0 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[4])), _sum0); - _sum1 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[5])), _sum1); - _sum2 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[6])), _sum2); - _sum3 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[7])), _sum3); - _sum0 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[8])), _sum0); - _sum1 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[9])), _sum1); - _sum2 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[10])), _sum2); - _sum3 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[11])), _sum3); - _sum0 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[12])), _sum0); - _sum1 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[13])), _sum1); - _sum2 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[14])), _sum2); - _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[15])), _sum3); - - m += 16; - kptr += 16; - } - for (; i < num_input; i++) - { - __m128 _val = _mm_loadu_ps(m); - _sum0 = _mm_add_ps(_mm_mul_ps(_val, _mm_set1_ps(kptr[0])), _sum0); - _sum1 = _mm_add_ps(_mm_mul_ps(_val, _mm_set1_ps(kptr[1])), _sum1); - _sum2 = _mm_add_ps(_mm_mul_ps(_val, _mm_set1_ps(kptr[2])), _sum2); - _sum3 = _mm_add_ps(_mm_mul_ps(_val, _mm_set1_ps(kptr[3])), _sum3); - - m += 4; - kptr += 4; - } - - _sum0 = activation_sse(_sum0, activation_type, activation_params); - _sum1 = activation_sse(_sum1, activation_type, activation_params); - _sum2 = activation_sse(_sum2, activation_type, activation_params); - _sum3 = activation_sse(_sum3, activation_type, activation_params); - - _mm_storeu_ps(outptr, _sum0); - _mm_storeu_ps(outptr + 4, _sum1); - _mm_storeu_ps(outptr + 8, _sum2); - _mm_storeu_ps(outptr + 12, _sum3); - outptr += 16; - } - } - - if (elempack == 1 && num_output_elempack == 4) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output / num_output_elempack; p++) - { - const float* kptr = weight_data_tm.row(p); - const float* m = bottom_blob.row(j); - - __m128 _sum = _mm_set1_ps(0.f); - - if (bias_term) - { - _sum = _mm_loadu_ps((const float*)bias_data + p * 4); - } - - int i = 0; -#if __AVX__ - for (; i + 7 < num_input; i += 8) - { - __m128 _val0 = _mm_broadcast_ss(m); - __m128 _val1 = _mm_broadcast_ss(m + 1); - __m128 _val2 = _mm_broadcast_ss(m + 2); - __m128 _val3 = _mm_broadcast_ss(m + 3); - __m128 _val4 = _mm_broadcast_ss(m + 4); - __m128 _val5 = _mm_broadcast_ss(m + 5); - __m128 _val6 = _mm_broadcast_ss(m + 6); - __m128 _val7 = _mm_broadcast_ss(m + 7); - - __m128 _w0 = _mm_loadu_ps(kptr); - _sum = _mm_comp_fmadd_ps(_val0, _w0, _sum); - __m128 _w1 = _mm_loadu_ps(kptr + 4); - _sum = _mm_comp_fmadd_ps(_val1, _w1, _sum); - __m128 _w2 = _mm_loadu_ps(kptr + 8); - _sum = _mm_comp_fmadd_ps(_val2, _w2, _sum); - __m128 _w3 = _mm_loadu_ps(kptr + 12); - _sum = _mm_comp_fmadd_ps(_val3, _w3, _sum); - __m128 _w4 = _mm_loadu_ps(kptr + 16); - _sum = _mm_comp_fmadd_ps(_val4, _w4, _sum); - __m128 _w5 = _mm_loadu_ps(kptr + 20); - _sum = _mm_comp_fmadd_ps(_val5, _w5, _sum); - __m128 _w6 = _mm_loadu_ps(kptr + 24); - _sum = _mm_comp_fmadd_ps(_val6, _w6, _sum); - __m128 _w7 = _mm_loadu_ps(kptr + 28); - _sum = _mm_comp_fmadd_ps(_val7, _w7, _sum); - - m += 8; - kptr += 32; - } -#endif // __AVX__ - for (; i + 3 < num_input; i += 4) - { - __m128 _val0 = _mm_set1_ps(m[0]); - __m128 _val1 = _mm_set1_ps(m[1]); - __m128 _val2 = _mm_set1_ps(m[2]); - __m128 _val3 = _mm_set1_ps(m[3]); - - __m128 _w0 = _mm_loadu_ps(kptr); - _sum = _mm_add_ps(_mm_mul_ps(_val0, _w0), _sum); - __m128 _w1 = _mm_loadu_ps(kptr + 4); - _sum = _mm_add_ps(_mm_mul_ps(_val1, _w1), _sum); - __m128 _w2 = _mm_loadu_ps(kptr + 8); - _sum = _mm_add_ps(_mm_mul_ps(_val2, _w2), _sum); - __m128 _w3 = _mm_loadu_ps(kptr + 12); - _sum = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum); - - m += 4; - kptr += 16; - } - for (; i < num_input; i++) - { - __m128 _val = _mm_set1_ps(m[0]); - __m128 _k = _mm_loadu_ps(kptr); - _sum = _mm_add_ps(_mm_mul_ps(_val, _k), _sum); - - m += 1; - kptr += 4; - } - - _sum = activation_sse(_sum, activation_type, activation_params); - - _mm_storeu_ps(outptr, _sum); - outptr += 4; - } - } - - if (elempack == 4 && num_output_elempack == 1) - { - float* outptr = top_blob.row(j); - - for (int p = 0; p < num_output; p++) - { - const float* kptr = (const float*)weight_data_tm + num_input * p; - const float* m = bottom_blob.row(j); - - __m128 _sum0 = _mm_set1_ps(0.f); - __m128 _sum1 = _mm_set1_ps(0.f); - __m128 _sum2 = _mm_set1_ps(0.f); - __m128 _sum3 = _mm_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm_set1_ps(bias_data[p]); - } - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m128 _val0 = _mm_loadu_ps(m); - __m128 _val1 = _mm_loadu_ps(m + 4); - __m128 _val2 = _mm_loadu_ps(m + 8); - __m128 _val3 = _mm_loadu_ps(m + 12); - __m128 _val4 = _mm_loadu_ps(m + 16); - __m128 _val5 = _mm_loadu_ps(m + 20); - __m128 _val6 = _mm_loadu_ps(m + 24); - __m128 _val7 = _mm_loadu_ps(m + 28); - _sum0 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[0])), _sum0); - _sum1 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[1])), _sum1); - _sum2 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[2])), _sum2); - _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[3])), _sum3); - _sum0 = _mm_add_ps(_mm_mul_ps(_val4, _mm_set1_ps(kptr[4])), _sum0); - _sum1 = _mm_add_ps(_mm_mul_ps(_val5, _mm_set1_ps(kptr[5])), _sum1); - _sum2 = _mm_add_ps(_mm_mul_ps(_val6, _mm_set1_ps(kptr[6])), _sum2); - _sum3 = _mm_add_ps(_mm_mul_ps(_val7, _mm_set1_ps(kptr[7])), _sum3); - - m += 32; - kptr += 8; - } - for (; i + 3 < num_input; i += 4) - { - __m128 _val0 = _mm_loadu_ps(m); - __m128 _val1 = _mm_loadu_ps(m + 4); - __m128 _val2 = _mm_loadu_ps(m + 8); - __m128 _val3 = _mm_loadu_ps(m + 12); - _sum0 = _mm_add_ps(_mm_mul_ps(_val0, _mm_set1_ps(kptr[0])), _sum0); - _sum1 = _mm_add_ps(_mm_mul_ps(_val1, _mm_set1_ps(kptr[1])), _sum1); - _sum2 = _mm_add_ps(_mm_mul_ps(_val2, _mm_set1_ps(kptr[2])), _sum2); - _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _mm_set1_ps(kptr[3])), _sum3); - - m += 16; - kptr += 4; - } - for (; i < num_input; i++) - { - __m128 _val = _mm_loadu_ps(m); - __m128 _k = _mm_set1_ps(kptr[0]); - _sum0 = _mm_add_ps(_mm_mul_ps(_val, _k), _sum0); - - m += 4; - kptr += 1; - } - - _sum0 = _mm_add_ps(_sum0, _sum1); - _sum2 = _mm_add_ps(_sum2, _sum3); - _sum0 = _mm_add_ps(_sum0, _sum2); - - _sum0 = activation_sse(_sum0, activation_type, activation_params); - - _mm_storeu_ps(outptr, _sum0); - outptr += 4; - } - } -#endif // __SSE2__ - - if (elempack == 1 && num_output_elempack == 1) - { - float* outptr = top_blob.row(j); + if (opt.lightmode) + { + weight_data.release(); + } - for (int p = 0; p < num_output; p++) - { - const float* kptr = (const float*)weight_data_tm + num_input * p; - const float* m = bottom_blob.row(j); + return 0; +} - float sum = 0.f; +int InnerProduct_x86::destroy_pipeline(const Option& opt) +{ + if (flatten) + { + flatten->destroy_pipeline(opt); + delete flatten; + flatten = 0; + } - if (bias_term) - { - sum = bias_data[p]; - } + return 0; +} - int i = 0; -#if __SSE2__ -#if __AVX__ - __m256 _sum = _mm256_set1_ps(0.f); - for (; i + 7 < num_input; i += 8) - { - __m256 _m = _mm256_loadu_ps(m); - __m256 _w = _mm256_loadu_ps(kptr); - _sum = _mm256_comp_fmadd_ps(_m, _w, _sum); +int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if NCNN_INT8 + if (opt.use_int8_inference && int8_scale_term) + { + return forward_int8_x86(bottom_blob, top_blob, opt); + } +#endif - m += 8; - kptr += 8; - } -#endif // __AVX__ - __m128 _suml = _mm_set1_ps(0.f); - for (; i + 3 < num_input; i += 4) - { - __m128 _val = _mm_loadu_ps(m); - __m128 _k = _mm_loadu_ps(kptr); - _suml = _mm_add_ps(_mm_mul_ps(_val, _k), _suml); +#if NCNN_F16C && __AVX__ + if (cpu_support_x86_f16c() && opt.use_fp16_storage) + { + return forward_fp16s(bottom_blob, top_blob, opt); + } +#endif - m += 4; - kptr += 4; - } -#endif // __SSE2__ - for (; i < num_input; i++) - { - sum += *m++ * *kptr++; - } + const int num_input = weight_data_size / num_output; -#if __SSE2__ -#if __AVX__ - sum += _mm256_reduce_add_ps(_sum); -#endif // __AVX__ - sum += _mm_reduce_add_ps(_suml); -#endif // __SSE2__ + if (bottom_blob.dims == 2 && bottom_blob.w == num_input && bottom_blob.h * bottom_blob.elempack > 1) + { + // gemm + int h = bottom_blob.h; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; - sum = activation_ss(sum, activation_type, activation_params); + top_blob.create(num_output, h, elemsize, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; - outptr[0] = sum; - outptr += 1; - } - } - } + innerproduct_gemm_sse(bottom_blob, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); return 0; } @@ -1413,602 +170,12 @@ int InnerProduct_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Optio if (top_blob.empty()) return -100; -#if __SSE2__ -#if __AVX__ -#if __AVX512F__ - if (out_elempack == 16) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output / out_elempack; p++) - { - __m512 _sum0 = _mm512_set1_ps(0.f); - __m512 _sum1 = _mm512_set1_ps(0.f); - __m512 _sum2 = _mm512_set1_ps(0.f); - __m512 _sum3 = _mm512_set1_ps(0.f); - __m512 _sum4 = _mm512_set1_ps(0.f); - __m512 _sum5 = _mm512_set1_ps(0.f); - __m512 _sum6 = _mm512_set1_ps(0.f); - __m512 _sum7 = _mm512_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm512_loadu_ps((const float*)bias_data + p * 16); - } - - const float* kptr = weight_data_tm.row(p); - - const float* sptr = bottom_blob_flattened; - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m512 _val0 = _mm512_set1_ps(sptr[0]); - __m512 _val1 = _mm512_set1_ps(sptr[1]); - __m512 _val2 = _mm512_set1_ps(sptr[2]); - __m512 _val3 = _mm512_set1_ps(sptr[3]); - __m512 _val4 = _mm512_set1_ps(sptr[4]); - __m512 _val5 = _mm512_set1_ps(sptr[5]); - __m512 _val6 = _mm512_set1_ps(sptr[6]); - __m512 _val7 = _mm512_set1_ps(sptr[7]); - - __m512 _w0 = _mm512_loadu_ps(kptr + 16 * 0); - __m512 _w1 = _mm512_loadu_ps(kptr + 16 * 1); - __m512 _w2 = _mm512_loadu_ps(kptr + 16 * 2); - __m512 _w3 = _mm512_loadu_ps(kptr + 16 * 3); - __m512 _w4 = _mm512_loadu_ps(kptr + 16 * 4); - __m512 _w5 = _mm512_loadu_ps(kptr + 16 * 5); - __m512 _w6 = _mm512_loadu_ps(kptr + 16 * 6); - __m512 _w7 = _mm512_loadu_ps(kptr + 16 * 7); - - _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0); - _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1); - _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2); - _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3); - _sum4 = _mm512_fmadd_ps(_val4, _w4, _sum4); - _sum5 = _mm512_fmadd_ps(_val5, _w5, _sum5); - _sum6 = _mm512_fmadd_ps(_val6, _w6, _sum6); - _sum7 = _mm512_fmadd_ps(_val7, _w7, _sum7); - - sptr += 8; - kptr += 128; - } - for (; i + 3 < num_input; i += 4) - { - __m512 _val0 = _mm512_set1_ps(sptr[0]); - __m512 _val1 = _mm512_set1_ps(sptr[1]); - __m512 _val2 = _mm512_set1_ps(sptr[2]); - __m512 _val3 = _mm512_set1_ps(sptr[3]); - - __m512 _w0 = _mm512_loadu_ps(kptr); - __m512 _w1 = _mm512_loadu_ps(kptr + 16); - __m512 _w2 = _mm512_loadu_ps(kptr + 32); - __m512 _w3 = _mm512_loadu_ps(kptr + 48); - _sum0 = _mm512_fmadd_ps(_val0, _w0, _sum0); - _sum1 = _mm512_fmadd_ps(_val1, _w1, _sum1); - _sum2 = _mm512_fmadd_ps(_val2, _w2, _sum2); - _sum3 = _mm512_fmadd_ps(_val3, _w3, _sum3); - - sptr += 4; - kptr += 64; - } - for (; i < num_input; i++) - { - __m512 _val = _mm512_set1_ps(sptr[0]); - __m512 _w = _mm512_loadu_ps(kptr); - _sum0 = _mm512_fmadd_ps(_val, _w, _sum0); - - sptr += 1; - kptr += 16; - } - - _sum0 = _mm512_add_ps(_sum0, _sum1); - _sum2 = _mm512_add_ps(_sum2, _sum3); - _sum4 = _mm512_add_ps(_sum4, _sum5); - _sum6 = _mm512_add_ps(_sum6, _sum7); - _sum0 = _mm512_add_ps(_sum0, _sum2); - _sum4 = _mm512_add_ps(_sum4, _sum6); - _sum0 = _mm512_add_ps(_sum0, _sum4); - - _sum0 = activation_avx512(_sum0, activation_type, activation_params); - - float* outptr = top_blob; - _mm512_storeu_ps(outptr + p * 16, _sum0); - } - } - -#endif // __AVX512F__ - - if (out_elempack == 8) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output / out_elempack; p++) - { - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - __m256 _sum4 = _mm256_set1_ps(0.f); - __m256 _sum5 = _mm256_set1_ps(0.f); - __m256 _sum6 = _mm256_set1_ps(0.f); - __m256 _sum7 = _mm256_set1_ps(0.f); - - if (bias_term) - { - _sum0 = _mm256_loadu_ps((const float*)bias_data + p * 8); - } - - const float* kptr = weight_data_tm.row(p); - - const float* sptr = bottom_blob_flattened; - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m256 _val0 = _mm256_broadcast_ss(sptr); - __m256 _val1 = _mm256_broadcast_ss(sptr + 1); - __m256 _val2 = _mm256_broadcast_ss(sptr + 2); - __m256 _val3 = _mm256_broadcast_ss(sptr + 3); - __m256 _val4 = _mm256_broadcast_ss(sptr + 4); - __m256 _val5 = _mm256_broadcast_ss(sptr + 5); - __m256 _val6 = _mm256_broadcast_ss(sptr + 6); - __m256 _val7 = _mm256_broadcast_ss(sptr + 7); - - __m256 _w0 = _mm256_loadu_ps(kptr); - _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); - __m256 _w1 = _mm256_loadu_ps(kptr + 8); - _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); - __m256 _w2 = _mm256_loadu_ps(kptr + 16); - _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2); - __m256 _w3 = _mm256_loadu_ps(kptr + 24); - _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); - __m256 _w4 = _mm256_loadu_ps(kptr + 32); - _sum4 = _mm256_comp_fmadd_ps(_val4, _w4, _sum4); - __m256 _w5 = _mm256_loadu_ps(kptr + 40); - _sum5 = _mm256_comp_fmadd_ps(_val5, _w5, _sum5); - __m256 _w6 = _mm256_loadu_ps(kptr + 48); - _sum6 = _mm256_comp_fmadd_ps(_val6, _w6, _sum6); - __m256 _w7 = _mm256_loadu_ps(kptr + 56); - _sum7 = _mm256_comp_fmadd_ps(_val7, _w7, _sum7); - - sptr += 8; - kptr += 64; - } - for (; i + 3 < num_input; i += 4) - { - __m256 _val0 = _mm256_broadcast_ss(sptr); - __m256 _val1 = _mm256_broadcast_ss(sptr + 1); - __m256 _val2 = _mm256_broadcast_ss(sptr + 2); - __m256 _val3 = _mm256_broadcast_ss(sptr + 3); - - __m256 _w0 = _mm256_loadu_ps(kptr); - _sum0 = _mm256_comp_fmadd_ps(_val0, _w0, _sum0); - __m256 _w1 = _mm256_loadu_ps(kptr + 8); - _sum1 = _mm256_comp_fmadd_ps(_val1, _w1, _sum1); - __m256 _w2 = _mm256_loadu_ps(kptr + 16); - _sum2 = _mm256_comp_fmadd_ps(_val2, _w2, _sum2); - __m256 _w3 = _mm256_loadu_ps(kptr + 24); - _sum3 = _mm256_comp_fmadd_ps(_val3, _w3, _sum3); - - sptr += 4; - kptr += 32; - } - for (; i < num_input; i++) - { - __m256 _val = _mm256_set1_ps(sptr[0]); - __m256 _w = _mm256_loadu_ps(kptr); - _sum0 = _mm256_comp_fmadd_ps(_val, _w, _sum0); - - sptr += 1; - kptr += 8; - } - - _sum0 = _mm256_add_ps(_sum0, _sum1); - _sum2 = _mm256_add_ps(_sum2, _sum3); - _sum4 = _mm256_add_ps(_sum4, _sum5); - _sum6 = _mm256_add_ps(_sum6, _sum7); - _sum0 = _mm256_add_ps(_sum0, _sum2); - _sum4 = _mm256_add_ps(_sum4, _sum6); - _sum0 = _mm256_add_ps(_sum0, _sum4); - - _sum0 = activation_avx(_sum0, activation_type, activation_params); - - float* outptr = top_blob; - _mm256_storeu_ps(outptr + p * 8, _sum0); - } - } -#endif // __AVX__ - - if (out_elempack == 4) - { - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < num_output / out_elempack; p++) - { - __m128 _sum0 = _mm_set1_ps(0.f); - __m128 _sum1 = _mm_set1_ps(0.f); - __m128 _sum2 = _mm_set1_ps(0.f); - __m128 _sum3 = _mm_set1_ps(0.f); -#if __AVX__ - __m128 _sum4 = _mm_set1_ps(0.f); - __m128 _sum5 = _mm_set1_ps(0.f); - __m128 _sum6 = _mm_set1_ps(0.f); - __m128 _sum7 = _mm_set1_ps(0.f); -#endif - - if (bias_term) - { - _sum0 = _mm_loadu_ps((const float*)bias_data + p * 4); - } - - const float* kptr = weight_data_tm.row(p); - - const float* sptr = bottom_blob_flattened; - - int i = 0; -#if __AVX__ - for (; i + 7 < num_input; i += 8) - { - __m128 _val0 = _mm_broadcast_ss(sptr); - __m128 _val1 = _mm_broadcast_ss(sptr + 1); - __m128 _val2 = _mm_broadcast_ss(sptr + 2); - __m128 _val3 = _mm_broadcast_ss(sptr + 3); - __m128 _val4 = _mm_broadcast_ss(sptr + 4); - __m128 _val5 = _mm_broadcast_ss(sptr + 5); - __m128 _val6 = _mm_broadcast_ss(sptr + 6); - __m128 _val7 = _mm_broadcast_ss(sptr + 7); - - __m128 _w0 = _mm_loadu_ps(kptr); - _sum0 = _mm_comp_fmadd_ps(_val0, _w0, _sum0); - __m128 _w1 = _mm_loadu_ps(kptr + 4); - _sum1 = _mm_comp_fmadd_ps(_val1, _w1, _sum1); - __m128 _w2 = _mm_loadu_ps(kptr + 8); - _sum2 = _mm_comp_fmadd_ps(_val2, _w2, _sum2); - __m128 _w3 = _mm_loadu_ps(kptr + 12); - _sum3 = _mm_comp_fmadd_ps(_val3, _w3, _sum3); - __m128 _w4 = _mm_loadu_ps(kptr + 16); - _sum4 = _mm_comp_fmadd_ps(_val4, _w4, _sum4); - __m128 _w5 = _mm_loadu_ps(kptr + 20); - _sum5 = _mm_comp_fmadd_ps(_val5, _w5, _sum5); - __m128 _w6 = _mm_loadu_ps(kptr + 24); - _sum6 = _mm_comp_fmadd_ps(_val6, _w6, _sum6); - __m128 _w7 = _mm_loadu_ps(kptr + 28); - _sum7 = _mm_comp_fmadd_ps(_val7, _w7, _sum7); - - sptr += 8; - kptr += 32; - } -#endif - for (; i + 3 < num_input; i += 4) - { - __m128 _val0 = _mm_set1_ps(sptr[0]); - __m128 _val1 = _mm_set1_ps(sptr[1]); - __m128 _val2 = _mm_set1_ps(sptr[2]); - __m128 _val3 = _mm_set1_ps(sptr[3]); - - __m128 _w0 = _mm_loadu_ps(kptr); - _sum0 = _mm_add_ps(_mm_mul_ps(_val0, _w0), _sum0); - __m128 _w1 = _mm_loadu_ps(kptr + 4); - _sum1 = _mm_add_ps(_mm_mul_ps(_val1, _w1), _sum1); - __m128 _w2 = _mm_loadu_ps(kptr + 8); - _sum2 = _mm_add_ps(_mm_mul_ps(_val2, _w2), _sum2); - __m128 _w3 = _mm_loadu_ps(kptr + 12); - _sum3 = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum3); - - sptr += 4; - kptr += 16; - } - for (; i < num_input; i++) - { - __m128 _val = _mm_set1_ps(sptr[0]); - __m128 _w = _mm_loadu_ps(kptr); - _sum0 = _mm_add_ps(_mm_mul_ps(_val, _w), _sum0); - - sptr += 1; - kptr += 4; - } - - _sum0 = _mm_add_ps(_sum0, _sum1); - _sum2 = _mm_add_ps(_sum2, _sum3); -#if __AVX__ - _sum4 = _mm_add_ps(_sum4, _sum5); - _sum6 = _mm_add_ps(_sum6, _sum7); -#endif - _sum0 = _mm_add_ps(_sum0, _sum2); -#if __AVX__ - _sum4 = _mm_add_ps(_sum4, _sum6); - _sum0 = _mm_add_ps(_sum0, _sum4); -#endif - - _sum0 = activation_sse(_sum0, activation_type, activation_params); - - float* outptr = top_blob; - _mm_storeu_ps(outptr + p * 4, _sum0); - } - } -#endif // __SSE2__ - - if (out_elempack == 1) - { -#if __SSE2__ -#if __AVX__ - int remain_num_output_start = 0; - int nn_num_output = num_output >> 3; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp = 0; pp < nn_num_output; pp++) - { - int p = pp * 8; - - float sums[8] = {0.0f}; - if (bias_term) - { - sums[0] = bias_data[p]; - sums[1] = bias_data[p + 1]; - sums[2] = bias_data[p + 2]; - sums[3] = bias_data[p + 3]; - sums[4] = bias_data[p + 4]; - sums[5] = bias_data[p + 5]; - sums[6] = bias_data[p + 6]; - sums[7] = bias_data[p + 7]; - } - - const float* w0 = (const float*)weight_data_tm + num_input * p; - const float* w1 = (const float*)weight_data_tm + num_input * (p + 1); - const float* w2 = (const float*)weight_data_tm + num_input * (p + 2); - const float* w3 = (const float*)weight_data_tm + num_input * (p + 3); - const float* w4 = (const float*)weight_data_tm + num_input * (p + 4); - const float* w5 = (const float*)weight_data_tm + num_input * (p + 5); - const float* w6 = (const float*)weight_data_tm + num_input * (p + 6); - const float* w7 = (const float*)weight_data_tm + num_input * (p + 7); - - const float* m = bottom_blob_flattened; - - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - __m256 _sum4 = _mm256_set1_ps(0.f); - __m256 _sum5 = _mm256_set1_ps(0.f); - __m256 _sum6 = _mm256_set1_ps(0.f); - __m256 _sum7 = _mm256_set1_ps(0.f); - - int i = 0; - for (; i + 7 < num_input; i += 8) - { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w0 = _mm256_loadu_ps(w0); - _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0); - __m256 _w1 = _mm256_loadu_ps(w1); - _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1); - __m256 _w2 = _mm256_loadu_ps(w2); - _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2); - __m256 _w3 = _mm256_loadu_ps(w3); - _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); - __m256 _w4 = _mm256_loadu_ps(w4); - _sum4 = _mm256_comp_fmadd_ps(_m, _w4, _sum4); - __m256 _w5 = _mm256_loadu_ps(w5); - _sum5 = _mm256_comp_fmadd_ps(_m, _w5, _sum5); - __m256 _w6 = _mm256_loadu_ps(w6); - _sum6 = _mm256_comp_fmadd_ps(_m, _w6, _sum6); - __m256 _w7 = _mm256_loadu_ps(w7); - _sum7 = _mm256_comp_fmadd_ps(_m, _w7, _sum7); - - m += 8; - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - w4 += 8; - w5 += 8; - w6 += 8; - w7 += 8; - } - for (; i < num_input; i++) - { - sums[0] += *m * *w0; - sums[1] += *m * *w1; - sums[2] += *m * *w2; - sums[3] += *m * *w3; - sums[4] += *m * *w4; - sums[5] += *m * *w5; - sums[6] += *m * *w6; - sums[7] += *m * *w7; - - m++; - w0++; - w1++; - w2++; - w3++; - w4++; - w5++; - w6++; - w7++; - } - - __m256 _sums = HorizontalSums(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7); - __m256 _sums_f = _mm256_loadu_ps(sums); - _sums = _mm256_add_ps(_sums_f, _sums); - _sums = activation_avx(_sums, activation_type, activation_params); - - float* outptr = top_blob; - _mm256_storeu_ps(outptr + p, _sums); - } - - remain_num_output_start += (nn_num_output << 3); - nn_num_output = (num_output - remain_num_output_start) >> 2; -#else - int remain_num_output_start = 0; - int nn_num_output = num_output >> 2; -#endif // __AVX__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp = 0; pp < nn_num_output; pp++) - { - int p = remain_num_output_start + (pp * 4); - - float sums[4] = {0.0f}; - if (bias_term) - { - sums[0] = bias_data[p]; - sums[1] = bias_data[p + 1]; - sums[2] = bias_data[p + 2]; - sums[3] = bias_data[p + 3]; - } - - const float* w0 = (const float*)weight_data_tm + num_input * p; - const float* w1 = (const float*)weight_data_tm + num_input * (p + 1); - const float* w2 = (const float*)weight_data_tm + num_input * (p + 2); - const float* w3 = (const float*)weight_data_tm + num_input * (p + 3); - - const float* m = bottom_blob_flattened; - - int i = 0; -#if __AVX__ - __m256 _sum0 = _mm256_set1_ps(0.f); - __m256 _sum1 = _mm256_set1_ps(0.f); - __m256 _sum2 = _mm256_set1_ps(0.f); - __m256 _sum3 = _mm256_set1_ps(0.f); - for (; i + 7 < num_input; i += 8) - { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w0 = _mm256_loadu_ps(w0); - _sum0 = _mm256_comp_fmadd_ps(_m, _w0, _sum0); - __m256 _w1 = _mm256_loadu_ps(w1); - _sum1 = _mm256_comp_fmadd_ps(_m, _w1, _sum1); - __m256 _w2 = _mm256_loadu_ps(w2); - _sum2 = _mm256_comp_fmadd_ps(_m, _w2, _sum2); - __m256 _w3 = _mm256_loadu_ps(w3); - _sum3 = _mm256_comp_fmadd_ps(_m, _w3, _sum3); - - m += 8; - w0 += 8; - w1 += 8; - w2 += 8; - w3 += 8; - } -#endif // __AVX__ - __m128 _sum0l = _mm_set1_ps(0.f); - __m128 _sum1l = _mm_set1_ps(0.f); - __m128 _sum2l = _mm_set1_ps(0.f); - __m128 _sum3l = _mm_set1_ps(0.f); - for (; i + 3 < num_input; i += 4) - { - __m128 _m = _mm_loadu_ps(m); - - __m128 _w0 = _mm_loadu_ps(w0); - _sum0l = _mm_add_ps(_mm_mul_ps(_m, _w0), _sum0l); - __m128 _w1 = _mm_loadu_ps(w1); - _sum1l = _mm_add_ps(_mm_mul_ps(_m, _w1), _sum1l); - __m128 _w2 = _mm_loadu_ps(w2); - _sum2l = _mm_add_ps(_mm_mul_ps(_m, _w2), _sum2l); - __m128 _w3 = _mm_loadu_ps(w3); - _sum3l = _mm_add_ps(_mm_mul_ps(_m, _w3), _sum3l); - - m += 4; - w0 += 4; - w1 += 4; - w2 += 4; - w3 += 4; - } - for (; i < num_input; i++) - { - sums[0] += *m * *w0; - sums[1] += *m * *w1; - sums[2] += *m * *w2; - sums[3] += *m * *w3; - - m++; - w0++; - w1++; - w2++; - w3++; - } - - __m128 _sums = _mm_loadu_ps(sums); -#if __AVX__ - _sums = _mm_add_ps(HorizontalSums(_sum0, _sum1, _sum2, _sum3), _sums); -#endif - _MM_TRANSPOSE4_PS(_sum0l, _sum1l, _sum2l, _sum3l); - _sums = _mm_add_ps(_sum0l, _sums); - _sums = _mm_add_ps(_sum1l, _sums); - _sums = _mm_add_ps(_sum2l, _sums); - _sums = _mm_add_ps(_sum3l, _sums); - _sums = activation_sse(_sums, activation_type, activation_params); - - float* outptr = top_blob; - _mm_storeu_ps(outptr + p, _sums); - } - - remain_num_output_start += (nn_num_output << 2); -#else - int remain_num_output_start = 0; -#endif // __SSE2__ - - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = remain_num_output_start; p < num_output; p++) - { - float sum = 0.f; - - if (bias_term) - sum = bias_data[p]; - - const float* w = (const float*)weight_data_tm + num_input * p; - - const float* m = bottom_blob_flattened; - - int i = 0; -#if __SSE2__ -#if __AVX__ - __m256 _sum = _mm256_set1_ps(0.f); - for (; i + 7 < num_input; i += 8) - { - __m256 _m = _mm256_loadu_ps(m); - - __m256 _w = _mm256_loadu_ps(w); - _sum = _mm256_comp_fmadd_ps(_m, _w, _sum); - - m += 8; - w += 8; - } -#endif // __AVX__ - __m128 _suml = _mm_set1_ps(0.f); - for (; i + 3 < num_input; i += 4) - { - __m128 _m = _mm_loadu_ps(m); - - __m128 _w = _mm_loadu_ps(w); - _suml = _mm_add_ps(_mm_mul_ps(_m, _w), _suml); - - m += 4; - w += 4; - } -#endif // __SSE2__ - for (; i < num_input; i++) - { - sum += *m * *w; - m++; - w++; - } - -#if __SSE2__ -#if __AVX__ - sum += _mm256_reduce_add_ps(_sum); -#endif - sum += _mm_reduce_add_ps(_suml); -#endif // __SSE2__ - - sum = activation_ss(sum, activation_type, activation_params); - - float* outptr = top_blob; - outptr[p] = sum; - } - } + innerproduct_sse(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); return 0; } -#if NCNN_F16C +#if NCNN_F16C && __AVX__ int InnerProduct_x86::create_pipeline_fp16s(const Option& opt) { const int num_input = weight_data_size / num_output; @@ -2071,31 +238,11 @@ int InnerProduct_x86::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const if (top_blob.empty()) return -100; -#if __AVX512F__ - if (out_elempack == 16) - { - innerproduct_fp16s_pack16_avx512(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); - } -#endif // __AVX512F__ - - if (out_elempack == 8) - { - innerproduct_fp16s_pack8_avx(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); - } - - if (out_elempack == 4) - { - innerproduct_fp16s_pack4_sse(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); - } - - if (out_elempack == 1) - { - innerproduct_fp16s_sse(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); - } + innerproduct_fp16s_sse(bottom_blob_flattened, top_blob, weight_data_tm, bias_data, activation_type, activation_params, opt); return 0; } -#endif // NCNN_F16C +#endif // NCNN_F16C && __AVX__ #if NCNN_INT8 int InnerProduct_x86::create_pipeline_int8_x86(const Option& opt) diff --git a/src/layer/x86/innerproduct_x86.h b/src/layer/x86/innerproduct_x86.h index ab4f26f1380d..211131e6132d 100644 --- a/src/layer/x86/innerproduct_x86.h +++ b/src/layer/x86/innerproduct_x86.h @@ -30,7 +30,7 @@ class InnerProduct_x86 : virtual public InnerProduct virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; protected: -#if NCNN_F16C +#if NCNN_F16C && __AVX__ int create_pipeline_fp16s(const Option& opt); int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif diff --git a/src/layer/x86/innerproduct_x86_f16c.cpp b/src/layer/x86/innerproduct_x86_f16c.cpp index efa2625997f5..e6a942aa3643 100644 --- a/src/layer/x86/innerproduct_x86_f16c.cpp +++ b/src/layer/x86/innerproduct_x86_f16c.cpp @@ -26,18 +26,10 @@ namespace ncnn { -#include "innerproduct_fp16s.h" -#include "innerproduct_gemm_fp16s.h" - -void innerproduct_fp16s_pack8_avx_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) -{ - innerproduct_fp16s_pack8_avx(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); -} - -void innerproduct_fp16s_pack4_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) -{ - innerproduct_fp16s_pack4_sse(bottom_blob, top_blob, weight_data_fp16, bias_data, activation_type, activation_params, opt); -} +#define NCNN_IMPL_FP16S 1 +#include "innerproduct_fp.h" +#include "innerproduct_gemm_fp.h" +#undef NCNN_IMPL_FP16S void innerproduct_fp16s_sse_f16c(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int activation_type, const Mat& activation_params, const Option& opt) { diff --git a/src/layer/x86/layernorm_x86.cpp b/src/layer/x86/layernorm_x86.cpp index 3f6a66a5ec03..27ded2b4e3ab 100644 --- a/src/layer/x86/layernorm_x86.cpp +++ b/src/layer/x86/layernorm_x86.cpp @@ -119,7 +119,7 @@ static NCNN_FORCEINLINE void fast_mean(float* ptr, float* mean, int elempack, in #if __SSE2__ #if __AVX__ #if __AVX512F__ - sum += _mm512_reduce_add_ps(_sum_512); + sum += _mm512_comp_reduce_add_ps(_sum_512); #endif // __AVX512F__ sum += _mm256_reduce_add_ps(_sum_256); #endif // __AVX__ @@ -230,7 +230,7 @@ static NCNN_FORCEINLINE void fast_var(float* ptr, float* var, const float* mean, #if __SSE2__ #if __AVX__ #if __AVX512F__ - sq_sum += _mm512_reduce_add_ps(_sq_sum_512); + sq_sum += _mm512_comp_reduce_add_ps(_sq_sum_512); #endif // __AVX512F__ sq_sum += _mm256_reduce_add_ps(_sq_sum_256); #endif // __AVX__ diff --git a/src/layer/x86/lstm_x86.cpp b/src/layer/x86/lstm_x86.cpp index 59124f7907ad..53c8bfe902be 100644 --- a/src/layer/x86/lstm_x86.cpp +++ b/src/layer/x86/lstm_x86.cpp @@ -14,6 +14,13 @@ #include "lstm_x86.h" +#if __SSE2__ +#include +#if __AVX__ +#include +#endif +#endif // __SSE2__ + #include "x86_activation.h" #include "x86_usability.h" @@ -30,23 +37,183 @@ LSTM_x86::LSTM_x86() int LSTM_x86::create_pipeline(const Option& opt) { - (void)(opt); + // pack IFOG + int num_directions = direction == 2 ? 2 : 1; + int size = weight_data_size / num_directions / hidden_size / 4; + +#if __AVX__ + weight_xc_data_packed.create(size, hidden_size / 2 + hidden_size % 2, num_directions, 32u, 8); + bias_c_data_packed.create(hidden_size, 1, num_directions, 16u, 4); + weight_hc_data_packed.create(num_output, hidden_size / 2 + hidden_size % 2, num_directions, 32u, 8); +#else + weight_xc_data_packed.create(size, hidden_size, num_directions, 16u, 4); + bias_c_data_packed.create(hidden_size, 1, num_directions, 16u, 4); + weight_hc_data_packed.create(num_output, hidden_size, num_directions, 16u, 4); +#endif + + #pragma omp parallel for num_threads(opt.num_threads) + for (int dr = 0; dr < num_directions; dr++) + { + const Mat weight_xc = weight_xc_data.channel(dr); + const Mat bias_c = bias_c_data.channel(dr); + const Mat weight_hc = weight_hc_data.channel(dr); + + Mat weight_xc_data_packed_dr = weight_xc_data_packed.channel(dr); + Mat bias_c_data_packed_dr = bias_c_data_packed.channel(dr); + Mat weight_hc_data_packed_dr = weight_hc_data_packed.channel(dr); + + const float* bias_c_I = bias_c.row(0); + const float* bias_c_F = bias_c.row(1); + const float* bias_c_O = bias_c.row(2); + const float* bias_c_G = bias_c.row(3); + + float* bias_c_IFOG = bias_c_data_packed_dr.row(0); + + int q = 0; +#if __AVX__ + for (; q + 1 < hidden_size; q += 2) + { + bias_c_IFOG[0] = bias_c_I[q]; + bias_c_IFOG[1] = bias_c_F[q]; + bias_c_IFOG[2] = bias_c_O[q]; + bias_c_IFOG[3] = bias_c_G[q]; + bias_c_IFOG[4] = bias_c_I[q + 1]; + bias_c_IFOG[5] = bias_c_F[q + 1]; + bias_c_IFOG[6] = bias_c_O[q + 1]; + bias_c_IFOG[7] = bias_c_G[q + 1]; + + bias_c_IFOG += 8; + + const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q); + const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q); + const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q); + const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q); + const float* weight_xc_I_1 = weight_xc.row(hidden_size * 0 + q + 1); + const float* weight_xc_F_1 = weight_xc.row(hidden_size * 1 + q + 1); + const float* weight_xc_O_1 = weight_xc.row(hidden_size * 2 + q + 1); + const float* weight_xc_G_1 = weight_xc.row(hidden_size * 3 + q + 1); + + const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q); + const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q); + const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q); + const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q); + const float* weight_hc_I_1 = weight_hc.row(hidden_size * 0 + q + 1); + const float* weight_hc_F_1 = weight_hc.row(hidden_size * 1 + q + 1); + const float* weight_hc_O_1 = weight_hc.row(hidden_size * 2 + q + 1); + const float* weight_hc_G_1 = weight_hc.row(hidden_size * 3 + q + 1); + + float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q / 2); + float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q / 2); + + for (int i = 0; i < size; i++) + { + weight_xc_IFOG[0] = weight_xc_I[i]; + weight_xc_IFOG[1] = weight_xc_F[i]; + weight_xc_IFOG[2] = weight_xc_O[i]; + weight_xc_IFOG[3] = weight_xc_G[i]; + weight_xc_IFOG[4] = weight_xc_I_1[i]; + weight_xc_IFOG[5] = weight_xc_F_1[i]; + weight_xc_IFOG[6] = weight_xc_O_1[i]; + weight_xc_IFOG[7] = weight_xc_G_1[i]; + + weight_xc_IFOG += 8; + } + + for (int i = 0; i < num_output; i++) + { + weight_hc_IFOG[0] = weight_hc_I[i]; + weight_hc_IFOG[1] = weight_hc_F[i]; + weight_hc_IFOG[2] = weight_hc_O[i]; + weight_hc_IFOG[3] = weight_hc_G[i]; + weight_hc_IFOG[4] = weight_hc_I_1[i]; + weight_hc_IFOG[5] = weight_hc_F_1[i]; + weight_hc_IFOG[6] = weight_hc_O_1[i]; + weight_hc_IFOG[7] = weight_hc_G_1[i]; + + weight_hc_IFOG += 8; + } + } +#endif // __AVX__ + for (; q < hidden_size; q++) + { + bias_c_IFOG[0] = bias_c_I[q]; + bias_c_IFOG[1] = bias_c_F[q]; + bias_c_IFOG[2] = bias_c_O[q]; + bias_c_IFOG[3] = bias_c_G[q]; + + bias_c_IFOG += 4; + + const float* weight_xc_I = weight_xc.row(hidden_size * 0 + q); + const float* weight_xc_F = weight_xc.row(hidden_size * 1 + q); + const float* weight_xc_O = weight_xc.row(hidden_size * 2 + q); + const float* weight_xc_G = weight_xc.row(hidden_size * 3 + q); + + const float* weight_hc_I = weight_hc.row(hidden_size * 0 + q); + const float* weight_hc_F = weight_hc.row(hidden_size * 1 + q); + const float* weight_hc_O = weight_hc.row(hidden_size * 2 + q); + const float* weight_hc_G = weight_hc.row(hidden_size * 3 + q); + +#if __AVX__ + float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q / 2 + q % 2); + float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q / 2 + q % 2); +#else + float* weight_xc_IFOG = weight_xc_data_packed_dr.row(q); + float* weight_hc_IFOG = weight_hc_data_packed_dr.row(q); +#endif + + for (int i = 0; i < size; i++) + { + weight_xc_IFOG[0] = weight_xc_I[i]; + weight_xc_IFOG[1] = weight_xc_F[i]; + weight_xc_IFOG[2] = weight_xc_O[i]; + weight_xc_IFOG[3] = weight_xc_G[i]; + + weight_xc_IFOG += 4; + } + + for (int i = 0; i < num_output; i++) + { + weight_hc_IFOG[0] = weight_hc_I[i]; + weight_hc_IFOG[1] = weight_hc_F[i]; + weight_hc_IFOG[2] = weight_hc_O[i]; + weight_hc_IFOG[3] = weight_hc_G[i]; + + weight_hc_IFOG += 4; + } + } + } + + if (opt.lightmode) + { + weight_xc_data.release(); + bias_c_data.release(); + weight_hc_data.release(); + } return 0; } -#ifdef __AVX__ -static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, Mat& cell_state, const Option& opt) + +static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, const Mat& weight_hr, Mat& hidden_state, Mat& cell_state, const Option& opt) { int size = bottom_blob.w; int T = bottom_blob.h; int num_output = top_blob.w; + int hidden_size = cell_state.w; - // 4 x num_output - Mat gates(num_output, 4, 4u, opt.workspace_allocator); + // 4 x hidden_size + Mat gates(4, hidden_size, 4u, opt.workspace_allocator); if (gates.empty()) return -100; + Mat tmp_hidden_state; + if (num_output != hidden_size) + { + tmp_hidden_state.create(hidden_size, 4u, opt.workspace_allocator); + if (tmp_hidden_state.empty()) + return -100; + } + // unroll for (int t = 0; t < T; t++) { @@ -59,267 +226,222 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w int ti = reverse ? T - 1 - t : t; - int nn_num_output = num_output >> 1; - int remain_num_output_start = nn_num_output << 1; +#if __AVX__ + int nn_hidden_size = hidden_size >> 1; + int remain_hidden_size_start = nn_hidden_size << 1; #pragma omp parallel for num_threads(opt.num_threads) - for (int qq = 0; qq < nn_num_output; qq++) + for (int qq = 0; qq < nn_hidden_size; qq++) { int q = qq * 2; - const float* x = bottom_blob.row(ti); - const float* hidden_ptr_r = hidden_state; - const float* bias_c_I = bias_c.row(0); - const float* bias_c_F = bias_c.row(1); - const float* bias_c_O = bias_c.row(2); - const float* bias_c_G = bias_c.row(3); - - float* gates_data_I = gates.row(0); - float* gates_data_F = gates.row(1); - float* gates_data_O = gates.row(2); - float* gates_data_G = gates.row(3); + const float* bias_c_IFOG = (const float*)bias_c + q * 4; + // gate I F O G - const float* weight_xc_I_0 = weight_xc.row(num_output * 0 + q); - const float* weight_xc_F_0 = weight_xc.row(num_output * 1 + q); - const float* weight_xc_O_0 = weight_xc.row(num_output * 2 + q); - const float* weight_xc_G_0 = weight_xc.row(num_output * 3 + q); - const float* weight_xc_I_1 = weight_xc.row(num_output * 0 + (q + 1)); - const float* weight_xc_F_1 = weight_xc.row(num_output * 1 + (q + 1)); - const float* weight_xc_O_1 = weight_xc.row(num_output * 2 + (q + 1)); - const float* weight_xc_G_1 = weight_xc.row(num_output * 3 + (q + 1)); - - const float* weight_hc_I_0 = weight_hc.row(num_output * 0 + q); - const float* weight_hc_F_0 = weight_hc.row(num_output * 1 + q); - const float* weight_hc_O_0 = weight_hc.row(num_output * 2 + q); - const float* weight_hc_G_0 = weight_hc.row(num_output * 3 + q); - const float* weight_hc_I_1 = weight_hc.row(num_output * 0 + (q + 1)); - const float* weight_hc_F_1 = weight_hc.row(num_output * 1 + (q + 1)); - const float* weight_hc_O_1 = weight_hc.row(num_output * 2 + (q + 1)); - const float* weight_hc_G_1 = weight_hc.row(num_output * 3 + (q + 1)); - - // float I = bias_c_I[q]; - // float F = bias_c_F[q]; - // float O = bias_c_O[q]; - // float G = bias_c_G[q]; - __m256 _sumI_0 = _mm256_setzero_ps(); - __m256 _sumF_0 = _mm256_setzero_ps(); - __m256 _sumO_0 = _mm256_setzero_ps(); - __m256 _sumG_0 = _mm256_setzero_ps(); - __m256 _sumI_1 = _mm256_setzero_ps(); - __m256 _sumF_1 = _mm256_setzero_ps(); - __m256 _sumO_1 = _mm256_setzero_ps(); - __m256 _sumG_1 = _mm256_setzero_ps(); - int nn_num_size = size >> 3; - int remain_size = size & 7; - for (; nn_num_size > 0; nn_num_size--) + const float* weight_xc_IFOG = weight_xc.row(q / 2); + const float* weight_hc_IFOG = weight_hc.row(q / 2); + + __m256 _IFOG = _mm256_loadu_ps(bias_c_IFOG); + __m256 _sum1 = _mm256_setzero_ps(); + __m256 _sum2 = _mm256_setzero_ps(); + __m256 _sum3 = _mm256_setzero_ps(); + + const float* x = bottom_blob.row(ti); + + int i = 0; + for (; i + 3 < size; i += 4) { - __m256 xi = _mm256_loadu_ps(x); - _sumI_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_I_0), xi, _sumI_0); - _sumF_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_F_0), xi, _sumF_0); - _sumO_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_O_0), xi, _sumO_0); - _sumG_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_G_0), xi, _sumG_0); - _sumI_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_I_1), xi, _sumI_1); - _sumF_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_F_1), xi, _sumF_1); - _sumO_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_O_1), xi, _sumO_1); - _sumG_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_G_1), xi, _sumG_1); - x += 8; - weight_xc_I_0 += 8; - weight_xc_F_0 += 8; - weight_xc_O_0 += 8; - weight_xc_G_0 += 8; - weight_xc_I_1 += 8; - weight_xc_F_1 += 8; - weight_xc_O_1 += 8; - weight_xc_G_1 += 8; + __m256 _xi0 = _mm256_broadcast_ss(x); + __m256 _xi1 = _mm256_broadcast_ss(x + 1); + __m256 _xi2 = _mm256_broadcast_ss(x + 2); + __m256 _xi3 = _mm256_broadcast_ss(x + 3); + __m256 _weight_xc_IFOG0 = _mm256_loadu_ps(weight_xc_IFOG); + __m256 _weight_xc_IFOG1 = _mm256_loadu_ps(weight_xc_IFOG + 8); + __m256 _weight_xc_IFOG2 = _mm256_loadu_ps(weight_xc_IFOG + 16); + __m256 _weight_xc_IFOG3 = _mm256_loadu_ps(weight_xc_IFOG + 24); + _IFOG = _mm256_comp_fmadd_ps(_weight_xc_IFOG0, _xi0, _IFOG); + _sum1 = _mm256_comp_fmadd_ps(_weight_xc_IFOG1, _xi1, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_weight_xc_IFOG2, _xi2, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_weight_xc_IFOG3, _xi3, _sum3); + + x += 4; + weight_xc_IFOG += 32; } - int nn_num_output = num_output >> 3; - int remain_num_output = num_output & 7; - for (; nn_num_output > 0; nn_num_output--) + for (; i < size; i++) { - __m256 h_cont = _mm256_loadu_ps(hidden_ptr_r); - - _sumI_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_I_0), h_cont, _sumI_0); - _sumF_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_F_0), h_cont, _sumF_0); - _sumO_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_O_0), h_cont, _sumO_0); - _sumG_0 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_G_0), h_cont, _sumG_0); - _sumI_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_I_1), h_cont, _sumI_1); - _sumF_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_F_1), h_cont, _sumF_1); - _sumO_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_O_1), h_cont, _sumO_1); - _sumG_1 = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_G_1), h_cont, _sumG_1); - hidden_ptr_r += 8; - weight_hc_I_0 += 8; - weight_hc_F_0 += 8; - weight_hc_O_0 += 8; - weight_hc_G_0 += 8; - weight_hc_I_1 += 8; - weight_hc_F_1 += 8; - weight_hc_O_1 += 8; - weight_hc_G_1 += 8; + __m256 _xi = _mm256_broadcast_ss(x); + __m256 _weight_xc_IFOG = _mm256_loadu_ps(weight_xc_IFOG); + _IFOG = _mm256_comp_fmadd_ps(_weight_xc_IFOG, _xi, _IFOG); + + x += 1; + weight_xc_IFOG += 8; } - float sums[8]; - _mm256_storeu_ps(sums, HorizontalSums(_sumI_0, _sumF_0, _sumO_0, _sumG_0, _sumI_1, _sumF_1, _sumO_1, _sumG_1)); - sums[0] += bias_c_I[q]; - sums[1] += bias_c_F[q]; - sums[2] += bias_c_O[q]; - sums[3] += bias_c_G[q]; - sums[4] += bias_c_I[q + 1]; - sums[5] += bias_c_F[q + 1]; - sums[6] += bias_c_O[q + 1]; - sums[7] += bias_c_G[q + 1]; - - for (; remain_size > 0; remain_size--) + + const float* hidden_ptr = hidden_state; + + i = 0; + for (; i + 3 < num_output; i += 4) { - float xi = *x; - sums[0] += *weight_xc_I_0 * xi; - sums[1] += *weight_xc_F_0 * xi; - sums[2] += *weight_xc_O_0 * xi; - sums[3] += *weight_xc_G_0 * xi; - sums[4] += *weight_xc_I_1 * xi; - sums[5] += *weight_xc_F_1 * xi; - sums[6] += *weight_xc_O_1 * xi; - sums[7] += *weight_xc_G_1 * xi; - x++; - weight_xc_I_0++; - weight_xc_F_0++; - weight_xc_O_0++; - weight_xc_G_0++; - weight_xc_I_1++; - weight_xc_F_1++; - weight_xc_O_1++; - weight_xc_G_1++; + __m256 _h_cont0 = _mm256_broadcast_ss(hidden_ptr); + __m256 _h_cont1 = _mm256_broadcast_ss(hidden_ptr + 1); + __m256 _h_cont2 = _mm256_broadcast_ss(hidden_ptr + 2); + __m256 _h_cont3 = _mm256_broadcast_ss(hidden_ptr + 3); + __m256 _weight_hc_IFOG0 = _mm256_loadu_ps(weight_hc_IFOG); + __m256 _weight_hc_IFOG1 = _mm256_loadu_ps(weight_hc_IFOG + 8); + __m256 _weight_hc_IFOG2 = _mm256_loadu_ps(weight_hc_IFOG + 16); + __m256 _weight_hc_IFOG3 = _mm256_loadu_ps(weight_hc_IFOG + 24); + _IFOG = _mm256_comp_fmadd_ps(_weight_hc_IFOG0, _h_cont0, _IFOG); + _sum1 = _mm256_comp_fmadd_ps(_weight_hc_IFOG1, _h_cont1, _sum1); + _sum2 = _mm256_comp_fmadd_ps(_weight_hc_IFOG2, _h_cont2, _sum2); + _sum3 = _mm256_comp_fmadd_ps(_weight_hc_IFOG3, _h_cont3, _sum3); + + hidden_ptr += 4; + weight_hc_IFOG += 32; } - - for (; remain_num_output > 0; remain_num_output--) + for (; i < num_output; i++) { - float h_cont = *hidden_ptr_r; - sums[0] += *weight_hc_I_0 * h_cont; - sums[1] += *weight_hc_F_0 * h_cont; - sums[2] += *weight_hc_O_0 * h_cont; - sums[3] += *weight_hc_G_0 * h_cont; - sums[4] += *weight_hc_I_1 * h_cont; - sums[5] += *weight_hc_F_1 * h_cont; - sums[6] += *weight_hc_O_1 * h_cont; - sums[7] += *weight_hc_G_1 * h_cont; - hidden_ptr_r++; - weight_hc_I_0++; - weight_hc_F_0++; - weight_hc_O_0++; - weight_hc_G_0++; - weight_hc_I_1++; - weight_hc_F_1++; - weight_hc_O_1++; - weight_hc_G_1++; + __m256 _h_cont = _mm256_broadcast_ss(hidden_ptr); + __m256 _weight_hc_IFOG = _mm256_loadu_ps(weight_hc_IFOG); + _IFOG = _mm256_comp_fmadd_ps(_weight_hc_IFOG, _h_cont, _IFOG); + + hidden_ptr += 1; + weight_hc_IFOG += 8; } - gates_data_I[q] = sums[0]; - gates_data_F[q] = sums[1]; - gates_data_O[q] = sums[2]; - gates_data_G[q] = sums[3]; - gates_data_I[q + 1] = sums[4]; - gates_data_F[q + 1] = sums[5]; - gates_data_O[q + 1] = sums[6]; - gates_data_G[q + 1] = sums[7]; + + float* gates_data = gates.row(q); + + _IFOG = _mm256_add_ps(_IFOG, _sum1); + _sum2 = _mm256_add_ps(_sum2, _sum3); + _IFOG = _mm256_add_ps(_IFOG, _sum2); + + _mm256_storeu_ps(gates_data, _IFOG); } +#else + int nn_hidden_size = 0; + int remain_hidden_size_start = 0; +#endif // __AVX__ + #pragma omp parallel for num_threads(opt.num_threads) - for (int q = remain_num_output_start; q < num_output; q++) + for (int q = remain_hidden_size_start; q < hidden_size; q++) { - const float* x = bottom_blob.row(ti); - const float* hidden_ptr_r = hidden_state; - const float* bias_c_I = bias_c.row(0); - const float* bias_c_F = bias_c.row(1); - const float* bias_c_O = bias_c.row(2); - const float* bias_c_G = bias_c.row(3); - - float* gates_data_I = gates.row(0); - float* gates_data_F = gates.row(1); - float* gates_data_O = gates.row(2); - float* gates_data_G = gates.row(3); + const float* bias_c_IFOG = (const float*)bias_c + q * 4; + // gate I F O G - const float* weight_xc_I = weight_xc.row(num_output * 0 + q); - const float* weight_xc_F = weight_xc.row(num_output * 1 + q); - const float* weight_xc_O = weight_xc.row(num_output * 2 + q); - const float* weight_xc_G = weight_xc.row(num_output * 3 + q); - - const float* weight_hc_I = weight_hc.row(num_output * 0 + q); - const float* weight_hc_F = weight_hc.row(num_output * 1 + q); - const float* weight_hc_O = weight_hc.row(num_output * 2 + q); - const float* weight_hc_G = weight_hc.row(num_output * 3 + q); - - // float I = bias_c_I[q]; - // float F = bias_c_F[q]; - // float O = bias_c_O[q]; - // float G = bias_c_G[q]; - __m256 _sumI = _mm256_setzero_ps(); - __m256 _sumF = _mm256_setzero_ps(); - __m256 _sumO = _mm256_setzero_ps(); - __m256 _sumG = _mm256_setzero_ps(); - int nn_num_size = size >> 3; - int remain_size = size & 7; - for (; nn_num_size > 0; nn_num_size--) +#if __AVX__ + const float* weight_xc_IFOG = weight_xc.row(q / 2 + q % 2); + const float* weight_hc_IFOG = weight_hc.row(q / 2 + q % 2); +#else + const float* weight_xc_IFOG = weight_xc.row(q); + const float* weight_hc_IFOG = weight_hc.row(q); +#endif + +#if __SSE2__ + __m128 _IFOG = _mm_loadu_ps(bias_c_IFOG); + __m128 _sum1 = _mm_setzero_ps(); + __m128 _sum2 = _mm_setzero_ps(); + __m128 _sum3 = _mm_setzero_ps(); +#else // __SSE2__ + float I = bias_c_IFOG[0]; + float F = bias_c_IFOG[1]; + float O = bias_c_IFOG[2]; + float G = bias_c_IFOG[3]; +#endif // __SSE2__ + + const float* x = bottom_blob.row(ti); + + int i = 0; +#if __SSE2__ + for (; i + 3 < size; i += 4) { - __m256 xi = _mm256_loadu_ps(x); - _sumI = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_I), xi, _sumI); - _sumF = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_F), xi, _sumF); - _sumO = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_O), xi, _sumO); - _sumG = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_xc_G), xi, _sumG); - x += 8; - weight_xc_I += 8; - weight_xc_F += 8; - weight_xc_O += 8; - weight_xc_G += 8; + __m128 _xi0 = _mm_load1_ps(x); + __m128 _xi1 = _mm_load1_ps(x + 1); + __m128 _xi2 = _mm_load1_ps(x + 2); + __m128 _xi3 = _mm_load1_ps(x + 3); + __m128 _weight_xc_IFOG0 = _mm_loadu_ps(weight_xc_IFOG); + __m128 _weight_xc_IFOG1 = _mm_loadu_ps(weight_xc_IFOG + 4); + __m128 _weight_xc_IFOG2 = _mm_loadu_ps(weight_xc_IFOG + 8); + __m128 _weight_xc_IFOG3 = _mm_loadu_ps(weight_xc_IFOG + 12); + _IFOG = _mm_comp_fmadd_ps(_weight_xc_IFOG0, _xi0, _IFOG); + _sum1 = _mm_comp_fmadd_ps(_weight_xc_IFOG1, _xi1, _sum1); + _sum2 = _mm_comp_fmadd_ps(_weight_xc_IFOG2, _xi2, _sum2); + _sum3 = _mm_comp_fmadd_ps(_weight_xc_IFOG3, _xi3, _sum3); + + x += 4; + weight_xc_IFOG += 16; } - int nn_num_output = num_output >> 3; - int remain_num_output = num_output & 7; - for (; nn_num_output > 0; nn_num_output--) +#endif // __SSE2__ + for (; i < size; i++) { - __m256 h_cont = _mm256_loadu_ps(hidden_ptr_r); - - _sumI = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_I), h_cont, _sumI); - _sumF = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_F), h_cont, _sumF); - _sumO = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_O), h_cont, _sumO); - _sumG = _mm256_comp_fmadd_ps(_mm256_loadu_ps(weight_hc_G), h_cont, _sumG); - hidden_ptr_r += 8; - weight_hc_I += 8; - weight_hc_F += 8; - weight_hc_O += 8; - weight_hc_G += 8; +#if __SSE2__ + __m128 _xi = _mm_load1_ps(x); + __m128 _weight_xc_IFOG = _mm_loadu_ps(weight_xc_IFOG); + _IFOG = _mm_comp_fmadd_ps(_weight_xc_IFOG, _xi, _IFOG); +#else // __SSE2__ + float xi = x[0]; + I += xi * weight_xc_IFOG[0]; + F += xi * weight_xc_IFOG[1]; + O += xi * weight_xc_IFOG[2]; + G += xi * weight_xc_IFOG[3]; +#endif // __SSE2__ + + x += 1; + weight_xc_IFOG += 4; } - float sums[4]; - _mm_storeu_ps(sums, HorizontalSums(_sumI, _sumF, _sumO, _sumG)); - sums[0] += bias_c_I[q]; - sums[1] += bias_c_F[q]; - sums[2] += bias_c_O[q]; - sums[3] += bias_c_G[q]; - - for (; remain_size > 0; remain_size--) + + const float* hidden_ptr = hidden_state; + + i = 0; +#if __SSE2__ + for (; i + 3 < num_output; i += 4) { - float xi = *x; - sums[0] += *weight_xc_I * xi; - sums[1] += *weight_xc_F * xi; - sums[2] += *weight_xc_O * xi; - sums[3] += *weight_xc_G * xi; - x++; - weight_xc_I++; - weight_xc_F++; - weight_xc_O++; - weight_xc_G++; + __m128 _h_cont0 = _mm_load1_ps(hidden_ptr); + __m128 _h_cont1 = _mm_load1_ps(hidden_ptr + 1); + __m128 _h_cont2 = _mm_load1_ps(hidden_ptr + 2); + __m128 _h_cont3 = _mm_load1_ps(hidden_ptr + 3); + __m128 _weight_hc_IFOG0 = _mm_loadu_ps(weight_hc_IFOG); + __m128 _weight_hc_IFOG1 = _mm_loadu_ps(weight_hc_IFOG + 4); + __m128 _weight_hc_IFOG2 = _mm_loadu_ps(weight_hc_IFOG + 8); + __m128 _weight_hc_IFOG3 = _mm_loadu_ps(weight_hc_IFOG + 12); + _IFOG = _mm_comp_fmadd_ps(_weight_hc_IFOG0, _h_cont0, _IFOG); + _sum1 = _mm_comp_fmadd_ps(_weight_hc_IFOG1, _h_cont1, _sum1); + _sum2 = _mm_comp_fmadd_ps(_weight_hc_IFOG2, _h_cont2, _sum2); + _sum3 = _mm_comp_fmadd_ps(_weight_hc_IFOG3, _h_cont3, _sum3); + + hidden_ptr += 4; + weight_hc_IFOG += 16; } - - for (; remain_num_output > 0; remain_num_output--) +#endif // __SSE2__ + for (; i < num_output; i++) { - float h_cont = *hidden_ptr_r; - sums[0] += *weight_hc_I * h_cont; - sums[1] += *weight_hc_F * h_cont; - sums[2] += *weight_hc_O * h_cont; - sums[3] += *weight_hc_G * h_cont; - hidden_ptr_r++; - weight_hc_I++; - weight_hc_F++; - weight_hc_O++; - weight_hc_G++; +#if __SSE2__ + __m128 _h_cont = _mm_load1_ps(hidden_ptr); + __m128 _weight_hc_IFOG = _mm_loadu_ps(weight_hc_IFOG); + _IFOG = _mm_comp_fmadd_ps(_weight_hc_IFOG, _h_cont, _IFOG); +#else // __SSE2__ + float h_cont = hidden_ptr[0]; + I += h_cont * weight_hc_IFOG[0]; + F += h_cont * weight_hc_IFOG[1]; + O += h_cont * weight_hc_IFOG[2]; + G += h_cont * weight_hc_IFOG[3]; +#endif // __SSE2__ + + hidden_ptr += 1; + weight_hc_IFOG += 4; } - gates_data_I[q] = sums[0]; - gates_data_F[q] = sums[1]; - gates_data_O[q] = sums[2]; - gates_data_G[q] = sums[3]; + + float* gates_data = gates.row(q); + +#if __SSE2__ + _IFOG = _mm_add_ps(_IFOG, _sum1); + _sum2 = _mm_add_ps(_sum2, _sum3); + _IFOG = _mm_add_ps(_IFOG, _sum2); + + _mm_storeu_ps(gates_data, _IFOG); +#else // __SSE2__ + gates_data[0] = I; + gates_data[1] = F; + gates_data[2] = O; + gates_data[3] = G; +#endif // __SSE2__ } // lstm unit @@ -330,69 +452,117 @@ static int lstm(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& w // c_t := f_t .* c_{t-1} + i_t .* g_t // h_t := o_t .* tanh[c_t] float* output_data = top_blob.row(ti); + float* cell_ptr = cell_state; float* hidden_ptr = hidden_state; - const float* gates_data_I = gates.row(0); - const float* gates_data_F = gates.row(1); - const float* gates_data_O = gates.row(2); - const float* gates_data_G = gates.row(3); - int nn_activation = num_output >> 3; - int remain_activations = num_output & 7; - for (; nn_activation > 0; nn_activation--) + float* tmp_hidden_ptr = tmp_hidden_state; + +#if __SSE2__ + nn_hidden_size = hidden_size >> 2; + remain_hidden_size_start = nn_hidden_size << 2; + #pragma omp parallel for num_threads(opt.num_threads) + for (int qq = 0; qq < nn_hidden_size; qq++) { - __m256 I = sigmoid_avx(_mm256_loadu_ps(gates_data_I)); - __m256 F = sigmoid_avx(_mm256_loadu_ps(gates_data_F)); - __m256 O = sigmoid_avx(_mm256_loadu_ps(gates_data_O)); - __m256 G = tanh_avx(_mm256_loadu_ps(gates_data_G)); - __m256 cell2 = _mm256_add_ps(_mm256_mul_ps(F, _mm256_loadu_ps(cell_ptr)), _mm256_mul_ps(I, G)); - __m256 H = _mm256_mul_ps(O, tanh_avx(cell2)); - _mm256_storeu_ps(cell_ptr, cell2); - _mm256_storeu_ps(hidden_ptr, H); - _mm256_storeu_ps(output_data, H); - cell_ptr += 8; - output_data += 8; - hidden_ptr += 8; - gates_data_I += 8; - gates_data_F += 8; - gates_data_O += 8; - gates_data_G += 8; + int q = qq * 4; + + const float* gates_data = gates.row(q); + + __m128 _IFOG_4x4_0 = _mm_loadu_ps(gates_data); + __m128 _IFOG_4x4_1 = _mm_loadu_ps(gates_data + 4); + __m128 _IFOG_4x4_2 = _mm_loadu_ps(gates_data + 8); + __m128 _IFOG_4x4_3 = _mm_loadu_ps(gates_data + 12); + + _MM_TRANSPOSE4_PS(_IFOG_4x4_0, _IFOG_4x4_1, _IFOG_4x4_2, _IFOG_4x4_3); + + __m128 _I = sigmoid_sse(_IFOG_4x4_0); + __m128 _F = sigmoid_sse(_IFOG_4x4_1); + __m128 _O = sigmoid_sse(_IFOG_4x4_2); + __m128 _G = tanh_sse(_IFOG_4x4_3); + + __m128 _cell2 = _mm_add_ps(_mm_mul_ps(_F, _mm_loadu_ps(cell_ptr + q)), _mm_mul_ps(_I, _G)); + __m128 _H = _mm_mul_ps(_O, tanh_sse(_cell2)); + + _mm_storeu_ps(cell_ptr + q, _cell2); + + if (num_output == hidden_size) + { + _mm_storeu_ps(hidden_ptr + q, _H); + _mm_storeu_ps(output_data + q, _H); + } + else + { + _mm_storeu_ps(tmp_hidden_ptr + q, _H); + } } - for (; remain_activations > 0; remain_activations--) +#else // __SSE2__ + remain_hidden_size_start = 0; +#endif // __SSE2__ + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = remain_hidden_size_start; q < hidden_size; q++) { - float I = *gates_data_I; - float F = *gates_data_F; - float O = *gates_data_O; - float G = *gates_data_G; + const float* gates_data = gates.row(q); + + float I = gates_data[0]; + float F = gates_data[1]; + float O = gates_data[2]; + float G = gates_data[3]; I = 1.f / (1.f + exp(-I)); F = 1.f / (1.f + exp(-F)); O = 1.f / (1.f + exp(-O)); G = tanh(G); - float cell2 = F * *cell_ptr + I * G; + + float cell2 = F * cell_ptr[q] + I * G; float H = O * tanh(cell2); - *cell_ptr = cell2; - *hidden_ptr = H; - *output_data = H; - cell_ptr++; - output_data++; - hidden_ptr++; - gates_data_I++; - gates_data_F++; - gates_data_O++; - gates_data_G++; + + cell_ptr[q] = cell2; + if (num_output == hidden_size) + { + hidden_ptr[q] = H; + output_data[q] = H; + } + else + { + tmp_hidden_ptr[q] = H; + } } - // no cell output here + if (num_output != hidden_size) + { + // int nn_num_output = num_output >> 2; + // int remain_num_output_start = nn_num_output << 2; + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int qq = 0; qq < nn_num_output; qq++) + // { + // int q = qq * 4; + // + // } + int remain_num_output_start = 0; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = remain_num_output_start; q < num_output; q++) + { + const float* hr = weight_hr.row(q); + const float* tmp_hidden_ptr = tmp_hidden_state; + + float H = 0; + for (int i = 0; i < hidden_size; i++) + { + H += tmp_hidden_ptr[i] * hr[i]; + } + + output_data[q] = H; + hidden_ptr[q] = H; + } + } } return 0; } -#endif int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { -#if __AVX__ int T = bottom_blob.h; + int num_directions = direction == 2 ? 2 : 1; // initial hidden state @@ -400,8 +570,8 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (hidden.empty()) return -100; hidden.fill(0.f); - // internal cell state - Mat cell(num_output, 4u, opt.workspace_allocator); + + Mat cell(hidden_size, 4u, opt.workspace_allocator); if (cell.empty()) return -100; cell.fill(0.f); @@ -413,7 +583,7 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) // Uni directional if (direction == 0 || direction == 1) { - int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt); + int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret != 0) return ret; } @@ -428,14 +598,14 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) if (top_blob_reverse.empty()) return -100; - int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt); + int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret0 != 0) return ret0; hidden.fill(0.0f); cell.fill(0.0f); - int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, cell, opt); + int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden, cell, opt); if (ret1 != 0) return ret1; @@ -452,14 +622,10 @@ int LSTM_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } return 0; -#else - return LSTM::forward(bottom_blob, top_blob, opt); -#endif } int LSTM_x86::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const { -#if __AVX__ const Mat& bottom_blob = bottom_blobs[0]; int T = bottom_blob.h; int num_directions = direction == 2 ? 2 : 1; @@ -479,7 +645,7 @@ int LSTM_x86::forward(const std::vector& bottom_blobs, std::vector& to return -100; hidden.fill(0.f); - cell.create(num_output, num_directions, 4u, hidden_cell_allocator); + cell.create(hidden_size, num_directions, 4u, hidden_cell_allocator); if (cell.empty()) return -100; cell.fill(0.f); @@ -493,7 +659,7 @@ int LSTM_x86::forward(const std::vector& bottom_blobs, std::vector& to // Uni directional if (direction == 0 || direction == 1) { - int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, cell, opt); + int ret = lstm(bottom_blob, top_blob, direction, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden, cell, opt); if (ret != 0) return ret; } @@ -510,15 +676,13 @@ int LSTM_x86::forward(const std::vector& bottom_blobs, std::vector& to Mat hidden0 = hidden.row_range(0, 1); Mat cell0 = cell.row_range(0, 1); - - int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, cell0, opt); + int ret0 = lstm(bottom_blob, top_blob_forward, 0, weight_xc_data_packed.channel(0), bias_c_data_packed.channel(0), weight_hc_data_packed.channel(0), num_output == hidden_size ? Mat() : weight_hr_data.channel(0), hidden0, cell0, opt); if (ret0 != 0) return ret0; Mat hidden1 = hidden.row_range(1, 1); Mat cell1 = cell.row_range(1, 1); - - int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, cell1, opt); + int ret1 = lstm(bottom_blob, top_blob_reverse, 1, weight_xc_data_packed.channel(1), bias_c_data_packed.channel(1), weight_hc_data_packed.channel(1), num_output == hidden_size ? Mat() : weight_hr_data.channel(1), hidden1, cell1, opt); if (ret1 != 0) return ret1; @@ -541,9 +705,6 @@ int LSTM_x86::forward(const std::vector& bottom_blobs, std::vector& to } return 0; -#else - return LSTM::forward(bottom_blobs, top_blobs, opt); -#endif } } // namespace ncnn diff --git a/src/layer/x86/lstm_x86.h b/src/layer/x86/lstm_x86.h index 51ffb4139164..cab7d7e32fae 100644 --- a/src/layer/x86/lstm_x86.h +++ b/src/layer/x86/lstm_x86.h @@ -31,6 +31,9 @@ class LSTM_x86 : virtual public LSTM virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; public: + Mat weight_xc_data_packed; + Mat bias_c_data_packed; + Mat weight_hc_data_packed; }; } // namespace ncnn diff --git a/src/layer/x86/packing_x86.cpp b/src/layer/x86/packing_x86.cpp index 38d0a941d134..df120ee0726d 100644 --- a/src/layer/x86/packing_x86.cpp +++ b/src/layer/x86/packing_x86.cpp @@ -235,7 +235,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m256 _row5 = _mm256_loadu_ps(r5); __m256 _row6 = _mm256_loadu_ps(r6); __m256 _row7 = _mm256_loadu_ps(r7); - transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); + transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); _mm256_storeu_ps(outptr, _row0); _mm256_storeu_ps(outptr + 8, _row1); _mm256_storeu_ps(outptr + 16, _row2); @@ -298,7 +298,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m256 _row5 = _mm256_loadu_ps(r0 + 40); __m256 _row6 = _mm256_loadu_ps(r0 + 48); __m256 _row7 = _mm256_loadu_ps(r0 + 56); - transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); + transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); _mm256_storeu_ps(outptr0, _row0); _mm256_storeu_ps(outptr1, _row1); _mm256_storeu_ps(outptr2, _row2); @@ -432,7 +432,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m512 _rd = _mm512_loadu_ps(rd); __m512 _re = _mm512_loadu_ps(re); __m512 _rf = _mm512_loadu_ps(rf); - transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); _mm512_storeu_ps(outptr, _r0); _mm512_storeu_ps(outptr + 16, _r1); _mm512_storeu_ps(outptr + 16 * 2, _r2); @@ -535,7 +535,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m512 _rd = _mm512_loadu_ps(r0 + 16 * 13); __m512 _re = _mm512_loadu_ps(r0 + 16 * 14); __m512 _rf = _mm512_loadu_ps(r0 + 16 * 15); - transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); _mm512_storeu_ps(outptr0, _r0); _mm512_storeu_ps(outptr1, _r1); _mm512_storeu_ps(outptr2, _r2); @@ -882,7 +882,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m256 _row5 = _mm256_loadu_ps(r5); __m256 _row6 = _mm256_loadu_ps(r6); __m256 _row7 = _mm256_loadu_ps(r7); - transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); + transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); _mm256_storeu_ps(outptr, _row0); _mm256_storeu_ps(outptr + 8, _row1); _mm256_storeu_ps(outptr + 16, _row2); @@ -945,7 +945,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m256 _row5 = _mm256_loadu_ps(r0 + 40); __m256 _row6 = _mm256_loadu_ps(r0 + 48); __m256 _row7 = _mm256_loadu_ps(r0 + 56); - transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); + transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); _mm256_storeu_ps(outptr0, _row0); _mm256_storeu_ps(outptr1, _row1); _mm256_storeu_ps(outptr2, _row2); @@ -1079,7 +1079,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m512 _rd = _mm512_loadu_ps(rd); __m512 _re = _mm512_loadu_ps(re); __m512 _rf = _mm512_loadu_ps(rf); - transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); _mm512_storeu_ps(outptr, _r0); _mm512_storeu_ps(outptr + 16, _r1); _mm512_storeu_ps(outptr + 16 * 2, _r2); @@ -1182,7 +1182,7 @@ int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m512 _rd = _mm512_loadu_ps(r0 + 16 * 13); __m512 _re = _mm512_loadu_ps(r0 + 16 * 14); __m512 _rf = _mm512_loadu_ps(r0 + 16 * 15); - transpose16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); + transpose16x16_ps(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _ra, _rb, _rc, _rd, _re, _rf); _mm512_storeu_ps(outptr0, _r0); _mm512_storeu_ps(outptr1, _r1); _mm512_storeu_ps(outptr2, _r2); diff --git a/src/layer/x86/prelu_x86.cpp b/src/layer/x86/prelu_x86.cpp index 240cfd849dc0..52334659e26a 100644 --- a/src/layer/x86/prelu_x86.cpp +++ b/src/layer/x86/prelu_x86.cpp @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at @@ -14,6 +14,12 @@ #include "prelu_x86.h" +#if __SSE2__ +#include +#if __AVX__ +#include +#endif // __AVX__ +#endif // __SSE2__ #include "x86_activation.h" namespace ncnn { @@ -28,219 +34,210 @@ PReLU_x86::PReLU_x86() int PReLU_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { int dims = bottom_top_blob.dims; -#if __SSE2__ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; int elempack = bottom_top_blob.elempack; -#if __AVX__ -#if __AVX512F__ - if (elempack == 16) + if (dims == 1) { - Mat tmp; - convert_packing(bottom_top_blob, tmp, 8, opt); - - forward_inplace(tmp, opt); - - convert_packing(tmp, bottom_top_blob, 16, opt); - - return 0; - } -#endif // __AVX512F__ + const int size = w * elempack; - if (elempack == 8) - { - if (dims == 1) + if (num_slope > 1) { - int w = bottom_top_blob.w; + float* ptr = bottom_top_blob; + const float* slope = slope_data; - if (num_slope > 1) + int nn_size = 0; + int remain_size_start = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + nn_size = (size - remain_size_start) / 16; + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) { - const float* slope = slope_data; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float* ptr = (float*)bottom_top_blob + i * 8; - __m256 _p = _mm256_loadu_ps(ptr); - __m256 _slope = _mm256_loadu_ps(slope + i * 8); - _mm256_storeu_ps(ptr, prelu_avx(_p, _slope)); - } + int i = remain_size_start + ii * 16; + __m512 _p512 = _mm512_loadu_ps(ptr + i); + __m512 _slope512 = _mm512_loadu_ps(slope + i); + _mm512_storeu_ps(ptr + i, prelu_avx512(_p512, _slope512)); } - else + remain_size_start += nn_size * 16; +#endif // __AVX512F__ + nn_size = (size - remain_size_start) / 8; + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) { - __m256 _slope = _mm256_set1_ps(slope_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float* ptr = (float*)bottom_top_blob + i * 8; - __m256 _p = _mm256_loadu_ps(ptr); - _mm256_storeu_ps(ptr, prelu_avx(_p, _slope)); - } + int i = remain_size_start + ii * 8; + __m256 _p256 = _mm256_loadu_ps(ptr + i); + __m256 _slope256 = _mm256_loadu_ps(slope + i); + _mm256_storeu_ps(ptr + i, prelu_avx(_p256, _slope256)); } - } - - if (dims == 2) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - + remain_size_start += nn_size * 8; +#endif // __AVX__ + nn_size = (size - remain_size_start) / 4; #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + for (int ii = 0; ii < nn_size; ii++) { - float* ptr = bottom_top_blob.row(i); - __m256 _slope = num_slope > 1 ? _mm256_loadu_ps((const float*)slope_data + i * 8) : _mm256_set1_ps(slope_data[0]); - - for (int j = 0; j < w; j++) - { - __m256 _p = _mm256_loadu_ps(ptr); - _mm256_storeu_ps(ptr, prelu_avx(_p, _slope)); - ptr += 8; - } + int i = remain_size_start + ii * 4; + __m128 _p128 = _mm_load_ps(ptr + i); + __m128 _slope128 = _mm_load_ps(slope + i); + _mm_store_ps(ptr + i, prelu_sse(_p128, _slope128)); + } + remain_size_start += nn_size * 4; +#endif // __SSE2__ + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + if (ptr[i] < 0) + ptr[i] *= slope_data[i]; } } - - if (dims == 3) + else { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; + float* ptr = bottom_top_blob; + const float slope = slope_data[0]; + int nn_size = 0; + int remain_size_start = 0; +#if __SSE2__ +#if __AVX__ +#if __AVX512F__ + nn_size = (size - remain_size_start) / 16; #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + for (int ii = 0; ii < nn_size; ii++) { - float* ptr = bottom_top_blob.channel(q); - __m256 _slope = num_slope > 1 ? _mm256_loadu_ps((const float*)slope_data + q * 8) : _mm256_set1_ps(slope_data[0]); - - for (int i = 0; i < size; i++) - { - __m256 _p = _mm256_loadu_ps(ptr); - _mm256_storeu_ps(ptr, prelu_avx(_p, _slope)); - ptr += 8; - } + int i = remain_size_start + ii * 16; + __m512 _p512 = _mm512_loadu_ps(ptr + i); + __m512 _slope512 = _mm512_set1_ps(slope); + _mm512_storeu_ps(ptr + i, prelu_avx512(_p512, _slope512)); + } + remain_size_start += nn_size * 16; +#endif // __AVX512F__ + nn_size = (size - remain_size_start) / 8; + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 8; + __m256 _p256 = _mm256_loadu_ps(ptr + i); + __m256 _slope256 = _mm256_set1_ps(slope); + _mm256_storeu_ps(ptr + i, prelu_avx(_p256, _slope256)); + } + remain_size_start += nn_size * 8; +#endif // __AVX__ + nn_size = (size - remain_size_start) / 4; + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii = 0; ii < nn_size; ii++) + { + int i = remain_size_start + ii * 4; + __m128 _p128 = _mm_load_ps(ptr + i); + __m128 _slope128 = _mm_set1_ps(slope); + _mm_store_ps(ptr + i, prelu_sse(_p128, _slope128)); + } + remain_size_start += nn_size * 4; +#endif // __SSE2__ + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = remain_size_start; i < size; i++) + { + if (ptr[i] < 0) + ptr[i] *= slope; } } - - return 0; } -#endif // __AVX__ - if (elempack == 4) + if (dims == 2) { - if (dims == 1) + const int size = w * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) { - int w = bottom_top_blob.w; + float* ptr = bottom_top_blob.row(i); + int j = 0; - if (num_slope > 1) - { - const float* slope = slope_data; + float slope = num_slope > 1 ? slope_data[i] : slope_data[0]; +#if __SSE2__ + __m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_load_ps((const float*)slope_data + i * 4) : _mm_set1_ps(slope); +#if __AVX__ + __m256 _slope256 = num_slope > 1 && (elempack == 8) ? _mm256_loadu_ps((const float*)slope_data + i * 8) : _mm256_insertf128_ps(_mm256_castps128_ps256(_slope128), _slope128, 1); +#if __AVX512F__ + __m512 _slope512 = num_slope > 1 && (elempack == 16) ? _mm512_loadu_ps((const float*)slope_data + i * 16) : _mm512_insertf32x8(_mm512_castps256_ps512(_slope256), _slope256, 1); - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float* ptr = (float*)bottom_top_blob + i * 4; - __m128 _p = _mm_loadu_ps(ptr); - __m128 _slope = _mm_loadu_ps(slope + i * 4); - _mm_storeu_ps(ptr, prelu_sse(_p, _slope)); - } + for (; j + 15 < size; j += 16) + { + __m512 _p512 = _mm512_loadu_ps(ptr); + _mm512_storeu_ps(ptr, prelu_avx512(_p512, _slope512)); + ptr += 16; } - else +#endif // __AVX512F__ + for (; j + 7 < size; j += 8) { - __m128 _slope = _mm_set1_ps(slope_data[0]); - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < w; i++) - { - float* ptr = (float*)bottom_top_blob + i * 4; - __m128 _p = _mm_loadu_ps(ptr); - _mm_storeu_ps(ptr, prelu_sse(_p, _slope)); - } + __m256 _p256 = _mm256_loadu_ps(ptr); + _mm256_storeu_ps(ptr, prelu_avx(_p256, _slope256)); + ptr += 8; } - } - - if (dims == 2) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#endif // __AVX__ + for (; j + 3 < size; j += 4) { - float* ptr = bottom_top_blob.row(i); - __m128 _slope = num_slope > 1 ? _mm_loadu_ps((const float*)slope_data + i * 4) : _mm_set1_ps(slope_data[0]); - - for (int j = 0; j < w; j++) - { - __m128 _p = _mm_loadu_ps(ptr); - _mm_storeu_ps(ptr, prelu_sse(_p, _slope)); - ptr += 4; - } + __m128 _p128 = _mm_loadu_ps(ptr); + _mm_storeu_ps(ptr, prelu_sse(_p128, _slope128)); + ptr += 4; } - } - - if (dims == 3) - { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#endif // __SSE2__ + for (; j < size; j++) { - float* ptr = bottom_top_blob.channel(q); - __m128 _slope = num_slope > 1 ? _mm_loadu_ps((const float*)slope_data + q * 4) : _mm_set1_ps(slope_data[0]); - - for (int i = 0; i < size; i++) - { - __m128 _p = _mm_loadu_ps(ptr); - _mm_storeu_ps(ptr, prelu_sse(_p, _slope)); - ptr += 4; - } + if (*ptr < 0) + *ptr *= slope; + ptr++; } } - - return 0; } -#endif // __SSE2__ - - if (dims != 3) - return PReLU::forward_inplace(bottom_top_blob, opt); - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int channels = bottom_top_blob.c; - int size = w * h; - - const float* slope_data_ptr = slope_data; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + if (dims == 3) { - float* ptr = bottom_top_blob.channel(q); - float slope = num_slope > 1 ? slope_data_ptr[q] : slope_data_ptr[0]; + const int size = w * h * elempack; -#if __AVX__ - int nn = size >> 3; - int remain = size - (nn << 3); -#else - int remain = size; -#endif // __AVX__ + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + float* ptr = bottom_top_blob.channel(q); + int i = 0; + float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; +#if __SSE2__ + __m128 _slope128 = num_slope > 1 && (elempack == 4) ? _mm_load_ps((const float*)slope_data + q * 4) : _mm_set1_ps(slope); #if __AVX__ - for (; nn > 0; nn--) - { - __m256 _p = _mm256_loadu_ps(ptr); - _mm256_storeu_ps(ptr, prelu_avx(_p, _mm256_set1_ps(slope))); - ptr += 8; - } -#endif // __AVX__ - for (; remain > 0; remain--) - { - if (*ptr < 0) - *ptr *= slope; + __m256 _slope256 = num_slope > 1 && (elempack == 8) ? _mm256_loadu_ps((const float*)slope_data + q * 8) : _mm256_insertf128_ps(_mm256_castps128_ps256(_slope128), _slope128, 1); +#if __AVX512F__ + __m512 _slope512 = num_slope > 1 && (elempack == 16) ? _mm512_loadu_ps((const float*)slope_data + q * 16) : _mm512_insertf32x8(_mm512_castps256_ps512(_slope256), _slope256, 1); - ptr++; + for (; i + 15 < size; i += 16) + { + __m512 _p512 = _mm512_loadu_ps(ptr); + _mm512_storeu_ps(ptr, prelu_avx512(_p512, _slope512)); + ptr += 16; + } +#endif // __AVX512F__ + for (; i + 7 < size; i += 8) + { + __m256 _p256 = _mm256_loadu_ps(ptr); + _mm256_storeu_ps(ptr, prelu_avx(_p256, _slope256)); + ptr += 8; + } +#endif // __AVX__ + for (; i + 3 < size; i += 4) + { + __m128 _p128 = _mm_load_ps(ptr); + _mm_store_ps(ptr, prelu_sse(_p128, _slope128)); + ptr += 4; + } +#endif // __SSE2__ + for (; i < size; i++) + { + if (*ptr < 0) + *ptr *= slope; + ptr++; + } } } diff --git a/src/layer/x86/prelu_x86.h b/src/layer/x86/prelu_x86.h index d6d0e4509beb..6bbfeae0f0d0 100644 --- a/src/layer/x86/prelu_x86.h +++ b/src/layer/x86/prelu_x86.h @@ -1,6 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // -// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except // in compliance with the License. You may obtain a copy of the License at diff --git a/src/layer/x86/reshape_x86.cpp b/src/layer/x86/reshape_x86.cpp index 7d1c89b359c2..ab45ca647ade 100644 --- a/src/layer/x86/reshape_x86.cpp +++ b/src/layer/x86/reshape_x86.cpp @@ -208,7 +208,7 @@ int Reshape_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m512 _rowe = _mm512_loadu_ps(ptre); __m512 _rowf = _mm512_loadu_ps(ptrf); - transpose16_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7, _row8, _row9, _rowa, _rowb, _rowc, _rowd, _rowe, _rowf); + transpose16x16_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7, _row8, _row9, _rowa, _rowb, _rowc, _rowd, _rowe, _rowf); _mm512_storeu_ps(outptr, _row0); _mm512_storeu_ps(outptr + 16, _row1); @@ -297,7 +297,7 @@ int Reshape_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m256 _row6 = _mm256_loadu_ps(ptr6); __m256 _row7 = _mm256_loadu_ps(ptr7); - transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); + transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); _mm256_storeu_ps(outptr, _row0); _mm256_storeu_ps(outptr + 8, _row1); @@ -526,7 +526,7 @@ int Reshape_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m512 _rowe = _mm512_loadu_ps(ptre); __m512 _rowf = _mm512_loadu_ps(ptrf); - transpose16_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7, _row8, _row9, _rowa, _rowb, _rowc, _rowd, _rowe, _rowf); + transpose16x16_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7, _row8, _row9, _rowa, _rowb, _rowc, _rowd, _rowe, _rowf); _mm512_storeu_ps(outptr, _row0); _mm512_storeu_ps(outptr + 16, _row1); @@ -615,7 +615,7 @@ int Reshape_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op __m256 _row6 = _mm256_loadu_ps(ptr6); __m256 _row7 = _mm256_loadu_ps(ptr7); - transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); + transpose8x8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7); _mm256_storeu_ps(outptr, _row0); _mm256_storeu_ps(outptr + 8, _row1); diff --git a/src/layer/x86/softmax_x86.cpp b/src/layer/x86/softmax_x86.cpp index d1df7e446cf7..3a658a9a4bcb 100644 --- a/src/layer/x86/softmax_x86.cpp +++ b/src/layer/x86/softmax_x86.cpp @@ -125,7 +125,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const __m512 _pd = _mm512_load_ps(ptr + 16 * 13); __m512 _pe = _mm512_load_ps(ptr + 16 * 14); __m512 _pf = _mm512_load_ps(ptr + 16 * 15); - transpose16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf); + transpose16x16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf); __m512 _max01 = _mm512_max_ps(_p0, _p1); __m512 _max23 = _mm512_max_ps(_p2, _p3); __m512 _max45 = _mm512_max_ps(_p4, _p5); @@ -219,7 +219,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const _mm512_store_ps(ptr + 16 * 13, _pd); _mm512_store_ps(ptr + 16 * 14, _pe); _mm512_store_ps(ptr + 16 * 15, _pf); - transpose16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf); + transpose16x16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf); __m512 _sum01 = _mm512_add_ps(_p0, _p1); __m512 _sum23 = _mm512_add_ps(_p2, _p3); __m512 _sum45 = _mm512_add_ps(_p4, _p5); @@ -341,7 +341,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const __m512 _pd = _mm512_load_ps(ptr + 16 * 13); __m512 _pe = _mm512_load_ps(ptr + 16 * 14); __m512 _pf = _mm512_load_ps(ptr + 16 * 15); - transpose16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf); + transpose16x16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf); __m512 _max01 = _mm512_max_ps(_p0, _p1); __m512 _max23 = _mm512_max_ps(_p2, _p3); __m512 _max45 = _mm512_max_ps(_p4, _p5); @@ -435,7 +435,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const _mm512_store_ps(ptr + 16 * 13, _pd); _mm512_store_ps(ptr + 16 * 14, _pe); _mm512_store_ps(ptr + 16 * 15, _pf); - transpose16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf); + transpose16x16_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, _p8, _p9, _pa, _pb, _pc, _pd, _pe, _pf); __m512 _sum01 = _mm512_add_ps(_p0, _p1); __m512 _sum23 = _mm512_add_ps(_p2, _p3); __m512 _sum45 = _mm512_add_ps(_p4, _p5); @@ -687,7 +687,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const __m256 _p5 = _mm256_load_ps(ptr + 8 * 5); __m256 _p6 = _mm256_load_ps(ptr + 8 * 6); __m256 _p7 = _mm256_load_ps(ptr + 8 * 7); - transpose8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7); + transpose8x8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7); __m256 _max01 = _mm256_max_ps(_p0, _p1); __m256 _max23 = _mm256_max_ps(_p2, _p3); __m256 _max45 = _mm256_max_ps(_p4, _p5); @@ -749,7 +749,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const _mm256_store_ps(ptr + 8 * 5, _p5); _mm256_store_ps(ptr + 8 * 6, _p6); _mm256_store_ps(ptr + 8 * 7, _p7); - transpose8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7); + transpose8x8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7); __m256 _sum01 = _mm256_add_ps(_p0, _p1); __m256 _sum23 = _mm256_add_ps(_p2, _p3); __m256 _sum45 = _mm256_add_ps(_p4, _p5); @@ -855,7 +855,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const __m256 _p5 = _mm256_load_ps(ptr + 8 * 5); __m256 _p6 = _mm256_load_ps(ptr + 8 * 6); __m256 _p7 = _mm256_load_ps(ptr + 8 * 7); - transpose8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7); + transpose8x8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7); __m256 _max01 = _mm256_max_ps(_p0, _p1); __m256 _max23 = _mm256_max_ps(_p2, _p3); __m256 _max45 = _mm256_max_ps(_p4, _p5); @@ -917,7 +917,7 @@ int Softmax_x86::forward_inplace(Mat& bottom_top_blob, const Option& opt) const _mm256_store_ps(ptr + 8 * 5, _p5); _mm256_store_ps(ptr + 8 * 6, _p6); _mm256_store_ps(ptr + 8 * 7, _p7); - transpose8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7); + transpose8x8_ps(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7); __m256 _sum01 = _mm256_add_ps(_p0, _p1); __m256 _sum23 = _mm256_add_ps(_p2, _p3); __m256 _sum45 = _mm256_add_ps(_p4, _p5); diff --git a/src/layer/x86/sse_mathfun.h b/src/layer/x86/sse_mathfun.h index 764e33e79768..75527d507b0a 100644 --- a/src/layer/x86/sse_mathfun.h +++ b/src/layer/x86/sse_mathfun.h @@ -286,6 +286,47 @@ static NCNN_FORCEINLINE v4sf exp_ps(v4sf x) return y; } +_PS_CONST(tanh_hi, 9.0f); +_PS_CONST(tanh_lo, -9.0f); + +_PS_CONST(cephes_tanh_p0, -2.76076847742355E-16f); +_PS_CONST(cephes_tanh_p1, 2.00018790482477E-13f); +_PS_CONST(cephes_tanh_p2, -8.60467152213735E-11f); +_PS_CONST(cephes_tanh_p3, 5.12229709037114E-08f); +_PS_CONST(cephes_tanh_p4, 1.48572235717979E-05f); +_PS_CONST(cephes_tanh_p5, 6.37261928875436E-04f); +_PS_CONST(cephes_tanh_p6, 4.89352455891786E-03f); +_PS_CONST(cephes_tanh_p7, 1.19825839466702e-06f); +_PS_CONST(cephes_tanh_p8, 1.18534705686654e-04f); +_PS_CONST(cephes_tanh_p9, 2.26843463243900e-03f); + +// an approximation of tanh +static inline v4sf tanh_ps(const v4sf x) +{ + v4sf value = x; + value = _mm_max_ps(*(v4sf*)_ps_tanh_lo, value); + value = _mm_min_ps(*(v4sf*)_ps_tanh_hi, value); + + v4sf value_squared = _mm_mul_ps(value, value); + + v4sf p; + p = _mm_comp_fmadd_ps(value_squared, *(v4sf*)_ps_cephes_tanh_p0, *(v4sf*)_ps_cephes_tanh_p1); + p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p2); + p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p3); + p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p4); + p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p5); + p = _mm_comp_fmadd_ps(p, value_squared, *(v4sf*)_ps_cephes_tanh_p6); + p = _mm_mul_ps(p, value); + + v4sf q; + q = _mm_comp_fmadd_ps(value_squared, *(v4sf*)_ps_cephes_tanh_p7, *(v4sf*)_ps_cephes_tanh_p8); + q = _mm_comp_fmadd_ps(q, value_squared, *(v4sf*)_ps_cephes_tanh_p9); + q = _mm_comp_fmadd_ps(q, value_squared, *(v4sf*)_ps_cephes_tanh_p6); + + v4sf dst = _mm_div_ps(p, q); + return dst; +} + _PS_CONST(minus_cephes_DP1, -0.78515625f); _PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f); _PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f); diff --git a/src/layer/x86/x86_activation.h b/src/layer/x86/x86_activation.h index 6c58cd8f4ce1..4299d4d46b10 100644 --- a/src/layer/x86/x86_activation.h +++ b/src/layer/x86/x86_activation.h @@ -292,6 +292,13 @@ static NCNN_FORCEINLINE __m512 elu_avx512(__m512 inputs, __m512 alphas) return _mm512_add_ps(pos, _mm512_mul_ps(alphas, neg)); } +static NCNN_FORCEINLINE __m512 prelu_avx512(__m512 inputs, __m512 alphas) +{ + __m512 pos = _mm512_max_ps(_mm512_setzero_ps(), inputs); + __m512 neg = _mm512_min_ps(_mm512_setzero_ps(), inputs); + return _mm512_add_ps(pos, _mm512_mul_ps(alphas, neg)); +} + static NCNN_FORCEINLINE __m512 activation_avx512(__m512 _v, int activation_type, const ncnn::Mat& activation_params) { // Process fused activations diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h index 28ddfd50b952..669cec0a7387 100644 --- a/src/layer/x86/x86_usability.h +++ b/src/layer/x86/x86_usability.h @@ -155,20 +155,20 @@ static NCNN_FORCEINLINE __m128i float2int8_sse(const __m128& _v0, const __m128& } #ifndef __FMA__ -static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(__m128 _a, const __m128 _b, const __m128 _c) +static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c) { return _mm_add_ps(_mm_mul_ps(_a, _b), _c); } -static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(__m128 _a, const __m128 _b, const __m128 _c) +static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c) { return _mm_sub_ps(_c, _mm_mul_ps(_a, _b)); } #else -static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(__m128 _a, const __m128 _b, const __m128 _c) +static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c) { return _mm_fmadd_ps(_a, _b, _c); } -static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(__m128 _a, const __m128 _b, const __m128 _c) +static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c) { // return -a * b + c return _mm_fnmadd_ps(_a, _b, _c); @@ -177,65 +177,165 @@ static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(__m128 _a, const __m128 _b, co #if __AVX__ #ifndef __FMA__ -static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(__m256 _a, const __m256 _b, const __m256 _c) +static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c) { return _mm256_add_ps(_mm256_mul_ps(_a, _b), _c); } -static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(__m256 _a, const __m256 _b, const __m256 _c) +static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c) { return _mm256_sub_ps(_c, _mm256_mul_ps(_a, _b)); } #else -static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(__m256 _a, const __m256 _b, const __m256 _c) +static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c) { return _mm256_fmadd_ps(_a, _b, _c); } -static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(__m256 _a, const __m256 _b, const __m256 _c) +static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c) { // return -a * b + c return _mm256_fnmadd_ps(_a, _b, _c); } #endif -static NCNN_FORCEINLINE __m256 _mm256_fmadd_1_ps(__m256 a, __m256 b, float c) +static NCNN_FORCEINLINE __m256 _mm256_fmadd_1_ps(const __m256& a, const __m256& b, float c) { return _mm256_comp_fmadd_ps(b, _mm256_set1_ps(c), a); } -static NCNN_FORCEINLINE __m256 _mm256_fmrsub_1_ps(__m256 a, __m256 b, float c) +static NCNN_FORCEINLINE __m256 _mm256_fmrsub_1_ps(const __m256& a, const __m256& b, float c) { // return a - b * c return _mm256_comp_fnmadd_ps(b, _mm256_set1_ps(c), a); } -// From: https://stackoverflow.com/a/25627536 -static NCNN_FORCEINLINE void transpose8_ps(__m256& row0, __m256& row1, __m256& row2, __m256& row3, __m256& row4, __m256& row5, __m256& row6, __m256& row7) -{ - __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; - __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; - __t0 = _mm256_unpacklo_ps(row0, row1); - __t1 = _mm256_unpackhi_ps(row0, row1); - __t2 = _mm256_unpacklo_ps(row2, row3); - __t3 = _mm256_unpackhi_ps(row2, row3); - __t4 = _mm256_unpacklo_ps(row4, row5); - __t5 = _mm256_unpackhi_ps(row4, row5); - __t6 = _mm256_unpacklo_ps(row6, row7); - __t7 = _mm256_unpackhi_ps(row6, row7); - __tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1, 0, 1, 0)); - __tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3, 2, 3, 2)); - __tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1, 0, 1, 0)); - __tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3, 2, 3, 2)); - __tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1, 0, 1, 0)); - __tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3, 2, 3, 2)); - __tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1, 0, 1, 0)); - __tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3, 2, 3, 2)); - row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); - row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); - row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); - row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); - row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); - row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); - row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); - row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); + +static NCNN_FORCEINLINE void transpose8x12_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5, __m256& _r6, __m256& _r7, + __m256& _r8, __m256& _r9, __m256& _ra, __m256& _rb) +{ + __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); + __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); + __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); + __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); + __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); + __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); + __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); + __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); + __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9); + __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9); + __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb); + __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb); + + __m256 _tmpc = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpd = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpe = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpf = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpg = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmph = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpi = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpj = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpl = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpm = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpn = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); + + _r0 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 2, 0, 0)); + _r1 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); + _r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0)); + _r3 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 2, 0, 0)); + _r4 = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); + _r5 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0)); + _r6 = _mm256_permute2f128_ps(_tmpc, _tmpg, _MM_SHUFFLE(0, 3, 0, 1)); + _r7 = _mm256_permute2f128_ps(_tmpk, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); + _r8 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1)); + _r9 = _mm256_permute2f128_ps(_tmpe, _tmpi, _MM_SHUFFLE(0, 3, 0, 1)); + _ra = _mm256_permute2f128_ps(_tmpm, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); + _rb = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1)); +} + +static NCNN_FORCEINLINE void transpose8x8_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5, __m256& _r6, __m256& _r7) +{ + __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); + __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); + __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); + __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); + __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); + __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); + __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); + __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); + + __m256 _tmp8 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmp9 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpa = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpb = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpc = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpd = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpe = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpf = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); + + _r0 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); + _r1 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); + _r2 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); + _r3 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); + _r4 = _mm256_permute2f128_ps(_tmp8, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); + _r5 = _mm256_permute2f128_ps(_tmp9, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); + _r6 = _mm256_permute2f128_ps(_tmpa, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); + _r7 = _mm256_permute2f128_ps(_tmpb, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); +} + +static NCNN_FORCEINLINE void transpose8x4_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3) +{ + __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); + __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); + __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); + __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); + + __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmp5 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmp6 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmp7 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); + + _r0 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); + _r1 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); + _r2 = _mm256_permute2f128_ps(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); + _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); +} + +static NCNN_FORCEINLINE void transpose8x2_ps(__m256& _r0, __m256& _r1) +{ + __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); + __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); + + _r0 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0)); + _r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1)); +} + +static NCNN_FORCEINLINE void transpose8x8_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7) +{ + __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1); + __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1); + __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3); + __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3); + __m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5); + __m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5); + __m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7); + __m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7); + + __m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2); + __m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2); + __m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3); + __m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3); + __m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6); + __m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6); + __m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7); + __m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7); + + _r0 = _mm_unpacklo_epi64(_tmp8, _tmpc); + _r1 = _mm_unpackhi_epi64(_tmp8, _tmpc); + _r2 = _mm_unpacklo_epi64(_tmp9, _tmpd); + _r3 = _mm_unpackhi_epi64(_tmp9, _tmpd); + _r4 = _mm_unpacklo_epi64(_tmpa, _tmpe); + _r5 = _mm_unpackhi_epi64(_tmpa, _tmpe); + _r6 = _mm_unpacklo_epi64(_tmpb, _tmpf); + _r7 = _mm_unpackhi_epi64(_tmpb, _tmpf); } static NCNN_FORCEINLINE __m256 HorizontalSums(__m256& v0, __m256& v1, __m256& v2, __m256& v3, __m256& v4, __m256& v5, __m256& v6, __m256& v7) @@ -401,7 +501,7 @@ static NCNN_FORCEINLINE void _mm256_comp_fmadd_ps8(__m256& _sum, } #if __AVX512F__ -static NCNN_FORCEINLINE void transpose16_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7, +static NCNN_FORCEINLINE void transpose16x16_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7, __m512& _r8, __m512& _r9, __m512& _ra, __m512& _rb, __m512& _rc, __m512& _rd, __m512& _re, __m512& _rf) { __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); @@ -473,6 +573,302 @@ static NCNN_FORCEINLINE void transpose16_ps(__m512& _r0, __m512& _r1, __m512& _r _rf = _mm512_shuffle_f32x4(_tmpe, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); } +static NCNN_FORCEINLINE void transpose16x12_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7, + __m512& _r8, __m512& _r9, __m512& _ra, __m512& _rb) +{ + __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); + __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); + __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); + __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); + __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); + __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); + __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); + __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); + __m512 _tmp8 = _mm512_unpacklo_ps(_r8, _r9); + __m512 _tmp9 = _mm512_unpackhi_ps(_r8, _r9); + __m512 _tmpa = _mm512_unpacklo_ps(_ra, _rb); + __m512 _tmpb = _mm512_unpackhi_ps(_ra, _rb); + + __m512 _tmpc = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmpd = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmpe = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmpf = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmpg = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmph = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmpi = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmpj = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmpk = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmpl = _mm512_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmpm = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmpn = _mm512_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); + + _tmp0 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp1 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp2 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp3 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp4 = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp5 = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp6 = _mm512_shuffle_f32x4(_tmpc, _tmpg, _MM_SHUFFLE(3, 1, 3, 1)); + _tmp7 = _mm512_shuffle_f32x4(_tmpk, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); + _tmp8 = _mm512_shuffle_f32x4(_tmph, _tmpl, _MM_SHUFFLE(3, 1, 3, 1)); + _tmp9 = _mm512_shuffle_f32x4(_tmpe, _tmpi, _MM_SHUFFLE(3, 1, 3, 1)); + _tmpa = _mm512_shuffle_f32x4(_tmpm, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); + _tmpb = _mm512_shuffle_f32x4(_tmpj, _tmpn, _MM_SHUFFLE(3, 1, 3, 1)); + + _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); + _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); + _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); + _r4 = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(2, 0, 2, 0)); + _r5 = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(2, 0, 2, 0)); + _r6 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + _r7 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + _r8 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); + _r9 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + _ra = _mm512_shuffle_f32x4(_tmp8, _tmp9, _MM_SHUFFLE(3, 1, 3, 1)); + _rb = _mm512_shuffle_f32x4(_tmpa, _tmpb, _MM_SHUFFLE(3, 1, 3, 1)); +} + +static NCNN_FORCEINLINE void transpose16x8_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7) +{ + __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); + __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); + __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); + __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); + __m512 _tmp4 = _mm512_unpacklo_ps(_r4, _r5); + __m512 _tmp5 = _mm512_unpackhi_ps(_r4, _r5); + __m512 _tmp6 = _mm512_unpacklo_ps(_r6, _r7); + __m512 _tmp7 = _mm512_unpackhi_ps(_r6, _r7); + + __m512 _tmp8 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmp9 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmpa = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmpb = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmpc = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmpd = _mm512_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmpe = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmpf = _mm512_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); + + _tmp0 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp1 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp2 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp3 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp4 = _mm512_shuffle_f32x4(_tmp8, _tmpc, _MM_SHUFFLE(3, 1, 3, 1)); + _tmp5 = _mm512_shuffle_f32x4(_tmp9, _tmpd, _MM_SHUFFLE(3, 1, 3, 1)); + _tmp6 = _mm512_shuffle_f32x4(_tmpa, _tmpe, _MM_SHUFFLE(3, 1, 3, 1)); + _tmp7 = _mm512_shuffle_f32x4(_tmpb, _tmpf, _MM_SHUFFLE(3, 1, 3, 1)); + + _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); + _r2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); + _r3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); + _r4 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + _r5 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); + _r6 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); + _r7 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); +} + +static NCNN_FORCEINLINE void transpose16x4_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3) +{ + __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); + __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); + __m512 _tmp2 = _mm512_unpacklo_ps(_r2, _r3); + __m512 _tmp3 = _mm512_unpackhi_ps(_r2, _r3); + + __m512 _tmp4 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmp5 = _mm512_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 _tmp6 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); + __m512 _tmp7 = _mm512_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); + + _tmp0 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp1 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(2, 0, 2, 0)); + _tmp2 = _mm512_shuffle_f32x4(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 3, 1)); + _tmp3 = _mm512_shuffle_f32x4(_tmp6, _tmp7, _MM_SHUFFLE(3, 1, 3, 1)); + + _r0 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); + _r2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + _r3 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); +} + +static NCNN_FORCEINLINE void transpose16x2_ps(__m512& _r0, __m512& _r1) +{ + __m512 _tmp0 = _mm512_unpacklo_ps(_r0, _r1); + __m512 _tmp1 = _mm512_unpackhi_ps(_r0, _r1); + + __m512 _tmp2 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0)); + __m512 _tmp3 = _mm512_shuffle_f32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1)); + + _r0 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0)); + _r1 = _mm512_shuffle_f32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1)); +} + +static NCNN_FORCEINLINE void transpose8x16_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5, __m256& _r6, __m256& _r7, + __m256& _r8, __m256& _r9, __m256& _ra, __m256& _rb, __m256& _rc, __m256& _rd, __m256& _re, __m256& _rf) +{ + __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1); + __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1); + __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3); + __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3); + __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5); + __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5); + __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7); + __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7); + __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9); + __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9); + __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb); + __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb); + __m256 _tmpc = _mm256_unpacklo_ps(_rc, _rd); + __m256 _tmpd = _mm256_unpackhi_ps(_rc, _rd); + __m256 _tmpe = _mm256_unpacklo_ps(_re, _rf); + __m256 _tmpf = _mm256_unpackhi_ps(_re, _rf); + + __m256 _tmpg = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmph = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpi = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpj = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpk = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpl = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpm = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpn = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpo = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpp = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpq = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpr = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmps = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpt = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2)); + __m256 _tmpu = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0)); + __m256 _tmpv = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(3, 2, 3, 2)); + + _r0 = _mm256_permute2f128_ps(_tmpg, _tmpk, _MM_SHUFFLE(0, 2, 0, 0)); + _r1 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 2, 0, 0)); + _r2 = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 2, 0, 0)); + _r3 = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 2, 0, 0)); + _r4 = _mm256_permute2f128_ps(_tmpi, _tmpm, _MM_SHUFFLE(0, 2, 0, 0)); + _r5 = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 2, 0, 0)); + _r6 = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 2, 0, 0)); + _r7 = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 2, 0, 0)); + _r8 = _mm256_permute2f128_ps(_tmpg, _tmpk, _MM_SHUFFLE(0, 3, 0, 1)); + _r9 = _mm256_permute2f128_ps(_tmpo, _tmps, _MM_SHUFFLE(0, 3, 0, 1)); + _ra = _mm256_permute2f128_ps(_tmph, _tmpl, _MM_SHUFFLE(0, 3, 0, 1)); + _rb = _mm256_permute2f128_ps(_tmpp, _tmpt, _MM_SHUFFLE(0, 3, 0, 1)); + _rc = _mm256_permute2f128_ps(_tmpi, _tmpm, _MM_SHUFFLE(0, 3, 0, 1)); + _rd = _mm256_permute2f128_ps(_tmpq, _tmpu, _MM_SHUFFLE(0, 3, 0, 1)); + _re = _mm256_permute2f128_ps(_tmpj, _tmpn, _MM_SHUFFLE(0, 3, 0, 1)); + _rf = _mm256_permute2f128_ps(_tmpr, _tmpv, _MM_SHUFFLE(0, 3, 0, 1)); +} + +static NCNN_FORCEINLINE void transpose16x16_epi16(__m256i& _r0, __m256i& _r1, __m256i& _r2, __m256i& _r3, __m256i& _r4, __m256i& _r5, __m256i& _r6, __m256i& _r7, + __m256i& _r8, __m256i& _r9, __m256i& _ra, __m256i& _rb, __m256i& _rc, __m256i& _rd, __m256i& _re, __m256i& _rf) +{ + __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1); + __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1); + __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3); + __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3); + __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5); + __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5); + __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7); + __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7); + __m256i _tmp8 = _mm256_unpacklo_epi16(_r8, _r9); + __m256i _tmp9 = _mm256_unpackhi_epi16(_r8, _r9); + __m256i _tmpa = _mm256_unpacklo_epi16(_ra, _rb); + __m256i _tmpb = _mm256_unpackhi_epi16(_ra, _rb); + __m256i _tmpc = _mm256_unpacklo_epi16(_rc, _rd); + __m256i _tmpd = _mm256_unpackhi_epi16(_rc, _rd); + __m256i _tmpe = _mm256_unpacklo_epi16(_re, _rf); + __m256i _tmpf = _mm256_unpackhi_epi16(_re, _rf); + + __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); + __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); + __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); + __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); + __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); + __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); + __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); + __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); + __m256i _tmpo = _mm256_unpacklo_epi32(_tmp8, _tmpa); + __m256i _tmpp = _mm256_unpackhi_epi32(_tmp8, _tmpa); + __m256i _tmpq = _mm256_unpacklo_epi32(_tmp9, _tmpb); + __m256i _tmpr = _mm256_unpackhi_epi32(_tmp9, _tmpb); + __m256i _tmps = _mm256_unpacklo_epi32(_tmpc, _tmpe); + __m256i _tmpt = _mm256_unpackhi_epi32(_tmpc, _tmpe); + __m256i _tmpu = _mm256_unpacklo_epi32(_tmpd, _tmpf); + __m256i _tmpv = _mm256_unpackhi_epi32(_tmpd, _tmpf); + + _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk); + _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk); + _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl); + _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl); + _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm); + _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm); + _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn); + _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn); + _tmp8 = _mm256_unpacklo_epi64(_tmpo, _tmps); + _tmp9 = _mm256_unpackhi_epi64(_tmpo, _tmps); + _tmpa = _mm256_unpacklo_epi64(_tmpp, _tmpt); + _tmpb = _mm256_unpackhi_epi64(_tmpp, _tmpt); + _tmpc = _mm256_unpacklo_epi64(_tmpq, _tmpu); + _tmpd = _mm256_unpackhi_epi64(_tmpq, _tmpu); + _tmpe = _mm256_unpacklo_epi64(_tmpr, _tmpv); + _tmpf = _mm256_unpackhi_epi64(_tmpr, _tmpv); + + _r0 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 2, 0, 0)); + _r1 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 2, 0, 0)); + _r2 = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 2, 0, 0)); + _r3 = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 2, 0, 0)); + _r4 = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 2, 0, 0)); + _r5 = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 2, 0, 0)); + _r6 = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 2, 0, 0)); + _r7 = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 2, 0, 0)); + _r8 = _mm256_permute2x128_si256(_tmp0, _tmp8, _MM_SHUFFLE(0, 3, 0, 1)); + _r9 = _mm256_permute2x128_si256(_tmp1, _tmp9, _MM_SHUFFLE(0, 3, 0, 1)); + _ra = _mm256_permute2x128_si256(_tmp2, _tmpa, _MM_SHUFFLE(0, 3, 0, 1)); + _rb = _mm256_permute2x128_si256(_tmp3, _tmpb, _MM_SHUFFLE(0, 3, 0, 1)); + _rc = _mm256_permute2x128_si256(_tmp4, _tmpc, _MM_SHUFFLE(0, 3, 0, 1)); + _rd = _mm256_permute2x128_si256(_tmp5, _tmpd, _MM_SHUFFLE(0, 3, 0, 1)); + _re = _mm256_permute2x128_si256(_tmp6, _tmpe, _MM_SHUFFLE(0, 3, 0, 1)); + _rf = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 3, 0, 1)); +} + +static NCNN_FORCEINLINE void transpose16x8_epi16(__m256i& _r0, __m256i& _r1, __m256i& _r2, __m256i& _r3, __m256i& _r4, __m256i& _r5, __m256i& _r6, __m256i& _r7) +{ + __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1); + __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1); + __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3); + __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3); + __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5); + __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5); + __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7); + __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7); + + __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2); + __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2); + __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3); + __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3); + __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6); + __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6); + __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7); + __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7); + + _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk); + _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk); + _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl); + _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl); + _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm); + _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm); + _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn); + _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn); + + _r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0)); + _r1 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 2, 0, 0)); + _r2 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0)); + _r3 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0)); + _r4 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1)); + _r5 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 3, 0, 1)); + _r6 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1)); + _r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1)); +} + static NCNN_FORCEINLINE float _mm512_comp_reduce_add_ps(__m512 x) { const __m256 x256 = _mm256_add_ps(_mm512_castps512_ps256(x), _mm512_extractf32x8_ps(x, 1)); diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in index 99c1d8336f41..6947ecce5d18 100644 --- a/src/layer_registry.h.in +++ b/src/layer_registry.h.in @@ -28,6 +28,12 @@ static const layer_registry_entry layer_registry_msa[] = { }; #endif // NCNN_RUNTIME_CPU && NCNN_MSA +#if NCNN_RUNTIME_CPU && NCNN_LSX +static const layer_registry_entry layer_registry_lsx[] = { +@layer_registry_lsx@ +}; +#endif // NCNN_RUNTIME_CPU && NCNN_LSX + #if NCNN_RUNTIME_CPU && NCNN_RVV static const layer_registry_entry layer_registry_rvv[] = { @layer_registry_rvv@ diff --git a/src/layer_shader_registry.h.in b/src/layer_shader_registry.h.in index 9a88eb4604e2..52e3f013cf1c 100644 --- a/src/layer_shader_registry.h.in +++ b/src/layer_shader_registry.h.in @@ -3,4 +3,3 @@ // This file is auto-generated by cmake, don't edit it. @layer_shader_registry@ - diff --git a/src/layer_shader_spv_data.h.in b/src/layer_shader_spv_data.h.in index ab1b7b8aaa2e..a4795bb15265 100644 --- a/src/layer_shader_spv_data.h.in +++ b/src/layer_shader_spv_data.h.in @@ -3,4 +3,3 @@ // This file is auto-generated by cmake, don't edit it. @layer_shader_spv_data@ - diff --git a/src/main.cpp b/src/main.cpp deleted file mode 100644 index e107caea2d63..000000000000 --- a/src/main.cpp +++ /dev/null @@ -1,76 +0,0 @@ -#include - -#include "mat.h" -#include "net.h" - -template -void print_mat_1d(ncnn::Mat &m, int start_w, int end_w) { - const T *p = m; - if (end_w == -1) { - end_w = m.w; - } - for (int w = start_w; w != end_w; ++w) { - std::cout << p[w] << ", "; - } - std::cout << "\n"; -} - -template -void print_mat_2d(ncnn::Mat &m, int start_h, int end_h, int start_w, - int end_w) { - if (end_h == -1) { - end_h = m.h; - } - for (int h = start_h; h != end_h; ++h) { - ncnn::Mat sub = m.row_range(h, 1); - print_mat_1d(sub, start_w, end_w); - } -} - -template -void print_mat_3d(ncnn::Mat &m, int start_c, int end_c, int start_h, int end_h, - int start_w, int end_w) { - if (end_c == -1) { - end_c = m.c; - } - - for (int c = start_c; c != end_c; ++c) { - std::cout << "c " << c << "\n"; - ncnn::Mat sub = m.channel_range(c, 1); - print_mat_2d(sub, start_h, end_h, start_w, end_w); - } -} - -int main() { - int c = 1; - int h = 6; - int w = 8; - int size = c * h * w; - // std::vector data(size); - std::vector data = {1, 3, 5, 4, 2}; - // for (int i = 0; i != size; ++i) { - // data[i] = i; - // } - ncnn::Option opt; - opt.num_threads = 1; - ncnn::Net net; - net.opt = opt; - net.load_param("foo/make_pad_mask.ncnn.param"); - net.load_model("foo/make_pad_mask.ncnn.bin"); - - ncnn::Extractor ex = net.create_extractor(); - - ncnn::Mat m(data.size(), data.data()); - m = m.clone(); - std::cout << "in\n"; - print_mat_1d(m, 0, -1); - std::cout << "\n"; - - ncnn::Mat out; - - ex.input("in0", m); - ex.extract("out0", out); - print_mat_2d(out, 0, -1, 0, -1); - ex.clear(); - net.clear(); -} diff --git a/src/mat.h b/src/mat.h index 6d7deb502a20..c6f59ef42684 100644 --- a/src/mat.h +++ b/src/mat.h @@ -29,6 +29,9 @@ #if __mips_msa #include #endif +#if __loongarch_sx +#include +#endif #if __riscv_vector #include #include "cpu.h" // cpu_riscv_vlenb() @@ -128,6 +131,9 @@ class NCNN_EXPORT Mat #if __mips_msa void fill(v4f32 _v); #endif // __mips_msa +#if __loongarch_sx + void fill(__m128 _v); +#endif //__loongarch_sx #if __riscv_vector void fill(vfloat32m1_t _v); void fill(vuint16m1_t _v); @@ -1067,11 +1073,23 @@ NCNN_FORCEINLINE void Mat::fill(v4f32 _v) } #endif // __mips_msa +#if __loongarch_sx +NCNN_FORCEINLINE void Mat::fill(__m128 _v) +{ + int size = (int)total(); + float* ptr = (float*)data; + for (int i = 0; i < size; i++) + { + __lsx_vst(_v, ptr, 0); + ptr += 4; + } +} +#endif // __loongarch_sx #if __riscv_vector NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v) { const int packn = cpu_riscv_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int size = (int)total(); float* ptr = (float*)data; @@ -1085,7 +1103,7 @@ NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v) NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v) { const int packn = cpu_riscv_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int size = (int)total(); unsigned short* ptr = (unsigned short*)data; @@ -1099,7 +1117,7 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v) NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v) { const int packn = cpu_riscv_vlenb() / 1; - const word_type vl = vsetvl_e8m1(packn); + const size_t vl = vsetvl_e8m1(packn); int size = (int)total(); signed char* ptr = (signed char*)data; @@ -1113,7 +1131,7 @@ NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v) NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v) { const int packn = cpu_riscv_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int size = (int)total(); __fp16* ptr = (__fp16*)data; diff --git a/src/net.cpp b/src/net.cpp index e68507ca3b0d..8a09ebdc1ef8 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -108,6 +108,26 @@ NetPrivate::NetPrivate(Option& _opt) #endif // NCNN_VULKAN } +static Option get_masked_option(const Option& opt, int featmask) +{ + // mask option usage as layer specific featmask + Option opt1 = opt; + opt1.use_fp16_arithmetic = opt1.use_fp16_arithmetic && !(featmask & (1 << 0)); + opt1.use_fp16_storage = opt1.use_fp16_storage && !(featmask & (1 << 1)); + opt1.use_fp16_packed = opt1.use_fp16_packed && !(featmask & (1 << 1)); + opt1.use_bf16_storage = opt1.use_bf16_storage && !(featmask & (1 << 2)); + opt1.use_int8_packed = opt1.use_int8_packed && !(featmask & (1 << 3)); + opt1.use_int8_storage = opt1.use_int8_storage && !(featmask & (1 << 3)); + opt1.use_int8_arithmetic = opt1.use_int8_arithmetic && !(featmask & (1 << 3)); + opt1.use_vulkan_compute = opt1.use_vulkan_compute && !(featmask & (1 << 4)); + opt1.use_image_storage = opt1.use_image_storage && !(featmask & (1 << 4)); + opt1.use_tensor_storage = opt1.use_tensor_storage && !(featmask & (1 << 4)); + opt1.use_sgemm_convolution = opt1.use_sgemm_convolution && !(featmask & (1 << 5)); + opt1.use_winograd_convolution = opt1.use_winograd_convolution && !(featmask & (1 << 6)); + + return opt1; +} + #if NCNN_VULKAN int NetPrivate::upload_model() { @@ -132,7 +152,7 @@ int NetPrivate::upload_model() { if (layers[i]->support_vulkan) { - int uret = layers[i]->upload_model(cmd, opt_upload); + int uret = layers[i]->upload_model(cmd, get_masked_option(opt_upload, layers[i]->featmask)); if (uret != 0) { NCNN_LOGE("layer upload_model %d failed", (int)i); @@ -195,7 +215,15 @@ int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, cons bottom_blob.elemsize = blob_mats[bottom_blob_index].elemsize; } #endif - int ret = do_forward_layer(layer, blob_mats, opt); + int ret = 0; + if (layer->featmask) + { + ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask)); + } + else + { + ret = do_forward_layer(layer, blob_mats, opt); + } #if NCNN_BENCHMARK double end = get_current_time(); if (layer->one_blob_only) @@ -352,7 +380,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, std: #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2); #endif - ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt); + if (layer->featmask) + { + ret = do_forward_layer(layer, blob_mats_gpu, cmd, get_masked_option(opt, layer->featmask)); + } + else + { + ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt); + } #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2 + 1); #endif @@ -368,7 +403,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, std: bottom_blob = blob_mats[bottom_blob_index].shape(); } #endif - ret = do_forward_layer(layer, blob_mats, opt); + if (layer->featmask) + { + ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask)); + } + else + { + ret = do_forward_layer(layer, blob_mats, opt); + } #if NCNN_BENCHMARK double end = get_current_time(); if (layer->one_blob_only) @@ -677,7 +719,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, std: #endif if (layer->support_image_storage) { - ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, opt); + if (layer->featmask) + { + ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, get_masked_option(opt, layer->featmask)); + } + else + { + ret = do_forward_layer(layer, blob_mats_gpu_image, cmd, opt); + } if (ret == -100) { image_allocation_failed = true; @@ -686,7 +735,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, std: } else { - ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt); + if (layer->featmask) + { + ret = do_forward_layer(layer, blob_mats_gpu, cmd, get_masked_option(opt, layer->featmask)); + } + else + { + ret = do_forward_layer(layer, blob_mats_gpu, cmd, opt); + } } #if NCNN_BENCHMARK cmd.record_write_timestamp(layer_index * 2 + 1); @@ -703,7 +759,14 @@ int NetPrivate::forward_layer(int layer_index, std::vector& blob_mats, std: bottom_blob = blob_mats[bottom_blob_index].shape(); } #endif - ret = do_forward_layer(layer, blob_mats, opt); + if (layer->featmask) + { + ret = do_forward_layer(layer, blob_mats, get_masked_option(opt, layer->featmask)); + } + else + { + ret = do_forward_layer(layer, blob_mats, opt); + } #if NCNN_BENCHMARK double end = get_current_time(); if (layer->one_blob_only) @@ -790,6 +853,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio // *INDENT-ON* // clang-format on + int dst_elempack = 1; if (opt.use_packing_layout) { // resolve dst_elempack @@ -801,7 +865,6 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio int elembits = bottom_blob.elembits(); - int dst_elempack = 1; if (layer->support_packing) { if (elembits == 32) @@ -855,13 +918,13 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio #endif } } + } - if (bottom_blob.elempack != dst_elempack) - { - Mat bottom_blob_packed; - convert_packing(bottom_blob, bottom_blob_packed, dst_elempack, opt); - bottom_blob = bottom_blob_packed; - } + if (bottom_blob.elempack != dst_elempack) + { + Mat bottom_blob_packed; + convert_packing(bottom_blob, bottom_blob_packed, dst_elempack, opt); + bottom_blob = bottom_blob_packed; } return 0; @@ -1571,6 +1634,9 @@ int Net::load_param(const DataReader& dr) layer->top_shapes[j] = d->blobs[layer->tops[j]].shape; } + // pull out layer specific feature disabled set + layer->featmask = pd.get(31, 0); + int lr = layer->load_param(pd); if (lr != 0) { @@ -1774,6 +1840,9 @@ int Net::load_param_bin(const DataReader& dr) layer->top_shapes[j] = d->blobs[layer->tops[j]].shape; } + // pull out layer specific feature disabled set + layer->featmask = pd.get(31, 0); + int lr = layer->load_param(pd); if (lr != 0) { @@ -1855,12 +1924,17 @@ int Net::load_model(const DataReader& dr) { Layer* layer = d->layers[i]; - Option opt1 = opt; + Option opt1 = get_masked_option(opt, layer->featmask); #if NCNN_VULKAN - if (opt.use_vulkan_compute) + if (opt1.use_vulkan_compute) { if (!layer->support_image_storage) opt1.use_image_storage = false; } + else + { + layer->vkdev = 0; + layer->support_vulkan = false; + } #endif // NCNN_VULKAN int cret = layer->create_pipeline(opt1); @@ -1891,7 +1965,7 @@ int Net::load_model(const DataReader& dr) if (!d->local_workspace_allocator) { d->local_workspace_allocator = new PoolAllocator; - d->local_workspace_allocator->set_size_compare_ratio(0.5f); + d->local_workspace_allocator->set_size_compare_ratio(0.f); } } } @@ -2066,11 +2140,13 @@ void Net::clear() { Layer* layer = d->layers[i]; - Option opt1 = opt; + Option opt1 = get_masked_option(opt, layer->featmask); +#if NCNN_VULKAN if (!layer->support_image_storage) { opt1.use_image_storage = false; } +#endif // NCNN_VULKAN int dret = layer->destroy_pipeline(opt1); if (dret != 0) diff --git a/src/option.cpp b/src/option.cpp index 4aabfdde5ed0..80d4455307ef 100644 --- a/src/option.cpp +++ b/src/option.cpp @@ -21,7 +21,7 @@ namespace ncnn { Option::Option() { lightmode = true; - num_threads = get_big_cpu_count(); + num_threads = get_physical_big_cpu_count(); blob_allocator = 0; workspace_allocator = 0; diff --git a/src/platform.h.in b/src/platform.h.in index 755f8294bc29..219cff4aada9 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -55,6 +55,7 @@ #cmakedefine01 NCNN_ARM86SVEF32MM #endif // __aarch64__ #cmakedefine01 NCNN_MSA +#cmakedefine01 NCNN_LSX #cmakedefine01 NCNN_MMI #cmakedefine01 NCNN_RVV #cmakedefine01 NCNN_INT8 diff --git a/src/simplestl.h b/src/simplestl.h index b8454c40ae55..00ff46801867 100644 --- a/src/simplestl.h +++ b/src/simplestl.h @@ -508,7 +508,7 @@ struct vector { capacity_ = new_size * 2; T* new_data = (T*)new char[capacity_ * sizeof(T)]; - memset(new_data, 0, capacity_ * sizeof(T)); + memset(static_cast(new_data), 0, capacity_ * sizeof(T)); if (data_) { memmove(new_data, data_, sizeof(T) * size_); diff --git a/src/stb_image.h b/src/stb_image.h index 35e20150b462..6aad778aba19 100644 --- a/src/stb_image.h +++ b/src/stb_image.h @@ -4851,7 +4851,7 @@ static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0); if (p == NULL) return stbi__err("outofmem", "Out of memory"); - // between here and free(out) below, exitting would leak + // between here and free(out) below, exiting would leak temp_out = p; if (pal_img_n == 3) { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a88c6562db26..967fbd72befa 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -19,7 +19,18 @@ macro(ncnn_add_layer_test class) # enable if WITH_LAYER_xxx option ON if(${WITH_LAYER_${name}}) - ncnn_add_test(${name}) + file(GLOB test_${name}_SRCS "test_${name}.cpp" "test_${name}_*.cpp" LIST_DIRECTORIES FALSE) + + foreach(test_file ${test_${name}_SRCS}) + get_filename_component(test_filename ${test_file} NAME_WE) + add_executable(${test_filename} ${test_file}) + target_link_libraries(${test_filename} PRIVATE ncnn) + + add_test(NAME ${test_filename} COMMAND ${CMAKE_COMMAND} -DTEST_EXECUTABLE=$ -P ${CMAKE_CURRENT_SOURCE_DIR}/../cmake/run_test.cmake) + + # add test to a virtual project group + set_property(TARGET ${test_filename} PROPERTY FOLDER "tests") + endforeach() endif() endmacro() @@ -85,8 +96,11 @@ ncnn_add_layer_test(Eltwise) ncnn_add_layer_test(ELU) ncnn_add_layer_test(ExpandDims) ncnn_add_layer_test(Flatten) +ncnn_add_layer_test(Fold) ncnn_add_layer_test(GELU) +ncnn_add_layer_test(GLU) ncnn_add_layer_test(Gemm) +ncnn_add_layer_test(GridSample) ncnn_add_layer_test(GroupNorm) ncnn_add_layer_test(GRU) ncnn_add_layer_test(HardSigmoid) @@ -134,4 +148,5 @@ ncnn_add_layer_test(Swish) ncnn_add_layer_test(TanH) ncnn_add_layer_test(Tile) ncnn_add_layer_test(UnaryOp) +ncnn_add_layer_test(Unfold) ncnn_add_layer_test(Yolov3DetectionOutput) diff --git a/tests/test_binaryop.cpp b/tests/test_binaryop.cpp index 44e3d1b369ed..f79ec024be14 100644 --- a/tests/test_binaryop.cpp +++ b/tests/test_binaryop.cpp @@ -382,7 +382,7 @@ int main() { SRAND(7767517); - for (op_type = 0; op_type < OP_TYPE_MAX; op_type++) + for (op_type = 0; op_type < 3; op_type++) { int ret = 0 || test_binaryop_1() diff --git a/tests/test_binaryop_1.cpp b/tests/test_binaryop_1.cpp new file mode 100644 index 000000000000..bc0ec9c8927f --- /dev/null +++ b/tests/test_binaryop_1.cpp @@ -0,0 +1,431 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/binaryop.h" +#include "testutil.h" + +#define OP_TYPE_MAX 9 + +static int op_type = 0; + +static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b) +{ + ncnn::Mat a = _a; + ncnn::Mat b = _b; + if (op_type == 6) + { + // value must be positive for pow + Randomize(a, 0.001f, 2.f); + Randomize(b, 0.001f, 2.f); + } + if (op_type == 3 || op_type == 8) + { + // value must be positive for pow + Randomize(a, 0.1f, 10.f); + Randomize(b, 0.1f, 10.f); + } + + ncnn::ParamDict pd; + pd.set(0, op_type); + pd.set(1, 0); // with_scalar + pd.set(2, 0.f); // b + + std::vector weights(0); + + std::vector ab(2); + ab[0] = a; + ab[1] = b; + + int ret = test_layer("BinaryOp", pd, weights, ab); + if (ret != 0) + { + fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type); + } + + return ret; +} + +static int test_binaryop(const ncnn::Mat& _a, float b) +{ + ncnn::Mat a = _a; + if (op_type == 6) + { + // value must be positive for pow + Randomize(a, 0.001f, 2.f); + b = RandomFloat(0.001f, 2.f); + } + + ncnn::ParamDict pd; + pd.set(0, op_type); + pd.set(1, 1); // with_scalar + pd.set(2, b); // b + + std::vector weights(0); + + int ret = test_layer("BinaryOp", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type); + } + + return ret; +} + +// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting + +static int test_binaryop_1() +{ + return 0 + || test_binaryop(RandomMat(1), 1.f); +} + +static int test_binaryop_2() +{ + return 0 + || test_binaryop(RandomMat(1), RandomMat(1)) + || test_binaryop(RandomMat(1), RandomMat(4)) + || test_binaryop(RandomMat(1), RandomMat(16)); +} + +static int test_binaryop_3() +{ + return 0 + || test_binaryop(RandomMat(1), RandomMat(11, 3)) + || test_binaryop(RandomMat(1), RandomMat(11, 4)) + || test_binaryop(RandomMat(1), RandomMat(11, 16)); +} + +static int test_binaryop_4() +{ + return 0 + || test_binaryop(RandomMat(1), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(1), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(1), RandomMat(11, 6, 16)); +} + +static int test_binaryop_5() +{ + return 0 + || test_binaryop(RandomMat(2), 1.f) + || test_binaryop(RandomMat(4), 1.f) + || test_binaryop(RandomMat(16), 1.f); +} + +static int test_binaryop_6() +{ + return 0 + || test_binaryop(RandomMat(2), RandomMat(1)) + || test_binaryop(RandomMat(4), RandomMat(1)) + || test_binaryop(RandomMat(16), RandomMat(1)); +} + +static int test_binaryop_7() +{ + return 0 + || test_binaryop(RandomMat(2), RandomMat(2)) + || test_binaryop(RandomMat(4), RandomMat(4)) + || test_binaryop(RandomMat(16), RandomMat(16)); +} + +static int test_binaryop_8() +{ + return 0 + || test_binaryop(RandomMat(3), RandomMat(11, 3)) + || test_binaryop(RandomMat(4), RandomMat(11, 4)) + || test_binaryop(RandomMat(16), RandomMat(11, 16)); +} + +static int test_binaryop_9() +{ + return 0 + || test_binaryop(RandomMat(2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_10() +{ + return 0 + || test_binaryop(RandomMat(11, 3), 1.f) + || test_binaryop(RandomMat(11, 4), 1.f) + || test_binaryop(RandomMat(11, 16), 1.f); +} + +static int test_binaryop_11() +{ + return 0 + || test_binaryop(RandomMat(11, 3), RandomMat(1)) + || test_binaryop(RandomMat(11, 4), RandomMat(1)) + || test_binaryop(RandomMat(11, 16), RandomMat(1)); +} + +static int test_binaryop_12() +{ + return 0 + || test_binaryop(RandomMat(11, 3), RandomMat(3)) + || test_binaryop(RandomMat(11, 4), RandomMat(4)) + || test_binaryop(RandomMat(11, 16), RandomMat(16)); +} + +static int test_binaryop_13() +{ + return 0 + || test_binaryop(RandomMat(11, 3), RandomMat(11, 3)) + || test_binaryop(RandomMat(11, 4), RandomMat(11, 4)) + || test_binaryop(RandomMat(11, 16), RandomMat(11, 16)); +} + +static int test_binaryop_14() +{ + return 0 + || test_binaryop(RandomMat(6, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(6, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(6, 16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_15() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), 1.f) + || test_binaryop(RandomMat(11, 6, 4), 1.f) + || test_binaryop(RandomMat(11, 6, 16), 1.f); +} + +static int test_binaryop_16() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(1)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(1)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(1)); +} + +static int test_binaryop_17() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(16)); +} + +static int test_binaryop_18() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(6, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(6, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(6, 16)); +} + +static int test_binaryop_19() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_20() +{ + return 0 + || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_21() +{ + return 0 + || test_binaryop(RandomMat(2), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(4), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(16), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_22() +{ + return 0 + || test_binaryop(RandomMat(4, 2), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(4, 4), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(4, 16), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_23() +{ + return 0 + || test_binaryop(RandomMat(3, 4, 2), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(3, 4, 4), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(3, 4, 16), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_24() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), 1.f) + || test_binaryop(RandomMat(11, 3, 4, 4), 1.f) + || test_binaryop(RandomMat(11, 3, 4, 16), 1.f); +} + +static int test_binaryop_25() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(1)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(1)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(1)); +} + +static int test_binaryop_26() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(2)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(16)); +} + +static int test_binaryop_27() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(4, 2)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4, 4)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(4, 16)); +} + +static int test_binaryop_28() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(3, 4, 2)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(3, 4, 4)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(3, 4, 16)); +} + +static int test_binaryop_29() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_s1() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(1, 1, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(1, 1, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(1, 1, 16)); +} + +static int test_binaryop_s2() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 6, 1)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 6, 1)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 1)); +} + +static int test_binaryop_s3() +{ + return 0 + || test_binaryop(RandomMat(1, 1, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(1, 1, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(1, 1, 16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_s4() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 16)); +} + +static int test_binaryop_s5() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(1, 6, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(1, 6, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(1, 6, 16)); +} + +static int test_binaryop_s6() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 1, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 1, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 1, 16)); +} + +static int test_binaryop_s7() +{ + return 0 + || test_binaryop(RandomMat(1, 6, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(1, 6, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(1, 6, 16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_s8() +{ + return 0 + || test_binaryop(RandomMat(11, 1, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(11, 1, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(11, 1, 16), RandomMat(11, 6, 16)); +} + +int main() +{ + SRAND(7767517); + + for (op_type = 3; op_type < 6; op_type++) + { + int ret = 0 + || test_binaryop_1() + || test_binaryop_2() + || test_binaryop_3() + || test_binaryop_4() + || test_binaryop_5() + || test_binaryop_6() + || test_binaryop_7() + || test_binaryop_8() + || test_binaryop_9() + || test_binaryop_10() + || test_binaryop_11() + || test_binaryop_12() + || test_binaryop_13() + || test_binaryop_14() + || test_binaryop_15() + || test_binaryop_16() + || test_binaryop_17() + || test_binaryop_18() + || test_binaryop_19() + || test_binaryop_20() + || test_binaryop_21() + || test_binaryop_22() + || test_binaryop_23() + || test_binaryop_24() + || test_binaryop_25() + || test_binaryop_26() + || test_binaryop_27() + || test_binaryop_28() + || test_binaryop_29() + || test_binaryop_s1() + || test_binaryop_s2() + || test_binaryop_s3() + || test_binaryop_s4() + || test_binaryop_s5() + || test_binaryop_s6() + || test_binaryop_s7() + || test_binaryop_s8(); + + if (ret != 0) + return ret; + } + + return 0; +} diff --git a/tests/test_binaryop_2.cpp b/tests/test_binaryop_2.cpp new file mode 100644 index 000000000000..1608a2880b06 --- /dev/null +++ b/tests/test_binaryop_2.cpp @@ -0,0 +1,431 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/binaryop.h" +#include "testutil.h" + +#define OP_TYPE_MAX 9 + +static int op_type = 0; + +static int test_binaryop(const ncnn::Mat& _a, const ncnn::Mat& _b) +{ + ncnn::Mat a = _a; + ncnn::Mat b = _b; + if (op_type == 6) + { + // value must be positive for pow + Randomize(a, 0.001f, 2.f); + Randomize(b, 0.001f, 2.f); + } + if (op_type == 3 || op_type == 8) + { + // value must be positive for pow + Randomize(a, 0.1f, 10.f); + Randomize(b, 0.1f, 10.f); + } + + ncnn::ParamDict pd; + pd.set(0, op_type); + pd.set(1, 0); // with_scalar + pd.set(2, 0.f); // b + + std::vector weights(0); + + std::vector ab(2); + ab[0] = a; + ab[1] = b; + + int ret = test_layer("BinaryOp", pd, weights, ab); + if (ret != 0) + { + fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b.dims=%d b=(%d %d %d %d) op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b.dims, b.w, b.h, b.d, b.c, op_type); + } + + return ret; +} + +static int test_binaryop(const ncnn::Mat& _a, float b) +{ + ncnn::Mat a = _a; + if (op_type == 6) + { + // value must be positive for pow + Randomize(a, 0.001f, 2.f); + b = RandomFloat(0.001f, 2.f); + } + + ncnn::ParamDict pd; + pd.set(0, op_type); + pd.set(1, 1); // with_scalar + pd.set(2, b); // b + + std::vector weights(0); + + int ret = test_layer("BinaryOp", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_binaryop failed a.dims=%d a=(%d %d %d %d) b=%f op_type=%d\n", a.dims, a.w, a.h, a.d, a.c, b, op_type); + } + + return ret; +} + +// https://github.com/Tencent/ncnn/wiki/binaryop-broadcasting + +static int test_binaryop_1() +{ + return 0 + || test_binaryop(RandomMat(1), 1.f); +} + +static int test_binaryop_2() +{ + return 0 + || test_binaryop(RandomMat(1), RandomMat(1)) + || test_binaryop(RandomMat(1), RandomMat(4)) + || test_binaryop(RandomMat(1), RandomMat(16)); +} + +static int test_binaryop_3() +{ + return 0 + || test_binaryop(RandomMat(1), RandomMat(11, 3)) + || test_binaryop(RandomMat(1), RandomMat(11, 4)) + || test_binaryop(RandomMat(1), RandomMat(11, 16)); +} + +static int test_binaryop_4() +{ + return 0 + || test_binaryop(RandomMat(1), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(1), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(1), RandomMat(11, 6, 16)); +} + +static int test_binaryop_5() +{ + return 0 + || test_binaryop(RandomMat(2), 1.f) + || test_binaryop(RandomMat(4), 1.f) + || test_binaryop(RandomMat(16), 1.f); +} + +static int test_binaryop_6() +{ + return 0 + || test_binaryop(RandomMat(2), RandomMat(1)) + || test_binaryop(RandomMat(4), RandomMat(1)) + || test_binaryop(RandomMat(16), RandomMat(1)); +} + +static int test_binaryop_7() +{ + return 0 + || test_binaryop(RandomMat(2), RandomMat(2)) + || test_binaryop(RandomMat(4), RandomMat(4)) + || test_binaryop(RandomMat(16), RandomMat(16)); +} + +static int test_binaryop_8() +{ + return 0 + || test_binaryop(RandomMat(3), RandomMat(11, 3)) + || test_binaryop(RandomMat(4), RandomMat(11, 4)) + || test_binaryop(RandomMat(16), RandomMat(11, 16)); +} + +static int test_binaryop_9() +{ + return 0 + || test_binaryop(RandomMat(2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_10() +{ + return 0 + || test_binaryop(RandomMat(11, 3), 1.f) + || test_binaryop(RandomMat(11, 4), 1.f) + || test_binaryop(RandomMat(11, 16), 1.f); +} + +static int test_binaryop_11() +{ + return 0 + || test_binaryop(RandomMat(11, 3), RandomMat(1)) + || test_binaryop(RandomMat(11, 4), RandomMat(1)) + || test_binaryop(RandomMat(11, 16), RandomMat(1)); +} + +static int test_binaryop_12() +{ + return 0 + || test_binaryop(RandomMat(11, 3), RandomMat(3)) + || test_binaryop(RandomMat(11, 4), RandomMat(4)) + || test_binaryop(RandomMat(11, 16), RandomMat(16)); +} + +static int test_binaryop_13() +{ + return 0 + || test_binaryop(RandomMat(11, 3), RandomMat(11, 3)) + || test_binaryop(RandomMat(11, 4), RandomMat(11, 4)) + || test_binaryop(RandomMat(11, 16), RandomMat(11, 16)); +} + +static int test_binaryop_14() +{ + return 0 + || test_binaryop(RandomMat(6, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(6, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(6, 16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_15() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), 1.f) + || test_binaryop(RandomMat(11, 6, 4), 1.f) + || test_binaryop(RandomMat(11, 6, 16), 1.f); +} + +static int test_binaryop_16() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(1)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(1)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(1)); +} + +static int test_binaryop_17() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(16)); +} + +static int test_binaryop_18() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(6, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(6, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(6, 16)); +} + +static int test_binaryop_19() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_20() +{ + return 0 + || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(1), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_21() +{ + return 0 + || test_binaryop(RandomMat(2), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(4), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(16), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_22() +{ + return 0 + || test_binaryop(RandomMat(4, 2), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(4, 4), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(4, 16), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_23() +{ + return 0 + || test_binaryop(RandomMat(3, 4, 2), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(3, 4, 4), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(3, 4, 16), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_24() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), 1.f) + || test_binaryop(RandomMat(11, 3, 4, 4), 1.f) + || test_binaryop(RandomMat(11, 3, 4, 16), 1.f); +} + +static int test_binaryop_25() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(1)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(1)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(1)); +} + +static int test_binaryop_26() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(2)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(16)); +} + +static int test_binaryop_27() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(4, 2)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(4, 4)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(4, 16)); +} + +static int test_binaryop_28() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(3, 4, 2)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(3, 4, 4)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(3, 4, 16)); +} + +static int test_binaryop_29() +{ + return 0 + || test_binaryop(RandomMat(11, 3, 4, 2), RandomMat(11, 3, 4, 2)) + || test_binaryop(RandomMat(11, 3, 4, 4), RandomMat(11, 3, 4, 4)) + || test_binaryop(RandomMat(11, 3, 4, 16), RandomMat(11, 3, 4, 16)); +} + +static int test_binaryop_s1() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(1, 1, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(1, 1, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(1, 1, 16)); +} + +static int test_binaryop_s2() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 6, 1)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 6, 1)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 6, 1)); +} + +static int test_binaryop_s3() +{ + return 0 + || test_binaryop(RandomMat(1, 1, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(1, 1, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(1, 1, 16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_s4() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(11, 6, 1), RandomMat(11, 6, 16)); +} + +static int test_binaryop_s5() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(1, 6, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(1, 6, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(1, 6, 16)); +} + +static int test_binaryop_s6() +{ + return 0 + || test_binaryop(RandomMat(11, 6, 2), RandomMat(11, 1, 2)) + || test_binaryop(RandomMat(11, 6, 4), RandomMat(11, 1, 4)) + || test_binaryop(RandomMat(11, 6, 16), RandomMat(11, 1, 16)); +} + +static int test_binaryop_s7() +{ + return 0 + || test_binaryop(RandomMat(1, 6, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(1, 6, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(1, 6, 16), RandomMat(11, 6, 16)); +} + +static int test_binaryop_s8() +{ + return 0 + || test_binaryop(RandomMat(11, 1, 2), RandomMat(11, 6, 2)) + || test_binaryop(RandomMat(11, 1, 4), RandomMat(11, 6, 4)) + || test_binaryop(RandomMat(11, 1, 16), RandomMat(11, 6, 16)); +} + +int main() +{ + SRAND(7767517); + + for (op_type = 6; op_type < OP_TYPE_MAX; op_type++) + { + int ret = 0 + || test_binaryop_1() + || test_binaryop_2() + || test_binaryop_3() + || test_binaryop_4() + || test_binaryop_5() + || test_binaryop_6() + || test_binaryop_7() + || test_binaryop_8() + || test_binaryop_9() + || test_binaryop_10() + || test_binaryop_11() + || test_binaryop_12() + || test_binaryop_13() + || test_binaryop_14() + || test_binaryop_15() + || test_binaryop_16() + || test_binaryop_17() + || test_binaryop_18() + || test_binaryop_19() + || test_binaryop_20() + || test_binaryop_21() + || test_binaryop_22() + || test_binaryop_23() + || test_binaryop_24() + || test_binaryop_25() + || test_binaryop_26() + || test_binaryop_27() + || test_binaryop_28() + || test_binaryop_29() + || test_binaryop_s1() + || test_binaryop_s2() + || test_binaryop_s3() + || test_binaryop_s4() + || test_binaryop_s5() + || test_binaryop_s6() + || test_binaryop_s7() + || test_binaryop_s8(); + + if (ret != 0) + return ret; + } + + return 0; +} diff --git a/tests/test_c_api.cpp b/tests/test_c_api.cpp index 2c6dac4a8a24..7cdfc940f1d3 100644 --- a/tests/test_c_api.cpp +++ b/tests/test_c_api.cpp @@ -243,9 +243,15 @@ static int test_c_api_2() emptydr->read = emptydr_read; } + ncnn_allocator_t blob_allocator = ncnn_allocator_create_pool_allocator(); + ncnn_allocator_t workspace_allocator = ncnn_allocator_create_unlocked_pool_allocator(); + ncnn_option_t opt = ncnn_option_create(); { ncnn_option_set_num_threads(opt, 1); + + ncnn_option_set_blob_allocator(opt, blob_allocator); + ncnn_option_set_workspace_allocator(opt, workspace_allocator); } ncnn_net_t net = ncnn_net_create(); @@ -260,7 +266,7 @@ static int test_c_api_2() ncnn_net_load_model_datareader(net, emptydr); } - ncnn_mat_t a = ncnn_mat_create_1d(24, NULL); + ncnn_mat_t a = ncnn_mat_create_1d(24, blob_allocator); // set a { @@ -274,7 +280,7 @@ static int test_c_api_2() memcpy(a_data, data, 24 * sizeof(float)); } - ncnn_mat_t b = ncnn_mat_reshape_3d(a, 4, 2, 3, NULL); + ncnn_mat_t b = ncnn_mat_reshape_3d(a, 4, 2, 3, blob_allocator); ncnn_mat_t c = 0; { @@ -321,6 +327,9 @@ static int test_c_api_2() ncnn_option_destroy(opt); + ncnn_allocator_destroy(blob_allocator); + ncnn_allocator_destroy(workspace_allocator); + ncnn_datareader_destroy(emptydr); if (!success) diff --git a/tests/test_convolution.cpp b/tests/test_convolution.cpp index 5f7f5d209934..9140750e8c11 100644 --- a/tests/test_convolution.cpp +++ b/tests/test_convolution.cpp @@ -82,7 +82,7 @@ static int test_convolution_0() {7, 2, 1, -233}, }; - for (int i = 0; i < 16; i++) + for (int i = 0; i < 12; i++) { const int k = kdsp[i][0]; const int d = kdsp[i][1]; @@ -125,313 +125,12 @@ static int test_convolution_0() return -1; } - return 0 - || test_convolution(7, 5, 1, 4, 3, 1, 1, 1, 1) - || test_convolution(14, 5, 1, 4, 3, 1, 2, 1, 1) - || test_convolution(11, 5, 2, 12, 2, 2, 2, 1, 1) - || test_convolution(15, 11, 4, 4, 3, 1, 1, 1, 1) - || test_convolution(15, 11, 8, 8, 3, 1, 1, 1, 1) - || test_convolution(11, 11, 8, 16, 3, 1, 1, 1, 1) - || test_convolution(13, 16, 16, 24, 3, 1, 1, 1, 1) - || test_convolution(20, 19, 24, 24, 3, 1, 1, 1, 1) - || test_convolution(8, 8, 16, 24, 3, 1, 1, 1, 0) - || test_convolution(4, 8, 16, 24, 3, 1, 1, 1, 1) - || test_convolution(4, 20, 16, 24, 3, 1, 1, 1, 0) - || test_convolution(6, 7, 64, 64, 3, 1, 2, 0, 1) - || test_convolution(15, 17, 24, 32, 1, 1, 1, 0, 0) - || test_convolution(15, 17, 24, 32, 1, 1, 2, 0, 1) - || test_convolution(15, 17, 24, 32, 3, 1, 2, 0, 1) - || test_convolution(15, 17, 32, 24, 1, 1, 1, 0, 0) - || test_convolution(15, 17, 32, 24, 1, 1, 2, 0, 1) - || test_convolution(15, 17, 32, 24, 3, 1, 2, 0, 1) - || test_convolution(15, 17, 32, 28, 1, 1, 1, 0, 0) - || test_convolution(15, 17, 32, 28, 1, 1, 2, 0, 1) - || test_convolution(15, 17, 32, 28, 3, 1, 2, 0, 1) - || test_convolution(15, 17, 26, 32, 1, 1, 1, 0, 0) - || test_convolution(15, 17, 26, 32, 1, 1, 2, 0, 1) - || test_convolution(15, 17, 26, 32, 3, 1, 2, 0, 1) - || test_convolution(15, 17, 32, 26, 1, 1, 1, 0, 0) - || test_convolution(15, 17, 32, 26, 1, 1, 2, 0, 1) - || test_convolution(15, 17, 32, 26, 3, 1, 2, 0, 1) - || test_convolution(30, 30, 32, 26, 3, 1, 1, 1, 0) - || test_convolution(12, 18, 8, 16, 3, 1, 1, 1, 1) - || test_convolution(42, 18, 32, 160, 3, 1, 1, 1, 1) - || test_convolution(12, 18, 32, 160, 3, 1, 1, 1, 1) - || test_convolution(12, 18, 4, 12, 3, 1, 1, 1, 1) - || test_convolution(42, 18, 28, 140, 3, 1, 1, 1, 1) - || test_convolution(12, 18, 28, 140, 3, 1, 1, 1, 1); -} - -static int test_convolution_vec(int w, int outch, int kernel, int dilation, int stride, int pad, int bias) -{ - ncnn::Mat a = RandomMat(w); - - ncnn::ParamDict pd; - pd.set(0, outch); // num_output - pd.set(1, kernel); // kernel_w - pd.set(2, dilation); // dilation_w - pd.set(3, stride); // stride_w - pd.set(4, pad); // pad_w - pd.set(5, bias); // bias_term - pd.set(6, outch * w * kernel * kernel); - - int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 - ncnn::Mat activation_params(2); - activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha - activation_params[1] = RandomFloat(0, 1); // beta - pd.set(9, activation_type); - pd.set(10, activation_params); - - std::vector weights(bias ? 2 : 1); - weights[0] = RandomMat(outch * w * kernel * kernel); - if (bias) - weights[1] = RandomMat(outch); - - int ret = test_layer("Convolution", pd, weights, a); - if (ret != 0) - { - fprintf(stderr, "test_convolution_vec failed w=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); - } - - return ret; -} - -static int test_convolution_2() -{ - return 0 - || test_convolution_vec(1, 1, 1, 1, 1, 0, 1) - || test_convolution_vec(11, 12, 1, 1, 1, 0, 0) - || test_convolution_vec(20, 15, 1, 1, 1, 0, 1) - || test_convolution_vec(12, 20, 1, 1, 1, 0, 0) - || test_convolution_vec(3, 24, 1, 1, 1, 0, 1) - || test_convolution_vec(24, 5, 1, 1, 1, 0, 0) - || test_convolution_vec(32, 24, 1, 1, 1, 0, 1) - || test_convolution_vec(12, 32, 1, 1, 1, 0, 0) - || test_convolution_vec(64, 20, 1, 1, 1, 0, 1) - || test_convolution_vec(64, 128, 1, 1, 1, 0, 0); -} - -static int test_convolution_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) -{ - ncnn::Mat a = RandomMat(w, h, c); - - ncnn::ParamDict pd; - pd.set(0, 0); - pd.set(1, 0); - pd.set(2, dilation); - pd.set(3, stride); - pd.set(4, pad); - pd.set(5, bias); - pd.set(6, 0); - pd.set(19, 1); // dynamic weight - - int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 - ncnn::Mat activation_params(2); - activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha - activation_params[1] = RandomFloat(0, 1); // beta - pd.set(9, activation_type); - pd.set(10, activation_params); - - std::vector as(bias ? 3 : 2); - as[0] = a; - as[1] = RandomMat(kernel, kernel, c, outch); - if (bias) - as[2] = RandomMat(outch); - - std::vector weights(0); - - int ret = test_layer("Convolution", pd, weights, as); - if (ret != 0) - { - fprintf(stderr, "test_convolution_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); - } - - return ret; -} - -static int test_convolution_3() -{ - static const int kdsp[7][4] = { - {1, 1, 1, 0}, - {1, 1, 2, 0}, - {2, 1, 1, 1}, - {2, 1, 2, -233}, - {3, 1, 1, 1}, - {3, 1, 2, 1}, - {3, 2, 1, -234}, - }; - - for (int i = 0; i < 7; i++) - { - const int k = kdsp[i][0]; - const int d = kdsp[i][1]; - const int s = kdsp[i][2]; - const int p = kdsp[i][3]; - - int ret = 0 - || test_convolution_dynamic(11, 10, 1, 1, k, d, s, p, 1) - || test_convolution_dynamic(11, 10, 4, 13, k, d, s, p, 0) - || test_convolution_dynamic(11, 10, 13, 4, k, d, s, p, 1) - || test_convolution_dynamic(11, 10, 12, 12, k, d, s, p, 0) - || test_convolution_dynamic(11, 10, 8, 12, k, d, s, p, 1) - || test_convolution_dynamic(11, 10, 8, 13, k, d, s, p, 0) - || test_convolution_dynamic(11, 10, 13, 8, k, d, s, p, 1) - || test_convolution_dynamic(11, 10, 12, 16, k, d, s, p, 0) - || test_convolution_dynamic(11, 10, 15, 15, k, d, s, p, 0) - || test_convolution_dynamic(11, 10, 16, 16, k, d, s, p, 0); - - if (ret != 0) - return -1; - } - return 0; } -#if NCNN_INT8 -static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false) -{ - ncnn::Mat a = RandomMat(w, h, c); - - ncnn::ParamDict pd; - pd.set(0, outch); - pd.set(1, kernel); - pd.set(2, dilation); - pd.set(3, stride); - pd.set(4, pad); - pd.set(5, bias); - pd.set(6, outch * c * kernel * kernel); - pd.set(8, requant ? 101 : 1); // int8_scale_term - - int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 - ncnn::Mat activation_params(2); - activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha - activation_params[1] = RandomFloat(0, 1); // beta - pd.set(9, activation_type); - pd.set(10, activation_params); - - std::vector weights(bias ? 5 : 4); - weights[0] = RandomMat(outch * c * kernel * kernel); - - ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel); - ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep); - ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat(); - if (bias) - { - weights[1] = RandomMat(outch); - weights[2] = weight_scales; - weights[3] = input_scales; - weights[4] = top_scales; - } - else - { - weights[1] = weight_scales; - weights[2] = input_scales; - weights[3] = top_scales; - } - - int flag = TEST_LAYER_DISABLE_GPU_TESTING; - int ret = test_layer("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag); - if (ret != 0) - { - fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]); - } - - return ret; -} - -static int test_convolution_1() -{ - static const int kdsp[16][4] = { - {1, 1, 1, 0}, - {1, 1, 2, 0}, - {2, 1, 1, 1}, - {2, 1, 2, -233}, - {3, 1, 1, 1}, - {3, 1, 2, 1}, - {3, 2, 1, 1}, - {4, 1, 1, 2}, - {4, 1, 2, -233}, - {4, 2, 1, -234}, - {5, 1, 1, -234}, - {5, 1, 2, 2}, - {5, 2, 2, 2}, - {7, 1, 1, 3}, - {7, 1, 2, 3}, - {7, 2, 1, -233}, - }; - - for (int i = 0; i < 16; i++) - { - const int k = kdsp[i][0]; - const int d = kdsp[i][1]; - const int s = kdsp[i][2]; - const int p = kdsp[i][3]; - - int ret = 0 - || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1) - || test_convolution_int8(9, 7, 2, 2, k, d, s, p, 1) - || test_convolution_int8(9, 7, 3, 3, k, d, s, p, 1) - || test_convolution_int8(9, 7, 4, 4, k, d, s, p, 1) - || test_convolution_int8(9, 7, 7, 7, k, d, s, p, 1) - || test_convolution_int8(9, 7, 8, 8, k, d, s, p, 1) - || test_convolution_int8(9, 7, 15, 15, k, d, s, p, 1) - || test_convolution_int8(9, 7, 16, 15, k, d, s, p, 1) - || test_convolution_int8(9, 7, 15, 16, k, d, s, p, 1) - || test_convolution_int8(9, 7, 16, 16, k, d, s, p, 1); - - if (ret != 0) - return -1; - } - for (int i = 0; i < 16; i++) - { - const int k = kdsp[i][0]; - const int d = kdsp[i][1]; - const int s = kdsp[i][2]; - const int p = kdsp[i][3]; - - int ret = 0 - || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 2, 2, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 3, 3, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 4, 4, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 7, 7, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 8, 8, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 15, 15, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 16, 15, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 15, 16, k, d, s, p, 1, true) - || test_convolution_int8(9, 7, 16, 16, k, d, s, p, 1, true); - - if (ret != 0) - return -1; - } - - return 0 - || test_convolution_int8(11, 11, 8, 16, 3, 1, 1, 1, 1) - || test_convolution_int8(13, 16, 16, 24, 3, 1, 1, 1, 1) - || test_convolution_int8(8, 8, 16, 24, 3, 1, 1, 1, 0) - || test_convolution_int8(4, 8, 16, 24, 3, 1, 1, 1, 1) - || test_convolution_int8(4, 20, 16, 24, 3, 1, 1, 1, 0) - || test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1) - || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0) - || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0); -} -#endif // NCNN_INT8 - int main() { SRAND(7767517); -#if NCNN_INT8 - return 0 - || test_convolution_0() - || test_convolution_1() - || test_convolution_2() - || test_convolution_3(); -#else - return 0 - || test_convolution_0() - || test_convolution_2() - || test_convolution_3(); -#endif + return test_convolution_0(); } diff --git a/tests/test_convolution_1.cpp b/tests/test_convolution_1.cpp new file mode 100644 index 000000000000..b7641a4ea0f9 --- /dev/null +++ b/tests/test_convolution_1.cpp @@ -0,0 +1,136 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/convolution.h" +#include "testutil.h" + +static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + ncnn::Mat a = RandomMat(w, h, c); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch * c * kernel * kernel); + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(outch * c * kernel * kernel); + if (bias) + weights[1] = RandomMat(outch); + + float epsilon = 0.001; + // larget epsilon for winograd optimization + if (kernel == 3 && dilation == 1 && stride == 1 && c >= 16 && outch >= 16) + { + Randomize(a, -1, 1); + if (c >= 64 || outch >= 64) + Randomize(weights[0], -0.3, 0.3); + else + Randomize(weights[0], -1, 1); + epsilon = 0.002; + } + + int ret = test_layer("Convolution", pd, weights, a, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_convolution_0() +{ + static const int kdsp[16][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, -233}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, 1}, + {4, 1, 1, 2}, + {4, 1, 2, -233}, + {4, 2, 1, -234}, + {5, 1, 1, -234}, + {5, 1, 2, 2}, + {5, 2, 2, 2}, + {7, 1, 1, 3}, + {7, 1, 2, 3}, + {7, 2, 1, -233}, + }; + + for (int i = 12; i < 16; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_convolution(9, 7, 1, 1, k, d, s, p, 1) + || test_convolution(9, 7, 4, 13, k, d, s, p, 0) + || test_convolution(9, 7, 13, 4, k, d, s, p, 1) + || test_convolution(9, 7, 12, 12, k, d, s, p, 0) + || test_convolution(9, 7, 8, 12, k, d, s, p, 1) + || test_convolution(9, 7, 8, 13, k, d, s, p, 0) + || test_convolution(9, 7, 13, 8, k, d, s, p, 1) + || test_convolution(9, 7, 12, 16, k, d, s, p, 0) + || test_convolution(9, 7, 15, 15, k, d, s, p, 0) + || test_convolution(9, 7, 16, 16, k, d, s, p, 0) + || test_convolution(18, 17, 1, 1, k, d, s, p, 1) + || test_convolution(18, 17, 4, 13, k, d, s, p, 0) + || test_convolution(18, 17, 13, 4, k, d, s, p, 1) + || test_convolution(18, 17, 12, 12, k, d, s, p, 0) + || test_convolution(18, 17, 8, 12, k, d, s, p, 1) + || test_convolution(18, 17, 8, 13, k, d, s, p, 0) + || test_convolution(18, 17, 13, 8, k, d, s, p, 1) + || test_convolution(18, 17, 12, 16, k, d, s, p, 0) + || test_convolution(18, 17, 15, 15, k, d, s, p, 0) + || test_convolution(18, 17, 16, 16, k, d, s, p, 0) + || test_convolution(25, 33, 1, 1, k, d, s, p, 1) + || test_convolution(25, 33, 4, 13, k, d, s, p, 0) + || test_convolution(25, 33, 13, 4, k, d, s, p, 1) + || test_convolution(25, 33, 12, 12, k, d, s, p, 0) + || test_convolution(25, 33, 8, 12, k, d, s, p, 1) + || test_convolution(25, 33, 8, 13, k, d, s, p, 0) + || test_convolution(25, 33, 13, 8, k, d, s, p, 1) + || test_convolution(25, 33, 12, 16, k, d, s, p, 0) + || test_convolution(25, 33, 15, 15, k, d, s, p, 0) + || test_convolution(25, 33, 16, 16, k, d, s, p, 0); + + if (ret != 0) + return -1; + } + + return 0; +} + +int main() +{ + SRAND(7767517); + + return test_convolution_0(); +} diff --git a/tests/test_convolution_2.cpp b/tests/test_convolution_2.cpp new file mode 100644 index 000000000000..2dbaf59b3ba1 --- /dev/null +++ b/tests/test_convolution_2.cpp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/convolution.h" +#include "testutil.h" + +static int test_convolution(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + ncnn::Mat a = RandomMat(w, h, c); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch * c * kernel * kernel); + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(outch * c * kernel * kernel); + if (bias) + weights[1] = RandomMat(outch); + + float epsilon = 0.001; + // larget epsilon for winograd optimization + if (kernel == 3 && dilation == 1 && stride == 1 && c >= 16 && outch >= 16) + { + Randomize(a, -1, 1); + if (c >= 64 || outch >= 64) + Randomize(weights[0], -0.3, 0.3); + else + Randomize(weights[0], -1, 1); + epsilon = 0.002; + } + + int ret = test_layer("Convolution", pd, weights, a, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_convolution failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_convolution_0() +{ + return 0 + || test_convolution(7, 5, 1, 4, 3, 1, 1, 1, 1) + || test_convolution(14, 5, 1, 4, 3, 1, 2, 1, 1) + || test_convolution(11, 5, 2, 12, 2, 2, 2, 1, 1) + || test_convolution(15, 11, 4, 4, 3, 1, 1, 1, 1) + || test_convolution(15, 11, 8, 8, 3, 1, 1, 1, 1) + || test_convolution(11, 11, 8, 16, 3, 1, 1, 1, 1) + || test_convolution(13, 16, 16, 24, 3, 1, 1, 1, 1) + || test_convolution(20, 19, 24, 24, 3, 1, 1, 1, 1) + || test_convolution(8, 8, 16, 24, 3, 1, 1, 1, 0) + || test_convolution(4, 8, 16, 24, 3, 1, 1, 1, 1) + || test_convolution(4, 20, 16, 24, 3, 1, 1, 1, 0) + || test_convolution(6, 7, 64, 64, 3, 1, 2, 0, 1) + || test_convolution(15, 17, 24, 32, 1, 1, 1, 0, 0) + || test_convolution(15, 17, 24, 32, 1, 1, 2, 0, 1) + || test_convolution(15, 17, 24, 32, 3, 1, 2, 0, 1) + || test_convolution(15, 17, 32, 24, 1, 1, 1, 0, 0) + || test_convolution(15, 17, 32, 24, 1, 1, 2, 0, 1) + || test_convolution(15, 17, 32, 24, 3, 1, 2, 0, 1) + || test_convolution(15, 17, 32, 28, 1, 1, 1, 0, 0) + || test_convolution(15, 17, 32, 28, 1, 1, 2, 0, 1) + || test_convolution(15, 17, 32, 28, 3, 1, 2, 0, 1) + || test_convolution(15, 17, 26, 32, 1, 1, 1, 0, 0) + || test_convolution(15, 17, 26, 32, 1, 1, 2, 0, 1) + || test_convolution(15, 17, 26, 32, 3, 1, 2, 0, 1) + || test_convolution(15, 17, 32, 26, 1, 1, 1, 0, 0) + || test_convolution(15, 17, 32, 26, 1, 1, 2, 0, 1) + || test_convolution(15, 17, 32, 26, 3, 1, 2, 0, 1) + || test_convolution(30, 30, 32, 26, 3, 1, 1, 1, 0) + || test_convolution(12, 18, 8, 16, 3, 1, 1, 1, 1) + || test_convolution(42, 18, 32, 160, 3, 1, 1, 1, 1) + || test_convolution(12, 18, 32, 160, 3, 1, 1, 1, 1) + || test_convolution(12, 18, 4, 12, 3, 1, 1, 1, 1) + || test_convolution(42, 18, 28, 140, 3, 1, 1, 1, 1) + || test_convolution(12, 18, 28, 140, 3, 1, 1, 1, 1); +} + +int main() +{ + SRAND(7767517); + + return test_convolution_0(); +} diff --git a/tests/test_convolution_3.cpp b/tests/test_convolution_3.cpp new file mode 100644 index 000000000000..3d6f91d096a9 --- /dev/null +++ b/tests/test_convolution_3.cpp @@ -0,0 +1,288 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/convolution.h" +#include "testutil.h" + +static int test_convolution_vec(int w, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + ncnn::Mat a = RandomMat(w); + + ncnn::ParamDict pd; + pd.set(0, outch); // num_output + pd.set(1, kernel); // kernel_w + pd.set(2, dilation); // dilation_w + pd.set(3, stride); // stride_w + pd.set(4, pad); // pad_w + pd.set(5, bias); // bias_term + pd.set(6, outch * w * kernel * kernel); + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(outch * w * kernel * kernel); + if (bias) + weights[1] = RandomMat(outch); + + int ret = test_layer("Convolution", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_convolution_vec failed w=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_convolution_2() +{ + return 0 + || test_convolution_vec(1, 1, 1, 1, 1, 0, 1) + || test_convolution_vec(11, 12, 1, 1, 1, 0, 0) + || test_convolution_vec(20, 15, 1, 1, 1, 0, 1) + || test_convolution_vec(12, 20, 1, 1, 1, 0, 0) + || test_convolution_vec(3, 24, 1, 1, 1, 0, 1) + || test_convolution_vec(24, 5, 1, 1, 1, 0, 0) + || test_convolution_vec(32, 24, 1, 1, 1, 0, 1) + || test_convolution_vec(12, 32, 1, 1, 1, 0, 0) + || test_convolution_vec(64, 20, 1, 1, 1, 0, 1) + || test_convolution_vec(64, 128, 1, 1, 1, 0, 0); +} + +static int test_convolution_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + ncnn::Mat a = RandomMat(w, h, c); + + ncnn::ParamDict pd; + pd.set(0, 0); + pd.set(1, 0); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, 0); + pd.set(19, 1); // dynamic weight + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector as(bias ? 3 : 2); + as[0] = a; + as[1] = RandomMat(kernel, kernel, c, outch); + if (bias) + as[2] = RandomMat(outch); + + std::vector weights(0); + + int ret = test_layer("Convolution", pd, weights, as); + if (ret != 0) + { + fprintf(stderr, "test_convolution_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_convolution_3() +{ + static const int kdsp[7][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, -233}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, -234}, + }; + + for (int i = 0; i < 7; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_convolution_dynamic(11, 10, 1, 1, k, d, s, p, 1) + || test_convolution_dynamic(11, 10, 4, 13, k, d, s, p, 0) + || test_convolution_dynamic(11, 10, 13, 4, k, d, s, p, 1) + || test_convolution_dynamic(11, 10, 12, 12, k, d, s, p, 0) + || test_convolution_dynamic(11, 10, 8, 12, k, d, s, p, 1) + || test_convolution_dynamic(11, 10, 8, 13, k, d, s, p, 0) + || test_convolution_dynamic(11, 10, 13, 8, k, d, s, p, 1) + || test_convolution_dynamic(11, 10, 12, 16, k, d, s, p, 0) + || test_convolution_dynamic(11, 10, 15, 15, k, d, s, p, 0) + || test_convolution_dynamic(11, 10, 16, 16, k, d, s, p, 0); + + if (ret != 0) + return -1; + } + + return 0; +} + +#if NCNN_INT8 +static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, bool requant = false) +{ + ncnn::Mat a = RandomMat(w, h, c); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch * c * kernel * kernel); + pd.set(8, requant ? 101 : 1); // int8_scale_term + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 5 : 4); + weights[0] = RandomMat(outch * c * kernel * kernel); + + ncnn::Mat weight_scales = scales_mat(weights[0], outch, c * kernel * kernel, c * kernel * kernel); + ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep); + ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat(); + if (bias) + { + weights[1] = RandomMat(outch); + weights[2] = weight_scales; + weights[3] = input_scales; + weights[4] = top_scales; + } + else + { + weights[1] = weight_scales; + weights[2] = input_scales; + weights[3] = top_scales; + } + + int flag = TEST_LAYER_DISABLE_GPU_TESTING; + int ret = test_layer("Convolution", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag); + if (ret != 0) + { + fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_convolution_1() +{ + static const int kdsp[16][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, -233}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, 1}, + {4, 1, 1, 2}, + {4, 1, 2, -233}, + {4, 2, 1, -234}, + {5, 1, 1, -234}, + {5, 1, 2, 2}, + {5, 2, 2, 2}, + {7, 1, 1, 3}, + {7, 1, 2, 3}, + {7, 2, 1, -233}, + }; + + for (int i = 0; i < 16; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1) + || test_convolution_int8(9, 7, 2, 2, k, d, s, p, 1) + || test_convolution_int8(9, 7, 3, 3, k, d, s, p, 1) + || test_convolution_int8(9, 7, 4, 4, k, d, s, p, 1) + || test_convolution_int8(9, 7, 7, 7, k, d, s, p, 1) + || test_convolution_int8(9, 7, 8, 8, k, d, s, p, 1) + || test_convolution_int8(9, 7, 15, 15, k, d, s, p, 1) + || test_convolution_int8(9, 7, 16, 15, k, d, s, p, 1) + || test_convolution_int8(9, 7, 15, 16, k, d, s, p, 1) + || test_convolution_int8(9, 7, 16, 16, k, d, s, p, 1); + + if (ret != 0) + return -1; + } + for (int i = 0; i < 16; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 1, 1, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 2, 2, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 3, 3, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 4, 4, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 7, 7, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 8, 8, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 15, 15, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 16, 15, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 15, 16, k, d, s, p, 1, true) + || test_convolution_int8(9, 7, 16, 16, k, d, s, p, 1, true); + + if (ret != 0) + return -1; + } + + return 0 + || test_convolution_int8(11, 11, 8, 16, 3, 1, 1, 1, 1) + || test_convolution_int8(13, 16, 16, 24, 3, 1, 1, 1, 1) + || test_convolution_int8(8, 8, 16, 24, 3, 1, 1, 1, 0) + || test_convolution_int8(4, 8, 16, 24, 3, 1, 1, 1, 1) + || test_convolution_int8(4, 20, 16, 24, 3, 1, 1, 1, 0) + || test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1) + || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0) + || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0); +} +#endif // NCNN_INT8 + +int main() +{ + SRAND(7767517); + +#if NCNN_INT8 + return 0 + || test_convolution_1() + || test_convolution_2() + || test_convolution_3(); +#else + return 0 + || test_convolution_2() + || test_convolution_3(); +#endif +} diff --git a/tests/test_convolutiondepthwise.cpp b/tests/test_convolutiondepthwise.cpp index 03317b68c1eb..715fc73662c4 100644 --- a/tests/test_convolutiondepthwise.cpp +++ b/tests/test_convolutiondepthwise.cpp @@ -125,222 +125,9 @@ static int test_convolutiondepthwise_0() return 0; } -static int test_convolutiondepthwise_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group) -{ - ncnn::Mat a = RandomMat(w, h, c); - - ncnn::ParamDict pd; - pd.set(0, 0); - pd.set(1, 0); - pd.set(2, dilation); - pd.set(3, stride); - pd.set(4, pad); - pd.set(5, bias); - pd.set(6, 0); - pd.set(7, group); - pd.set(19, 1); // dynamic weight - - int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 - ncnn::Mat activation_params(2); - activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha - activation_params[1] = RandomFloat(0, 1); // beta - pd.set(9, activation_type); - pd.set(10, activation_params); - - std::vector as(bias ? 3 : 2); - as[0] = a; - as[1] = RandomMat(kernel, kernel, c / group, outch); - if (bias) - as[2] = RandomMat(outch); - - std::vector weights(0); - - int ret = test_layer("ConvolutionDepthWise", pd, weights, as); - if (ret != 0) - { - fprintf(stderr, "test_convolutiondepthwise_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]); - } - - return ret; -} - -static int test_convolutiondepthwise_2() -{ - static const int kdsp[7][4] = { - {1, 1, 1, 0}, - {1, 1, 2, 0}, - {2, 1, 1, 1}, - {2, 1, 2, -233}, - {3, 1, 1, 1}, - {3, 1, 2, 1}, - {3, 2, 1, -234}, - }; - - for (int i = 0; i < 7; i++) - { - const int k = kdsp[i][0]; - const int d = kdsp[i][1]; - const int s = kdsp[i][2]; - const int p = kdsp[i][3]; - - int ret = 0 - || test_convolutiondepthwise_dynamic(11, 10, 1, 1, k, d, s, p, 1, 1) - || test_convolutiondepthwise_dynamic(11, 10, 2, 2, k, d, s, p, 0, 1) - || test_convolutiondepthwise_dynamic(11, 10, 2, 2, k, d, s, p, 1, 2) - || test_convolutiondepthwise_dynamic(11, 10, 3, 3, k, d, s, p, 0, 3) - || test_convolutiondepthwise_dynamic(11, 10, 4, 2, k, d, s, p, 1, 2) - || test_convolutiondepthwise_dynamic(11, 10, 4, 4, k, d, s, p, 0, 4) - || test_convolutiondepthwise_dynamic(11, 10, 7, 7, k, d, s, p, 1, 7) - || test_convolutiondepthwise_dynamic(11, 10, 8, 8, k, d, s, p, 0, 2) - || test_convolutiondepthwise_dynamic(11, 10, 8, 8, k, d, s, p, 1, 8) - || test_convolutiondepthwise_dynamic(11, 10, 12, 12, k, d, s, p, 0, 4) - || test_convolutiondepthwise_dynamic(11, 10, 15, 15, k, d, s, p, 1, 15) - || test_convolutiondepthwise_dynamic(11, 10, 16, 8, k, d, s, p, 0, 2) - || test_convolutiondepthwise_dynamic(11, 10, 16, 16, k, d, s, p, 1, 16); - - if (ret != 0) - return -1; - } - - return 0; -} - -#if NCNN_INT8 -static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false) -{ - ncnn::Mat a = RandomMat(w, h, c); - - ncnn::ParamDict pd; - pd.set(0, outch); - pd.set(1, kernel); - pd.set(2, dilation); - pd.set(3, stride); - pd.set(4, pad); - pd.set(5, bias); - pd.set(6, outch / group * c / group * kernel * kernel * group); - pd.set(7, group); - pd.set(8, requant ? 101 : 1); // int8_scale_term - - int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 - ncnn::Mat activation_params(2); - activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha - activation_params[1] = RandomFloat(0, 1); // beta - pd.set(9, activation_type); - pd.set(10, activation_params); - - std::vector weights(bias ? 5 : 4); - weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group); - ncnn::Mat weight_scales = scales_mat(weights[0], group, c * kernel * kernel / group, c * kernel * kernel / group); - ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep); - ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat(); - if (bias) - { - weights[1] = RandomMat(outch); - weights[2] = weight_scales; - weights[3] = input_scales; - weights[4] = top_scales; - } - else - { - weights[1] = weight_scales; - weights[2] = input_scales; - weights[3] = top_scales; - } - - int flag = TEST_LAYER_DISABLE_GPU_TESTING; - int ret = test_layer("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag); - if (ret != 0) - { - fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, requant, activation_type, activation_params[0], activation_params[1]); - } - - return ret; -} - -static int test_convolutiondepthwise_1() -{ - static const int kdsp[16][4] = { - {1, 1, 1, 0}, - {1, 1, 2, 0}, - {2, 1, 1, 1}, - {2, 1, 2, -233}, - {3, 1, 1, 1}, - {3, 1, 2, 1}, - {3, 2, 1, 1}, - {4, 1, 1, 2}, - {4, 1, 2, -233}, - {4, 2, 1, -234}, - {5, 1, 1, -234}, - {5, 1, 2, 2}, - {5, 2, 2, 2}, - {7, 1, 1, 3}, - {7, 1, 2, 3}, - {7, 2, 1, -233}, - }; - - for (int i = 0; i < 16; i++) - { - const int k = kdsp[i][0]; - const int d = kdsp[i][1]; - const int s = kdsp[i][2]; - const int p = kdsp[i][3]; - - int ret = 0 - || test_convolutiondepthwise_int8(15, 7, 1, 1, k, d, s, p, 1, 1) - || test_convolutiondepthwise_int8(15, 7, 2, 2, k, d, s, p, 0, 1) - || test_convolutiondepthwise_int8(15, 7, 2, 2, k, d, s, p, 1, 2) - || test_convolutiondepthwise_int8(15, 7, 3, 3, k, d, s, p, 0, 3) - || test_convolutiondepthwise_int8(15, 7, 4, 2, k, d, s, p, 1, 2) - || test_convolutiondepthwise_int8(15, 7, 4, 4, k, d, s, p, 0, 4) - || test_convolutiondepthwise_int8(15, 7, 7, 7, k, d, s, p, 1, 7) - || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 0, 2) - || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 1, 8) - || test_convolutiondepthwise_int8(15, 7, 12, 12, k, d, s, p, 0, 4) - || test_convolutiondepthwise_int8(15, 7, 15, 15, k, d, s, p, 1, 15) - || test_convolutiondepthwise_int8(15, 7, 16, 8, k, d, s, p, 0, 2) - || test_convolutiondepthwise_int8(15, 7, 16, 16, k, d, s, p, 1, 16); - - if (ret != 0) - return -1; - } - - for (int i = 0; i < 16; i++) - { - const int k = kdsp[i][0]; - const int d = kdsp[i][1]; - const int s = kdsp[i][2]; - const int p = kdsp[i][3]; - - int ret = 0 - || test_convolutiondepthwise_int8(9, 7, 1, 1, k, d, s, p, 1, 1, true) - || test_convolutiondepthwise_int8(9, 7, 2, 2, k, d, s, p, 0, 1, true) - || test_convolutiondepthwise_int8(9, 7, 2, 2, k, d, s, p, 1, 2, true) - || test_convolutiondepthwise_int8(9, 7, 3, 3, k, d, s, p, 0, 3, true) - || test_convolutiondepthwise_int8(9, 7, 4, 2, k, d, s, p, 1, 2, true) - || test_convolutiondepthwise_int8(9, 7, 4, 4, k, d, s, p, 0, 4, true) - || test_convolutiondepthwise_int8(9, 7, 7, 7, k, d, s, p, 1, 7, true) - || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 0, 2, true) - || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 1, 8, true) - || test_convolutiondepthwise_int8(9, 7, 12, 12, k, d, s, p, 0, 4, true) - || test_convolutiondepthwise_int8(9, 7, 15, 15, k, d, s, p, 1, 15, true) - || test_convolutiondepthwise_int8(9, 7, 16, 8, k, d, s, p, 0, 2, true) - || test_convolutiondepthwise_int8(9, 7, 16, 16, k, d, s, p, 1, 16, true); - - if (ret != 0) - return -1; - } - - return 0; -} -#endif // NCNN_INT8 - int main() { SRAND(7767517); -#if NCNN_INT8 - return test_convolutiondepthwise_0() || test_convolutiondepthwise_1() || test_convolutiondepthwise_2(); -#else - return test_convolutiondepthwise_0() || test_convolutiondepthwise_2(); -#endif + return test_convolutiondepthwise_0(); } diff --git a/tests/test_convolutiondepthwise_1.cpp b/tests/test_convolutiondepthwise_1.cpp new file mode 100644 index 000000000000..3d10a7a8e855 --- /dev/null +++ b/tests/test_convolutiondepthwise_1.cpp @@ -0,0 +1,236 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/convolutiondepthwise.h" +#include "testutil.h" + +static int test_convolutiondepthwise_dynamic(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group) +{ + ncnn::Mat a = RandomMat(w, h, c); + + ncnn::ParamDict pd; + pd.set(0, 0); + pd.set(1, 0); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, 0); + pd.set(7, group); + pd.set(19, 1); // dynamic weight + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector as(bias ? 3 : 2); + as[0] = a; + as[1] = RandomMat(kernel, kernel, c / group, outch); + if (bias) + as[2] = RandomMat(outch); + + std::vector weights(0); + + int ret = test_layer("ConvolutionDepthWise", pd, weights, as); + if (ret != 0) + { + fprintf(stderr, "test_convolutiondepthwise_dynamic failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_convolutiondepthwise_2() +{ + static const int kdsp[7][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, -233}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, -234}, + }; + + for (int i = 0; i < 7; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_convolutiondepthwise_dynamic(11, 10, 1, 1, k, d, s, p, 1, 1) + || test_convolutiondepthwise_dynamic(11, 10, 2, 2, k, d, s, p, 0, 1) + || test_convolutiondepthwise_dynamic(11, 10, 2, 2, k, d, s, p, 1, 2) + || test_convolutiondepthwise_dynamic(11, 10, 3, 3, k, d, s, p, 0, 3) + || test_convolutiondepthwise_dynamic(11, 10, 4, 2, k, d, s, p, 1, 2) + || test_convolutiondepthwise_dynamic(11, 10, 4, 4, k, d, s, p, 0, 4) + || test_convolutiondepthwise_dynamic(11, 10, 7, 7, k, d, s, p, 1, 7) + || test_convolutiondepthwise_dynamic(11, 10, 8, 8, k, d, s, p, 0, 2) + || test_convolutiondepthwise_dynamic(11, 10, 8, 8, k, d, s, p, 1, 8) + || test_convolutiondepthwise_dynamic(11, 10, 12, 12, k, d, s, p, 0, 4) + || test_convolutiondepthwise_dynamic(11, 10, 15, 15, k, d, s, p, 1, 15) + || test_convolutiondepthwise_dynamic(11, 10, 16, 8, k, d, s, p, 0, 2) + || test_convolutiondepthwise_dynamic(11, 10, 16, 16, k, d, s, p, 1, 16); + + if (ret != 0) + return -1; + } + + return 0; +} + +#if NCNN_INT8 +static int test_convolutiondepthwise_int8(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias, int group, bool requant = false) +{ + ncnn::Mat a = RandomMat(w, h, c); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch / group * c / group * kernel * kernel * group); + pd.set(7, group); + pd.set(8, requant ? 101 : 1); // int8_scale_term + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 5 : 4); + weights[0] = RandomMat(outch / group * c / group * kernel * kernel * group); + ncnn::Mat weight_scales = scales_mat(weights[0], group, c * kernel * kernel / group, c * kernel * kernel / group); + ncnn::Mat input_scales = scales_mat(a, 1, w * h * c, a.cstep); + ncnn::Mat top_scales = requant ? scales_mat(a, 1, w * h * c, a.cstep) : ncnn::Mat(); + if (bias) + { + weights[1] = RandomMat(outch); + weights[2] = weight_scales; + weights[3] = input_scales; + weights[4] = top_scales; + } + else + { + weights[1] = weight_scales; + weights[2] = input_scales; + weights[3] = top_scales; + } + + int flag = TEST_LAYER_DISABLE_GPU_TESTING; + int ret = test_layer("ConvolutionDepthWise", pd, weights, a, requant ? 1.0f : 0.001f, 0, flag); + if (ret != 0) + { + fprintf(stderr, "test_convolutiondepthwise_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d group=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, group, requant, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_convolutiondepthwise_1() +{ + static const int kdsp[16][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, -233}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, 1}, + {4, 1, 1, 2}, + {4, 1, 2, -233}, + {4, 2, 1, -234}, + {5, 1, 1, -234}, + {5, 1, 2, 2}, + {5, 2, 2, 2}, + {7, 1, 1, 3}, + {7, 1, 2, 3}, + {7, 2, 1, -233}, + }; + + for (int i = 0; i < 16; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_convolutiondepthwise_int8(15, 7, 1, 1, k, d, s, p, 1, 1) + || test_convolutiondepthwise_int8(15, 7, 2, 2, k, d, s, p, 0, 1) + || test_convolutiondepthwise_int8(15, 7, 2, 2, k, d, s, p, 1, 2) + || test_convolutiondepthwise_int8(15, 7, 3, 3, k, d, s, p, 0, 3) + || test_convolutiondepthwise_int8(15, 7, 4, 2, k, d, s, p, 1, 2) + || test_convolutiondepthwise_int8(15, 7, 4, 4, k, d, s, p, 0, 4) + || test_convolutiondepthwise_int8(15, 7, 7, 7, k, d, s, p, 1, 7) + || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 0, 2) + || test_convolutiondepthwise_int8(15, 7, 8, 8, k, d, s, p, 1, 8) + || test_convolutiondepthwise_int8(15, 7, 12, 12, k, d, s, p, 0, 4) + || test_convolutiondepthwise_int8(15, 7, 15, 15, k, d, s, p, 1, 15) + || test_convolutiondepthwise_int8(15, 7, 16, 8, k, d, s, p, 0, 2) + || test_convolutiondepthwise_int8(15, 7, 16, 16, k, d, s, p, 1, 16); + + if (ret != 0) + return -1; + } + + for (int i = 0; i < 16; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_convolutiondepthwise_int8(9, 7, 1, 1, k, d, s, p, 1, 1, true) + || test_convolutiondepthwise_int8(9, 7, 2, 2, k, d, s, p, 0, 1, true) + || test_convolutiondepthwise_int8(9, 7, 2, 2, k, d, s, p, 1, 2, true) + || test_convolutiondepthwise_int8(9, 7, 3, 3, k, d, s, p, 0, 3, true) + || test_convolutiondepthwise_int8(9, 7, 4, 2, k, d, s, p, 1, 2, true) + || test_convolutiondepthwise_int8(9, 7, 4, 4, k, d, s, p, 0, 4, true) + || test_convolutiondepthwise_int8(9, 7, 7, 7, k, d, s, p, 1, 7, true) + || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 0, 2, true) + || test_convolutiondepthwise_int8(9, 7, 8, 8, k, d, s, p, 1, 8, true) + || test_convolutiondepthwise_int8(9, 7, 12, 12, k, d, s, p, 0, 4, true) + || test_convolutiondepthwise_int8(9, 7, 15, 15, k, d, s, p, 1, 15, true) + || test_convolutiondepthwise_int8(9, 7, 16, 8, k, d, s, p, 0, 2, true) + || test_convolutiondepthwise_int8(9, 7, 16, 16, k, d, s, p, 1, 16, true); + + if (ret != 0) + return -1; + } + + return 0; +} +#endif // NCNN_INT8 + +int main() +{ + SRAND(7767517); + +#if NCNN_INT8 + return test_convolutiondepthwise_1() || test_convolutiondepthwise_2(); +#else + return test_convolutiondepthwise_2(); +#endif +} diff --git a/tests/test_crop.cpp b/tests/test_crop.cpp index caa2876c4996..b2a29778fec4 100644 --- a/tests/test_crop.cpp +++ b/tests/test_crop.cpp @@ -42,112 +42,6 @@ static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, return ret; } -static ncnn::Mat IntArrayMat(int a0) -{ - ncnn::Mat m(1); - int* p = m; - p[0] = a0; - return m; -} - -static ncnn::Mat IntArrayMat(int a0, int a1) -{ - ncnn::Mat m(2); - int* p = m; - p[0] = a0; - p[1] = a1; - return m; -} - -static ncnn::Mat IntArrayMat(int a0, int a1, int a2) -{ - ncnn::Mat m(3); - int* p = m; - p[0] = a0; - p[1] = a1; - p[2] = a2; - return m; -} - -static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3) -{ - ncnn::Mat m(4); - int* p = m; - p[0] = a0; - p[1] = a1; - p[2] = a2; - p[3] = a3; - return m; -} - -static void print_int_array(const ncnn::Mat& a) -{ - const int* pa = a; - - fprintf(stderr, "["); - for (int i = 0; i < a.w; i++) - { - fprintf(stderr, " %d", pa[i]); - } - fprintf(stderr, " ]"); -} - -static int test_crop(const ncnn::Mat& a, const ncnn::Mat& starts, const ncnn::Mat& ends, const ncnn::Mat& axes) -{ - ncnn::ParamDict pd; - pd.set(9, starts); // starts - pd.set(10, ends); // ends - pd.set(11, axes); // axes - - std::vector weights(0); - - int ret = test_layer("Crop", pd, weights, a); - if (ret != 0) - { - fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c); - fprintf(stderr, " starts="); - print_int_array(starts); - fprintf(stderr, " ends="); - print_int_array(ends); - fprintf(stderr, " axes="); - print_int_array(axes); - fprintf(stderr, "\n"); - } - - return ret; -} - -static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, int coffset, const ncnn::Mat& ref) -{ - ncnn::ParamDict pd; - pd.set(0, woffset); - pd.set(1, hoffset); - pd.set(13, doffset); - pd.set(2, coffset); - pd.set(3, 0); // outw - pd.set(4, 0); // outh - pd.set(14, 0); // outd - pd.set(5, 0); // outc - pd.set(6, 0); // woffset2 - pd.set(7, 0); // hoffset2 - pd.set(15, 0); // doffset2 - pd.set(8, 0); // coffset2 - - std::vector weights(0); - - std::vector ab(2); - ab[0] = a; - ab[1] = ref; - - int ret = test_layer("Crop", pd, weights, ab); - if (ret != 0) - { - fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d ref.dims=%d ref=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, woffset, hoffset, doffset, coffset, ref.dims, ref.w, ref.h, ref.d, ref.c); - } - - return ret; -} - static int test_crop_0(const ncnn::Mat& a) { return 0 @@ -161,30 +55,6 @@ static int test_crop_0(const ncnn::Mat& a) || test_crop(a, 16, 0, 0, 0, -233, 0, 0, 0, 7, 0, 0, 0); } -static int test_crop_1(const ncnn::Mat& a) -{ - return 0 - || test_crop(a, IntArrayMat(12), IntArrayMat(-233), IntArrayMat(0)) - || test_crop(a, IntArrayMat(16), IntArrayMat(-233), IntArrayMat(0)) - || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0)) - || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(-1)) - || test_crop(a, IntArrayMat(16), IntArrayMat(16 + 12), ncnn::Mat()) - || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0)) - || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(-1)) - || test_crop(a, IntArrayMat(16), IntArrayMat(-16 + 1), ncnn::Mat()); -} - -static int test_crop_2(const ncnn::Mat& a) -{ - return 0 - || test_crop(a, 0, 0, 0, 0, a) - || test_crop(a, 0, 0, 0, 0, ncnn::Mat(27)) - - || test_crop(a, 11, 0, 0, 0, ncnn::Mat(7)) - || test_crop(a, 12, 0, 0, 0, ncnn::Mat(12)) - || test_crop(a, 16, 0, 0, 0, ncnn::Mat(16)); -} - static int test_crop_3(const ncnn::Mat& a) { return 0 @@ -220,52 +90,6 @@ static int test_crop_3(const ncnn::Mat& a) || test_crop(a, 4, 8, 0, 0, -233, -233, 0, 0, 6, 12, 0, 0); } -static int test_crop_4(const ncnn::Mat& a) -{ - return 0 - || test_crop(a, IntArrayMat(12), IntArrayMat(-233), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0)) - || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(1)) - || test_crop(a, IntArrayMat(5, 11), IntArrayMat(-233, -233), IntArrayMat(0, 1)) - - || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0)) - || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 12), IntArrayMat(-2)) - - || test_crop(a, IntArrayMat(5), IntArrayMat(8), IntArrayMat(1)) - || test_crop(a, IntArrayMat(6), IntArrayMat(9), IntArrayMat(1)) - || test_crop(a, IntArrayMat(4), IntArrayMat(12), IntArrayMat(-1)) - - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 12), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 16, 10), IntArrayMat(0, -1)) - - || test_crop(a, IntArrayMat(11), IntArrayMat(-16 + 1), IntArrayMat(0)) - || test_crop(a, IntArrayMat(12), IntArrayMat(-7 + 1), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(-12 + 1), IntArrayMat(-2)) - - || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(1)) - || test_crop(a, IntArrayMat(6), IntArrayMat(-6 + 1), IntArrayMat(1)) - || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-1)) - - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-2, -1)); -} - -static int test_crop_5(const ncnn::Mat& a) -{ - return 0 - || test_crop(a, 0, 0, 0, 0, a) - - || test_crop(a, 0, 12, 0, 0, ncnn::Mat(8, 7)) - || test_crop(a, 5, 0, 0, 0, ncnn::Mat(7, 27)) - - || test_crop(a, 5, 11, 0, 0, ncnn::Mat(5, 12)) - || test_crop(a, 6, 12, 0, 0, ncnn::Mat(4, 16)) - || test_crop(a, 4, 8, 0, 0, ncnn::Mat(6, 7)); -} - static int test_crop_6(const ncnn::Mat& a) { return 0 @@ -338,94 +162,6 @@ static int test_crop_6(const ncnn::Mat& a) || test_crop(a, 4, 4, 0, 8, -233, -233, 0, -233, 6, 2, 0, 12); } -static int test_crop_7(const ncnn::Mat& a) -{ - return 0 - || test_crop(a, IntArrayMat(11), IntArrayMat(-233), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0)) - || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(1)) - || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(2)) - || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(-1)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-233, -233), IntArrayMat(0, -1)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-233, -233), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(6, 6), IntArrayMat(-233, -233), IntArrayMat(1, -1)) - || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, -1)) - - || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 7), IntArrayMat(0)) - || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 12), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 16), IntArrayMat(0)) - - || test_crop(a, IntArrayMat(5), IntArrayMat(13), IntArrayMat(1)) - || test_crop(a, IntArrayMat(6), IntArrayMat(12), IntArrayMat(1)) - || test_crop(a, IntArrayMat(4), IntArrayMat(11), IntArrayMat(-2)) - - || test_crop(a, IntArrayMat(5), IntArrayMat(12), IntArrayMat(2)) - || test_crop(a, IntArrayMat(6), IntArrayMat(11), IntArrayMat(2)) - || test_crop(a, IntArrayMat(4), IntArrayMat(13), IntArrayMat(-1)) - - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 16, 12), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 12, 13), IntArrayMat(0, -2)) - - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 16, 13), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 11), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 7, 12), IntArrayMat(0, -1)) - - || test_crop(a, IntArrayMat(5, 4), IntArrayMat(12, 12), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(6, 3), IntArrayMat(13, 13), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(4, 2), IntArrayMat(11, 11), IntArrayMat(-2, -1)) - - || test_crop(a, IntArrayMat(11, 5, 2), IntArrayMat(11 + 7, 11, 11), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(12, 6, 4), IntArrayMat(12 + 16, 12, 12), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(8, 4, 3), IntArrayMat(8 + 12, 13, 13), IntArrayMat(-3, -2, -1)) - - || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0)) - || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(-16 + 1), IntArrayMat(-3)) - - || test_crop(a, IntArrayMat(5), IntArrayMat(-6 + 1), IntArrayMat(1)) - || test_crop(a, IntArrayMat(6), IntArrayMat(-5 + 1), IntArrayMat(1)) - || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-2)) - - || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(2)) - || test_crop(a, IntArrayMat(6), IntArrayMat(-4 + 1), IntArrayMat(2)) - || test_crop(a, IntArrayMat(4), IntArrayMat(-6 + 1), IntArrayMat(-1)) - - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(-3, -2)) - - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-3, -1)) - - || test_crop(a, IntArrayMat(5, 2), IntArrayMat(-5 + 1, -5 + 1), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(6, 4), IntArrayMat(-4 + 1, -4 + 1), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(4, 3), IntArrayMat(-6 + 1, -6 + 1), IntArrayMat(-2, -1)) - - || test_crop(a, IntArrayMat(11, 5, 4), IntArrayMat(-7 + 1, -5 + 1, -5 + 1), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(12, 6, 3), IntArrayMat(-12 + 1, -6 + 1, -6 + 1), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(8, 4, 2), IntArrayMat(-16 + 1, -4 + 1, -4 + 1), IntArrayMat(-3, -2, -1)); -} - -static int test_crop_8(const ncnn::Mat& a) -{ - return 0 - || test_crop(a, 0, 0, 0, 0, a) - - || test_crop(a, 0, 5, 0, 0, ncnn::Mat(6, 6)) - || test_crop(a, 6, 0, 0, 0, ncnn::Mat(8, 8)) - || test_crop(a, 5, 2, 0, 0, ncnn::Mat(6, 3)) - || test_crop(a, 6, 3, 0, 0, ncnn::Mat(8, 4)) - || test_crop(a, 4, 4, 0, 0, ncnn::Mat(7, 5)) - - || test_crop(a, 5, 3, 0, 11, ncnn::Mat(7, 3, 7)) - || test_crop(a, 6, 4, 0, 12, ncnn::Mat(6, 4, 12)) - || test_crop(a, 4, 2, 0, 8, ncnn::Mat(5, 5, 16)); -} - static int test_crop_9(const ncnn::Mat& a) { return 0 @@ -524,171 +260,6 @@ static int test_crop_9(const ncnn::Mat& a) || test_crop(a, 3, 3, 3, 8, -233, -233, -233, -233, 3, 3, 3, 12); } -static int test_crop_10(const ncnn::Mat& a) -{ - return 0 - || test_crop(a, IntArrayMat(11), IntArrayMat(-233), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0)) - || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(1)) - || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(2)) - || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(-2)) - || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(3)) - || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(-1)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-233, -233), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-233, -233), IntArrayMat(-4, -2)) - || test_crop(a, IntArrayMat(4, 4), IntArrayMat(-233, -233), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 3)) - || test_crop(a, IntArrayMat(5, 5), IntArrayMat(-233, -233), IntArrayMat(1, 3)) - || test_crop(a, IntArrayMat(4, 4), IntArrayMat(-233, -233), IntArrayMat(2, 3)) - || test_crop(a, IntArrayMat(12, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 3)) - || test_crop(a, IntArrayMat(12, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(0, 2, 3)) - || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 2, 3)) - || test_crop(a, IntArrayMat(4, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(1, 2, 3)) - || test_crop(a, IntArrayMat(6, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(1, 2, 3)) - || test_crop(a, IntArrayMat(11, 5, 5, 5), IntArrayMat(-233, -233, -233, -233), IntArrayMat(0, 1, 2, 3)) - || test_crop(a, IntArrayMat(8, 4, 4, 4), IntArrayMat(-233, -233, -233, -233), IntArrayMat(0, 1, 2, 3)) - || test_crop(a, IntArrayMat(12, 6, 6, 6), IntArrayMat(-233, -233, -233, -233), IntArrayMat(-4, -3, -2, -1)) - - || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0)) - || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 12), IntArrayMat(-4)) - - || test_crop(a, IntArrayMat(5), IntArrayMat(11), IntArrayMat(1)) - || test_crop(a, IntArrayMat(6), IntArrayMat(13), IntArrayMat(1)) - || test_crop(a, IntArrayMat(4), IntArrayMat(12), IntArrayMat(-3)) - - || test_crop(a, IntArrayMat(3), IntArrayMat(12), IntArrayMat(2)) - || test_crop(a, IntArrayMat(4), IntArrayMat(13), IntArrayMat(2)) - || test_crop(a, IntArrayMat(5), IntArrayMat(11), IntArrayMat(-2)) - - || test_crop(a, IntArrayMat(1), IntArrayMat(8), IntArrayMat(3)) - || test_crop(a, IntArrayMat(2), IntArrayMat(7), IntArrayMat(3)) - || test_crop(a, IntArrayMat(3), IntArrayMat(6), IntArrayMat(-1)) - - || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 12), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 16, 13), IntArrayMat(-4, -3)) - - || test_crop(a, IntArrayMat(11, 4), IntArrayMat(11 + 12, 13), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(12, 3), IntArrayMat(12 + 16, 11), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(8, 2), IntArrayMat(8 + 7, 12), IntArrayMat(-4, -2)) - - || test_crop(a, IntArrayMat(11, 1), IntArrayMat(11 + 16, 5), IntArrayMat(0, 3)) - || test_crop(a, IntArrayMat(12, 2), IntArrayMat(12 + 7, 6), IntArrayMat(0, 3)) - || test_crop(a, IntArrayMat(8, 3), IntArrayMat(8 + 12, 7), IntArrayMat(-4, -1)) - - || test_crop(a, IntArrayMat(3, 3), IntArrayMat(13, 4), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(4, 2), IntArrayMat(12, 3), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(5, 1), IntArrayMat(11, 2), IntArrayMat(-3, -2)) - - || test_crop(a, IntArrayMat(5, 5), IntArrayMat(11, 8), IntArrayMat(1, 3)) - || test_crop(a, IntArrayMat(4, 6), IntArrayMat(12, 9), IntArrayMat(1, 3)) - || test_crop(a, IntArrayMat(3, 4), IntArrayMat(13, 7), IntArrayMat(-3, -1)) - - || test_crop(a, IntArrayMat(2, 3), IntArrayMat(12, 9), IntArrayMat(2, 3)) - || test_crop(a, IntArrayMat(3, 2), IntArrayMat(11, 7), IntArrayMat(2, 3)) - || test_crop(a, IntArrayMat(4, 1), IntArrayMat(10, 8), IntArrayMat(-2, -1)) - - || test_crop(a, IntArrayMat(11, 2, 2), IntArrayMat(11 + 6, 9, 9), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(12 + 1, 10, 10), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(8 + 3, 11, 11), IntArrayMat(-4, -3, -2)) - - || test_crop(a, IntArrayMat(11, 4, 4), IntArrayMat(11 + 12, 12, 12), IntArrayMat(0, 1, 3)) - || test_crop(a, IntArrayMat(12, 5, 5), IntArrayMat(12 + 8, 11, 11), IntArrayMat(0, 1, 3)) - || test_crop(a, IntArrayMat(8, 6, 6), IntArrayMat(8 + 4, 13, 13), IntArrayMat(-4, -3, -1)) - - || test_crop(a, IntArrayMat(11, 1, 4), IntArrayMat(11 + 5, 12, 12), IntArrayMat(0, 2, 3)) - || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(12 + 3, 11, 11), IntArrayMat(0, 2, 3)) - || test_crop(a, IntArrayMat(8, 2, 5), IntArrayMat(8 + 2, 10, 10), IntArrayMat(-4, -2, -1)) - - || test_crop(a, IntArrayMat(1, 1, 1), IntArrayMat(7, 7, 7), IntArrayMat(1, 2, 3)) - || test_crop(a, IntArrayMat(2, 2, 2), IntArrayMat(8, 9, 10), IntArrayMat(1, 2, 3)) - || test_crop(a, IntArrayMat(3, 3, 3), IntArrayMat(11, 12, 13), IntArrayMat(-3, -2, -1)) - - || test_crop(a, IntArrayMat(11, 2, 3, 6), IntArrayMat(11 + 11, 10, 12, 11), IntArrayMat(0, 1, 2, 3)) - || test_crop(a, IntArrayMat(12, 3, 4, 5), IntArrayMat(12 + 12, 9, 11, 13), IntArrayMat(0, 1, 2, 3)) - || test_crop(a, IntArrayMat(8, 4, 5, 4), IntArrayMat(8 + 8, 8, 10, 12), IntArrayMat(-4, -3, -2, -1)) - - || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0)) - || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(0)) - || test_crop(a, IntArrayMat(8), IntArrayMat(-16 + 1), IntArrayMat(-4)) - - || test_crop(a, IntArrayMat(5), IntArrayMat(-6 + 1), IntArrayMat(1)) - || test_crop(a, IntArrayMat(6), IntArrayMat(-5 + 1), IntArrayMat(1)) - || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-3)) - - || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(2)) - || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(2)) - || test_crop(a, IntArrayMat(6), IntArrayMat(-6 + 1), IntArrayMat(-2)) - - || test_crop(a, IntArrayMat(1), IntArrayMat(-5 + 1), IntArrayMat(3)) - || test_crop(a, IntArrayMat(2), IntArrayMat(-4 + 1), IntArrayMat(3)) - || test_crop(a, IntArrayMat(3), IntArrayMat(-3 + 1), IntArrayMat(-1)) - - || test_crop(a, IntArrayMat(11, 3), IntArrayMat(-7 + 1, -3 + 1), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(12, 4), IntArrayMat(-12 + 1, -4 + 1), IntArrayMat(0, 1)) - || test_crop(a, IntArrayMat(8, 5), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(-4, -3)) - - || test_crop(a, IntArrayMat(11, 1), IntArrayMat(-12 + 1, -5 + 1), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(12, 2), IntArrayMat(-16 + 1, -4 + 1), IntArrayMat(0, 2)) - || test_crop(a, IntArrayMat(8, 3), IntArrayMat(-7 + 1, -6 + 1), IntArrayMat(-4, -2)) - - || test_crop(a, IntArrayMat(11, 3), IntArrayMat(-12 + 1, -2 + 1), IntArrayMat(0, 3)) - || test_crop(a, IntArrayMat(12, 4), IntArrayMat(-16 + 1, -3 + 1), IntArrayMat(0, 3)) - || test_crop(a, IntArrayMat(8, 5), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-4, -1)) - - || test_crop(a, IntArrayMat(2, 3), IntArrayMat(-4 + 1, -2 + 1), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(3, 4), IntArrayMat(-2 + 1, -3 + 1), IntArrayMat(1, 2)) - || test_crop(a, IntArrayMat(4, 5), IntArrayMat(-3 + 1, -4 + 1), IntArrayMat(-3, -2)) - - || test_crop(a, IntArrayMat(3, 2), IntArrayMat(-2 + 1, -4 + 1), IntArrayMat(1, 3)) - || test_crop(a, IntArrayMat(4, 3), IntArrayMat(-3 + 1, -2 + 1), IntArrayMat(1, 3)) - || test_crop(a, IntArrayMat(5, 4), IntArrayMat(-4 + 1, -3 + 1), IntArrayMat(-3, -1)) - - || test_crop(a, IntArrayMat(2, 3), IntArrayMat(-4 + 1, -6 + 1), IntArrayMat(2, 3)) - || test_crop(a, IntArrayMat(1, 2), IntArrayMat(-5 + 1, -5 + 1), IntArrayMat(2, 3)) - || test_crop(a, IntArrayMat(3, 1), IntArrayMat(-6 + 1, -4 + 1), IntArrayMat(-2, -1)) - - || test_crop(a, IntArrayMat(11, 3, 3), IntArrayMat(-7 + 1, -3 + 1, -4 + 1), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(12, 4, 4), IntArrayMat(-12 + 1, -4 + 1, -3 + 1), IntArrayMat(0, 1, 2)) - || test_crop(a, IntArrayMat(8, 5, 5), IntArrayMat(-16 + 1, -5 + 1, -5 + 1), IntArrayMat(-4, -3, -2)) - - || test_crop(a, IntArrayMat(11, 2, 2), IntArrayMat(-7 + 1, -5 + 1, -4 + 1), IntArrayMat(0, 1, 3)) - || test_crop(a, IntArrayMat(12, 1, 1), IntArrayMat(-12 + 1, -6 + 1, -5 + 1), IntArrayMat(0, 1, 3)) - || test_crop(a, IntArrayMat(8, 3, 3), IntArrayMat(-16 + 1, -4 + 1, -6 + 1), IntArrayMat(-4, -3, -1)) - - || test_crop(a, IntArrayMat(11, 2, 5), IntArrayMat(-7 + 1, -2 + 1, -5 + 1), IntArrayMat(0, 2, 3)) - || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(-12 + 1, -3 + 1, -4 + 1), IntArrayMat(0, 2, 3)) - || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-16 + 1, -4 + 1, -3 + 1), IntArrayMat(-4, -2, -1)) - - || test_crop(a, IntArrayMat(1, 3, 3), IntArrayMat(-3 + 1, -6 + 1, -4 + 1), IntArrayMat(1, 2, 3)) - || test_crop(a, IntArrayMat(2, 2, 2), IntArrayMat(-4 + 1, -4 + 1, -5 + 1), IntArrayMat(1, 2, 3)) - || test_crop(a, IntArrayMat(3, 1, 1), IntArrayMat(-5 + 1, -5 + 1, -6 + 1), IntArrayMat(-3, -2, -1)) - - || test_crop(a, IntArrayMat(11, 3, 4, 4), IntArrayMat(-7 + 1, -3 + 1, -2 + 1, -4 + 1), IntArrayMat(0, 1, 2, 3)) - || test_crop(a, IntArrayMat(12, 4, 5, 3), IntArrayMat(-12 + 1, -4 + 1, -3 + 1, -5 + 1), IntArrayMat(0, 1, 2, 3)) - || test_crop(a, IntArrayMat(8, 5, 6, 2), IntArrayMat(-16 + 1, -5 + 1, -4 + 1, -3 + 1), IntArrayMat(-4, -3, -2, -1)); -} - -static int test_crop_11(const ncnn::Mat& a) -{ - return 0 - || test_crop(a, 0, 0, 0, 0, a) - - || test_crop(a, 0, 5, 0, 0, ncnn::Mat(6, 6, 6)) - || test_crop(a, 6, 0, 0, 0, ncnn::Mat(8, 8, 8)) - || test_crop(a, 5, 5, 5, 0, ncnn::Mat(6, 6, 6)) - || test_crop(a, 6, 6, 6, 0, ncnn::Mat(8, 8, 8)) - || test_crop(a, 4, 4, 4, 0, ncnn::Mat(5, 5, 5)) - - || test_crop(a, 3, 3, 3, 11, ncnn::Mat(3, 3, 3, 7)) - || test_crop(a, 4, 4, 4, 12, ncnn::Mat(6, 6, 6, 12)) - || test_crop(a, 5, 5, 5, 8, ncnn::Mat(8, 8, 8, 16)); -} - int main() { SRAND(776757); @@ -697,37 +268,13 @@ int main() || test_crop_0(RandomMat(112)) || test_crop_0(RandomMat(126)) || test_crop_0(RandomMat(127)) - || test_crop_1(RandomMat(112)) - || test_crop_1(RandomMat(126)) - || test_crop_1(RandomMat(127)) - || test_crop_2(RandomMat(112)) - || test_crop_2(RandomMat(126)) - || test_crop_2(RandomMat(127)) || test_crop_3(RandomMat(20, 48)) || test_crop_3(RandomMat(15, 36)) || test_crop_3(RandomMat(16, 33)) - || test_crop_4(RandomMat(20, 48)) - || test_crop_4(RandomMat(15, 36)) - || test_crop_4(RandomMat(16, 33)) - || test_crop_5(RandomMat(20, 48)) - || test_crop_5(RandomMat(15, 36)) - || test_crop_5(RandomMat(16, 33)) || test_crop_6(RandomMat(20, 20, 48)) || test_crop_6(RandomMat(15, 15, 36)) || test_crop_6(RandomMat(16, 16, 33)) - || test_crop_7(RandomMat(20, 20, 48)) - || test_crop_7(RandomMat(15, 15, 36)) - || test_crop_7(RandomMat(16, 16, 33)) - || test_crop_8(RandomMat(20, 20, 48)) - || test_crop_8(RandomMat(15, 15, 36)) - || test_crop_8(RandomMat(16, 16, 33)) || test_crop_9(RandomMat(20, 20, 20, 48)) || test_crop_9(RandomMat(15, 15, 15, 36)) - || test_crop_9(RandomMat(16, 16, 16, 33)) - || test_crop_10(RandomMat(20, 20, 20, 48)) - || test_crop_10(RandomMat(15, 15, 15, 36)) - || test_crop_10(RandomMat(16, 16, 16, 33)) - || test_crop_11(RandomMat(20, 20, 20, 48)) - || test_crop_11(RandomMat(15, 15, 15, 36)) - || test_crop_11(RandomMat(16, 16, 16, 33)); + || test_crop_9(RandomMat(16, 16, 16, 33)); } diff --git a/tests/test_crop_1.cpp b/tests/test_crop_1.cpp new file mode 100644 index 000000000000..c875a51c7fa8 --- /dev/null +++ b/tests/test_crop_1.cpp @@ -0,0 +1,377 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/crop.h" +#include "testutil.h" + +static ncnn::Mat IntArrayMat(int a0) +{ + ncnn::Mat m(1); + int* p = m; + p[0] = a0; + return m; +} + +static ncnn::Mat IntArrayMat(int a0, int a1) +{ + ncnn::Mat m(2); + int* p = m; + p[0] = a0; + p[1] = a1; + return m; +} + +static ncnn::Mat IntArrayMat(int a0, int a1, int a2) +{ + ncnn::Mat m(3); + int* p = m; + p[0] = a0; + p[1] = a1; + p[2] = a2; + return m; +} + +static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3) +{ + ncnn::Mat m(4); + int* p = m; + p[0] = a0; + p[1] = a1; + p[2] = a2; + p[3] = a3; + return m; +} + +static void print_int_array(const ncnn::Mat& a) +{ + const int* pa = a; + + fprintf(stderr, "["); + for (int i = 0; i < a.w; i++) + { + fprintf(stderr, " %d", pa[i]); + } + fprintf(stderr, " ]"); +} + +static int test_crop(const ncnn::Mat& a, const ncnn::Mat& starts, const ncnn::Mat& ends, const ncnn::Mat& axes) +{ + ncnn::ParamDict pd; + pd.set(9, starts); // starts + pd.set(10, ends); // ends + pd.set(11, axes); // axes + + std::vector weights(0); + + int ret = test_layer("Crop", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d)", a.dims, a.w, a.h, a.d, a.c); + fprintf(stderr, " starts="); + print_int_array(starts); + fprintf(stderr, " ends="); + print_int_array(ends); + fprintf(stderr, " axes="); + print_int_array(axes); + fprintf(stderr, "\n"); + } + + return ret; +} + +static int test_crop_1(const ncnn::Mat& a) +{ + return 0 + || test_crop(a, IntArrayMat(12), IntArrayMat(-233), IntArrayMat(0)) + || test_crop(a, IntArrayMat(16), IntArrayMat(-233), IntArrayMat(0)) + || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0)) + || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(-1)) + || test_crop(a, IntArrayMat(16), IntArrayMat(16 + 12), ncnn::Mat()) + || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0)) + || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(-1)) + || test_crop(a, IntArrayMat(16), IntArrayMat(-16 + 1), ncnn::Mat()); +} + +static int test_crop_4(const ncnn::Mat& a) +{ + return 0 + || test_crop(a, IntArrayMat(12), IntArrayMat(-233), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0)) + || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(1)) + || test_crop(a, IntArrayMat(5, 11), IntArrayMat(-233, -233), IntArrayMat(0, 1)) + + || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0)) + || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 12), IntArrayMat(-2)) + + || test_crop(a, IntArrayMat(5), IntArrayMat(8), IntArrayMat(1)) + || test_crop(a, IntArrayMat(6), IntArrayMat(9), IntArrayMat(1)) + || test_crop(a, IntArrayMat(4), IntArrayMat(12), IntArrayMat(-1)) + + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 12), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 16, 10), IntArrayMat(0, -1)) + + || test_crop(a, IntArrayMat(11), IntArrayMat(-16 + 1), IntArrayMat(0)) + || test_crop(a, IntArrayMat(12), IntArrayMat(-7 + 1), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(-12 + 1), IntArrayMat(-2)) + + || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(1)) + || test_crop(a, IntArrayMat(6), IntArrayMat(-6 + 1), IntArrayMat(1)) + || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-1)) + + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-2, -1)); +} + +static int test_crop_7(const ncnn::Mat& a) +{ + return 0 + || test_crop(a, IntArrayMat(11), IntArrayMat(-233), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0)) + || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(1)) + || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(2)) + || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(-1)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-233, -233), IntArrayMat(0, -1)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-233, -233), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(6, 6), IntArrayMat(-233, -233), IntArrayMat(1, -1)) + || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, -1)) + + || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 7), IntArrayMat(0)) + || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 12), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 16), IntArrayMat(0)) + + || test_crop(a, IntArrayMat(5), IntArrayMat(13), IntArrayMat(1)) + || test_crop(a, IntArrayMat(6), IntArrayMat(12), IntArrayMat(1)) + || test_crop(a, IntArrayMat(4), IntArrayMat(11), IntArrayMat(-2)) + + || test_crop(a, IntArrayMat(5), IntArrayMat(12), IntArrayMat(2)) + || test_crop(a, IntArrayMat(6), IntArrayMat(11), IntArrayMat(2)) + || test_crop(a, IntArrayMat(4), IntArrayMat(13), IntArrayMat(-1)) + + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 16, 12), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 12, 13), IntArrayMat(0, -2)) + + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 16, 13), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 11), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 7, 12), IntArrayMat(0, -1)) + + || test_crop(a, IntArrayMat(5, 4), IntArrayMat(12, 12), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(6, 3), IntArrayMat(13, 13), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(4, 2), IntArrayMat(11, 11), IntArrayMat(-2, -1)) + + || test_crop(a, IntArrayMat(11, 5, 2), IntArrayMat(11 + 7, 11, 11), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(12, 6, 4), IntArrayMat(12 + 16, 12, 12), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(8, 4, 3), IntArrayMat(8 + 12, 13, 13), IntArrayMat(-3, -2, -1)) + + || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0)) + || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(-16 + 1), IntArrayMat(-3)) + + || test_crop(a, IntArrayMat(5), IntArrayMat(-6 + 1), IntArrayMat(1)) + || test_crop(a, IntArrayMat(6), IntArrayMat(-5 + 1), IntArrayMat(1)) + || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-2)) + + || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(2)) + || test_crop(a, IntArrayMat(6), IntArrayMat(-4 + 1), IntArrayMat(2)) + || test_crop(a, IntArrayMat(4), IntArrayMat(-6 + 1), IntArrayMat(-1)) + + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(-3, -2)) + + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-12 + 1, -6 + 1), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-3, -1)) + + || test_crop(a, IntArrayMat(5, 2), IntArrayMat(-5 + 1, -5 + 1), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(6, 4), IntArrayMat(-4 + 1, -4 + 1), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(4, 3), IntArrayMat(-6 + 1, -6 + 1), IntArrayMat(-2, -1)) + + || test_crop(a, IntArrayMat(11, 5, 4), IntArrayMat(-7 + 1, -5 + 1, -5 + 1), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(12, 6, 3), IntArrayMat(-12 + 1, -6 + 1, -6 + 1), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(8, 4, 2), IntArrayMat(-16 + 1, -4 + 1, -4 + 1), IntArrayMat(-3, -2, -1)); +} + +static int test_crop_10(const ncnn::Mat& a) +{ + return 0 + || test_crop(a, IntArrayMat(11), IntArrayMat(-233), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(-233), IntArrayMat(0)) + || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(1)) + || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(2)) + || test_crop(a, IntArrayMat(4), IntArrayMat(-233), IntArrayMat(-2)) + || test_crop(a, IntArrayMat(6), IntArrayMat(-233), IntArrayMat(3)) + || test_crop(a, IntArrayMat(5), IntArrayMat(-233), IntArrayMat(-1)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(-233, -233), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(-233, -233), IntArrayMat(-4, -2)) + || test_crop(a, IntArrayMat(4, 4), IntArrayMat(-233, -233), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(-233, -233), IntArrayMat(0, 3)) + || test_crop(a, IntArrayMat(5, 5), IntArrayMat(-233, -233), IntArrayMat(1, 3)) + || test_crop(a, IntArrayMat(4, 4), IntArrayMat(-233, -233), IntArrayMat(2, 3)) + || test_crop(a, IntArrayMat(12, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(0, 1, 3)) + || test_crop(a, IntArrayMat(12, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(0, 2, 3)) + || test_crop(a, IntArrayMat(11, 5, 5), IntArrayMat(-233, -233, -233), IntArrayMat(0, 2, 3)) + || test_crop(a, IntArrayMat(4, 4, 4), IntArrayMat(-233, -233, -233), IntArrayMat(1, 2, 3)) + || test_crop(a, IntArrayMat(6, 6, 6), IntArrayMat(-233, -233, -233), IntArrayMat(1, 2, 3)) + || test_crop(a, IntArrayMat(11, 5, 5, 5), IntArrayMat(-233, -233, -233, -233), IntArrayMat(0, 1, 2, 3)) + || test_crop(a, IntArrayMat(8, 4, 4, 4), IntArrayMat(-233, -233, -233, -233), IntArrayMat(0, 1, 2, 3)) + || test_crop(a, IntArrayMat(12, 6, 6, 6), IntArrayMat(-233, -233, -233, -233), IntArrayMat(-4, -3, -2, -1)) + + || test_crop(a, IntArrayMat(11), IntArrayMat(11 + 16), IntArrayMat(0)) + || test_crop(a, IntArrayMat(12), IntArrayMat(12 + 7), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(8 + 12), IntArrayMat(-4)) + + || test_crop(a, IntArrayMat(5), IntArrayMat(11), IntArrayMat(1)) + || test_crop(a, IntArrayMat(6), IntArrayMat(13), IntArrayMat(1)) + || test_crop(a, IntArrayMat(4), IntArrayMat(12), IntArrayMat(-3)) + + || test_crop(a, IntArrayMat(3), IntArrayMat(12), IntArrayMat(2)) + || test_crop(a, IntArrayMat(4), IntArrayMat(13), IntArrayMat(2)) + || test_crop(a, IntArrayMat(5), IntArrayMat(11), IntArrayMat(-2)) + + || test_crop(a, IntArrayMat(1), IntArrayMat(8), IntArrayMat(3)) + || test_crop(a, IntArrayMat(2), IntArrayMat(7), IntArrayMat(3)) + || test_crop(a, IntArrayMat(3), IntArrayMat(6), IntArrayMat(-1)) + + || test_crop(a, IntArrayMat(11, 5), IntArrayMat(11 + 7, 11), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(12, 6), IntArrayMat(12 + 12, 12), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(8, 4), IntArrayMat(8 + 16, 13), IntArrayMat(-4, -3)) + + || test_crop(a, IntArrayMat(11, 4), IntArrayMat(11 + 12, 13), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(12, 3), IntArrayMat(12 + 16, 11), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(8, 2), IntArrayMat(8 + 7, 12), IntArrayMat(-4, -2)) + + || test_crop(a, IntArrayMat(11, 1), IntArrayMat(11 + 16, 5), IntArrayMat(0, 3)) + || test_crop(a, IntArrayMat(12, 2), IntArrayMat(12 + 7, 6), IntArrayMat(0, 3)) + || test_crop(a, IntArrayMat(8, 3), IntArrayMat(8 + 12, 7), IntArrayMat(-4, -1)) + + || test_crop(a, IntArrayMat(3, 3), IntArrayMat(13, 4), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(4, 2), IntArrayMat(12, 3), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(5, 1), IntArrayMat(11, 2), IntArrayMat(-3, -2)) + + || test_crop(a, IntArrayMat(5, 5), IntArrayMat(11, 8), IntArrayMat(1, 3)) + || test_crop(a, IntArrayMat(4, 6), IntArrayMat(12, 9), IntArrayMat(1, 3)) + || test_crop(a, IntArrayMat(3, 4), IntArrayMat(13, 7), IntArrayMat(-3, -1)) + + || test_crop(a, IntArrayMat(2, 3), IntArrayMat(12, 9), IntArrayMat(2, 3)) + || test_crop(a, IntArrayMat(3, 2), IntArrayMat(11, 7), IntArrayMat(2, 3)) + || test_crop(a, IntArrayMat(4, 1), IntArrayMat(10, 8), IntArrayMat(-2, -1)) + + || test_crop(a, IntArrayMat(11, 2, 2), IntArrayMat(11 + 6, 9, 9), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(12 + 1, 10, 10), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(8 + 3, 11, 11), IntArrayMat(-4, -3, -2)) + + || test_crop(a, IntArrayMat(11, 4, 4), IntArrayMat(11 + 12, 12, 12), IntArrayMat(0, 1, 3)) + || test_crop(a, IntArrayMat(12, 5, 5), IntArrayMat(12 + 8, 11, 11), IntArrayMat(0, 1, 3)) + || test_crop(a, IntArrayMat(8, 6, 6), IntArrayMat(8 + 4, 13, 13), IntArrayMat(-4, -3, -1)) + + || test_crop(a, IntArrayMat(11, 1, 4), IntArrayMat(11 + 5, 12, 12), IntArrayMat(0, 2, 3)) + || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(12 + 3, 11, 11), IntArrayMat(0, 2, 3)) + || test_crop(a, IntArrayMat(8, 2, 5), IntArrayMat(8 + 2, 10, 10), IntArrayMat(-4, -2, -1)) + + || test_crop(a, IntArrayMat(1, 1, 1), IntArrayMat(7, 7, 7), IntArrayMat(1, 2, 3)) + || test_crop(a, IntArrayMat(2, 2, 2), IntArrayMat(8, 9, 10), IntArrayMat(1, 2, 3)) + || test_crop(a, IntArrayMat(3, 3, 3), IntArrayMat(11, 12, 13), IntArrayMat(-3, -2, -1)) + + || test_crop(a, IntArrayMat(11, 2, 3, 6), IntArrayMat(11 + 11, 10, 12, 11), IntArrayMat(0, 1, 2, 3)) + || test_crop(a, IntArrayMat(12, 3, 4, 5), IntArrayMat(12 + 12, 9, 11, 13), IntArrayMat(0, 1, 2, 3)) + || test_crop(a, IntArrayMat(8, 4, 5, 4), IntArrayMat(8 + 8, 8, 10, 12), IntArrayMat(-4, -3, -2, -1)) + + || test_crop(a, IntArrayMat(11), IntArrayMat(-7 + 1), IntArrayMat(0)) + || test_crop(a, IntArrayMat(12), IntArrayMat(-12 + 1), IntArrayMat(0)) + || test_crop(a, IntArrayMat(8), IntArrayMat(-16 + 1), IntArrayMat(-4)) + + || test_crop(a, IntArrayMat(5), IntArrayMat(-6 + 1), IntArrayMat(1)) + || test_crop(a, IntArrayMat(6), IntArrayMat(-5 + 1), IntArrayMat(1)) + || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(-3)) + + || test_crop(a, IntArrayMat(4), IntArrayMat(-4 + 1), IntArrayMat(2)) + || test_crop(a, IntArrayMat(5), IntArrayMat(-5 + 1), IntArrayMat(2)) + || test_crop(a, IntArrayMat(6), IntArrayMat(-6 + 1), IntArrayMat(-2)) + + || test_crop(a, IntArrayMat(1), IntArrayMat(-5 + 1), IntArrayMat(3)) + || test_crop(a, IntArrayMat(2), IntArrayMat(-4 + 1), IntArrayMat(3)) + || test_crop(a, IntArrayMat(3), IntArrayMat(-3 + 1), IntArrayMat(-1)) + + || test_crop(a, IntArrayMat(11, 3), IntArrayMat(-7 + 1, -3 + 1), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(12, 4), IntArrayMat(-12 + 1, -4 + 1), IntArrayMat(0, 1)) + || test_crop(a, IntArrayMat(8, 5), IntArrayMat(-16 + 1, -5 + 1), IntArrayMat(-4, -3)) + + || test_crop(a, IntArrayMat(11, 1), IntArrayMat(-12 + 1, -5 + 1), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(12, 2), IntArrayMat(-16 + 1, -4 + 1), IntArrayMat(0, 2)) + || test_crop(a, IntArrayMat(8, 3), IntArrayMat(-7 + 1, -6 + 1), IntArrayMat(-4, -2)) + + || test_crop(a, IntArrayMat(11, 3), IntArrayMat(-12 + 1, -2 + 1), IntArrayMat(0, 3)) + || test_crop(a, IntArrayMat(12, 4), IntArrayMat(-16 + 1, -3 + 1), IntArrayMat(0, 3)) + || test_crop(a, IntArrayMat(8, 5), IntArrayMat(-7 + 1, -4 + 1), IntArrayMat(-4, -1)) + + || test_crop(a, IntArrayMat(2, 3), IntArrayMat(-4 + 1, -2 + 1), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(3, 4), IntArrayMat(-2 + 1, -3 + 1), IntArrayMat(1, 2)) + || test_crop(a, IntArrayMat(4, 5), IntArrayMat(-3 + 1, -4 + 1), IntArrayMat(-3, -2)) + + || test_crop(a, IntArrayMat(3, 2), IntArrayMat(-2 + 1, -4 + 1), IntArrayMat(1, 3)) + || test_crop(a, IntArrayMat(4, 3), IntArrayMat(-3 + 1, -2 + 1), IntArrayMat(1, 3)) + || test_crop(a, IntArrayMat(5, 4), IntArrayMat(-4 + 1, -3 + 1), IntArrayMat(-3, -1)) + + || test_crop(a, IntArrayMat(2, 3), IntArrayMat(-4 + 1, -6 + 1), IntArrayMat(2, 3)) + || test_crop(a, IntArrayMat(1, 2), IntArrayMat(-5 + 1, -5 + 1), IntArrayMat(2, 3)) + || test_crop(a, IntArrayMat(3, 1), IntArrayMat(-6 + 1, -4 + 1), IntArrayMat(-2, -1)) + + || test_crop(a, IntArrayMat(11, 3, 3), IntArrayMat(-7 + 1, -3 + 1, -4 + 1), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(12, 4, 4), IntArrayMat(-12 + 1, -4 + 1, -3 + 1), IntArrayMat(0, 1, 2)) + || test_crop(a, IntArrayMat(8, 5, 5), IntArrayMat(-16 + 1, -5 + 1, -5 + 1), IntArrayMat(-4, -3, -2)) + + || test_crop(a, IntArrayMat(11, 2, 2), IntArrayMat(-7 + 1, -5 + 1, -4 + 1), IntArrayMat(0, 1, 3)) + || test_crop(a, IntArrayMat(12, 1, 1), IntArrayMat(-12 + 1, -6 + 1, -5 + 1), IntArrayMat(0, 1, 3)) + || test_crop(a, IntArrayMat(8, 3, 3), IntArrayMat(-16 + 1, -4 + 1, -6 + 1), IntArrayMat(-4, -3, -1)) + + || test_crop(a, IntArrayMat(11, 2, 5), IntArrayMat(-7 + 1, -2 + 1, -5 + 1), IntArrayMat(0, 2, 3)) + || test_crop(a, IntArrayMat(12, 3, 3), IntArrayMat(-12 + 1, -3 + 1, -4 + 1), IntArrayMat(0, 2, 3)) + || test_crop(a, IntArrayMat(8, 4, 4), IntArrayMat(-16 + 1, -4 + 1, -3 + 1), IntArrayMat(-4, -2, -1)) + + || test_crop(a, IntArrayMat(1, 3, 3), IntArrayMat(-3 + 1, -6 + 1, -4 + 1), IntArrayMat(1, 2, 3)) + || test_crop(a, IntArrayMat(2, 2, 2), IntArrayMat(-4 + 1, -4 + 1, -5 + 1), IntArrayMat(1, 2, 3)) + || test_crop(a, IntArrayMat(3, 1, 1), IntArrayMat(-5 + 1, -5 + 1, -6 + 1), IntArrayMat(-3, -2, -1)) + + || test_crop(a, IntArrayMat(11, 3, 4, 4), IntArrayMat(-7 + 1, -3 + 1, -2 + 1, -4 + 1), IntArrayMat(0, 1, 2, 3)) + || test_crop(a, IntArrayMat(12, 4, 5, 3), IntArrayMat(-12 + 1, -4 + 1, -3 + 1, -5 + 1), IntArrayMat(0, 1, 2, 3)) + || test_crop(a, IntArrayMat(8, 5, 6, 2), IntArrayMat(-16 + 1, -5 + 1, -4 + 1, -3 + 1), IntArrayMat(-4, -3, -2, -1)); +} + +int main() +{ + SRAND(776757); + + return 0 + || test_crop_1(RandomMat(112)) + || test_crop_1(RandomMat(126)) + || test_crop_1(RandomMat(127)) + || test_crop_4(RandomMat(20, 48)) + || test_crop_4(RandomMat(15, 36)) + || test_crop_4(RandomMat(16, 33)) + || test_crop_7(RandomMat(20, 20, 48)) + || test_crop_7(RandomMat(15, 15, 36)) + || test_crop_7(RandomMat(16, 16, 33)) + || test_crop_10(RandomMat(20, 20, 20, 48)) + || test_crop_10(RandomMat(15, 15, 15, 36)) + || test_crop_10(RandomMat(16, 16, 16, 33)); +} diff --git a/tests/test_crop_2.cpp b/tests/test_crop_2.cpp new file mode 100644 index 000000000000..287634b973eb --- /dev/null +++ b/tests/test_crop_2.cpp @@ -0,0 +1,122 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/crop.h" +#include "testutil.h" + +static int test_crop(const ncnn::Mat& a, int woffset, int hoffset, int doffset, int coffset, const ncnn::Mat& ref) +{ + ncnn::ParamDict pd; + pd.set(0, woffset); + pd.set(1, hoffset); + pd.set(13, doffset); + pd.set(2, coffset); + pd.set(3, 0); // outw + pd.set(4, 0); // outh + pd.set(14, 0); // outd + pd.set(5, 0); // outc + pd.set(6, 0); // woffset2 + pd.set(7, 0); // hoffset2 + pd.set(15, 0); // doffset2 + pd.set(8, 0); // coffset2 + + std::vector weights(0); + + std::vector ab(2); + ab[0] = a; + ab[1] = ref; + + int ret = test_layer("Crop", pd, weights, ab); + if (ret != 0) + { + fprintf(stderr, "test_crop failed a.dims=%d a=(%d %d %d %d) woffset=%d hoffset=%d doffset=%d coffset=%d ref.dims=%d ref=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c, woffset, hoffset, doffset, coffset, ref.dims, ref.w, ref.h, ref.d, ref.c); + } + + return ret; +} + +static int test_crop_2(const ncnn::Mat& a) +{ + return 0 + || test_crop(a, 0, 0, 0, 0, a) + || test_crop(a, 0, 0, 0, 0, ncnn::Mat(27)) + + || test_crop(a, 11, 0, 0, 0, ncnn::Mat(7)) + || test_crop(a, 12, 0, 0, 0, ncnn::Mat(12)) + || test_crop(a, 16, 0, 0, 0, ncnn::Mat(16)); +} + +static int test_crop_5(const ncnn::Mat& a) +{ + return 0 + || test_crop(a, 0, 0, 0, 0, a) + + || test_crop(a, 0, 12, 0, 0, ncnn::Mat(8, 7)) + || test_crop(a, 5, 0, 0, 0, ncnn::Mat(7, 27)) + + || test_crop(a, 5, 11, 0, 0, ncnn::Mat(5, 12)) + || test_crop(a, 6, 12, 0, 0, ncnn::Mat(4, 16)) + || test_crop(a, 4, 8, 0, 0, ncnn::Mat(6, 7)); +} + +static int test_crop_8(const ncnn::Mat& a) +{ + return 0 + || test_crop(a, 0, 0, 0, 0, a) + + || test_crop(a, 0, 5, 0, 0, ncnn::Mat(6, 6)) + || test_crop(a, 6, 0, 0, 0, ncnn::Mat(8, 8)) + || test_crop(a, 5, 2, 0, 0, ncnn::Mat(6, 3)) + || test_crop(a, 6, 3, 0, 0, ncnn::Mat(8, 4)) + || test_crop(a, 4, 4, 0, 0, ncnn::Mat(7, 5)) + + || test_crop(a, 5, 3, 0, 11, ncnn::Mat(7, 3, 7)) + || test_crop(a, 6, 4, 0, 12, ncnn::Mat(6, 4, 12)) + || test_crop(a, 4, 2, 0, 8, ncnn::Mat(5, 5, 16)); +} + +static int test_crop_11(const ncnn::Mat& a) +{ + return 0 + || test_crop(a, 0, 0, 0, 0, a) + + || test_crop(a, 0, 5, 0, 0, ncnn::Mat(6, 6, 6)) + || test_crop(a, 6, 0, 0, 0, ncnn::Mat(8, 8, 8)) + || test_crop(a, 5, 5, 5, 0, ncnn::Mat(6, 6, 6)) + || test_crop(a, 6, 6, 6, 0, ncnn::Mat(8, 8, 8)) + || test_crop(a, 4, 4, 4, 0, ncnn::Mat(5, 5, 5)) + + || test_crop(a, 3, 3, 3, 11, ncnn::Mat(3, 3, 3, 7)) + || test_crop(a, 4, 4, 4, 12, ncnn::Mat(6, 6, 6, 12)) + || test_crop(a, 5, 5, 5, 8, ncnn::Mat(8, 8, 8, 16)); +} + +int main() +{ + SRAND(776757); + + return 0 + || test_crop_2(RandomMat(112)) + || test_crop_2(RandomMat(126)) + || test_crop_2(RandomMat(127)) + || test_crop_5(RandomMat(20, 48)) + || test_crop_5(RandomMat(15, 36)) + || test_crop_5(RandomMat(16, 33)) + || test_crop_8(RandomMat(20, 20, 48)) + || test_crop_8(RandomMat(15, 15, 36)) + || test_crop_8(RandomMat(16, 16, 33)) + || test_crop_11(RandomMat(20, 20, 20, 48)) + || test_crop_11(RandomMat(15, 15, 15, 36)) + || test_crop_11(RandomMat(16, 16, 16, 33)); +} diff --git a/tests/test_deformableconv2d.cpp b/tests/test_deformableconv2d.cpp index 01511e544960..6cc2d6141382 100644 --- a/tests/test_deformableconv2d.cpp +++ b/tests/test_deformableconv2d.cpp @@ -59,7 +59,7 @@ static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int static int test_deformableconv2d_0() { - static const int kdsp[16][4] = { + static const int kdsp[10][4] = { {1, 1, 1, 0}, {1, 1, 2, 0}, {2, 1, 1, 1}, @@ -67,18 +67,12 @@ static int test_deformableconv2d_0() {3, 1, 1, 1}, {3, 1, 2, 1}, {3, 2, 1, 1}, - {4, 1, 1, 0}, {4, 1, 2, 1}, - {4, 2, 1, 1}, - {5, 1, 1, 2}, {5, 1, 2, 2}, {5, 2, 2, 2}, - {7, 1, 1, 3}, - {7, 1, 2, 3}, - {7, 2, 1, 3}, }; - for (int i = 0; i < 16; i++) + for (int i = 0; i < 4; i++) { const int k = kdsp[i][0]; const int d = kdsp[i][1]; @@ -93,7 +87,23 @@ static int test_deformableconv2d_0() || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1) || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0) || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1) - || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0); + || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0) + || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1); if (ret != 0) return -1; diff --git a/tests/test_deformableconv2d_1.cpp b/tests/test_deformableconv2d_1.cpp new file mode 100644 index 000000000000..2f2febf469df --- /dev/null +++ b/tests/test_deformableconv2d_1.cpp @@ -0,0 +1,120 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/deformableconv2d.h" +#include "testutil.h" + +static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + const int kernel_extent_w = dilation * (kernel - 1) + 1; + const int kernel_extent_h = dilation * (kernel - 1) + 1; + const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1; + const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1; + std::vector a(3); + a[0] = RandomMat(w, h, c); + a[1] = RandomMat(out_w, out_h, kernel * kernel * 2); + a[2] = RandomMat(out_w, out_h, kernel * kernel); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch * c * kernel * kernel); + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(outch * c * kernel * kernel); + if (bias) + weights[1] = RandomMat(outch); + + float epsilon = 0.001; + int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_deformableconv2d_0() +{ + static const int kdsp[10][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, 0}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, 1}, + {4, 1, 2, 1}, + {5, 1, 2, 2}, + {5, 2, 2, 2}, + }; + + for (int i = 4; i < 6; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_deformableconv2d(9, 7, 1, 1, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 4, 13, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 13, 4, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 4, 8, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0) + || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1); + + if (ret != 0) + return -1; + } + + return 0; +} + +int main() +{ + SRAND(7767517); + + return test_deformableconv2d_0(); +} diff --git a/tests/test_deformableconv2d_2.cpp b/tests/test_deformableconv2d_2.cpp new file mode 100644 index 000000000000..130761d87d59 --- /dev/null +++ b/tests/test_deformableconv2d_2.cpp @@ -0,0 +1,120 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/deformableconv2d.h" +#include "testutil.h" + +static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + const int kernel_extent_w = dilation * (kernel - 1) + 1; + const int kernel_extent_h = dilation * (kernel - 1) + 1; + const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1; + const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1; + std::vector a(3); + a[0] = RandomMat(w, h, c); + a[1] = RandomMat(out_w, out_h, kernel * kernel * 2); + a[2] = RandomMat(out_w, out_h, kernel * kernel); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch * c * kernel * kernel); + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(outch * c * kernel * kernel); + if (bias) + weights[1] = RandomMat(outch); + + float epsilon = 0.001; + int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_deformableconv2d_0() +{ + static const int kdsp[10][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, 0}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, 1}, + {4, 1, 2, 1}, + {5, 1, 2, 2}, + {5, 2, 2, 2}, + }; + + for (int i = 6; i < 8; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_deformableconv2d(9, 7, 1, 1, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 4, 13, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 13, 4, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 4, 8, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0) + || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1); + + if (ret != 0) + return -1; + } + + return 0; +} + +int main() +{ + SRAND(7767517); + + return test_deformableconv2d_0(); +} diff --git a/tests/test_deformableconv2d_3.cpp b/tests/test_deformableconv2d_3.cpp new file mode 100644 index 000000000000..1a78a004db0c --- /dev/null +++ b/tests/test_deformableconv2d_3.cpp @@ -0,0 +1,120 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/deformableconv2d.h" +#include "testutil.h" + +static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + const int kernel_extent_w = dilation * (kernel - 1) + 1; + const int kernel_extent_h = dilation * (kernel - 1) + 1; + const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1; + const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1; + std::vector a(3); + a[0] = RandomMat(w, h, c); + a[1] = RandomMat(out_w, out_h, kernel * kernel * 2); + a[2] = RandomMat(out_w, out_h, kernel * kernel); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch * c * kernel * kernel); + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(outch * c * kernel * kernel); + if (bias) + weights[1] = RandomMat(outch); + + float epsilon = 0.001; + int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_deformableconv2d_0() +{ + static const int kdsp[10][4] = { + {1, 1, 1, 0}, + {1, 1, 2, 0}, + {2, 1, 1, 1}, + {2, 1, 2, 0}, + {3, 1, 1, 1}, + {3, 1, 2, 1}, + {3, 2, 1, 1}, + {4, 1, 2, 1}, + {5, 1, 2, 2}, + {5, 2, 2, 2}, + }; + + for (int i = 8; i < 10; i++) + { + const int k = kdsp[i][0]; + const int d = kdsp[i][1]; + const int s = kdsp[i][2]; + const int p = kdsp[i][3]; + + int ret = 0 + || test_deformableconv2d(9, 7, 1, 1, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 4, 13, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 13, 4, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 4, 8, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 8, 4, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 8, 13, k, d, s, p, 0) + || test_deformableconv2d(9, 7, 13, 8, k, d, s, p, 1) + || test_deformableconv2d(9, 7, 16, 16, k, d, s, p, 0) + || test_deformableconv2d(16, 16, 1 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 1 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 4 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 8 * 3, 16 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 1 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 4 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 8 * 3, k, d, s, p, 1) + || test_deformableconv2d(16, 16, 16 * 3, 16 * 3, k, d, s, p, 1); + + if (ret != 0) + return -1; + } + + return 0; +} + +int main() +{ + SRAND(7767517); + + return test_deformableconv2d_0(); +} diff --git a/tests/test_deformableconv2d_4.cpp b/tests/test_deformableconv2d_4.cpp new file mode 100644 index 000000000000..eca9f289dec0 --- /dev/null +++ b/tests/test_deformableconv2d_4.cpp @@ -0,0 +1,76 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/deformableconv2d.h" +#include "testutil.h" + +static int test_deformableconv2d(int w, int h, int c, int outch, int kernel, int dilation, int stride, int pad, int bias) +{ + const int kernel_extent_w = dilation * (kernel - 1) + 1; + const int kernel_extent_h = dilation * (kernel - 1) + 1; + const int out_w = (w + pad + pad - kernel_extent_w) / stride + 1; + const int out_h = (h + pad + pad - kernel_extent_h) / stride + 1; + std::vector a(3); + a[0] = RandomMat(w, h, c); + a[1] = RandomMat(out_w, out_h, kernel * kernel * 2); + a[2] = RandomMat(out_w, out_h, kernel * kernel); + + ncnn::ParamDict pd; + pd.set(0, outch); + pd.set(1, kernel); + pd.set(2, dilation); + pd.set(3, stride); + pd.set(4, pad); + pd.set(5, bias); + pd.set(6, outch * c * kernel * kernel); + + int activation_type = RAND() % 7; // 0 1 2 3 4 5 6 + ncnn::Mat activation_params(2); + activation_params[0] = (activation_type == 6) ? RandomFloat(0, 1) : RandomFloat(-1, 0); // alpha + activation_params[1] = RandomFloat(0, 1); // beta + pd.set(9, activation_type); + pd.set(10, activation_params); + + std::vector weights(bias ? 2 : 1); + weights[0] = RandomMat(outch * c * kernel * kernel); + if (bias) + weights[1] = RandomMat(outch); + + float epsilon = 0.001; + int ret = test_layer("DeformableConv2D", pd, weights, a, 1, epsilon); + if (ret != 0) + { + fprintf(stderr, "test_deformableconv2d failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, activation_type, activation_params[0], activation_params[1]); + } + + return ret; +} + +static int test_deformableconv2d_0() +{ + return 0 + || test_deformableconv2d(7, 5, 24, 32, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 32, 24, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 28, 32, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 32, 28, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 26, 32, 4, 2, 2, 2, 1) + || test_deformableconv2d(7, 5, 32, 26, 4, 2, 2, 2, 1); +} + +int main() +{ + SRAND(7767517); + + return test_deformableconv2d_0(); +} diff --git a/tests/test_expanddims.cpp b/tests/test_expanddims.cpp index b18dfdcc065a..d05d84a9d3b8 100644 --- a/tests/test_expanddims.cpp +++ b/tests/test_expanddims.cpp @@ -15,11 +15,12 @@ #include "layer/expanddims.h" #include "testutil.h" -static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int expand_c) +static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int expand_d, int expand_c) { ncnn::ParamDict pd; pd.set(0, expand_w); pd.set(1, expand_h); + pd.set(11, expand_d); pd.set(2, expand_c); std::vector weights(0); @@ -27,7 +28,7 @@ static int test_expanddims(const ncnn::Mat& a, int expand_w, int expand_h, int e int ret = test_layer("ExpandDims", pd, weights, a); if (ret != 0) { - fprintf(stderr, "test_expanddims failed a.dims=%d a=(%d %d %d) expand_w=%d expand_h=%d expand_c=%d\n", a.dims, a.w, a.h, a.c, expand_w, expand_h, expand_c); + fprintf(stderr, "test_expanddims failed a.dims=%d a=(%d %d %d %d) expand_w=%d expand_h=%d expand_d=%d expand_c=%d\n", a.dims, a.w, a.h, a.d, a.c, expand_w, expand_h, expand_d, expand_c); } return ret; @@ -60,6 +61,17 @@ static ncnn::Mat IntArrayMat(int a0, int a1, int a2) return m; } +static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3) +{ + ncnn::Mat m(4); + int* p = m; + p[0] = a0; + p[1] = a1; + p[2] = a2; + p[3] = a3; + return m; +} + static void print_int_array(const ncnn::Mat& a) { const int* pa = a; @@ -82,7 +94,7 @@ static int test_expanddims_axes(const ncnn::Mat& a, const ncnn::Mat& axes) int ret = test_layer("ExpandDims", pd, weights, a); if (ret != 0) { - fprintf(stderr, "test_expanddims_axes failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c); + fprintf(stderr, "test_expanddims_axes failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c); fprintf(stderr, " axes="); print_int_array(axes); fprintf(stderr, "\n"); @@ -91,48 +103,73 @@ static int test_expanddims_axes(const ncnn::Mat& a, const ncnn::Mat& axes) return ret; } -static int test_expand_0() +static int test_expanddims_all_params(const ncnn::Mat& a) { - ncnn::Mat as[7]; - as[0] = RandomMat(1, 1, 1); - as[1] = RandomMat(14, 16); - as[2] = RandomMat(1, 14); - as[3] = RandomMat(11, 1); - as[4] = RandomMat(1, 1); - as[5] = RandomMat(120); - as[6] = RandomMat(1); - - for (int i = 0; i < 7; i++) - { - const ncnn::Mat& a = as[i]; - int ret = 0 - || test_expanddims(a, 0, 0, 0) - || test_expanddims(a, 0, 0, 1) - || test_expanddims(a, 0, 1, 0) - || test_expanddims(a, 0, 1, 1) - || test_expanddims(a, 1, 0, 0) - || test_expanddims(a, 1, 0, 1) - || test_expanddims(a, 1, 1, 0) - || test_expanddims(a, 1, 1, 1) - - || test_expanddims_axes(a, IntArrayMat(0)) - || test_expanddims_axes(a, IntArrayMat(1)) - || test_expanddims_axes(a, IntArrayMat(2)) - || test_expanddims_axes(a, IntArrayMat(0, 1)) - || test_expanddims_axes(a, IntArrayMat(0, 2)) - || test_expanddims_axes(a, IntArrayMat(1, 2)) - || test_expanddims_axes(a, IntArrayMat(0, 1, 2)); - - if (ret != 0) - return ret; - } + return 0 + || test_expanddims(a, 0, 0, 0, 0) + || test_expanddims(a, 0, 0, 0, 1) + || test_expanddims(a, 0, 0, 1, 0) + || test_expanddims(a, 0, 0, 1, 1) + || test_expanddims(a, 0, 1, 0, 0) + || test_expanddims(a, 0, 1, 0, 1) + || test_expanddims(a, 0, 1, 1, 0) + || test_expanddims(a, 0, 1, 1, 1) + || test_expanddims(a, 1, 0, 0, 0) + || test_expanddims(a, 1, 0, 0, 1) + || test_expanddims(a, 1, 0, 1, 0) + || test_expanddims(a, 1, 0, 1, 1) + || test_expanddims(a, 1, 1, 0, 0) + || test_expanddims(a, 1, 1, 0, 1) + || test_expanddims(a, 1, 1, 1, 0) + || test_expanddims(a, 1, 1, 1, 1) + + || test_expanddims_axes(a, IntArrayMat(0)) + || test_expanddims_axes(a, IntArrayMat(1)) + || test_expanddims_axes(a, IntArrayMat(2)) + || test_expanddims_axes(a, IntArrayMat(3)) + || test_expanddims_axes(a, IntArrayMat(0, 1)) + || test_expanddims_axes(a, IntArrayMat(0, 2)) + || test_expanddims_axes(a, IntArrayMat(0, 3)) + || test_expanddims_axes(a, IntArrayMat(1, 2)) + || test_expanddims_axes(a, IntArrayMat(1, 3)) + || test_expanddims_axes(a, IntArrayMat(2, 3)) + || test_expanddims_axes(a, IntArrayMat(0, 1, 2)) + || test_expanddims_axes(a, IntArrayMat(0, 1, 3)) + || test_expanddims_axes(a, IntArrayMat(0, 2, 3)) + || test_expanddims_axes(a, IntArrayMat(1, 2, 3)) + || test_expanddims_axes(a, IntArrayMat(0, 1, 2, 3)); +} + +static int test_expanddims_0() +{ + return 0 + || test_expanddims_all_params(RandomMat(3, 12, 16)) + || test_expanddims_all_params(RandomMat(3, 1, 16)) + || test_expanddims_all_params(RandomMat(1, 33, 15)) + || test_expanddims_all_params(RandomMat(1, 14, 1)) + || test_expanddims_all_params(RandomMat(12, 13, 1)) + || test_expanddims_all_params(RandomMat(1, 1, 1)); +} - return 0; +static int test_expanddims_1() +{ + return 0 + || test_expanddims_all_params(RandomMat(14, 16)) + || test_expanddims_all_params(RandomMat(1, 14)) + || test_expanddims_all_params(RandomMat(11, 1)) + || test_expanddims_all_params(RandomMat(1, 1)); +} + +static int test_expanddims_2() +{ + return 0 + || test_expanddims_all_params(RandomMat(120)) + || test_expanddims_all_params(RandomMat(1)); } int main() { SRAND(7767517); - return test_expand_0(); + return test_expanddims_0() || test_expanddims_1() || test_expanddims_2(); } diff --git a/tests/test_fold.cpp b/tests/test_fold.cpp new file mode 100644 index 000000000000..11a38428fdd7 --- /dev/null +++ b/tests/test_fold.cpp @@ -0,0 +1,58 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/fold.h" +#include "testutil.h" + +static int test_fold(int w, int h, int outw, int outh, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_w, int pad_h) +{ + ncnn::Mat a = RandomMat(w, h); + + ncnn::ParamDict pd; + pd.set(1, kernel_w); + pd.set(11, kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_w); + pd.set(14, pad_h); + pd.set(20, outw); + pd.set(21, outh); + + std::vector weights(0); + + int ret = test_layer("Fold", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_fold failed w=%d h=%d outw=%d outh=%d kernel=%d,%d dilation=%d,%d stride=%d,%d pad=%d,%d\n", w, h, outw, outh, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_w, pad_h); + } + + return ret; +} + +static int test_fold_0() +{ + return 0 + || test_fold(400, 108, 22, 22, 3, 3, 1, 1, 1, 1, 0, 0) + || test_fold(190, 96, 18, 17, 4, 2, 1, 1, 1, 2, 2, 2) + || test_fold(120, 36, 11, 5, 3, 2, 2, 1, 1, 1, 4, 2); +} + +int main() +{ + SRAND(7767517); + + return test_fold_0(); +} diff --git a/tests/test_gelu.cpp b/tests/test_gelu.cpp index 974079edea8f..f4ac70cf8e25 100644 --- a/tests/test_gelu.cpp +++ b/tests/test_gelu.cpp @@ -34,6 +34,8 @@ static int test_gelu(const ncnn::Mat& a, bool fast_gelu) static int test_gelu_0() { return 0 + || test_gelu(RandomMat(9, 7, 32), false) + || test_gelu(RandomMat(9, 7, 32), true) || test_gelu(RandomMat(5, 7, 24), false) || test_gelu(RandomMat(5, 7, 24), true) || test_gelu(RandomMat(7, 9, 12), false) @@ -45,6 +47,8 @@ static int test_gelu_0() static int test_gelu_1() { return 0 + || test_gelu(RandomMat(13, 32), false) + || test_gelu(RandomMat(13, 32), true) || test_gelu(RandomMat(15, 24), false) || test_gelu(RandomMat(15, 24), true) || test_gelu(RandomMat(17, 12), false) @@ -61,7 +65,9 @@ static int test_gelu_2() || test_gelu(RandomMat(124), false) || test_gelu(RandomMat(124), true) || test_gelu(RandomMat(127), false) - || test_gelu(RandomMat(127), true); + || test_gelu(RandomMat(127), true) + || test_gelu(RandomMat(120), false) + || test_gelu(RandomMat(120), true); } int main() diff --git a/tests/test_glu.cpp b/tests/test_glu.cpp new file mode 100644 index 000000000000..58555aa53570 --- /dev/null +++ b/tests/test_glu.cpp @@ -0,0 +1,69 @@ +// Copyright (c) 2022 Xiaomi Corp. (author: Fangjun Kuang) +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/glu.h" +#include "testutil.h" + +static int test_glu(const ncnn::Mat& a, int axis) +{ + ncnn::ParamDict pd; + pd.set(0, axis); + + std::vector weights(0); + + int ret = test_layer("GLU", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_glu failed a.dims=%d a=(%d %d %d) axis=%d\n", a.dims, a.w, a.h, a.c, axis); + } + + return ret; +} + +static int test_glu_0() +{ + return 0 + || test_glu(RandomMat(6, 7, 24), 0) + || test_glu(RandomMat(6, 8, 24), 1) + || test_glu(RandomMat(6, 8, 24), 2) + || test_glu(RandomMat(36, 7, 22), 0) + || test_glu(RandomMat(5, 256, 23), -2) + || test_glu(RandomMat(129, 9, 60), 2) + || test_glu(RandomMat(129, 9, 30), -1); +} + +static int test_glu_1() +{ + return 0 + || test_glu(RandomMat(10, 24), 0) + || test_glu(RandomMat(7, 24), 1) + || test_glu(RandomMat(128, 22), 0) + || test_glu(RandomMat(128, 256), 1); +} + +static int test_glu_2() +{ + return 0 + || test_glu(RandomMat(10), 0) + || test_glu(RandomMat(20), 0) + || test_glu(RandomMat(128), 0); +} + +int main() +{ + SRAND(7767517); + + return 0 + || test_glu_0() + || test_glu_1() + || test_glu_2(); +} diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp new file mode 100644 index 000000000000..70c96b304805 --- /dev/null +++ b/tests/test_gridsample.cpp @@ -0,0 +1,131 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/gridsample.h" +#include "testutil.h" + +static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample_type, int padding_mode, int align_corner) +{ + ncnn::ParamDict pd; + pd.set(0, sample_type); + pd.set(1, padding_mode); + pd.set(2, align_corner); + + std::vector weights(0); + + std::vector as(2); + as[0] = a; + as[1] = grid; + + int ret = test_layer("GridSample", pd, weights, as); + if (ret != 0) + { + fprintf(stderr, "test_gridsample failed a.dims=%d a=(%d %d %d %d) grid.dims=%d grid=(%d %d %d %d) sample_type=%d padding_mode=%d align_corner=%d", + a.dims, a.w, a.h, a.d, a.c, grid.dims, grid.w, grid.h, grid.d, grid.c, + sample_type, padding_mode, align_corner); + } + + return ret; +} + +static int test_gridsample_0() +{ + return 0 + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 3, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 1, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 2, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 2, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 3, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 3, 1); +} + +static int test_gridsample_1() +{ + return 0 + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 3, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 1, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 1, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 2, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 2, 1) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 3, 0) + || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 3, 1); +} + +static int test_gridsample_2() +{ + return 0 + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 3, 1); +} + +static int test_gridsample_3() +{ + return 0 + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 1, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 1, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 2, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 2, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 3, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 3, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 1, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 1, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 2, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 2, 1) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 3, 0) + || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 3, 1); +} + +int main() +{ + SRAND(7767517); + + return 0 + || test_gridsample_0() + || test_gridsample_1() + || test_gridsample_2() + || test_gridsample_3(); +} diff --git a/tests/test_groupnorm.cpp b/tests/test_groupnorm.cpp index e1e831c066c6..5f3c5c569161 100644 --- a/tests/test_groupnorm.cpp +++ b/tests/test_groupnorm.cpp @@ -38,6 +38,17 @@ static int test_groupnorm(const ncnn::Mat& a, int group, float eps) } static int test_groupnorm_0() +{ + return 0 + || test_groupnorm(RandomMat(3, 6, 4, 2), 1, 0.01f) + || test_groupnorm(RandomMat(2, 3, 3, 8), 2, 0.002f) + || test_groupnorm(RandomMat(3, 4, 5, 6), 3, 0.01f) + || test_groupnorm(RandomMat(4, 5, 6, 12), 4, 0.02f) + || test_groupnorm(RandomMat(5, 6, 7, 24), 2, 0.001f) + || test_groupnorm(RandomMat(2, 8, 9, 24), 3, 0.0001f); +} + +static int test_groupnorm_1() { return 0 || test_groupnorm(RandomMat(6, 4, 2), 1, 0.01f) @@ -48,10 +59,35 @@ static int test_groupnorm_0() || test_groupnorm(RandomMat(8, 9, 24), 3, 0.0001f); } +static int test_groupnorm_2() +{ + return 0 + || test_groupnorm(RandomMat(24, 2), 1, 0.01f) + || test_groupnorm(RandomMat(23, 8), 2, 0.002f) + || test_groupnorm(RandomMat(25, 6), 3, 0.01f) + || test_groupnorm(RandomMat(26, 12), 4, 0.02f) + || test_groupnorm(RandomMat(27, 24), 2, 0.001f) + || test_groupnorm(RandomMat(29, 24), 3, 0.0001f); +} + +static int test_groupnorm_3() +{ + return 0 + || test_groupnorm(RandomMat(12), 1, 0.01f) + || test_groupnorm(RandomMat(18), 2, 0.002f) + || test_groupnorm(RandomMat(36), 3, 0.01f) + || test_groupnorm(RandomMat(212), 4, 0.02f) + || test_groupnorm(RandomMat(124), 2, 0.001f) + || test_groupnorm(RandomMat(324), 3, 0.0001f); +} + int main() { SRAND(7767517); return 0 - || test_groupnorm_0(); + || test_groupnorm_0() + || test_groupnorm_1() + || test_groupnorm_2() + || test_groupnorm_3(); } diff --git a/tests/test_lstm.cpp b/tests/test_lstm.cpp index f002a1aeccf0..fb76ad0fbd7f 100644 --- a/tests/test_lstm.cpp +++ b/tests/test_lstm.cpp @@ -15,50 +15,64 @@ #include "layer/lstm.h" #include "testutil.h" -static int test_lstm(const ncnn::Mat& a, int outch, int direction) +static int test_lstm(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0) { int input_size = a.w; int num_directions = direction == 2 ? 2 : 1; + if (hidden_size == 0) + hidden_size = outch; ncnn::ParamDict pd; pd.set(0, outch); - pd.set(1, outch * input_size * 4 * num_directions); + pd.set(1, hidden_size * input_size * 4 * num_directions); pd.set(2, direction); + pd.set(3, hidden_size); - std::vector weights(3); - weights[0] = RandomMat(outch * input_size * 4 * num_directions); - weights[1] = RandomMat(outch * 4 * num_directions); - weights[2] = RandomMat(outch * outch * 4 * num_directions); + std::vector weights(hidden_size == 0 ? 3 : 4); + weights[0] = RandomMat(hidden_size * input_size * 4 * num_directions); + weights[1] = RandomMat(hidden_size * 4 * num_directions); + weights[2] = RandomMat(outch * hidden_size * 4 * num_directions); + if (hidden_size) + { + weights[3] = RandomMat(hidden_size * outch * num_directions); + } int ret = test_layer("LSTM", pd, weights, a); if (ret != 0) { - fprintf(stderr, "test_lstm failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); + fprintf(stderr, "test_lstm failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size); } return ret; } -int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction) +int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0) { int input_size = a.w; int num_directions = direction == 2 ? 2 : 1; + if (hidden_size == 0) + hidden_size = outch; ncnn::ParamDict pd; pd.set(0, outch); - pd.set(1, outch * input_size * 4 * num_directions); + pd.set(1, hidden_size * input_size * 4 * num_directions); pd.set(2, direction); + pd.set(3, hidden_size); - std::vector weights(3); - weights[0] = RandomMat(outch * input_size * 4 * num_directions); - weights[1] = RandomMat(outch * 4 * num_directions); - weights[2] = RandomMat(outch * outch * 4 * num_directions); + std::vector weights(hidden_size == 0 ? 3 : 4); + weights[0] = RandomMat(hidden_size * input_size * 4 * num_directions); + weights[1] = RandomMat(hidden_size * 4 * num_directions); + weights[2] = RandomMat(outch * hidden_size * 4 * num_directions); + if (hidden_size) + { + weights[3] = RandomMat(hidden_size * outch * num_directions); + } // initial hidden state ncnn::Mat hidden = RandomMat(outch, num_directions); // initial cell state - ncnn::Mat cell = RandomMat(outch, num_directions); + ncnn::Mat cell = RandomMat(hidden_size, num_directions); std::vector as(3); as[0] = a; @@ -68,32 +82,39 @@ int test_lstm_layer_with_hidden(const ncnn::Mat& a, int outch, int direction) int ret = test_layer("LSTM", pd, weights, as, 3); if (ret != 0) { - fprintf(stderr, "test_lstm_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); + fprintf(stderr, "test_lstm_layer_with_hidden failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size); } return ret; } -int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction) +int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0) { int input_size = a.w; int num_directions = direction == 2 ? 2 : 1; + if (hidden_size == 0) + hidden_size = outch; ncnn::ParamDict pd; pd.set(0, outch); - pd.set(1, outch * input_size * 4 * num_directions); + pd.set(1, hidden_size * input_size * 4 * num_directions); pd.set(2, direction); + pd.set(3, hidden_size); - std::vector weights(3); - weights[0] = RandomMat(outch * input_size * 4 * num_directions); - weights[1] = RandomMat(outch * 4 * num_directions); - weights[2] = RandomMat(outch * outch * 4 * num_directions); + std::vector weights(hidden_size == 0 ? 3 : 4); + weights[0] = RandomMat(hidden_size * input_size * 4 * num_directions); + weights[1] = RandomMat(hidden_size * 4 * num_directions); + weights[2] = RandomMat(outch * hidden_size * 4 * num_directions); + if (hidden_size) + { + weights[3] = RandomMat(hidden_size * outch * num_directions); + } // initial hidden state ncnn::Mat hidden = RandomMat(outch, num_directions); // initial cell state - ncnn::Mat cell = RandomMat(outch, num_directions); + ncnn::Mat cell = RandomMat(hidden_size, num_directions); std::vector as(3); as[0] = a; @@ -103,26 +124,33 @@ int test_lstm_layer_with_hidden_input(const ncnn::Mat& a, int outch, int directi int ret = test_layer("LSTM", pd, weights, as, 1); if (ret != 0) { - fprintf(stderr, "test_lstm_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); + fprintf(stderr, "test_lstm_layer_with_hidden_input failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size); } return ret; } -int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction) +int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direction, int hidden_size = 0) { int input_size = a.w; int num_directions = direction == 2 ? 2 : 1; + if (hidden_size == 0) + hidden_size = outch; ncnn::ParamDict pd; pd.set(0, outch); - pd.set(1, outch * input_size * 4 * num_directions); + pd.set(1, hidden_size * input_size * 4 * num_directions); pd.set(2, direction); + pd.set(3, hidden_size); - std::vector weights(3); - weights[0] = RandomMat(outch * input_size * 4 * num_directions); - weights[1] = RandomMat(outch * 4 * num_directions); - weights[2] = RandomMat(outch * outch * 4 * num_directions); + std::vector weights(hidden_size == 0 ? 3 : 4); + weights[0] = RandomMat(hidden_size * input_size * 4 * num_directions); + weights[1] = RandomMat(hidden_size * 4 * num_directions); + weights[2] = RandomMat(outch * hidden_size * 4 * num_directions); + if (hidden_size) + { + weights[3] = RandomMat(hidden_size * outch * num_directions); + } std::vector as(1); as[0] = a; @@ -130,7 +158,7 @@ int test_lstm_layer_with_hidden_output(const ncnn::Mat& a, int outch, int direct int ret = test_layer("LSTM", pd, weights, as, 3); if (ret != 0) { - fprintf(stderr, "test_lstm_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d, direction = %d \n", a.dims, a.w, a.h, a.c, outch, direction); + fprintf(stderr, "test_lstm_layer_with_hidden_output failed a.dims=%d a=(%d %d %d) outch=%d direction=%d hidden_size=%d\n", a.dims, a.w, a.h, a.c, outch, direction, hidden_size); } return ret; @@ -147,7 +175,7 @@ static int test_lstm_0() || test_lstm(RandomMat(5, 16), 16, 2) || test_lstm(RandomMat(3, 16), 8, 2) || test_lstm(RandomMat(8, 16), 16, 2) - || test_lstm(RandomMat(2, 5), 17, 2); + || test_lstm(RandomMat(2, 5), 17, 2, 15); } static int test_lstm_1() @@ -160,7 +188,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 2) || test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 2) || test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 2) - || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 2) + || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 2, 33) || test_lstm_layer_with_hidden(RandomMat(4, 4), 1, 1) || test_lstm_layer_with_hidden(RandomMat(8, 2), 2, 1) || test_lstm_layer_with_hidden(RandomMat(16, 8), 7, 1) @@ -168,7 +196,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 1) || test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 1) || test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 1) - || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 1) + || test_lstm_layer_with_hidden(RandomMat(2, 5), 99, 1, 33) || test_lstm_layer_with_hidden(RandomMat(4, 2), 1, 0) || test_lstm_layer_with_hidden(RandomMat(8, 2), 2, 0) || test_lstm_layer_with_hidden(RandomMat(16, 8), 7, 0) @@ -176,7 +204,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden(RandomMat(19, 15), 8, 0) || test_lstm_layer_with_hidden(RandomMat(5, 16), 16, 0) || test_lstm_layer_with_hidden(RandomMat(3, 16), 8, 0) - || test_lstm_layer_with_hidden(RandomMat(2, 5), 17, 0) + || test_lstm_layer_with_hidden(RandomMat(2, 5), 17, 0, 15) || test_lstm_layer_with_hidden_input(RandomMat(4, 4), 1, 2) || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 2) @@ -185,7 +213,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 2) || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 2) || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 2) - || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 2) + || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 2, 33) || test_lstm_layer_with_hidden_input(RandomMat(4, 4), 1, 1) || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 1) || test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 1) @@ -193,7 +221,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 1) || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 1) || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 1) - || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 1) + || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 99, 1, 33) || test_lstm_layer_with_hidden_input(RandomMat(4, 2), 1, 0) || test_lstm_layer_with_hidden_input(RandomMat(8, 2), 2, 0) || test_lstm_layer_with_hidden_input(RandomMat(16, 8), 7, 0) @@ -201,7 +229,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden_input(RandomMat(19, 15), 8, 0) || test_lstm_layer_with_hidden_input(RandomMat(5, 16), 16, 0) || test_lstm_layer_with_hidden_input(RandomMat(3, 16), 8, 0) - || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 17, 0) + || test_lstm_layer_with_hidden_input(RandomMat(2, 5), 17, 0, 15) || test_lstm_layer_with_hidden_output(RandomMat(4, 4), 1, 2) || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 2) @@ -210,7 +238,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 2) || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 2) || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 2) - || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 2) + || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 2, 33) || test_lstm_layer_with_hidden_output(RandomMat(4, 4), 1, 1) || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 1) || test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 1) @@ -218,7 +246,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 1) || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 1) || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 1) - || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 1) + || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 99, 1, 33) || test_lstm_layer_with_hidden_output(RandomMat(4, 2), 1, 0) || test_lstm_layer_with_hidden_output(RandomMat(8, 2), 2, 0) || test_lstm_layer_with_hidden_output(RandomMat(16, 8), 7, 0) @@ -226,7 +254,7 @@ static int test_lstm_1() || test_lstm_layer_with_hidden_output(RandomMat(19, 15), 8, 0) || test_lstm_layer_with_hidden_output(RandomMat(5, 16), 16, 0) || test_lstm_layer_with_hidden_output(RandomMat(3, 16), 8, 0) - || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 17, 0); + || test_lstm_layer_with_hidden_output(RandomMat(2, 5), 17, 0, 15); } static int test_lstm_2() @@ -240,7 +268,7 @@ static int test_lstm_2() || test_lstm(RandomMat(5, 16), 16, 0) || test_lstm(RandomMat(3, 16), 8, 0) || test_lstm(RandomMat(8, 16), 16, 0) - || test_lstm(RandomMat(2, 5), 17, 0); + || test_lstm(RandomMat(2, 5), 17, 0, 15); } static int test_lstm_3() { @@ -253,7 +281,7 @@ static int test_lstm_3() || test_lstm(RandomMat(5, 16), 16, 1) || test_lstm(RandomMat(3, 16), 8, 1) || test_lstm(RandomMat(8, 16), 16, 1) - || test_lstm(RandomMat(2, 5), 17, 1); + || test_lstm(RandomMat(2, 5), 17, 1, 15); } int main() diff --git a/tests/test_multiheadattention.cpp b/tests/test_multiheadattention.cpp index f4e0b1b44f58..e7440fd55bda 100644 --- a/tests/test_multiheadattention.cpp +++ b/tests/test_multiheadattention.cpp @@ -15,34 +15,70 @@ #include "layer/multiheadattention.h" #include "testutil.h" -static int test_multiheadattention(const ncnn::Mat& a, int num_heads) +static int test_multiheadattention(const ncnn::Mat& q, const ncnn::Mat& k, const ncnn::Mat& v, int num_heads, int kdim, int vdim) { - int embed_dim = a.w; + int embed_dim = q.w; ncnn::ParamDict pd; pd.set(0, embed_dim); pd.set(1, num_heads); pd.set(2, embed_dim * embed_dim); + pd.set(3, kdim); + pd.set(4, vdim); std::vector weights(8); weights[0] = RandomMat(embed_dim * embed_dim); weights[1] = RandomMat(embed_dim); - weights[2] = RandomMat(embed_dim * embed_dim); + weights[2] = RandomMat(embed_dim * kdim); weights[3] = RandomMat(embed_dim); - weights[4] = RandomMat(embed_dim * embed_dim); + weights[4] = RandomMat(embed_dim * vdim); weights[5] = RandomMat(embed_dim); weights[6] = RandomMat(embed_dim * embed_dim); weights[7] = RandomMat(embed_dim); std::vector as(3); - as[0] = a; - as[1] = a; - as[2] = a; + as[0] = q; + as[1] = k; + as[2] = v; int ret = test_layer("MultiHeadAttention", pd, weights, as); if (ret != 0) { - fprintf(stderr, "test_multiheadattention failed a=(%d %d)\n", a.w, a.h); + fprintf(stderr, "test_multiheadattention failed q=(%d %d) k=(%d %d) v=(%d %d)\n", q.w, q.h, k.w, k.h, v.w, v.h); + } + + return ret; +} + +static int test_multiheadattention_samekv(const ncnn::Mat& q, const ncnn::Mat& kv, int num_heads, int kvdim) +{ + int embed_dim = q.w; + + ncnn::ParamDict pd; + pd.set(0, embed_dim); + pd.set(1, num_heads); + pd.set(2, embed_dim * embed_dim); + pd.set(3, kvdim); + pd.set(4, kvdim); + + std::vector weights(8); + weights[0] = RandomMat(embed_dim * embed_dim); + weights[1] = RandomMat(embed_dim); + weights[2] = RandomMat(embed_dim * kvdim); + weights[3] = RandomMat(embed_dim); + weights[4] = RandomMat(embed_dim * kvdim); + weights[5] = RandomMat(embed_dim); + weights[6] = RandomMat(embed_dim * embed_dim); + weights[7] = RandomMat(embed_dim); + + std::vector as(2); + as[0] = q; + as[1] = kv; + + int ret = test_layer("MultiHeadAttention", pd, weights, as); + if (ret != 0) + { + fprintf(stderr, "test_multiheadattention failed q=(%d %d) kv=(%d %d)\n", q.w, q.h, kv.w, kv.h); } return ret; @@ -82,11 +118,26 @@ static int test_multiheadattention_sameqkv(const ncnn::Mat& a, int num_heads) static int test_multiheadattention_0() { return 0 - || test_multiheadattention(RandomMat(64, 128), 4) - || test_multiheadattention(RandomMat(64, 127), 16); + || test_multiheadattention(RandomMat(64, 128), RandomMat(64, 128), RandomMat(64, 128), 4, 64, 64) + || test_multiheadattention(RandomMat(64, 127), RandomMat(64, 127), RandomMat(64, 127), 16, 64, 64) + || test_multiheadattention(RandomMat(16, 128), RandomMat(44, 128), RandomMat(55, 128), 2, 44, 55) + || test_multiheadattention(RandomMat(16, 128), RandomMat(44, 127), RandomMat(55, 127), 4, 44, 55) + || test_multiheadattention(RandomMat(12, 17), RandomMat(28, 127), RandomMat(32, 127), 3, 28, 32) + || test_multiheadattention(RandomMat(12, 17), RandomMat(28, 32), RandomMat(11, 32), 3, 28, 11); } static int test_multiheadattention_1() +{ + return 0 + || test_multiheadattention_samekv(RandomMat(64, 128), RandomMat(64, 128), 4, 64) + || test_multiheadattention_samekv(RandomMat(64, 127), RandomMat(64, 127), 16, 64) + || test_multiheadattention_samekv(RandomMat(16, 128), RandomMat(44, 128), 2, 44) + || test_multiheadattention_samekv(RandomMat(16, 128), RandomMat(22, 127), 4, 22) + || test_multiheadattention_samekv(RandomMat(12, 17), RandomMat(28, 127), 3, 28) + || test_multiheadattention_samekv(RandomMat(12, 17), RandomMat(11, 32), 3, 11); +} + +static int test_multiheadattention_2() { return 0 || test_multiheadattention_sameqkv(RandomMat(64, 128), 8) @@ -99,5 +150,6 @@ int main() return 0 || test_multiheadattention_0() - || test_multiheadattention_1(); + || test_multiheadattention_1() + || test_multiheadattention_2(); } diff --git a/tests/test_prelu.cpp b/tests/test_prelu.cpp index 7305dc899d52..4184a288ada5 100644 --- a/tests/test_prelu.cpp +++ b/tests/test_prelu.cpp @@ -37,6 +37,8 @@ static int test_prelu_0() return 0 || test_prelu(RandomMat(5, 7, 24), 24) || test_prelu(RandomMat(5, 7, 24), 1) + || test_prelu(RandomMat(5, 7, 32), 32) + || test_prelu(RandomMat(5, 7, 32), 1) || test_prelu(RandomMat(7, 9, 12), 12) || test_prelu(RandomMat(7, 9, 12), 1) || test_prelu(RandomMat(3, 5, 13), 13) @@ -48,6 +50,8 @@ static int test_prelu_1() return 0 || test_prelu(RandomMat(15, 24), 24) || test_prelu(RandomMat(15, 24), 1) + || test_prelu(RandomMat(15, 32), 32) + || test_prelu(RandomMat(15, 32), 1) || test_prelu(RandomMat(17, 12), 12) || test_prelu(RandomMat(17, 12), 1) || test_prelu(RandomMat(19, 15), 15) @@ -61,6 +65,8 @@ static int test_prelu_2() || test_prelu(RandomMat(128), 1) || test_prelu(RandomMat(124), 124) || test_prelu(RandomMat(124), 1) + || test_prelu(RandomMat(120), 120) + || test_prelu(RandomMat(120), 1) || test_prelu(RandomMat(127), 127) || test_prelu(RandomMat(127), 1); } diff --git a/tests/test_squeeze.cpp b/tests/test_squeeze.cpp index 6834349e0fb5..403f95bdf9b1 100644 --- a/tests/test_squeeze.cpp +++ b/tests/test_squeeze.cpp @@ -15,11 +15,12 @@ #include "layer/squeeze.h" #include "testutil.h" -static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int squeeze_c) +static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int squeeze_d, int squeeze_c) { ncnn::ParamDict pd; pd.set(0, squeeze_w); pd.set(1, squeeze_h); + pd.set(11, squeeze_d); pd.set(2, squeeze_c); std::vector weights(0); @@ -27,7 +28,7 @@ static int test_squeeze(const ncnn::Mat& a, int squeeze_w, int squeeze_h, int sq int ret = test_layer("Squeeze", pd, weights, a); if (ret != 0) { - fprintf(stderr, "test_squeeze failed a.dims=%d a=(%d %d %d) squeeze_w=%d squeeze_h=%d squeeze_c=%d\n", a.dims, a.w, a.h, a.c, squeeze_w, squeeze_h, squeeze_c); + fprintf(stderr, "test_squeeze failed a.dims=%d a=(%d %d %d %d) squeeze_w=%d squeeze_h=%d squeeze_d=%d squeeze_c=%d\n", a.dims, a.w, a.h, a.d, a.c, squeeze_w, squeeze_h, squeeze_d, squeeze_c); } return ret; @@ -60,6 +61,17 @@ static ncnn::Mat IntArrayMat(int a0, int a1, int a2) return m; } +static ncnn::Mat IntArrayMat(int a0, int a1, int a2, int a3) +{ + ncnn::Mat m(4); + int* p = m; + p[0] = a0; + p[1] = a1; + p[2] = a2; + p[3] = a3; + return m; +} + static void print_int_array(const ncnn::Mat& a) { const int* pa = a; @@ -82,7 +94,7 @@ static int test_squeeze_axes(const ncnn::Mat& a, const ncnn::Mat& axes) int ret = test_layer("Squeeze", pd, weights, a); if (ret != 0) { - fprintf(stderr, "test_squeeze_axes failed a.dims=%d a=(%d %d %d)\n", a.dims, a.w, a.h, a.c); + fprintf(stderr, "test_squeeze_axes failed a.dims=%d a=(%d %d %d %d)\n", a.dims, a.w, a.h, a.d, a.c); fprintf(stderr, " axes="); print_int_array(axes); fprintf(stderr, "\n"); @@ -91,53 +103,93 @@ static int test_squeeze_axes(const ncnn::Mat& a, const ncnn::Mat& axes) return ret; } +static int test_squeeze_all_params(const ncnn::Mat& a) +{ + return 0 + || test_squeeze(a, 0, 0, 0, 0) + || test_squeeze(a, 0, 0, 0, 1) + || test_squeeze(a, 0, 0, 1, 0) + || test_squeeze(a, 0, 0, 1, 1) + || test_squeeze(a, 0, 1, 0, 0) + || test_squeeze(a, 0, 1, 0, 1) + || test_squeeze(a, 0, 1, 1, 0) + || test_squeeze(a, 0, 1, 1, 1) + || test_squeeze(a, 1, 0, 0, 0) + || test_squeeze(a, 1, 0, 0, 1) + || test_squeeze(a, 1, 0, 1, 0) + || test_squeeze(a, 1, 0, 1, 1) + || test_squeeze(a, 1, 1, 0, 0) + || test_squeeze(a, 1, 1, 0, 1) + || test_squeeze(a, 1, 1, 1, 0) + || test_squeeze(a, 1, 1, 1, 1) + + || test_squeeze_axes(a, IntArrayMat(0)) + || test_squeeze_axes(a, IntArrayMat(1)) + || test_squeeze_axes(a, IntArrayMat(2)) + || test_squeeze_axes(a, IntArrayMat(3)) + || test_squeeze_axes(a, IntArrayMat(0, 1)) + || test_squeeze_axes(a, IntArrayMat(0, 2)) + || test_squeeze_axes(a, IntArrayMat(0, 3)) + || test_squeeze_axes(a, IntArrayMat(1, 2)) + || test_squeeze_axes(a, IntArrayMat(1, 3)) + || test_squeeze_axes(a, IntArrayMat(2, 3)) + || test_squeeze_axes(a, IntArrayMat(0, 1, 2)) + || test_squeeze_axes(a, IntArrayMat(0, 1, 3)) + || test_squeeze_axes(a, IntArrayMat(0, 2, 3)) + || test_squeeze_axes(a, IntArrayMat(1, 2, 3)) + || test_squeeze_axes(a, IntArrayMat(0, 1, 2, 3)); +} + static int test_squeeze_0() { - ncnn::Mat as[12]; - as[0] = RandomMat(3, 12, 16); - as[1] = RandomMat(3, 1, 16); - as[2] = RandomMat(1, 33, 15); - as[3] = RandomMat(1, 14, 1); - as[4] = RandomMat(12, 13, 1); - as[5] = RandomMat(1, 1, 1); - as[6] = RandomMat(14, 16); - as[7] = RandomMat(1, 14); - as[8] = RandomMat(11, 1); - as[9] = RandomMat(1, 1); - as[10] = RandomMat(120); - as[11] = RandomMat(1); - - for (int i = 0; i < 12; i++) - { - const ncnn::Mat& a = as[i]; - int ret = 0 - || test_squeeze(a, 0, 0, 0) - || test_squeeze(a, 0, 0, 1) - || test_squeeze(a, 0, 1, 0) - || test_squeeze(a, 0, 1, 1) - || test_squeeze(a, 1, 0, 0) - || test_squeeze(a, 1, 0, 1) - || test_squeeze(a, 1, 1, 0) - || test_squeeze(a, 1, 1, 1) - - || test_squeeze_axes(a, IntArrayMat(0)) - || test_squeeze_axes(a, IntArrayMat(1)) - || test_squeeze_axes(a, IntArrayMat(2)) - || test_squeeze_axes(a, IntArrayMat(0, 1)) - || test_squeeze_axes(a, IntArrayMat(0, 2)) - || test_squeeze_axes(a, IntArrayMat(1, 2)) - || test_squeeze_axes(a, IntArrayMat(0, 1, 2)); - - if (ret != 0) - return ret; - } + return 0 + || test_squeeze_all_params(RandomMat(4, 5, 7, 16)) + || test_squeeze_all_params(RandomMat(4, 5, 1, 15)) + || test_squeeze_all_params(RandomMat(4, 1, 7, 12)) + || test_squeeze_all_params(RandomMat(1, 5, 7, 16)) + || test_squeeze_all_params(RandomMat(1, 5, 1, 15)) + || test_squeeze_all_params(RandomMat(1, 1, 7, 12)) + || test_squeeze_all_params(RandomMat(6, 1, 1, 16)) + || test_squeeze_all_params(RandomMat(1, 1, 1, 15)) + || test_squeeze_all_params(RandomMat(4, 5, 7, 1)) + || test_squeeze_all_params(RandomMat(4, 5, 1, 1)) + || test_squeeze_all_params(RandomMat(4, 1, 7, 1)) + || test_squeeze_all_params(RandomMat(1, 5, 7, 1)) + || test_squeeze_all_params(RandomMat(1, 5, 1, 1)) + || test_squeeze_all_params(RandomMat(1, 1, 7, 1)) + || test_squeeze_all_params(RandomMat(1, 1, 1, 1)); +} + +static int test_squeeze_1() +{ + return 0 + || test_squeeze_all_params(RandomMat(3, 12, 16)) + || test_squeeze_all_params(RandomMat(3, 1, 16)) + || test_squeeze_all_params(RandomMat(1, 33, 15)) + || test_squeeze_all_params(RandomMat(1, 14, 1)) + || test_squeeze_all_params(RandomMat(12, 13, 1)) + || test_squeeze_all_params(RandomMat(1, 1, 1)); +} - return 0; +static int test_squeeze_2() +{ + return 0 + || test_squeeze_all_params(RandomMat(14, 16)) + || test_squeeze_all_params(RandomMat(1, 14)) + || test_squeeze_all_params(RandomMat(11, 1)) + || test_squeeze_all_params(RandomMat(1, 1)); +} + +static int test_squeeze_3() +{ + return 0 + || test_squeeze_all_params(RandomMat(120)) + || test_squeeze_all_params(RandomMat(1)); } int main() { SRAND(7767517); - return test_squeeze_0(); + return test_squeeze_0() || test_squeeze_1() || test_squeeze_2() || test_squeeze_3(); } diff --git a/tests/test_squeezenet.cpp b/tests/test_squeezenet.cpp index 81789d26a728..07788c8edd10 100644 --- a/tests/test_squeezenet.cpp +++ b/tests/test_squeezenet.cpp @@ -177,6 +177,16 @@ static int test_squeezenet(const ncnn::Option& opt, int load_model_type, float e { // load from plain model file squeezenet.load_param(MODEL_DIR "/squeezenet_v1.1.param"); + + // test random feature disabled bits + { + std::vector& layers = squeezenet.mutable_layers(); + for (size_t i = 0; i < layers.size(); i++) + { + layers[i]->featmask = i * 11 % 128; + } + } + squeezenet.load_model(MODEL_DIR "/squeezenet_v1.1.bin"); } if (load_model_type == 1) diff --git a/tests/test_unfold.cpp b/tests/test_unfold.cpp new file mode 100644 index 000000000000..4eea1d020eab --- /dev/null +++ b/tests/test_unfold.cpp @@ -0,0 +1,65 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "layer/unfold.h" +#include "testutil.h" + +static int test_unfold(int w, int h, int c, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int pad_w, int pad_h, float pad_value) +{ + ncnn::Mat a = RandomMat(w, h, c); + + ncnn::ParamDict pd; + pd.set(1, kernel_w); + pd.set(11, kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_w); + pd.set(14, pad_h); + pd.set(18, pad_value); + + std::vector weights(0); + + int ret = test_layer("Unfold", pd, weights, a); + if (ret != 0) + { + fprintf(stderr, "test_unfold failed w=%d h=%d c=%d kernel=%d,%d dilation=%d,%d stride=%d,%d pad=%d,%d pad_value=%f\n", w, h, c, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, pad_w, pad_h, pad_value); + } + + return ret; +} + +static int test_unfold_0() +{ + return 0 + || test_unfold(32, 32, 11, 3, 3, 1, 1, 1, 1, 0, 0, 0.f) + || test_unfold(32, 32, 12, 4, 2, 1, 1, 1, 2, 2, 2, -0.5f) + || test_unfold(32, 32, 16, 3, 2, 2, 1, 1, 1, 4, 2, 2.f); +} + +static int test_unfold_1() +{ + return 0 + || test_unfold(32, 32, 11, 3, 3, 1, 1, 1, 1, -233, -233, -0.5f) + || test_unfold(32, 32, 12, 4, 2, 1, 1, 1, 2, -234, -234, 0.f) + || test_unfold(32, 32, 16, 3, 2, 2, 1, 1, 1, -233, -233, 1.f); +} + +int main() +{ + SRAND(7767517); + + return test_unfold_0() || test_unfold_1(); +} diff --git a/toolchains/loongarch64-linux-gnu.toolchain.cmake b/toolchains/loongarch64-linux-gnu.toolchain.cmake index 4390155f2b42..7cdfd9dbef8d 100644 --- a/toolchains/loongarch64-linux-gnu.toolchain.cmake +++ b/toolchains/loongarch64-linux-gnu.toolchain.cmake @@ -1,8 +1,18 @@ set(CMAKE_SYSTEM_NAME Linux) set(CMAKE_SYSTEM_PROCESSOR loongarch64) -set(CMAKE_C_COMPILER "loongarch64-linux-gnu-gcc") -set(CMAKE_CXX_COMPILER "loongarch64-linux-gnu-g++") +if(DEFINED ENV{LOONGARCH64_ROOT_PATH}) + file(TO_CMAKE_PATH $ENV{LOONGARCH64_ROOT_PATH} LOONGARCH64_ROOT_PATH) +else() + message(FATAL_ERROR "LOONGARCH64_ROOT_PATH env must be defined") +endif() + +set(LOONGARCH64_ROOT_PATH ${LOONGARCH64_ROOT_PATH} CACHE STRING "root path to loongarch64 toolchain") + +set(CMAKE_C_COMPILER "${LOONGARCH64_ROOT_PATH}/bin/loongarch64-linux-gnu-gcc") +set(CMAKE_CXX_COMPILER "${LOONGARCH64_ROOT_PATH}/bin/loongarch64-linux-gnu-g++") + +set(CMAKE_FIND_ROOT_PATH "${LOONGARCH64_ROOT_PATH}/loongarch64-linux-gnu") set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) diff --git a/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake new file mode 100644 index 000000000000..953f21aaf959 --- /dev/null +++ b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake @@ -0,0 +1,29 @@ +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +if(DEFINED ENV{RISCV_ROOT_PATH}) + file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH) +else() + message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined") +endif() + +set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv gnu toolchain") + +set(CMAKE_C_COMPILER "clang") +set(CMAKE_CXX_COMPILER "clang++") +set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") + +set(CMAKE_C_COMPILER_TARGET "riscv64-unknown-linux-gnu") +set(CMAKE_CXX_COMPILER_TARGET "riscv64-unknown-linux-gnu") + +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# add --ld-path=${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-ld or append $RISCV_ROOT_PATH/bin to PATH. +set(CMAKE_C_FLAGS "--gcc-toolchain=${RISCV_ROOT_PATH} -march=rv64gc") +set(CMAKE_CXX_FLAGS "--gcc-toolchain=${RISCV_ROOT_PATH} -march=rv64gc") + +# cache flags +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags") diff --git a/toolchains/th1520-v240.toolchain.cmake b/toolchains/th1520-v240.toolchain.cmake new file mode 100644 index 000000000000..fb9787a82632 --- /dev/null +++ b/toolchains/th1520-v240.toolchain.cmake @@ -0,0 +1,31 @@ +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR riscv64) +set(C906 True) + +if(DEFINED ENV{RISCV_ROOT_PATH}) + file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH) +else() + message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined") +endif() + +set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain") + +set(CMAKE_C_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc") +set(CMAKE_CXX_COMPILER "${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++") + +set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu") + +set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") + +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) + +set(CMAKE_C_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static") +set(CMAKE_CXX_FLAGS "-march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static") + +# cache flags +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags") + diff --git a/tools/modelwriter.h b/tools/modelwriter.h index e9ff979176a7..0a53c099da48 100644 --- a/tools/modelwriter.h +++ b/tools/modelwriter.h @@ -1569,7 +1569,7 @@ int ModelWriter::save(const char* parampath, const char* binpath) fprintf_param_value(" 1=%d", expand_h) fprintf_param_value(" 2=%d", expand_c) { - if (!op->axes.empty()) fprintf_param_int_array(0, op->axes, pp); + if (!op->axes.empty()) fprintf_param_int_array(3, op->axes, pp); } } else if (layer->type == "GELU") diff --git a/tools/onnx/onnx2ncnn.cpp b/tools/onnx/onnx2ncnn.cpp index 069e6660c1d8..161cf94b2721 100644 --- a/tools/onnx/onnx2ncnn.cpp +++ b/tools/onnx/onnx2ncnn.cpp @@ -2930,6 +2930,30 @@ static void fuse_binaryop_with_scalar(onnx::GraphProto* mutable_graph, std::map< } } +// truncate layer/blob names when they exceed 255, which is the upper length limit when parsing param in src/net.cpp +static std::string trunc_name(std::string name) +{ + static int trunc_idx = 0; + static std::map name_trunc_map; + + const int max_len = 255; + if (name.size() <= max_len) + { + return name; + } + if (name_trunc_map.count(name)) + { + return name_trunc_map[name]; + } + + std::string concat_name = name + "_t" + std::to_string(trunc_idx); + std::string trunc_name = concat_name.substr(concat_name.size() - max_len); + trunc_idx += 1; + name_trunc_map[name] = trunc_name; + + return trunc_name; +} + int main(int argc, char** argv) { if (!(argc == 2 || argc == 4)) @@ -3433,7 +3457,7 @@ int main(int argc, char** argv) if (weights.find(input_name) != weights.end()) continue; - fprintf(pp, "%-16s %-24s 0 1 %s\n", "Input", input_name.c_str(), input_name.c_str()); + fprintf(pp, "%-16s %-24s 0 1 %s\n", "Input", trunc_name(input_name).c_str(), trunc_name(input_name).c_str()); int refcount = node_reference[input_name]; if (refcount <= 1) @@ -3444,11 +3468,12 @@ int main(int argc, char** argv) char splitname[256]; sprintf(splitname, "splitncnn_input%d", j); fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount); - fprintf(pp, " %s", input_name.c_str()); + fprintf(pp, " %s", trunc_name(input_name).c_str()); for (int k = 0; k < refcount; k++) { - fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k); + std::string split_name = input_name + "_splitncnn_" + std::to_string(k); + fprintf(pp, " %s", trunc_name(split_name).c_str()); } fprintf(pp, "\n"); } @@ -3464,7 +3489,7 @@ int main(int argc, char** argv) continue; } - fprintf(pp, "%-16s %-24s 0 1 %s", "MemoryData", input_name.c_str(), input_name.c_str()); + fprintf(pp, "%-16s %-24s 0 1 %s", "MemoryData", trunc_name(input_name).c_str(), trunc_name(input_name).c_str()); const onnx::TensorProto& M = weights[input_name]; @@ -3513,11 +3538,12 @@ int main(int argc, char** argv) sprintf(splitname, "splitncnn_%d", internal_split); fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount); - fprintf(pp, " %s", input_name.c_str()); + fprintf(pp, " %s", trunc_name(input_name).c_str()); for (int k = 0; k < refcount; k++) { - fprintf(pp, " %s_splitncnn_%d", input_name.c_str(), k); + std::string split_name = input_name + "_splitncnn_" + std::to_string(k); + fprintf(pp, " %s", trunc_name(split_name).c_str()); } fprintf(pp, "\n"); @@ -3939,7 +3965,7 @@ int main(int argc, char** argv) fprintf(pp, "%-16s", op.c_str()); } - fprintf(pp, " %-24s %d %d", name.c_str(), input_size, output_size); + fprintf(pp, " %-24s %d %d", trunc_name(name).c_str(), input_size, output_size); for (int j = 0; j < (int)node.input_size(); j++) { @@ -3966,14 +3992,14 @@ int main(int argc, char** argv) input_name = input_name + splitsuffix; } - fprintf(pp, " %s", input_name.c_str()); + fprintf(pp, " %s", trunc_name(input_name).c_str()); } for (int j = 0; j < output_size; j++) { const std::string& output_name = node.output(j); - fprintf(pp, " %s", output_name.c_str()); + fprintf(pp, " %s", trunc_name(output_name).c_str()); } if (op == "Abs") @@ -6064,11 +6090,12 @@ int main(int argc, char** argv) sprintf(splitname, "splitncnn_%d", internal_split); fprintf(pp, "%-16s %-24s %d %d", "Split", splitname, 1, refcount); - fprintf(pp, " %s", output_name.c_str()); + fprintf(pp, " %s", trunc_name(output_name).c_str()); for (int k = 0; k < refcount; k++) { - fprintf(pp, " %s_splitncnn_%d", output_name.c_str(), k); + std::string split_name = output_name + "_splitncnn_" + std::to_string(k); + fprintf(pp, " %s", trunc_name(split_name).c_str()); } fprintf(pp, "\n"); diff --git a/tools/pnnx/README.md b/tools/pnnx/README.md index f9997e193627..882280d1f438 100644 --- a/tools/pnnx/README.md +++ b/tools/pnnx/README.md @@ -62,7 +62,7 @@ mod.save("resnet18.pt") pnnx resnet18.pt inputshape=[1,3,224,224] ``` -Normally, you will get six files +Normally, you will get seven files ```resnet18.pnnx.param``` PNNX graph definition @@ -70,6 +70,8 @@ Normally, you will get six files ```resnet18_pnnx.py``` PyTorch script for inference, the python code for model construction and weight initialization +```resnet18.pnnx.onnx``` PNNX model in onnx format + ```resnet18.ncnn.param``` ncnn graph definition ```resnet18.ncnn.bin``` ncnn model weight @@ -87,9 +89,11 @@ Usage: pnnx [model.pt] [(key=value)...] pnnxparam=model.pnnx.param pnnxbin=model.pnnx.bin pnnxpy=model_pnnx.py + pnnxonnx=model.pnnx.onnx ncnnparam=model.ncnn.param ncnnbin=model.ncnn.bin ncnnpy=model_ncnn.py + fp16=1 optlevel=2 device=cpu/gpu inputshape=[1,3,224,224],... @@ -108,12 +112,16 @@ Parameters: `pnnxpy` (default="*_pnnx.py"): PyTorch script for inference, including model construction and weight initialization code +`pnnxonnx` (default="*.pnnx.onnx"): PNNX model in onnx format + `ncnnparam` (default="*.ncnn.param"): ncnn graph definition `ncnnbin` (default="*.ncnn.bin"): ncnn model weight `ncnnpy` (default="*_ncnn.py"): pyncnn script for inference +`fp16` (default=1): save ncnn weight and onnx in fp16 data type + `optlevel` (default=2): graph optimization level | Option | Optimization level | @@ -484,10 +492,11 @@ TORCH_LIBRARY(upfirdn2d_op, m) { |nn.Embedding | :heavy_check_mark: | :heavy_check_mark: | |nn.EmbeddingBag | | |nn.Flatten | :heavy_check_mark: | -|nn.Fold | | +|nn.Fold | :heavy_check_mark: | :heavy_check_mark: | |nn.FractionalMaxPool2d | | |nn.FractionalMaxPool3d | | |nn.GELU | :heavy_check_mark: | :heavy_check_mark: | +|nn.GLU | :heavy_check_mark: | :heavy_check_mark: | |nn.GroupNorm | :heavy_check_mark: | :heavy_check_mark: | |nn.GRU | :heavy_check_mark: | :heavy_check_mark: | |nn.GRUCell | | @@ -546,7 +555,7 @@ TORCH_LIBRARY(upfirdn2d_op, m) { |nn.Sigmoid | :heavy_check_mark: | :heavy_check_mark: | |nn.SiLU | :heavy_check_mark: | :heavy_check_mark: | |nn.Softmax | :heavy_check_mark: | :heavy_check_mark: | -|nn.Softmax2d | | +|nn.Softmax2d | :heavy_check_mark: | :heavy_check_mark: | |nn.Softmin | :heavy_check_mark: | |nn.Softplus | :heavy_check_mark: | |nn.Softshrink | :heavy_check_mark: | @@ -561,7 +570,7 @@ TORCH_LIBRARY(upfirdn2d_op, m) { |nn.TransformerEncoder | | |nn.TransformerEncoderLayer | | |nn.Unflatten | | -|nn.Unfold | | +|nn.Unfold | :heavy_check_mark: | :heavy_check_mark: | |nn.Upsample | :heavy_check_mark: | :heavy_check_mark: | |nn.UpsamplingBilinear2d | :heavy_check_mark: | :heavy_check_mark: | |nn.UpsamplingNearest2d | :heavy_check_mark: | :heavy_check_mark: | @@ -599,12 +608,12 @@ TORCH_LIBRARY(upfirdn2d_op, m) { |F.embedding | :heavy_check_mark: | :heavy_check_mark: | |F.embedding_bag | | |F.feature_alpha_dropout | :heavy_check_mark: | :heavy_check_mark: | -|F.fold | | +|F.fold | :heavy_check_mark: | :heavy_check_mark: | |F.fractional_max_pool2d | | |F.fractional_max_pool3d | | |F.gelu | :heavy_check_mark: | :heavy_check_mark: | -|F.glu | | -|F.grid_sample | :heavy_check_mark: | +|F.glu | :heavy_check_mark: | :heavy_check_mark: | +|F.grid_sample | :heavy_check_mark: | :heavy_check_mark: | |F.group_norm | :heavy_check_mark: | :heavy_check_mark: | |F.gumbel_softmax | | |F.hardshrink | :heavy_check_mark: | @@ -655,7 +664,7 @@ TORCH_LIBRARY(upfirdn2d_op, m) { |F.tanhshrink | :heavy_check_mark: | |F.threshold | :heavy_check_mark: | |F.threshold_ | :heavy_check_mark: | -|F.unfold | | +|F.unfold | :heavy_check_mark: | :heavy_check_mark: | |F.upsample | :heavy_check_mark: | :heavy_check_mark: | |F.upsample_bilinear | :heavy_check_mark: | :heavy_check_mark: | |F.upsample_nearest | :heavy_check_mark: | :heavy_check_mark: | diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt index f9e29f7fe2c0..9005df1ecc99 100644 --- a/tools/pnnx/src/CMakeLists.txt +++ b/tools/pnnx/src/CMakeLists.txt @@ -4,6 +4,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) set(pnnx_pass_level0_SRCS pass_level0/constant_unpooling.cpp pass_level0/inline_block.cpp + pass_level0/reset_device.cpp pass_level0/shape_inference.cpp ) @@ -37,7 +38,9 @@ set(pnnx_pass_level1_SRCS pass_level1/nn_Dropout3d.cpp pass_level1/nn_ELU.cpp pass_level1/nn_Embedding.cpp + pass_level1/nn_Fold.cpp pass_level1/nn_GELU.cpp + pass_level1/nn_GLU.cpp pass_level1/nn_GroupNorm.cpp pass_level1/nn_GRU.cpp pass_level1/nn_Hardshrink.cpp @@ -78,6 +81,7 @@ set(pnnx_pass_level1_SRCS pass_level1/nn_Sigmoid.cpp pass_level1/nn_SiLU.cpp pass_level1/nn_Softmax.cpp + pass_level1/nn_Softmax2d.cpp pass_level1/nn_Softmin.cpp pass_level1/nn_Softplus.cpp pass_level1/nn_Softshrink.cpp @@ -85,6 +89,7 @@ set(pnnx_pass_level1_SRCS pass_level1/nn_Tanh.cpp pass_level1/nn_Tanhshrink.cpp pass_level1/nn_Threshold.cpp + pass_level1/nn_Unfold.cpp pass_level1/nn_Upsample.cpp pass_level1/nn_UpsamplingBilinear2d.cpp pass_level1/nn_UpsamplingNearest2d.cpp @@ -126,6 +131,7 @@ set(pnnx_pass_level2_SRCS pass_level2/F_elu.cpp pass_level2/F_embedding.cpp pass_level2/F_feature_alpha_dropout.cpp + pass_level2/F_fold.cpp pass_level2/F_gelu.cpp pass_level2/F_glu.cpp pass_level2/F_grid_sample.cpp @@ -167,10 +173,12 @@ set(pnnx_pass_level2_SRCS pass_level2/F_tanh.cpp pass_level2/F_tanhshrink.cpp pass_level2/F_threshold.cpp + pass_level2/F_unfold.cpp pass_level2/F_upsample_bilinear.cpp pass_level2/F_upsample_nearest.cpp pass_level2/F_upsample.cpp pass_level2/Tensor_contiguous.cpp + pass_level2/Tensor_copy.cpp pass_level2/Tensor_expand.cpp pass_level2/Tensor_expand_as.cpp pass_level2/Tensor_index.cpp @@ -194,6 +202,8 @@ set(pnnx_pass_level2_SRCS pass_level2/torch_bitwise_and.cpp pass_level2/torch_bitwise_or.cpp pass_level2/torch_bitwise_xor.cpp + pass_level2/torch_bitwise_left_shift.cpp + pass_level2/torch_bitwise_right_shift.cpp pass_level2/torch_cat.cpp pass_level2/torch_chunk.cpp pass_level2/torch_clamp.cpp @@ -300,10 +310,11 @@ set(pnnx_pass_level5_SRCS pass_level5/eliminate_noop_expression.cpp pass_level5/eliminate_noop_pad.cpp pass_level5/eliminate_noop_upsample.cpp - pass_level5/eliminate_slice.cpp - pass_level5/eliminate_view_reshape.cpp + pass_level5/eliminate_noop_slice.cpp + pass_level5/eliminate_noop_view_reshape.cpp pass_level5/eval_expression.cpp pass_level5/fold_constants.cpp + pass_level5/fuse_adjacent_reshape.cpp pass_level5/fuse_channel_shuffle.cpp pass_level5/fuse_constant_expression.cpp pass_level5/fuse_conv1d_batchnorm1d.cpp @@ -312,10 +323,19 @@ set(pnnx_pass_level5_SRCS pass_level5/fuse_convtranspose2d_batchnorm2d.cpp pass_level5/fuse_contiguous_view.cpp pass_level5/fuse_linear_batchnorm1d.cpp + pass_level5/fuse_pad_conv1d.cpp + pass_level5/fuse_pad_conv2d.cpp pass_level5/fuse_select_to_unbind.cpp + pass_level5/fuse_slice_copy.cpp pass_level5/fuse_slice_indices.cpp pass_level5/fuse_slice_to_tensor_split.cpp + pass_level5/fuse_static_batchnorm.cpp pass_level5/fuse_static_conv.cpp + pass_level5/fuse_static_convtranspose.cpp + pass_level5/fuse_static_groupnorm.cpp + pass_level5/fuse_static_instancenorm.cpp + pass_level5/fuse_static_layernorm.cpp + pass_level5/fuse_static_linear.cpp pass_level5/normalize_einsum_equation.cpp pass_level5/unroll_rnn_op.cpp ) @@ -338,7 +358,6 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/insert_split.cpp pass_ncnn/chain_multi_output.cpp pass_ncnn/solve_batch_index.cpp - pass_ncnn/convert_to_fp16_model.cpp pass_ncnn/eliminate_noop.cpp pass_ncnn/eliminate_tail_reshape_permute.cpp @@ -373,8 +392,10 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/F_conv3d.cpp pass_ncnn/F_elu.cpp pass_ncnn/F_embedding.cpp + pass_ncnn/F_fold.cpp pass_ncnn/F_gelu.cpp pass_ncnn/F_glu.cpp + pass_ncnn/F_grid_sample.cpp pass_ncnn/F_group_norm.cpp pass_ncnn/F_hardsigmoid.cpp pass_ncnn/F_hardswish.cpp @@ -401,6 +422,7 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/F_silu.cpp pass_ncnn/F_softmax.cpp pass_ncnn/F_tanh.cpp + pass_ncnn/F_unfold.cpp pass_ncnn/F_upsample_bilinear.cpp pass_ncnn/F_upsample_nearest.cpp pass_ncnn/F_upsample.cpp @@ -428,7 +450,9 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/nn_ConvTranspose3d.cpp pass_ncnn/nn_ELU.cpp pass_ncnn/nn_Embedding.cpp + pass_ncnn/nn_Fold.cpp pass_ncnn/nn_GELU.cpp + pass_ncnn/nn_GLU.cpp pass_ncnn/nn_GroupNorm.cpp pass_ncnn/nn_GRU.cpp pass_ncnn/nn_Hardsigmoid.cpp @@ -459,7 +483,9 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/nn_Sigmoid.cpp pass_ncnn/nn_SiLU.cpp pass_ncnn/nn_Softmax.cpp + pass_ncnn/nn_Softmax2d.cpp pass_ncnn/nn_Tanh.cpp + pass_ncnn/nn_Unfold.cpp pass_ncnn/nn_Upsample.cpp pass_ncnn/nn_UpsamplingBilinear2d.cpp pass_ncnn/nn_UpsamplingNearest2d.cpp @@ -495,6 +521,27 @@ set(pnnx_pass_ncnn_SRCS pass_ncnn/icefall_RelShift.cpp ) +find_package(Protobuf) +if(PROTOBUF_FOUND) + protobuf_generate_cpp(ONNX_PROTO_SRCS ONNX_PROTO_HDRS onnx.proto) + + add_library(pnnx2onnx STATIC + save_onnx.cpp + save_onnx_cxxabi_bridge.cpp + ${ONNX_PROTO_SRCS} ${ONNX_PROTO_HDRS} + ) + + target_include_directories(pnnx2onnx PRIVATE ${PROTOBUF_INCLUDE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(pnnx2onnx PRIVATE ${PROTOBUF_LIBRARIES}) + + # libtorch is usually compiled with old cxx11 abi + set_source_files_properties(save_onnx_cxxabi_bridge.cpp PROPERTIES COMPILE_FLAGS "${TORCH_CXX_FLAGS}") + + message(STATUS "Building with onnx-zero") +else() + message(STATUS "Building without onnx-zero") +endif() + set(pnnx_SRCS main.cpp ir.cpp @@ -508,8 +555,6 @@ set(pnnx_SRCS pass_level4.cpp pass_level5.cpp - pass_ncnn.cpp - ${pnnx_pass_level0_SRCS} ${pnnx_pass_level1_SRCS} ${pnnx_pass_level2_SRCS} @@ -517,6 +562,8 @@ set(pnnx_SRCS ${pnnx_pass_level4_SRCS} ${pnnx_pass_level5_SRCS} + pass_ncnn.cpp + save_ncnn.cpp ${pnnx_pass_ncnn_SRCS} ) @@ -526,6 +573,8 @@ endif() add_executable(pnnx ${pnnx_SRCS}) +target_compile_definitions(pnnx PRIVATE BUILD_PNNX) + if(PNNX_COVERAGE) target_compile_options(pnnx PUBLIC -coverage -fprofile-arcs -ftest-coverage) target_link_libraries(pnnx PUBLIC -coverage -lgcov) @@ -535,6 +584,11 @@ if(WIN32) target_compile_definitions(pnnx PUBLIC NOMINMAX) endif() +if(PROTOBUF_FOUND) + target_compile_definitions(pnnx PRIVATE BUILD_PNNX2ONNX) + target_link_libraries(pnnx PRIVATE pnnx2onnx) +endif() + if(TorchVision_FOUND) target_link_libraries(pnnx PRIVATE TorchVision::TorchVision) endif() diff --git a/tools/pnnx/src/ir.cpp b/tools/pnnx/src/ir.cpp index 86cdd75f720c..062092fe9c47 100644 --- a/tools/pnnx/src/ir.cpp +++ b/tools/pnnx/src/ir.cpp @@ -14,14 +14,18 @@ #include "ir.h" +#include #include +#include #include #include #include #include #include +#if BUILD_PNNX #include +#endif #include "storezip.h" @@ -129,6 +133,7 @@ static int string_to_type(const char* s) return 0; // null } +#if BUILD_PNNX int get_at_tensor_type(const at::ScalarType& st) { if (st == c10::ScalarType::Float) return 1; @@ -177,7 +182,10 @@ Parameter::Parameter(const torch::jit::Node* value_node) case c10::TypeKind::IntType: { type = 2; - i = (int)value_node->i(torch::jit::attr::value); + int64_t i64 = value_node->i(torch::jit::attr::value); + if (i64 == LONG_MAX) i64 = INT_MAX; + if (i64 == LONG_MIN) i64 = INT_MIN; + i = (int)i64; break; } case c10::TypeKind::FloatType: @@ -201,7 +209,10 @@ Parameter::Parameter(const torch::jit::Node* value_node) if (t.scalar_type() == c10::ScalarType::Long) { type = 2; - i = (int)t.item(); + int64_t i64 = t.item(); + if (i64 == LONG_MAX) i64 = INT_MAX; + if (i64 == LONG_MIN) i64 = INT_MIN; + i = (int)i64; } else if (t.scalar_type() == c10::ScalarType::Int) { @@ -288,6 +299,7 @@ Parameter::Parameter(const torch::jit::Value* value) : Parameter(value->node()) { } +#endif // BUILD_PNNX bool operator==(const Parameter& lhs, const Parameter& rhs) { @@ -321,6 +333,7 @@ bool operator==(const Parameter& lhs, const Parameter& rhs) return false; } +#if BUILD_PNNX Attribute::Attribute(const at::Tensor& t) { type = get_at_tensor_type(t.scalar_type()); @@ -377,6 +390,7 @@ Attribute::Attribute(const at::Tensor& t) memcpy((void*)data.data(), (const void*)t.cpu().contiguous().data_ptr(), data.size()); } } +#endif // BUILD_PNNX Attribute::Attribute(const std::initializer_list& _shape, const std::vector& t) { @@ -1046,14 +1060,58 @@ static std::string expand_expression(const Operator* op) std::string r = a + ".size(" + b + ")"; exprstack.push(r); } - else if (t == "int" || t == "sqrt" || t == "rsqrt" || t == "neg" || t == "floor") + else if (t == "int" + || t == "abs" + || t == "acos" + || t == "acosh" + || t == "asin" + || t == "asinh" + || t == "atan" + || t == "atanh" + || t == "ceil" + || t == "cos" + || t == "cosh" + || t == "exp" + || t == "floor" + || t == "log" + || t == "neg" + || t == "reciprocal" + || t == "rsqrt" + || t == "sign" + || t == "sin" + || t == "sinh" + || t == "sqrt" + || t == "square" + || t == "tan" + || t == "tanh" + || t == "trunc") { std::string unaryop; if (t == "int") unaryop = "int"; - if (t == "sqrt") unaryop = "torch.sqrt"; - if (t == "rsqrt") unaryop = "torch.rsqrt"; - if (t == "neg") unaryop = "torch.neg"; + if (t == "abs") unaryop = "torch.abs"; + if (t == "acos") unaryop = "torch.acos"; + if (t == "acosh") unaryop = "torch.acosh"; + if (t == "asin") unaryop = "torch.asin"; + if (t == "asinh") unaryop = "torch.asinh"; + if (t == "atan") unaryop = "torch.atan"; + if (t == "atanh") unaryop = "torch.atanh"; + if (t == "ceil") unaryop = "torch.ceil"; + if (t == "cos") unaryop = "torch.cos"; + if (t == "cosh") unaryop = "torch.cosh"; + if (t == "exp") unaryop = "torch.exp"; if (t == "floor") unaryop = "torch.floor"; + if (t == "log") unaryop = "torch.log"; + if (t == "neg") unaryop = "torch.neg"; + if (t == "reciprocal") unaryop = "torch.reciprocal"; + if (t == "rsqrt") unaryop = "torch.rsqrt"; + if (t == "sign") unaryop = "torch.sign"; + if (t == "sin") unaryop = "torch.sin"; + if (t == "sinh") unaryop = "torch.sinh"; + if (t == "sqrt") unaryop = "torch.sqrt"; + if (t == "square") unaryop = "torch.square"; + if (t == "tan") unaryop = "torch.tan"; + if (t == "tanh") unaryop = "torch.tanh"; + if (t == "trunc") unaryop = "torch.trunc"; std::string a = exprstack.top(); exprstack.pop(); @@ -1061,17 +1119,22 @@ static std::string expand_expression(const Operator* op) std::string r = unaryop + "(" + a + ")"; exprstack.push(r); } - else if (t == "pow") + else if (t == "atan2" + || t == "pow") { + std::string binaryop; + if (t == "atan2") binaryop = "torch.atan2"; + if (t == "pow") binaryop = "torch.pow"; + std::string a = exprstack.top(); exprstack.pop(); std::string b = exprstack.top(); exprstack.pop(); - std::string r = a + ".pow(" + b + ")"; + std::string r = binaryop + "(" + a + ", " + b + ")"; exprstack.push(r); } - else if (t == "add" || t == "sub" || t == "mul" || t == "div" || t == "floor_divide" || t == "and" || t == "or" || t == "xor") + else if (t == "add" || t == "sub" || t == "mul" || t == "div" || t == "floor_divide" || t == "and" || t == "or" || t == "xor" || t == "lshift" || t == "rshift") { std::string binaryop; if (t == "add") binaryop = "+"; @@ -1082,6 +1145,8 @@ static std::string expand_expression(const Operator* op) if (t == "and") binaryop = "&"; if (t == "or") binaryop = "|"; if (t == "xor") binaryop = "^"; + if (t == "lshift") binaryop = "<<"; + if (t == "rshift") binaryop = ">>"; std::string a = exprstack.top(); exprstack.pop(); @@ -1196,7 +1261,7 @@ static std::string make_slice_expression(const Operator* op) { std::vector ends = op->params.at("ends").ai; int end = ends[i]; - if (end != -1) + if (end != INT_MAX) r += std::to_string(end); } else @@ -1283,9 +1348,9 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath) fprintf(pyfp, "import torch.nn as nn\n"); fprintf(pyfp, "import torch.nn.functional as F\n"); fprintf(pyfp, "try:\n"); - fprintf(pyfp, "\timport torchvision\n"); + fprintf(pyfp, " import torchvision\n"); fprintf(pyfp, "except:\n"); - fprintf(pyfp, "\tpass\n"); + fprintf(pyfp, " pass\n"); fprintf(pyfp, "\n"); @@ -1595,6 +1660,13 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath) std::string slice_expr = make_slice_expression(op); fprintf(pyfp, "v_%s = v_%s[%s]\n", sanitize_identifier(op->outputs[0]->name).c_str(), sanitize_identifier(op->inputs[0]->name).c_str(), slice_expr.c_str()); } + else if (op->type == "Tensor.slice_copy") + { + // slice copy expr + std::string slice_expr = make_slice_expression(op); + fprintf(pyfp, "v_%s = v_%s\n", sanitize_identifier(op->outputs[0]->name).c_str(), sanitize_identifier(op->inputs[0]->name).c_str()); + fprintf(pyfp, " v_%s[%s] = v_%s\n", sanitize_identifier(op->outputs[0]->name).c_str(), slice_expr.c_str(), sanitize_identifier(op->inputs[1]->name).c_str()); + } else if (op->type == "Tensor.index") { // index expr @@ -1762,8 +1834,14 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath) fprintf(pyfp, " = self.%s(", sanitize_identifier(op->name).c_str()); if (op->inputs.size() == 1) { - const char* in0 = sanitize_identifier(op->inputs[0]->name).c_str(); - fprintf(pyfp, "v_%s, v_%s, v_%s", in0, in0, in0); + std::string in0 = sanitize_identifier(op->inputs[0]->name); + fprintf(pyfp, "v_%s, v_%s, v_%s", in0.c_str(), in0.c_str(), in0.c_str()); + } + else if (op->inputs.size() == 2) + { + std::string in0 = sanitize_identifier(op->inputs[0]->name); + std::string in1 = sanitize_identifier(op->inputs[1]->name); + fprintf(pyfp, "v_%s, v_%s, v_%s", in0.c_str(), in1.c_str(), in1.c_str()); } else { @@ -2233,314 +2311,6 @@ int Graph::python(const std::string& pypath, const std::string& pnnxbinpath) return 0; } -static bool string_is_positive_integer(const std::string& t) -{ - for (size_t i = 0; i < t.size(); i++) - { - if (t[i] < '0' || t[i] > '9') - return false; - } - - return true; -} - -int Graph::ncnn(const std::string& parampath, const std::string& binpath, const std::string& pypath) -{ - FILE* paramfp = fopen(parampath.c_str(), "wb"); - if (!paramfp) - { - fprintf(stderr, "fopen %s failed\n", parampath.c_str()); - return -1; - } - - FILE* binfp = fopen(binpath.c_str(), "wb"); - if (!binfp) - { - fprintf(stderr, "fopen %s failed\n", binpath.c_str()); - fclose(paramfp); - return -1; - } - - // magic - fprintf(paramfp, "7767517\n"); - - // op count and oprand count - fprintf(paramfp, "%d %d\n", (int)ops.size(), (int)operands.size()); - - for (const Operator* op : ops) - { - fprintf(paramfp, "%-24s %-24s %d %d", op->type.c_str(), op->name.c_str(), (int)op->inputs.size(), (int)op->outputs.size()); - - for (const Operand* oprand : op->inputs) - { - fprintf(paramfp, " %s", oprand->name.c_str()); - } - - for (const Operand* oprand : op->outputs) - { - fprintf(paramfp, " %s", oprand->name.c_str()); - } - - for (const auto& it : op->params) - { - const Parameter& param = it.second; - - if (!string_is_positive_integer(it.first)) - { - fprintf(stderr, "ignore %s %s param %s=", op->type.c_str(), op->name.c_str(), it.first.c_str()); - - if (param.type == 0) - { - fprintf(stderr, "None"); - } - if (param.type == 1) - { - if (param.b) - fprintf(stderr, "True"); - else - fprintf(stderr, "False"); - } - if (param.type == 2) - { - fprintf(stderr, "%d", param.i); - } - if (param.type == 3) - { - fprintf(stderr, "%e", param.f); - } - if (param.type == 4) - { - fprintf(stderr, "%s", param.s.c_str()); - } - if (param.type == 5) - { - fprintf(stderr, "("); - for (size_t i = 0; i < param.ai.size(); i++) - { - fprintf(stderr, "%d", param.ai[i]); - if (i + 1 != param.ai.size()) - fprintf(stderr, ","); - } - fprintf(stderr, ")"); - } - if (param.type == 6) - { - fprintf(stderr, "("); - for (size_t i = 0; i < param.af.size(); i++) - { - fprintf(stderr, "%e", param.af[i]); - if (i + 1 != param.af.size()) - fprintf(stderr, ","); - } - fprintf(stderr, ")"); - } - if (param.type == 7) - { - fprintf(stderr, "("); - for (size_t i = 0; i < param.as.size(); i++) - { - fprintf(stderr, "%s", param.as[i].c_str()); - if (i + 1 != param.as.size()) - fprintf(stderr, ","); - } - fprintf(stderr, ")"); - } - fprintf(stderr, "\n"); - - continue; - } - - const int idkey = std::stoi(it.first); - if (param.type == 2) - { - fprintf(paramfp, " %d=%d", idkey, param.i); - } - if (param.type == 3) - { - fprintf(paramfp, " %d=%e", idkey, param.f); - } - if (param.type == 5) - { - const int array_size = (int)param.ai.size(); - fprintf(paramfp, " %d=%d", -23300 - idkey, array_size); - for (size_t i = 0; i < param.ai.size(); i++) - { - fprintf(paramfp, ",%d", param.ai[i]); - } - } - if (param.type == 6) - { - const int array_size = (int)param.af.size(); - fprintf(paramfp, " %d=%d", -23300 - idkey, array_size); - for (size_t i = 0; i < param.af.size(); i++) - { - fprintf(paramfp, ",%e", param.af[i]); - } - } - } - - for (const auto& it : op->attrs) - { - // fprintf(paramfp, " @%s=", it.first.c_str()); - - const Attribute& attr = it.second; - - fwrite(attr.data.data(), attr.data.size(), 1, binfp); - } - - // if (op->inputnames.size() == op->inputs.size()) - // { - // for (size_t i = 0; i < op->inputs.size(); i++) - // { - // const Operand* oprand = op->inputs[i]; - // fprintf(paramfp, " $%s=%s", op->inputnames[i].c_str(), oprand->name.c_str()); - // } - // } - - // for (const Operand* oprand : op->outputs) - // { - // if (oprand->params.find("__batch_index") == oprand->params.end()) - // continue; - // - // const int batch_index = oprand->params.at("__batch_index").i; - // - // fprintf(paramfp, " #%s=%d", oprand->name.c_str(), batch_index); - // } - - // for (const Operand* oprand : op->outputs) - // { - // if (oprand->shape.empty()) - // continue; - // - // fprintf(paramfp, " #%s=", oprand->name.c_str()); - // - // fprintf(paramfp, "("); - // for (int64_t i = 0; i < oprand->shape.size() - 1; i++) - // { - // fprintf(paramfp, "%d,", oprand->shape[i]); - // } - // if (oprand->shape.size() > 0) - // fprintf(paramfp, "%d", oprand->shape[oprand->shape.size() - 1]); - // fprintf(paramfp, ")"); - // - // fprintf(paramfp, type_to_string(oprand->type)); - // } - - fprintf(paramfp, "\n"); - } - - fclose(paramfp); - fclose(binfp); - - FILE* pyfp = fopen(pypath.c_str(), "wb"); - if (!pyfp) - { - fprintf(stderr, "fopen %s failed\n", pypath.c_str()); - return -1; - } - - fprintf(pyfp, "import numpy as np\n"); - fprintf(pyfp, "import ncnn\n"); - fprintf(pyfp, "import torch\n"); - - fprintf(pyfp, "\n"); - - // test inference - { - fprintf(pyfp, "def test_inference():\n"); - fprintf(pyfp, " torch.manual_seed(0)\n"); - - for (int input_index = 0;; input_index++) - { - std::string input_name = std::string("in") + std::to_string(input_index); - const Operand* r = get_operand(input_name); - if (!r) - break; - - if (type_is_integer(r->type)) - { - fprintf(pyfp, " %s = torch.randint(10, (", input_name.c_str()); - for (size_t i = 0; i < r->shape.size(); i++) - { - fprintf(pyfp, "%d", r->shape[i]); - if (i + 1 != r->shape.size() || r->shape.size() == 1) - fprintf(pyfp, ", "); - } - fprintf(pyfp, "), dtype=%s)\n", type_to_dtype_string(r->type)); - } - else - { - fprintf(pyfp, " %s = torch.rand(", input_name.c_str()); - for (size_t i = 0; i < r->shape.size(); i++) - { - fprintf(pyfp, "%d, ", r->shape[i]); - } - fprintf(pyfp, "dtype=%s)\n", type_to_dtype_string(r->type)); - } - } - - fprintf(pyfp, " out = []\n"); - fprintf(pyfp, "\n"); - - fprintf(pyfp, " with ncnn.Net() as net:\n"); - fprintf(pyfp, " net.load_param(\"%s\")\n", parampath.c_str()); - fprintf(pyfp, " net.load_model(\"%s\")\n", binpath.c_str()); - fprintf(pyfp, "\n"); - fprintf(pyfp, " with net.create_extractor() as ex:\n"); - - for (int input_index = 0;; input_index++) - { - std::string input_name = std::string("in") + std::to_string(input_index); - const Operand* r = get_operand(input_name); - if (!r) - break; - - const int batch_index = r->params.at("__batch_index").i; - if (batch_index != 233) - { - fprintf(pyfp, " ex.input(\"%s\", ncnn.Mat(%s.squeeze(%d).numpy()).clone())\n", input_name.c_str(), input_name.c_str(), batch_index); - } - else - { - fprintf(pyfp, " ex.input(\"%s\", ncnn.Mat(%s.numpy()).clone())\n", input_name.c_str(), input_name.c_str()); - } - } - - fprintf(pyfp, "\n"); - - for (int output_index = 0;; output_index++) - { - std::string output_name = std::string("out") + std::to_string(output_index); - const Operand* r = get_operand(output_name); - if (!r) - break; - - fprintf(pyfp, " _, %s = ex.extract(\"%s\")\n", output_name.c_str(), output_name.c_str()); - - const int batch_index = r->params.at("__batch_index").i; - if (batch_index != 233) - { - fprintf(pyfp, " out.append(torch.from_numpy(np.array(%s)).unsqueeze(%d))\n", output_name.c_str(), batch_index); - } - else - { - fprintf(pyfp, " out.append(torch.from_numpy(np.array(%s)))\n", output_name.c_str()); - } - } - - fprintf(pyfp, "\n"); - - fprintf(pyfp, " if len(out) == 1:\n"); - fprintf(pyfp, " return out[0]\n"); - fprintf(pyfp, " else:\n"); - fprintf(pyfp, " return tuple(out)\n"); - } - - fclose(pyfp); - - return 0; -} - int Graph::parse(const std::string& param) { std::istringstream is(param); @@ -2675,6 +2445,7 @@ Operator* Graph::new_operator_after(const std::string& type, const std::string& return op; } +#if BUILD_PNNX Operand* Graph::new_operand(const torch::jit::Value* v) { Operand* r = new Operand; @@ -2701,6 +2472,7 @@ Operand* Graph::new_operand(const torch::jit::Value* v) operands.push_back(r); return r; } +#endif // BUILD_PNNX Operand* Graph::new_operand(const std::string& name) { @@ -2721,4 +2493,15 @@ Operand* Graph::get_operand(const std::string& name) return 0; } +const Operand* Graph::get_operand(const std::string& name) const +{ + for (const Operand* r : operands) + { + if (r->name == name) + return r; + } + + return 0; +} + } // namespace pnnx diff --git a/tools/pnnx/src/ir.h b/tools/pnnx/src/ir.h index 06fe09c14bd2..740e40192cc9 100644 --- a/tools/pnnx/src/ir.h +++ b/tools/pnnx/src/ir.h @@ -17,9 +17,11 @@ #include #include +#include #include #include +#if BUILD_PNNX namespace torch { namespace jit { struct Value; @@ -29,6 +31,7 @@ struct Node; namespace at { class Tensor; } +#endif // BUILD_PNNX namespace pnnx { @@ -114,8 +117,10 @@ class Parameter { } +#if BUILD_PNNX Parameter(const torch::jit::Node* value_node); Parameter(const torch::jit::Value* value); +#endif // BUILD_PNNX static Parameter parse_from_string(const std::string& value); @@ -126,9 +131,11 @@ class Parameter bool b; int i; float f; - std::string s; std::vector ai; std::vector af; + + // keep std::string typed member the last for cross cxxabi compatibility + std::string s; std::vector as; }; @@ -142,7 +149,9 @@ class Attribute { } +#if BUILD_PNNX Attribute(const at::Tensor& t); +#endif // BUILD_PNNX Attribute(const std::initializer_list& shape, const std::vector& t); @@ -164,8 +173,6 @@ class Operand public: void remove_consumer(const Operator* c); - std::string name; - Operator* producer; std::vector consumers; @@ -173,6 +180,9 @@ class Operand int type; std::vector shape; + // keep std::string typed member the last for cross cxxabi compatibility + std::string name; + std::map params; private: @@ -185,12 +195,13 @@ class Operand class Operator { public: - std::string type; - std::string name; - std::vector inputs; std::vector outputs; + // keep std::string typed member the last for cross cxxabi compatibility + std::string type; + std::string name; + std::vector inputnames; std::map params; std::map attrs; @@ -213,8 +224,6 @@ class Graph int python(const std::string& pypath, const std::string& binpath); - int ncnn(const std::string& parampath, const std::string& binpath, const std::string& pypath); - int parse(const std::string& param); Operator* new_operator(const std::string& type, const std::string& name); @@ -223,11 +232,14 @@ class Graph Operator* new_operator_after(const std::string& type, const std::string& name, const Operator* cur); +#if BUILD_PNNX Operand* new_operand(const torch::jit::Value* v); +#endif Operand* new_operand(const std::string& name); Operand* get_operand(const std::string& name); + const Operand* get_operand(const std::string& name) const; std::vector ops; std::vector operands; diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp index 87ecfecd6482..98066c4c547c 100644 --- a/tools/pnnx/src/main.cpp +++ b/tools/pnnx/src/main.cpp @@ -39,6 +39,11 @@ #include "pass_level5.h" #include "pass_ncnn.h" +#include "save_ncnn.h" + +#if BUILD_PNNX2ONNX +#include "save_onnx.h" +#endif static std::string get_basename(const std::string& path) { @@ -159,9 +164,11 @@ static void show_usage() fprintf(stderr, " pnnxparam=model.pnnx.param\n"); fprintf(stderr, " pnnxbin=model.pnnx.bin\n"); fprintf(stderr, " pnnxpy=model_pnnx.py\n"); + fprintf(stderr, " pnnxonnx=model.pnnx.onnx\n"); fprintf(stderr, " ncnnparam=model.ncnn.param\n"); fprintf(stderr, " ncnnbin=model.ncnn.bin\n"); fprintf(stderr, " ncnnpy=model_ncnn.py\n"); + fprintf(stderr, " fp16=1\n"); fprintf(stderr, " optlevel=2\n"); fprintf(stderr, " device=cpu/gpu\n"); fprintf(stderr, " inputshape=[1,3,224,224],...\n"); @@ -200,9 +207,11 @@ int main(int argc, char** argv) std::string pnnxparampath = ptbase + ".pnnx.param"; std::string pnnxbinpath = ptbase + ".pnnx.bin"; std::string pnnxpypath = ptbase + "_pnnx.py"; + std::string pnnxonnxpath = ptbase + ".pnnx.onnx"; std::string ncnnparampath = ptbase + ".ncnn.param"; std::string ncnnbinpath = ptbase + ".ncnn.bin"; std::string ncnnpypath = ptbase + "_ncnn.py"; + int fp16 = 1; int optlevel = 2; std::string device = "cpu"; std::vector > input_shapes; @@ -235,12 +244,16 @@ int main(int argc, char** argv) pnnxbinpath = std::string(value); if (strcmp(key, "pnnxpy") == 0) pnnxpypath = std::string(value); + if (strcmp(key, "pnnxonnx") == 0) + pnnxonnxpath = std::string(value); if (strcmp(key, "ncnnparam") == 0) ncnnparampath = std::string(value); if (strcmp(key, "ncnnbin") == 0) ncnnbinpath = std::string(value); if (strcmp(key, "ncnnpy") == 0) ncnnpypath = std::string(value); + if (strcmp(key, "fp16") == 0) + fp16 = atoi(value); if (strcmp(key, "optlevel") == 0) optlevel = atoi(value); if (strcmp(key, "device") == 0) @@ -260,9 +273,11 @@ int main(int argc, char** argv) fprintf(stderr, "pnnxparam = %s\n", pnnxparampath.c_str()); fprintf(stderr, "pnnxbin = %s\n", pnnxbinpath.c_str()); fprintf(stderr, "pnnxpy = %s\n", pnnxpypath.c_str()); + fprintf(stderr, "pnnxonnx = %s\n", pnnxonnxpath.c_str()); fprintf(stderr, "ncnnparam = %s\n", ncnnparampath.c_str()); fprintf(stderr, "ncnnbin = %s\n", ncnnbinpath.c_str()); fprintf(stderr, "ncnnpy = %s\n", ncnnpypath.c_str()); + fprintf(stderr, "fp16 = %d\n", fp16); fprintf(stderr, "optlevel = %d\n", optlevel); fprintf(stderr, "device = %s\n", device.c_str()); fprintf(stderr, "inputshape = "); @@ -327,7 +342,7 @@ int main(int argc, char** argv) try { - mod = torch::jit::load(ptpath); + mod = torch::jit::load(ptpath, (device == "gpu") ? c10::kCUDA : c10::kCPU); } catch (const c10::Error& e) { @@ -358,8 +373,9 @@ int main(int argc, char** argv) fprintf(stderr, "############# pass_level0\n"); - std::map foldable_constants; - pnnx::pass_level0(mod, g, input_tensors, input_tensors2, module_operators, ptpath, foldable_constants); + std::set foldable_constants; + std::string foldable_constants_zippath = ptbase + ".foldable_constants.zip"; + pnnx::pass_level0(mod, g, input_tensors, input_tensors2, module_operators, ptpath, device, foldable_constants, foldable_constants_zippath); // g->dump(); @@ -393,20 +409,29 @@ int main(int argc, char** argv) { fprintf(stderr, "############# pass_level5\n"); - pnnx::pass_level5(pnnx_graph, foldable_constants); + pnnx::pass_level5(pnnx_graph, foldable_constants, foldable_constants_zippath); } + // delete foldable_constants_zippath + remove(foldable_constants_zippath.c_str()); + pnnx_graph.save(pnnxparampath, pnnxbinpath); pnnx_graph.python(pnnxpypath, pnnxbinpath); +#if BUILD_PNNX2ONNX + pnnx::save_onnx(pnnx_graph, pnnxonnxpath.c_str(), fp16); +#else + fprintf(stderr, "pnnx build without onnx-zero support, skip saving onnx\n"); +#endif + // if (optlevel >= 2) { fprintf(stderr, "############# pass_ncnn\n"); pnnx::pass_ncnn(pnnx_graph); - pnnx_graph.ncnn(ncnnparampath, ncnnbinpath, ncnnpypath); + pnnx::save_ncnn(pnnx_graph, ncnnparampath, ncnnbinpath, ncnnpypath, fp16); } // pnnx::Graph pnnx_graph2; diff --git a/tools/pnnx/src/onnx.proto b/tools/pnnx/src/onnx.proto new file mode 100644 index 000000000000..461bd0b78cd6 --- /dev/null +++ b/tools/pnnx/src/onnx.proto @@ -0,0 +1,505 @@ +// +// WARNING: This file is automatically generated! Please edit onnx.in.proto. +// + + +// Copyright (c) ONNX Project Contributors. +// Licensed under the MIT license. + +syntax = "proto2"; + +package onnx; + +// Overview +// +// ONNX is an open specification that is comprised of the following components: +// +// 1) A definition of an extensible computation graph model. +// 2) Definitions of standard data types. +// 3) Definitions of built-in operators. +// +// This document describes the syntax of models and their computation graphs, +// as well as the standard data types. Together, they are referred to as the ONNX +// Intermediate Representation, or 'IR' for short. +// +// The normative semantic specification of the ONNX IR is found in docs/IR.md. +// Definitions of the built-in neural network operators may be found in docs/Operators.md. + +// Notes +// +// Release +// +// We are still in the very early stage of defining ONNX. The current +// version of ONNX is a starting point. While we are actively working +// towards a complete spec, we would like to get the community involved +// by sharing our working version of ONNX. +// +// Protobuf compatibility +// +// To simplify framework compatibility, ONNX is defined using the subset of protobuf +// that is compatible with both protobuf v2 and v3. This means that we do not use any +// protobuf features that are only available in one of the two versions. +// +// Here are the most notable contortions we have to carry out to work around +// these limitations: +// +// - No 'map' (added protobuf 3.0). We instead represent mappings as lists +// of key-value pairs, where order does not matter and duplicates +// are not allowed. + + +// Versioning +// +// ONNX versioning is specified in docs/IR.md and elaborated on in docs/Versioning.md +// +// To be compatible with both proto2 and proto3, we will use a version number +// that is not defined by the default value but an explicit enum number. +enum Version { + // proto3 requires the first enum value to be zero. + // We add this just to appease the compiler. + _START_VERSION = 0; + // The version field is always serialized and we will use it to store the + // version that the graph is generated from. This helps us set up version + // control. + // For the IR, we are using simple numbers starting with with 0x00000001, + // which was the version we published on Oct 10, 2017. + IR_VERSION_2017_10_10 = 0x0000000000000001; + + // IR_VERSION 2 published on Oct 30, 2017 + // - Added type discriminator to AttributeProto to support proto3 users + IR_VERSION_2017_10_30 = 0x0000000000000002; + + // IR VERSION 3 published on Nov 3, 2017 + // - For operator versioning: + // - Added new message OperatorSetIdProto + // - Added opset_import in ModelProto + // - For vendor extensions, added domain in NodeProto + IR_VERSION_2017_11_3 = 0x0000000000000003; + + // IR VERSION 4 published on Jan 22, 2019 + // - Relax constraint that initializers should be a subset of graph inputs + // - Add type BFLOAT16 + IR_VERSION_2019_1_22 = 0x0000000000000004; + + // IR VERSION 5 published on March 18, 2019 + // - Add message TensorAnnotation. + // - Add quantization annotation in GraphProto to map tensor with its scale and zero point quantization parameters. + IR_VERSION = 0x0000000000000005; +} + +// Attributes +// +// A named attribute containing either singular float, integer, string, graph, +// and tensor values, or repeated float, integer, string, graph, and tensor values. +// An AttributeProto MUST contain the name field, and *only one* of the +// following content fields, effectively enforcing a C/C++ union equivalent. +message AttributeProto { + + // Note: this enum is structurally identical to the OpSchema::AttrType + // enum defined in schema.h. If you rev one, you likely need to rev the other. + enum AttributeType { + UNDEFINED = 0; + FLOAT = 1; + INT = 2; + STRING = 3; + TENSOR = 4; + GRAPH = 5; + + FLOATS = 6; + INTS = 7; + STRINGS = 8; + TENSORS = 9; + GRAPHS = 10; + } + + // The name field MUST be present for this version of the IR. + optional string name = 1; // namespace Attribute + + // if ref_attr_name is not empty, ref_attr_name is the attribute name in parent function. + // In this case, this AttributeProto does not contain data, and it's a reference of attribute + // in parent scope. + // NOTE: This should ONLY be used in function (sub-graph). It's invalid to be used in main graph. + optional string ref_attr_name = 21; + + // A human-readable documentation for this attribute. Markdown is allowed. + optional string doc_string = 13; + + // The type field MUST be present for this version of the IR. + // For 0.0.1 versions of the IR, this field was not defined, and + // implementations needed to use has_field hueristics to determine + // which value field was in use. For IR_VERSION 0.0.2 or later, this + // field MUST be set and match the f|i|s|t|... field in use. This + // change was made to accommodate proto3 implementations. + optional AttributeType type = 20; // discriminator that indicates which field below is in use + + // Exactly ONE of the following fields must be present for this version of the IR + optional float f = 2; // float + optional int64 i = 3; // int + optional bytes s = 4; // UTF-8 string + optional TensorProto t = 5; // tensor value + optional GraphProto g = 6; // graph + // Do not use field below, it's deprecated. + // optional ValueProto v = 12; // value - subsumes everything but graph + + repeated float floats = 7; // list of floats + repeated int64 ints = 8; // list of ints + repeated bytes strings = 9; // list of UTF-8 strings + repeated TensorProto tensors = 10; // list of tensors + repeated GraphProto graphs = 11; // list of graph +} + +// Defines information on value, including the name, the type, and +// the shape of the value. +message ValueInfoProto { + // This field MUST be present in this version of the IR. + optional string name = 1; // namespace Value + // This field MUST be present in this version of the IR. + optional TypeProto type = 2; + // A human-readable documentation for this value. Markdown is allowed. + optional string doc_string = 3; +} + +// Nodes +// +// Computation graphs are made up of a DAG of nodes, which represent what is +// commonly called a "layer" or "pipeline stage" in machine learning frameworks. +// +// For example, it can be a node of type "Conv" that takes in an image, a filter +// tensor and a bias tensor, and produces the convolved output. +message NodeProto { + repeated string input = 1; // namespace Value + repeated string output = 2; // namespace Value + + // An optional identifier for this node in a graph. + // This field MAY be absent in ths version of the IR. + optional string name = 3; // namespace Node + + // The symbolic identifier of the Operator to execute. + optional string op_type = 4; // namespace Operator + // The domain of the OperatorSet that specifies the operator named by op_type. + optional string domain = 7; // namespace Domain + + // Additional named attributes. + repeated AttributeProto attribute = 5; + + // A human-readable documentation for this node. Markdown is allowed. + optional string doc_string = 6; +} + +// Models +// +// ModelProto is a top-level file/container format for bundling a ML model and +// associating its computation graph with metadata. +// +// The semantics of the model are described by the associated GraphProto. +message ModelProto { + // The version of the IR this model targets. See Version enum above. + // This field MUST be present. + optional int64 ir_version = 1; + + // The OperatorSets this model relies on. + // All ModelProtos MUST have at least one entry that + // specifies which version of the ONNX OperatorSet is + // being imported. + // + // All nodes in the ModelProto's graph will bind against the operator + // with the same-domain/same-op_type operator with the HIGHEST version + // in the referenced operator sets. + repeated OperatorSetIdProto opset_import = 8; + + // The name of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_name = 2; + + // The version of the framework or tool used to generate this model. + // This field SHOULD be present to indicate which implementation/tool/framework + // emitted the model. + optional string producer_version = 3; + + // Domain name of the model. + // We use reverse domain names as name space indicators. For example: + // `com.facebook.fair` or `com.microsoft.cognitiveservices` + // + // Together with `model_version` and GraphProto.name, this forms the unique identity of + // the graph. + optional string domain = 4; + + // The version of the graph encoded. See Version enum below. + optional int64 model_version = 5; + + // A human-readable documentation for this model. Markdown is allowed. + optional string doc_string = 6; + + // The parameterized graph that is evaluated to execute the model. + optional GraphProto graph = 7; + + // Named metadata values; keys should be distinct. + repeated StringStringEntryProto metadata_props = 14; +}; + +// StringStringEntryProto follows the pattern for cross-proto-version maps. +// See https://developers.google.com/protocol-buffers/docs/proto3#maps +message StringStringEntryProto { + optional string key = 1; + optional string value= 2; +}; + +message TensorAnnotation { + optional string tensor_name = 1; + // pairs to annotate tensor specified by above. + // The keys used in the mapping below must be pre-defined in ONNX spec. + // For example, for 8-bit linear quantization case, 'SCALE_TENSOR', 'ZERO_POINT_TENSOR' will be pre-defined as + // quantization parameter keys. + repeated StringStringEntryProto quant_parameter_tensor_names = 2; +} + + + +// Graphs +// +// A graph defines the computational logic of a model and is comprised of a parameterized +// list of nodes that form a directed acyclic graph based on their inputs and outputs. +// This is the equivalent of the "network" or "graph" in many deep learning +// frameworks. +message GraphProto { + // The nodes in the graph, sorted topologically. + repeated NodeProto node = 1; + + // The name of the graph. + optional string name = 2; // namespace Graph + + // A list of named tensor values, used to specify constant inputs of the graph. + // Each TensorProto entry must have a distinct name (within the list) that + // MAY also appear in the input list. + repeated TensorProto initializer = 5; + + // A human-readable documentation for this graph. Markdown is allowed. + optional string doc_string = 10; + + // The inputs and outputs of the graph. + repeated ValueInfoProto input = 11; + repeated ValueInfoProto output = 12; + + // Information for the values in the graph. The ValueInfoProto.name's + // must be distinct. It is optional for a value to appear in value_info list. + repeated ValueInfoProto value_info = 13; + + // This field carries information to indicate the mapping among a tensor and its + // quantization parameter tensors. For example: + // For tensor 'a', it may have {'SCALE_TENSOR', 'a_scale'} and {'ZERO_POINT_TENSOR', 'a_zero_point'} annotated, + // which means, tensor 'a_scale' and tensor 'a_zero_point' are scale and zero point of tensor 'a' in the model. + repeated TensorAnnotation quantization_annotation = 14; + + // DO NOT USE the following fields, they were deprecated from earlier versions. + // repeated string input = 3; + // repeated string output = 4; + // optional int64 ir_version = 6; + // optional int64 producer_version = 7; + // optional string producer_tag = 8; + // optional string domain = 9; +} + +// Tensors +// +// A serialized tensor value. +message TensorProto { + enum DataType { + UNDEFINED = 0; + // Basic types. + FLOAT = 1; // float + UINT8 = 2; // uint8_t + INT8 = 3; // int8_t + UINT16 = 4; // uint16_t + INT16 = 5; // int16_t + INT32 = 6; // int32_t + INT64 = 7; // int64_t + STRING = 8; // string + BOOL = 9; // bool + + // IEEE754 half-precision floating-point format (16 bits wide). + // This format has 1 sign bit, 5 exponent bits, and 10 mantissa bits. + FLOAT16 = 10; + + DOUBLE = 11; + UINT32 = 12; + UINT64 = 13; + COMPLEX64 = 14; // complex with float32 real and imaginary components + COMPLEX128 = 15; // complex with float64 real and imaginary components + + // Non-IEEE floating-point format based on IEEE754 single-precision + // floating-point number truncated to 16 bits. + // This format has 1 sign bit, 8 exponent bits, and 7 mantissa bits. + BFLOAT16 = 16; + + // Future extensions go here. + } + + // The shape of the tensor. + repeated int64 dims = 1; + + // The data type of the tensor. + // This field MUST have a valid TensorProto.DataType value + optional int32 data_type = 2; + + // For very large tensors, we may want to store them in chunks, in which + // case the following fields will specify the segment that is stored in + // the current TensorProto. + message Segment { + optional int64 begin = 1; + optional int64 end = 2; + } + optional Segment segment = 3; + + // Tensor content must be organized in row-major order. + // + // Depending on the data_type field, exactly one of the fields below with + // name ending in _data is used to store the elements of the tensor. + + // For float and complex64 values + // Complex64 tensors are encoded as a single array of floats, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component apparing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be FLOAT or COMPLEX64. + repeated float float_data = 4 [packed = true]; + + // For int32, uint8, int8, uint16, int16, bool, and float16 values + // float16 values must be bit-wise converted to an uint16_t prior + // to writing to the buffer. + // When this field is present, the data_type field MUST be + // INT32, INT16, INT8, UINT16, UINT8, BOOL, or FLOAT16 + repeated int32 int32_data = 5 [packed = true]; + + // For strings. + // Each element of string_data is a UTF-8 encoded Unicode + // string. No trailing null, no leading BOM. The protobuf "string" + // scalar type is not used to match ML community conventions. + // When this field is present, the data_type field MUST be STRING + repeated bytes string_data = 6; + + // For int64. + // When this field is present, the data_type field MUST be INT64 + repeated int64 int64_data = 7 [packed = true]; + + // Optionally, a name for the tensor. + optional string name = 8; // namespace Value + + // A human-readable documentation for this tensor. Markdown is allowed. + optional string doc_string = 12; + + // Serializations can either use one of the fields above, or use this + // raw bytes field. The only exception is the string case, where one is + // required to store the content in the repeated bytes string_data field. + // + // When this raw_data field is used to store tensor value, elements MUST + // be stored in as fixed-width, little-endian order. + // Floating-point data types MUST be stored in IEEE 754 format. + // Complex64 elements must be written as two consecutive FLOAT values, real component first. + // Complex128 elements must be written as two consecutive DOUBLE values, real component first. + // Boolean type MUST be written one byte per tensor element (00000001 for true, 00000000 for false). + // + // Note: the advantage of specific field rather than the raw_data field is + // that in some cases (e.g. int data), protobuf does a better packing via + // variable length storage, and may lead to smaller binary footprint. + // When this field is present, the data_type field MUST NOT be STRING or UNDEFINED + optional bytes raw_data = 9; + + // Data can be stored inside the protobuf file using type-specific fields or raw_data. + // Alternatively, raw bytes data can be stored in an external file, using the external_data field. + // external_data stores key-value pairs describing data location. Recognized keys are: + // - "location" (required) - POSIX filesystem path relative to the directory where the ONNX + // protobuf model was stored + // - "offset" (optional) - position of byte at which stored data begins. Integer stored as string. + // Offset values SHOULD be multiples 4096 (page size) to enable mmap support. + // - "length" (optional) - number of bytes containing data. Integer stored as string. + // - "checksum" (optional) - SHA1 digest of file specified in under 'location' key. + repeated StringStringEntryProto external_data = 13; + + // Location of the data for this tensor. MUST be one of: + // - DEFAULT - data stored inside the protobuf message. Data is stored in raw_data (if set) otherwise in type-specified field. + // - EXTERNAL - data stored in an external location as described by external_data field. + enum DataLocation { + DEFAULT = 0; + EXTERNAL = 1; + } + + // If value not set, data is stored in raw_data (if set) otherwise in type-specified field. + optional DataLocation data_location = 14; + + // For double + // Complex128 tensors are encoded as a single array of doubles, + // with the real components appearing in odd numbered positions, + // and the corresponding imaginary component apparing in the + // subsequent even numbered position. (e.g., [1.0 + 2.0i, 3.0 + 4.0i] + // is encoded as [1.0, 2.0 ,3.0 ,4.0] + // When this field is present, the data_type field MUST be DOUBLE or COMPLEX128 + repeated double double_data = 10 [packed = true]; + + // For uint64 and uint32 values + // When this field is present, the data_type field MUST be + // UINT32 or UINT64 + repeated uint64 uint64_data = 11 [packed = true]; +} + +// Defines a tensor shape. A dimension can be either an integer value +// or a symbolic variable. A symbolic variable represents an unknown +// dimension. +message TensorShapeProto { + message Dimension { + oneof value { + int64 dim_value = 1; + string dim_param = 2; // namespace Shape + }; + // Standard denotation can optionally be used to denote tensor + // dimensions with standard semantic descriptions to ensure + // that operations are applied to the correct axis of a tensor. + // Refer to https://github.com/onnx/onnx/blob/master/docs/DimensionDenotation.md#denotation-definition + // for pre-defined dimension denotations. + optional string denotation = 3; + }; + repeated Dimension dim = 1; +} + +// Types +// +// The standard ONNX data types. +message TypeProto { + + message Tensor { + // This field MUST NOT have the value of UNDEFINED + // This field MUST have a valid TensorProto.DataType value + // This field MUST be present for this version of the IR. + optional int32 elem_type = 1; + optional TensorShapeProto shape = 2; + } + + + oneof value { + // The type of a tensor. + Tensor tensor_type = 1; + + } + + // An optional denotation can be used to denote the whole + // type with a standard semantic description as to what is + // stored inside. Refer to https://github.com/onnx/onnx/blob/master/docs/TypeDenotation.md#type-denotation-definition + // for pre-defined type denotations. + optional string denotation = 6; +} + +// Operator Sets +// +// OperatorSets are uniquely identified by a (domain, opset_version) pair. +message OperatorSetIdProto { + // The domain of the operator set being identified. + // The empty string ("") or absence of this field implies the operator + // set that is defined as part of the ONNX specification. + // This field MUST be present in this version of the IR when referring to any other operator set. + optional string domain = 1; + + // The version of the operator set being identified. + // This field MUST be present in this version of the IR. + optional int64 version = 2; +} diff --git a/tools/pnnx/src/pass_level0.cpp b/tools/pnnx/src/pass_level0.cpp index d50f71bbe296..ba7b7d5109fc 100644 --- a/tools/pnnx/src/pass_level0.cpp +++ b/tools/pnnx/src/pass_level0.cpp @@ -16,19 +16,22 @@ #include "pass_level0/constant_unpooling.h" #include "pass_level0/inline_block.h" +#include "pass_level0/reset_device.h" #include "pass_level0/shape_inference.h" namespace pnnx { -void pass_level0(const torch::jit::Module& mod, std::shared_ptr& g, const std::vector& input_tensors, const std::vector& input_tensors2, const std::vector& module_operators, const std::string& ptpath, std::map& foldable_constants) +void pass_level0(const torch::jit::Module& mod, std::shared_ptr& g, const std::vector& input_tensors, const std::vector& input_tensors2, const std::vector& module_operators, const std::string& ptpath, const std::string& device, std::set& foldable_constants, const std::string& foldable_constants_zippath) { inline_block(g, module_operators); + reset_device(g, device); + constant_unpooling(g); if (!input_tensors.empty()) { - shape_inference(mod, g, input_tensors, input_tensors2, module_operators, ptpath, foldable_constants); + shape_inference(mod, g, input_tensors, input_tensors2, module_operators, ptpath, device, foldable_constants, foldable_constants_zippath); } } diff --git a/tools/pnnx/src/pass_level0.h b/tools/pnnx/src/pass_level0.h index 11543ddc8ffa..783a8522d4bf 100644 --- a/tools/pnnx/src/pass_level0.h +++ b/tools/pnnx/src/pass_level0.h @@ -20,7 +20,7 @@ namespace pnnx { -void pass_level0(const torch::jit::Module& mod, std::shared_ptr& g, const std::vector& input_tensors, const std::vector& input_tensors2, const std::vector& module_operators, const std::string& ptpath, std::map& foldable_constants); +void pass_level0(const torch::jit::Module& mod, std::shared_ptr& g, const std::vector& input_tensors, const std::vector& input_tensors2, const std::vector& module_operators, const std::string& ptpath, const std::string& device, std::set& foldable_constants, const std::string& foldable_constants_zippath); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level0/reset_device.cpp b/tools/pnnx/src/pass_level0/reset_device.cpp new file mode 100644 index 000000000000..b817e41a1f4e --- /dev/null +++ b/tools/pnnx/src/pass_level0/reset_device.cpp @@ -0,0 +1,36 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "reset_device.h" +#include "../pass_level1.h" + +namespace pnnx { + +void reset_device(std::shared_ptr& graph, const std::string& device) +{ + for (torch::jit::Node* n : graph->nodes()) + { + if (n->kind().toDisplayString() == std::string("aten::to")) + { + if (n->hasNamedInput("device")) + { + torch::jit::Node* device_node = n->namedInput("device")->node(); + + device_node->s_(torch::jit::attr::value, (device == "gpu") ? "cuda" : "cpu"); + } + } + } +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level0/reset_device.h b/tools/pnnx/src/pass_level0/reset_device.h new file mode 100644 index 000000000000..17d8f93995e8 --- /dev/null +++ b/tools/pnnx/src/pass_level0/reset_device.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include + +namespace pnnx { + +void reset_device(std::shared_ptr& graph, const std::string& device); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level0/shape_inference.cpp b/tools/pnnx/src/pass_level0/shape_inference.cpp index a7274f370134..b3fcf4a944a6 100644 --- a/tools/pnnx/src/pass_level0/shape_inference.cpp +++ b/tools/pnnx/src/pass_level0/shape_inference.cpp @@ -15,8 +15,10 @@ #include "shape_inference.h" #include +#include "storezip.h" #include "pass_level0/constant_unpooling.h" #include "pass_level0/inline_block.h" +#include "pass_level0/reset_device.h" #include "pass_level0/shape_inference.h" namespace pnnx { @@ -27,7 +29,15 @@ static bool value_link_input(const torch::jit::Value* v, const std::vectornode()->kind().toDisplayString(); - if (optype == "aten::size" || optype == "aten::new_empty" || optype == "aten::new_ones" || optype == "aten::new_zeros") + if (optype == "aten::size" + || optype == "aten::new_empty" + || optype == "aten::new_full" + || optype == "aten::new_ones" + || optype == "aten::new_zeros" + || optype == "aten::empty_like" + || optype == "aten::full_like" + || optype == "aten::ones_like" + || optype == "aten::zeros_like") return false; } @@ -69,7 +79,7 @@ static bool value_link_output(const torch::jit::Value* v, const std::vector& graph, const std::vector& input_tensors, const std::vector& input_tensors2, const std::vector& module_operators, const std::string& ptpath, std::map& foldable_constants) +void shape_inference(const torch::jit::Module& mod, std::shared_ptr& graph, const std::vector& input_tensors, const std::vector& input_tensors2, const std::vector& module_operators, const std::string& ptpath, const std::string& device, std::set& foldable_constants, const std::string& foldable_constants_zippath) { // collect all intermediate output tensors std::vector > more_value_names; @@ -133,7 +143,8 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr output_tensors; + StoreZipWriter zip; + zip.open(foldable_constants_zippath); for (size_t p = 0; p < more_value_names.size(); p++) { @@ -142,13 +153,15 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr values2; @@ -163,7 +176,7 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptrdebugName()) != value_names.end()) { values2.push_back(v); - fprintf(stderr, "%s ", v->debugName().c_str()); + // fprintf(stderr, "%s ", v->debugName().c_str()); } } } @@ -193,7 +206,16 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr 0) + { + // fprintf(stderr, "foldable_constant %s\n", v->debugName().c_str()); + foldable_constants.insert(v->debugName()); + + at::Tensor t2 = t.cpu().contiguous(); + zip.write_file(v->debugName(), (const char*)t2.data_ptr(), t2.nbytes()); + } } } } @@ -231,12 +253,23 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptr 0) + { + // fprintf(stderr, "foldable_constant %s\n", v->debugName().c_str()); + foldable_constants.insert(v->debugName()); + + at::Tensor t2 = t.cpu().contiguous(); + zip.write_file(v->debugName(), (const char*)t2.data_ptr(), t2.nbytes()); + } } } } } + zip.close(); + if (input_tensors2.empty()) { for (size_t i = 0; i < input_tensors.size(); i++) @@ -269,33 +302,6 @@ void shape_inference(const torch::jit::Module& mod, std::shared_ptrinputs()[1 + i]->setType(finaltype); } } - - for (auto xx : output_tensors) - { - auto v = xx.first; - auto tensor = xx.second; - - bool link_to_output = false; - for (size_t i = 0; i < v->uses().size(); i++) - { - auto node = v->uses()[i].user; - for (auto x : node->outputs()) - { - if (output_tensors.find(x) == output_tensors.end()) - { - link_to_output = true; - break; - } - } - } - - const int ndim = (int)tensor.dim(); - if (link_to_output && ndim > 0) - { - fprintf(stderr, "foldable_constant %s\n", v->debugName().c_str()); - foldable_constants[v->debugName()] = Attribute(tensor); - } - } } } // namespace pnnx diff --git a/tools/pnnx/src/pass_level0/shape_inference.h b/tools/pnnx/src/pass_level0/shape_inference.h index cf80ade7abef..feabfffe64b1 100644 --- a/tools/pnnx/src/pass_level0/shape_inference.h +++ b/tools/pnnx/src/pass_level0/shape_inference.h @@ -18,6 +18,6 @@ namespace pnnx { -void shape_inference(const torch::jit::Module& mod, std::shared_ptr& graph, const std::vector& input_tensors, const std::vector& input_tensors2, const std::vector& module_operators, const std::string& ptpath, std::map& foldable_constants); +void shape_inference(const torch::jit::Module& mod, std::shared_ptr& graph, const std::vector& input_tensors, const std::vector& input_tensors2, const std::vector& module_operators, const std::string& ptpath, const std::string& device, std::set& foldable_constants, const std::string& foldable_constants_zippath); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level1.cpp b/tools/pnnx/src/pass_level1.cpp index 9b5e4460905c..0aaf4d897a88 100644 --- a/tools/pnnx/src/pass_level1.cpp +++ b/tools/pnnx/src/pass_level1.cpp @@ -376,10 +376,6 @@ void pass_level1(const torch::jit::Module& mod, const std::shared_ptrkind().toDisplayString(), name); - // always treat inplace op type as non-inplace version - if (op->type.size() > 2 && op->type[op->type.size() - 2] != '_' && op->type[op->type.size() - 1] == '_') - op->type = op->type.substr(0, op->type.size() - 1); - for (int i = 0; i < (int)n->inputs().size(); i++) { const auto& in = n->input(i); diff --git a/tools/pnnx/src/pass_level1/nn_Fold.cpp b/tools/pnnx/src/pass_level1/nn_Fold.cpp new file mode 100644 index 000000000000..045c1f6f1baf --- /dev/null +++ b/tools/pnnx/src/pass_level1/nn_Fold.cpp @@ -0,0 +1,48 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level1.h" + +#include "../utils.h" + +namespace pnnx { + +class Fold : public FuseModulePass +{ +public: + const char* match_type_str() const + { + return "__torch__.torch.nn.modules.fold.Fold"; + } + + const char* type_str() const + { + return "nn.Fold"; + } + + void write(Operator* op, const std::shared_ptr& graph) const + { + const torch::jit::Node* col2im = find_node_by_kind(graph, "aten::col2im"); + + op->params["output_size"] = col2im->namedInput("output_size"); + op->params["kernel_size"] = col2im->namedInput("kernel_size"); + op->params["stride"] = col2im->namedInput("stride"); + op->params["padding"] = col2im->namedInput("padding"); + op->params["dilation"] = col2im->namedInput("dilation"); + } +}; + +REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(Fold) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level1/nn_GLU.cpp b/tools/pnnx/src/pass_level1/nn_GLU.cpp new file mode 100644 index 000000000000..72af2f3f0a62 --- /dev/null +++ b/tools/pnnx/src/pass_level1/nn_GLU.cpp @@ -0,0 +1,45 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// 2022 Xiaomi Corp. (author: Fangjun Kuang) +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level1.h" + +#include "../utils.h" + +namespace pnnx { + +class GLU : public FuseModulePass +{ +public: + const char* match_type_str() const + { + return "__torch__.torch.nn.modules.activation.GLU"; + } + + const char* type_str() const + { + return "nn.GLU"; + } + + void write(Operator* op, const std::shared_ptr& graph) const + { + const torch::jit::Node* glu = find_node_by_kind(graph, "aten::glu"); + + op->params["dim"] = glu->namedInput("dim"); + } +}; + +REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(GLU) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level1/nn_LSTM.cpp b/tools/pnnx/src/pass_level1/nn_LSTM.cpp index cf345dce2ae6..a2354dfadfa7 100644 --- a/tools/pnnx/src/pass_level1/nn_LSTM.cpp +++ b/tools/pnnx/src/pass_level1/nn_LSTM.cpp @@ -33,9 +33,9 @@ class LSTM : public FuseModulePass void write(Operator* op, const std::shared_ptr& graph, const torch::jit::Module& mod) const { - // mod.dump(true, true, true); - - // graph->dump(); + // mod.dump(true, true, true); + // + // graph->dump(); const torch::jit::Node* lstm = find_node_by_kind(graph, "aten::lstm"); @@ -49,12 +49,13 @@ class LSTM : public FuseModulePass op->params["pnnx_rnn_output_swapped"] = 1; } - // for (auto aa : lstm->schema().arguments()) - // { - // fprintf(stderr, "arg %s\n", aa.name().c_str()); - // } + // for (auto aa : lstm->schema().arguments()) + // { + // fprintf(stderr, "arg %s\n", aa.name().c_str()); + // } const auto& weight_ih_l0 = mod.attr("weight_ih_l0").toTensor(); + const auto& weight_hh_l0 = mod.attr("weight_hh_l0").toTensor(); op->params["input_size"] = weight_ih_l0.size(1); op->params["hidden_size"] = weight_ih_l0.size(0) / 4; @@ -62,17 +63,12 @@ class LSTM : public FuseModulePass op->params["bias"] = lstm->namedInput("has_biases"); op->params["batch_first"] = lstm->namedInput("batch_first"); op->params["bidirectional"] = lstm->namedInput("bidirectional"); - - int32_t proj_size = 0; - if (mod.hasattr("weight_hr_l0")) { - torch::Tensor w_hr = mod.attr("weight_hr_l0").toTensor(); - proj_size = w_hr.size(0); - } - op->params["proj_size"] = proj_size; + op->params["proj_size"] = weight_ih_l0.size(0) / 4 == weight_hh_l0.size(1) ? 0 : weight_hh_l0.size(1); const int num_layers = op->params["num_layers"].i; const bool bias = op->params["bias"].b; const bool bidirectional = op->params["bidirectional"].b; + const int proj_size = op->params["proj_size"].i; for (int k = 0; k < num_layers; k++) { @@ -82,11 +78,6 @@ class LSTM : public FuseModulePass op->attrs[weight_ih_lk_key] = mod.attr(weight_ih_lk_key).toTensor(); op->attrs[weight_hh_lk_key] = mod.attr(weight_hh_lk_key).toTensor(); - if (proj_size) { - std::string weight_hr_lk_key = std::string("weight_hr_l") + std::to_string(k); - op->attrs[weight_hr_lk_key] = mod.attr(weight_hr_lk_key).toTensor(); - } - if (bias) { std::string bias_ih_lk_key = std::string("bias_ih_l") + std::to_string(k); @@ -96,6 +87,13 @@ class LSTM : public FuseModulePass op->attrs[bias_hh_lk_key] = mod.attr(bias_hh_lk_key).toTensor(); } + if (proj_size > 0) + { + std::string weight_hr_lk_key = std::string("weight_hr_l") + std::to_string(k); + + op->attrs[weight_hr_lk_key] = mod.attr(weight_hr_lk_key).toTensor(); + } + if (bidirectional) { std::string weight_ih_lk_reverse_key = std::string("weight_ih_l") + std::to_string(k) + "_reverse"; @@ -112,6 +110,13 @@ class LSTM : public FuseModulePass op->attrs[bias_ih_lk_reverse_key] = mod.attr(bias_ih_lk_reverse_key).toTensor(); op->attrs[bias_hh_lk_reverse_key] = mod.attr(bias_hh_lk_reverse_key).toTensor(); } + + if (proj_size > 0) + { + std::string weight_hr_lk_reverse_key = std::string("weight_hr_l") + std::to_string(k) + "_reverse"; + + op->attrs[weight_hr_lk_reverse_key] = mod.attr(weight_hr_lk_reverse_key).toTensor(); + } } } } diff --git a/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp index 5a54ac442db5..608bca6fdbcf 100644 --- a/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp +++ b/tools/pnnx/src/pass_level1/nn_MultiheadAttention.cpp @@ -39,45 +39,75 @@ class MultiheadAttention : public FuseModulePass // graph->dump(); - const torch::jit::Node* div_num_heads = find_node_by_kind(graph, "aten::div"); - const torch::jit::Node* div_num_heads_18 = find_node_by_kind(graph, "aten::floor_divide"); - if (div_num_heads_18) + const torch::jit::Node* multi_head_attention = find_node_by_kind(graph, "aten::_native_multi_head_attention"); + if (multi_head_attention) { - div_num_heads = div_num_heads_18; + op->params["num_heads"] = multi_head_attention->namedInput("num_head"); + op->params["batch_first"] = true; + op->params["add_zero_attn"] = false; } + else + { + const torch::jit::Node* div_num_heads = find_node_by_kind(graph, "aten::div"); + const torch::jit::Node* div_num_heads_18 = find_node_by_kind(graph, "aten::floor_divide"); + if (div_num_heads_18) + { + div_num_heads = div_num_heads_18; + } - op->params["num_heads"] = (int)div_num_heads->input(1)->node()->t(torch::jit::attr::value).item(); + op->params["num_heads"] = (int)div_num_heads->input(1)->node()->t(torch::jit::attr::value).item(); - const torch::jit::Node* transpose_batch_seq = find_node_by_kind(graph, "aten::transpose"); + const torch::jit::Node* transpose_batch_seq = find_node_by_kind(graph, "aten::transpose"); - int transpose_dim0 = transpose_batch_seq->input(1)->node()->i(torch::jit::attr::value); - int transpose_dim1 = transpose_batch_seq->input(2)->node()->i(torch::jit::attr::value); - if (transpose_dim0 == 1 && transpose_dim1 == 0) - { - op->params["batch_first"] = true; - } + int transpose_dim0 = transpose_batch_seq->input(1)->node()->i(torch::jit::attr::value); + int transpose_dim1 = transpose_batch_seq->input(2)->node()->i(torch::jit::attr::value); + if (transpose_dim0 == 1 && transpose_dim1 == 0) + { + op->params["batch_first"] = true; + } #if TORCH_VERSION_MAJOR == 1 && TORCH_VERSION_MINOR >= 9 - else - { - op->params["batch_first"] = false; - } + else + { + op->params["batch_first"] = false; + } #endif - const torch::jit::Node* add_zero_attn = find_node_by_kind(graph, "aten::zeros"); - if (add_zero_attn) + const torch::jit::Node* add_zero_attn = find_node_by_kind(graph, "aten::zeros"); + if (add_zero_attn) + { + op->params["add_zero_attn"] = true; + } + else + { + op->params["add_zero_attn"] = false; + } + } + + if (mod.hasattr("in_proj_weight")) { - op->params["add_zero_attn"] = true; + const auto& in_proj_weight = mod.attr("in_proj_weight").toTensor(); + + op->params["embed_dim"] = in_proj_weight.size(1); + op->params["kdim"] = in_proj_weight.size(1); + op->params["vdim"] = in_proj_weight.size(1); + op->attrs["in_proj_weight"] = in_proj_weight; } else { - op->params["add_zero_attn"] = false; + const auto& q_proj_weight = mod.attr("q_proj_weight").toTensor(); + const auto& k_proj_weight = mod.attr("k_proj_weight").toTensor(); + const auto& v_proj_weight = mod.attr("v_proj_weight").toTensor(); + + op->params["embed_dim"] = q_proj_weight.size(1); + op->params["kdim"] = k_proj_weight.size(1); + op->params["vdim"] = v_proj_weight.size(1); + op->attrs["q_proj_weight"] = q_proj_weight; + op->attrs["k_proj_weight"] = k_proj_weight; + op->attrs["v_proj_weight"] = v_proj_weight; } - const auto& in_proj_weight = mod.attr("in_proj_weight").toTensor(); const auto& out_proj_weight = mod.attr("out_proj").toModule().attr("weight").toTensor(); - op->params["embed_dim"] = in_proj_weight.size(1); - op->attrs["in_proj_weight"] = in_proj_weight; op->attrs["out_proj.weight"] = out_proj_weight; if (mod.hasattr("in_proj_bias") && mod.attr("out_proj").toModule().hasattr("bias")) diff --git a/tools/pnnx/src/pass_level1/nn_Softmax2d.cpp b/tools/pnnx/src/pass_level1/nn_Softmax2d.cpp new file mode 100644 index 000000000000..c80404066233 --- /dev/null +++ b/tools/pnnx/src/pass_level1/nn_Softmax2d.cpp @@ -0,0 +1,37 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level1.h" + +#include "../utils.h" + +namespace pnnx { + +class Softmax2d : public FuseModulePass +{ +public: + const char* match_type_str() const + { + return "__torch__.torch.nn.modules.activation.Softmax2d"; + } + + const char* type_str() const + { + return "nn.Softmax2d"; + } +}; + +REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(Softmax2d) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level1/nn_Unfold.cpp b/tools/pnnx/src/pass_level1/nn_Unfold.cpp new file mode 100644 index 000000000000..1abf6201a832 --- /dev/null +++ b/tools/pnnx/src/pass_level1/nn_Unfold.cpp @@ -0,0 +1,47 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level1.h" + +#include "../utils.h" + +namespace pnnx { + +class Unfold : public FuseModulePass +{ +public: + const char* match_type_str() const + { + return "__torch__.torch.nn.modules.fold.Unfold"; + } + + const char* type_str() const + { + return "nn.Unfold"; + } + + void write(Operator* op, const std::shared_ptr& graph) const + { + const torch::jit::Node* im2col = find_node_by_kind(graph, "aten::im2col"); + + op->params["kernel_size"] = im2col->namedInput("kernel_size"); + op->params["stride"] = im2col->namedInput("stride"); + op->params["padding"] = im2col->namedInput("padding"); + op->params["dilation"] = im2col->namedInput("dilation"); + } +}; + +REGISTER_GLOBAL_PNNX_FUSE_MODULE_PASS(Unfold) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2.cpp b/tools/pnnx/src/pass_level2.cpp index b461d1e5c0f0..e9a98d4b267f 100644 --- a/tools/pnnx/src/pass_level2.cpp +++ b/tools/pnnx/src/pass_level2.cpp @@ -39,6 +39,11 @@ bool GraphRewriterPass::match(const std::map& captured_p return match(captured_params); } +bool GraphRewriterPass::match(const std::map& /*matched_operators*/) const +{ + return true; +} + void GraphRewriterPass::write(Operator* op, const std::map& captured_params) const { for (auto x : captured_params) @@ -215,7 +220,7 @@ static bool match_operator(const Operator* a, const Operator* b, std::map& matched_operators, std::unordered_map& matched_inputs, std::map& captured_params, std::map& captured_attrs) +static bool match(const Operator* anchor, const Operator* pattern, std::map& matched_operators, std::map& matched_inputs, std::map& captured_params, std::map& captured_attrs) { if (!match_operator(anchor, pattern, captured_params, captured_attrs)) return false; @@ -290,9 +295,9 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde bool matched = true; // lets match from output - std::unordered_map matched_operators; - std::unordered_map matched_inputs; - std::unordered_map matched_outputs; + std::map matched_operators; + std::map matched_inputs; + std::map matched_outputs; std::map captured_params; std::map captured_attrs; @@ -311,8 +316,8 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde { const Operator* anchor = graph.ops[j]; - std::unordered_map matched_operators2; - std::unordered_map matched_inputs2; + std::map matched_operators2; + std::map matched_inputs2; std::map captured_params2; std::map captured_attrs2; if (!match(anchor, pattern2, matched_operators2, matched_inputs2, captured_params2, captured_attrs2)) @@ -372,7 +377,7 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde break; } - if (matched && !pass->match(captured_params, captured_attrs)) + if (matched && (!pass->match(captured_params, captured_attrs) || !pass->match(matched_operators))) { matched_operators.clear(); matched_inputs.clear(); @@ -393,7 +398,7 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde // lets replace // remove all operands inside matched graph - std::unordered_map operands_to_remove; + std::map operands_to_remove; for (auto& _x : matched_operators) { Operator* x = (Operator*)_x.second; @@ -502,8 +507,112 @@ void pnnx_graph_rewrite(Graph& graph, const GraphRewriterPass* pass, int& opinde } } +static void fix_inplace_copy_output(Graph& graph) +{ + while (1) + { + bool matched = false; + for (size_t i = 0; i < graph.ops.size(); i++) + { + Operator* op = graph.ops[i]; + + bool is_inplace_op = op->type.size() > 2 && op->type[op->type.size() - 2] != '_' && op->type[op->type.size() - 1] == '_'; + if (!is_inplace_op) + continue; + + // replace inplace op with non-inplace version + op->type = op->type.substr(0, op->type.size() - 1); + + if (op->type == "aten::copy") + continue; + + if (op->outputs[0]->consumers.size() != 0) + continue; + + matched = true; + + // find in0 from slice / select chain + Operand* in0 = op->inputs[0]; + while (in0->producer->type == "aten::slice" || in0->producer->type == "aten::select") + { + in0 = in0->producer->inputs[0]; + } + + // append copy for inplace op + Operator* op_copy = graph.new_operator_after("aten::copy", op->name + "_copy", op); + Operand* copy_out = graph.new_operand(op->name + "_copy_out"); + + copy_out->shape = in0->shape; + + op_copy->inputs.push_back(op->inputs[0]); + op_copy->inputs.push_back(op->outputs[0]); + op->inputs[0]->consumers.push_back(op_copy); + op->outputs[0]->consumers.push_back(op_copy); + + op_copy->outputs.push_back(copy_out); + copy_out->producer = op_copy; + + break; + } + + if (!matched) + break; + } + + for (size_t i = 0; i < graph.ops.size(); i++) + { + Operator* op = graph.ops[i]; + + if (op->type != "aten::copy") + continue; + + if (op->outputs[0]->consumers.size() != 0) + continue; + + // aten::slice 5 1 in0 .... a + // aten::slice 5 1 a .... b + // aten::copy 2 1 b in1 out + + // aten::select 3 1 in0 .... a + // aten::copy 2 1 a in1 out + + // find in0 from slice / select chain + Operand* in0 = op->inputs[0]; + while (in0->producer->type == "aten::slice" || in0->producer->type == "aten::select") + { + in0 = in0->producer->inputs[0]; + } + + // replace all the following uses of in0 with out + Operand* out0 = op->outputs[0]; + out0->shape = in0->shape; + for (size_t j = i; j < graph.ops.size(); j++) + { + Operator* op2 = graph.ops[j]; + + bool use_in0 = false; + for (size_t k = 0; k < op2->inputs.size(); k++) + { + if (op2->inputs[k] == in0) + { + op2->inputs[k] = out0; + use_in0 = true; + } + } + + if (use_in0) + { + in0->remove_consumer(op2); + out0->consumers.push_back(op2); + } + } + } +} + void pass_level2(Graph& g) { + fix_inplace_copy_output(g); + int opindex = 0; for (auto x : g_global_pnnx_graph_rewriter_passes) { diff --git a/tools/pnnx/src/pass_level2.h b/tools/pnnx/src/pass_level2.h index 1a0562be939d..af0fb8346df3 100644 --- a/tools/pnnx/src/pass_level2.h +++ b/tools/pnnx/src/pass_level2.h @@ -34,6 +34,8 @@ class GraphRewriterPass virtual bool match(const std::map& captured_params, const std::map& captured_attrs) const; + virtual bool match(const std::map& matched_operators) const; + virtual void write(Operator* op, const std::map& captured_params) const; virtual void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const; diff --git a/tools/pnnx/src/pass_level2/F_fold.cpp b/tools/pnnx/src/pass_level2/F_fold.cpp new file mode 100644 index 000000000000..39e3787fbc5d --- /dev/null +++ b/tools/pnnx/src/pass_level2/F_fold.cpp @@ -0,0 +1,45 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level2.h" + +namespace pnnx { + +class F_fold : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +8 7 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 output_size +pnnx.Input input_2 0 1 kernel_size +pnnx.Input input_3 0 1 dilation +pnnx.Input input_4 0 1 padding +pnnx.Input input_5 0 1 stride +aten::col2im op_0 6 1 input output_size kernel_size dilation padding stride out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.fold"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_fold, 10) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_glu.cpp b/tools/pnnx/src/pass_level2/F_glu.cpp index 5ca26b96ad21..f6ed24408fdc 100644 --- a/tools/pnnx/src/pass_level2/F_glu.cpp +++ b/tools/pnnx/src/pass_level2/F_glu.cpp @@ -16,21 +16,26 @@ namespace pnnx { -class F_glu : public GraphRewriterPass { - public: - const char *match_pattern_graph() const { - return R"PNNXIR(7767517 +class F_glu : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 4 3 pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 dim aten::glu op_1 2 1 input dim out pnnx.Output output 1 0 out )PNNXIR"; - } + } - const char *type_str() const { return "F.glu"; } + const char* type_str() const + { + return "F.glu"; + } }; REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_glu, 10) -} // namespace pnnx +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_softmax.cpp b/tools/pnnx/src/pass_level2/F_softmax.cpp index af34a58957c8..8a9352beba20 100644 --- a/tools/pnnx/src/pass_level2/F_softmax.cpp +++ b/tools/pnnx/src/pass_level2/F_softmax.cpp @@ -25,7 +25,7 @@ class F_softmax : public GraphRewriterPass 5 4 pnnx.Input input_0 0 1 input pnnx.Input input_1 0 1 dim -prim::Constant op_0 0 1 dtype value=None +prim::Constant op_0 0 1 dtype value=* aten::softmax op_1 3 1 input dim dtype out pnnx.Output output 1 0 out )PNNXIR"; diff --git a/tools/pnnx/src/pass_level2/F_unfold.cpp b/tools/pnnx/src/pass_level2/F_unfold.cpp new file mode 100644 index 000000000000..a6a236080d41 --- /dev/null +++ b/tools/pnnx/src/pass_level2/F_unfold.cpp @@ -0,0 +1,44 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level2.h" + +namespace pnnx { + +class F_unfold : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +7 6 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 kernel_size +pnnx.Input input_2 0 1 dilation +pnnx.Input input_3 0 1 padding +pnnx.Input input_4 0 1 stride +aten::im2col op_0 5 1 input kernel_size dilation padding stride out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.unfold"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_unfold, 10) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/F_upsample_nearest.cpp b/tools/pnnx/src/pass_level2/F_upsample_nearest.cpp index c544e8065bbd..72b78b414410 100644 --- a/tools/pnnx/src/pass_level2/F_upsample_nearest.cpp +++ b/tools/pnnx/src/pass_level2/F_upsample_nearest.cpp @@ -63,6 +63,29 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_upsample_nearest_1, 10) +class F_upsample_nearest_1_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 size +prim::Constant op_0 0 1 scale_factor value=None +aten::upsample_nearest2d op_1 3 1 input size scale_factor out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "F.upsample_nearest"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_upsample_nearest_1_1, 10) + class F_upsample_nearest_2 : public GraphRewriterPass { public: diff --git a/tools/pnnx/src/pass_level2/Tensor_copy.cpp b/tools/pnnx/src/pass_level2/Tensor_copy.cpp new file mode 100644 index 000000000000..d5369b29e8ac --- /dev/null +++ b/tools/pnnx/src/pass_level2/Tensor_copy.cpp @@ -0,0 +1,64 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level2.h" + +namespace pnnx { + +class Tensor_copy : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_0 0 1 self +pnnx.Input input_1 0 1 src +prim::Constant op_0 0 1 non_blocking value=* +aten::copy op_1 3 1 self src non_blocking out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Tensor.copy"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_copy, 20) + +class Tensor_copy_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 self +pnnx.Input input_1 0 1 src +aten::copy op_1 2 1 self src out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Tensor.copy"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(Tensor_copy_1, 20) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/torch_bitwise_left_shift.cpp b/tools/pnnx/src/pass_level2/torch_bitwise_left_shift.cpp new file mode 100644 index 000000000000..4fadaad74af5 --- /dev/null +++ b/tools/pnnx/src/pass_level2/torch_bitwise_left_shift.cpp @@ -0,0 +1,41 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level2.h" + +namespace pnnx { + +class torch_bitwise_left_shift : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 other +aten::bitwise_left_shift op_0 2 1 input other out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "torch.bitwise_left_shift"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_bitwise_left_shift, 20) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/torch_bitwise_right_shift.cpp b/tools/pnnx/src/pass_level2/torch_bitwise_right_shift.cpp new file mode 100644 index 000000000000..4db2560da3fe --- /dev/null +++ b/tools/pnnx/src/pass_level2/torch_bitwise_right_shift.cpp @@ -0,0 +1,41 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_level2.h" + +namespace pnnx { + +class torch_bitwise_right_shift : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 input +pnnx.Input input_1 0 1 other +aten::bitwise_right_shift op_0 2 1 input other out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "torch.bitwise_right_shift"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_bitwise_right_shift, 20) + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level2/torch_einsum.cpp b/tools/pnnx/src/pass_level2/torch_einsum.cpp index 771df403c9e7..f6b24757e50c 100644 --- a/tools/pnnx/src/pass_level2/torch_einsum.cpp +++ b/tools/pnnx/src/pass_level2/torch_einsum.cpp @@ -38,4 +38,27 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_einsum, 20) +class torch_einsum_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_0 0 1 equation +pnnx.Input input_1 0 1 operands +prim::Constant op_0 0 1 path value=None +aten::einsum op_1 3 1 equation operands path out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "torch.einsum"; + } +}; + +REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_einsum_1, 20) + } // namespace pnnx diff --git a/tools/pnnx/src/pass_level3.cpp b/tools/pnnx/src/pass_level3.cpp index dc272845319b..182e5a343fe4 100644 --- a/tools/pnnx/src/pass_level3.cpp +++ b/tools/pnnx/src/pass_level3.cpp @@ -37,7 +37,7 @@ namespace pnnx { -void pass_level3(Graph& g, const std::map& foldable_constants) +void pass_level3(Graph& g, const std::set& foldable_constants) { assign_unique_name(g); diff --git a/tools/pnnx/src/pass_level3.h b/tools/pnnx/src/pass_level3.h index ac6a0b265fe5..53482824fff0 100644 --- a/tools/pnnx/src/pass_level3.h +++ b/tools/pnnx/src/pass_level3.h @@ -19,7 +19,7 @@ namespace pnnx { -void pass_level3(Graph& g, const std::map& foldable_constants); +void pass_level3(Graph& g, const std::set& foldable_constants); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level3/fuse_expression.cpp b/tools/pnnx/src/pass_level3/fuse_expression.cpp index 645a0a5d7f03..0866e1301c6c 100644 --- a/tools/pnnx/src/pass_level3/fuse_expression.cpp +++ b/tools/pnnx/src/pass_level3/fuse_expression.cpp @@ -65,30 +65,57 @@ static bool operand_maybe_tensor(const Operand* operand) return false; } - if (op->type == "aten::floor_divide" || op->type == "aten::mul" || op->type == "aten::div" || op->type == "aten::pow") + if (op->type == "aten::abs" + || op->type == "aten::acos" + || op->type == "aten::acosh" + || op->type == "aten::asin" + || op->type == "aten::asinh" + || op->type == "aten::atan" + || op->type == "aten::atanh" + || op->type == "aten::ceil" + || op->type == "aten::cos" + || op->type == "aten::cosh" + || op->type == "aten::exp" + || op->type == "aten::floor" + || op->type == "aten::log" + || op->type == "aten::neg" + || op->type == "aten::reciprocal" + || op->type == "aten::rsqrt" + || op->type == "aten::sign" + || op->type == "aten::sin" + || op->type == "aten::sinh" + || op->type == "aten::sqrt" + || op->type == "aten::square" + || op->type == "aten::tan" + || op->type == "aten::tanh" + || op->type == "aten::trunc") { - return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]); + return operand_maybe_tensor(op->inputs[0]); } - if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__") + if (op->type == "aten::atan2" + || op->type == "aten::div" + || op->type == "aten::floor_divide" + || op->type == "aten::mul" + || op->type == "aten::pow") { return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]); } - if (op->type == "aten::add" || op->type == "aten::sub" || op->type == "aten::rsub") + if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__" || op->type == "aten::__lshift__" || op->type == "aten::__rshift__") { - return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]) || operand_maybe_tensor(op->inputs[2]); + return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]); } - if (op->type == "aten::sqrt" || op->type == "aten::rsqrt" || op->type == "aten::neg" || op->type == "aten::floor" || op->type == "aten::exp") + if (op->type == "aten::add" || op->type == "aten::sub" || op->type == "aten::rsub") { - return operand_maybe_tensor(op->inputs[0]); + return operand_maybe_tensor(op->inputs[0]) || operand_maybe_tensor(op->inputs[1]) || operand_maybe_tensor(op->inputs[2]); } return true; } -static bool operand_is_foldable(const Operand* operand, const std::map& foldable_constants) +static bool operand_is_foldable(const Operand* operand, const std::set& foldable_constants) { if (foldable_constants.find(operand->name) != foldable_constants.end()) return true; @@ -107,7 +134,7 @@ static bool operand_is_foldable(const Operand* operand, const std::map& inputs, const std::map& foldable_constants, bool checksubgraph = true) +static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, std::vector& inputs, const std::set& foldable_constants, bool checksubgraph = true) { // fprintf(stderr, "fuse_expression %s\n", operand->name.c_str()); @@ -246,7 +273,44 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s { fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants); } - else if (op->type == "aten::floor_divide" || op->type == "aten::mul" || op->type == "aten::div" || op->type == "aten::pow" || op->type == "aten::remainder") + else if (op->type == "aten::abs" + || op->type == "aten::acos" + || op->type == "aten::acosh" + || op->type == "aten::asin" + || op->type == "aten::asinh" + || op->type == "aten::atan" + || op->type == "aten::atanh" + || op->type == "aten::ceil" + || op->type == "aten::cos" + || op->type == "aten::cosh" + || op->type == "aten::exp" + || op->type == "aten::floor" + || op->type == "aten::log" + || op->type == "aten::neg" + || op->type == "aten::reciprocal" + || op->type == "aten::rsqrt" + || op->type == "aten::sign" + || op->type == "aten::sin" + || op->type == "aten::sinh" + || op->type == "aten::sqrt" + || op->type == "aten::square" + || op->type == "aten::tan" + || op->type == "aten::tanh" + || op->type == "aten::trunc") + { + std::string mathop = op->type.substr(6); + + expr += mathop; + expr += "("; + fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants); + expr += ")"; + } + else if (op->type == "aten::atan2" + || op->type == "aten::div" + || op->type == "aten::floor_divide" + || op->type == "aten::mul" + || op->type == "aten::pow" + || op->type == "aten::remainder") { std::string mathop = op->type.substr(6); @@ -257,11 +321,9 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s fuse_expression(graph, op->inputs[1], expr, inputs, foldable_constants); expr += ")"; } - else if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__") + else if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__" || op->type == "aten::__lshift__" || op->type == "aten::__rshift__") { - std::string mathop = op->type.substr(8, 3); - if (mathop == "or_") - mathop = "or"; + std::string mathop = op->type.substr(8, op->type.size() - 10); expr += mathop; expr += "("; @@ -326,36 +388,6 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants); expr += ")"; } - else if (op->type == "aten::sqrt") - { - expr += "sqrt("; - fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants); - expr += ")"; - } - else if (op->type == "aten::rsqrt") - { - expr += "rsqrt("; - fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants); - expr += ")"; - } - else if (op->type == "aten::neg") - { - expr += "neg("; - fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants); - expr += ")"; - } - else if (op->type == "aten::floor") - { - expr += "floor("; - fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants); - expr += ")"; - } - else if (op->type == "aten::exp") - { - expr += "exp("; - fuse_expression(graph, op->inputs[0], expr, inputs, foldable_constants); - expr += ")"; - } else { auto it = std::find(inputs.begin(), inputs.end(), operand); @@ -378,7 +410,7 @@ static void fuse_expression(Graph& graph, Operand* operand, std::string& expr, s } } -void fuse_expression(Graph& graph, const std::map& foldable_constants) +void fuse_expression(Graph& graph, const std::set& foldable_constants) { int pnnx_expr_index = 0; @@ -415,11 +447,43 @@ void fuse_expression(Graph& graph, const std::map& folda { need_fuse = true; } - if (op->type == "aten::floor_divide" || op->type == "aten::add" || op->type == "aten::sub" || op->type == "aten::mul" || op->type == "aten::div" || op->type == "aten::sqrt" || op->type == "aten::rsub" || op->type == "aten::rsqrt" || op->type == "aten::neg" || op->type == "aten::pow" || op->type == "aten::remainder" || op->type == "aten::floor" || op->type == "aten::exp") + if (op->type == "aten::abs" + || op->type == "aten::acos" + || op->type == "aten::acosh" + || op->type == "aten::add" + || op->type == "aten::asin" + || op->type == "aten::asinh" + || op->type == "aten::atan" + || op->type == "aten::atanh" + || op->type == "aten::atan2" + || op->type == "aten::ceil" + || op->type == "aten::cos" + || op->type == "aten::cosh" + || op->type == "aten::div" + || op->type == "aten::exp" + || op->type == "aten::floor" + || op->type == "aten::floor_divide" + || op->type == "aten::log" + || op->type == "aten::mul" + || op->type == "aten::neg" + || op->type == "aten::pow" + || op->type == "aten::reciprocal" + || op->type == "aten::remainder" + || op->type == "aten::rsqrt" + || op->type == "aten::rsub" + || op->type == "aten::sign" + || op->type == "aten::sin" + || op->type == "aten::sinh" + || op->type == "aten::sqrt" + || op->type == "aten::square" + || op->type == "aten::sub" + || op->type == "aten::tan" + || op->type == "aten::tanh" + || op->type == "aten::trunc") { need_fuse = true; } - if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__") + if (op->type == "aten::__and__" || op->type == "aten::__or__" || op->type == "aten::__xor__" || op->type == "aten::__lshift__" || op->type == "aten::__rshift__") { need_fuse = true; } diff --git a/tools/pnnx/src/pass_level3/fuse_expression.h b/tools/pnnx/src/pass_level3/fuse_expression.h index 77ae711c8afc..5c8297a2158b 100644 --- a/tools/pnnx/src/pass_level3/fuse_expression.h +++ b/tools/pnnx/src/pass_level3/fuse_expression.h @@ -16,6 +16,6 @@ namespace pnnx { -void fuse_expression(Graph& graph, const std::map& foldable_constants); +void fuse_expression(Graph& graph, const std::set& foldable_constants); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level3/fuse_input_unpack.cpp b/tools/pnnx/src/pass_level3/fuse_input_unpack.cpp index 7eff87273d5b..85fcbce1486e 100644 --- a/tools/pnnx/src/pass_level3/fuse_input_unpack.cpp +++ b/tools/pnnx/src/pass_level3/fuse_input_unpack.cpp @@ -26,7 +26,7 @@ void fuse_input_unpack(Graph& graph) { Operator* op = graph.ops[i]; - if (op->type != "prim::TupleUnpack") + if (op->type != "prim::TupleUnpack" && op->type != "prim::ListUnpack") continue; if (op->inputs.size() != 1) diff --git a/tools/pnnx/src/pass_level5.cpp b/tools/pnnx/src/pass_level5.cpp index 1de1d0079479..ae365f369df8 100644 --- a/tools/pnnx/src/pass_level5.cpp +++ b/tools/pnnx/src/pass_level5.cpp @@ -22,9 +22,10 @@ #include "pass_level5/eliminate_noop_expression.h" #include "pass_level5/eliminate_noop_pad.h" #include "pass_level5/eliminate_noop_upsample.h" -#include "pass_level5/eliminate_slice.h" -#include "pass_level5/eliminate_view_reshape.h" +#include "pass_level5/eliminate_noop_slice.h" +#include "pass_level5/eliminate_noop_view_reshape.h" #include "pass_level5/eval_expression.h" +#include "pass_level5/fuse_adjacent_reshape.h" #include "pass_level5/fuse_channel_shuffle.h" #include "pass_level5/fuse_constant_expression.h" #include "pass_level5/fuse_conv1d_batchnorm1d.h" @@ -33,10 +34,19 @@ #include "pass_level5/fuse_convtranspose2d_batchnorm2d.h" #include "pass_level5/fuse_contiguous_view.h" #include "pass_level5/fuse_linear_batchnorm1d.h" +#include "pass_level5/fuse_pad_conv1d.h" +#include "pass_level5/fuse_pad_conv2d.h" #include "pass_level5/fuse_select_to_unbind.h" +#include "pass_level5/fuse_slice_copy.h" #include "pass_level5/fuse_slice_indices.h" #include "pass_level5/fuse_slice_to_tensor_split.h" +#include "pass_level5/fuse_static_batchnorm.h" #include "pass_level5/fuse_static_conv.h" +#include "pass_level5/fuse_static_convtranspose.h" +#include "pass_level5/fuse_static_groupnorm.h" +#include "pass_level5/fuse_static_instancenorm.h" +#include "pass_level5/fuse_static_layernorm.h" +#include "pass_level5/fuse_static_linear.h" #include "pass_level5/normalize_einsum_equation.h" #include "pass_level4/dead_code_elimination.h" #include "pass_level4/canonicalize.h" @@ -44,15 +54,17 @@ namespace pnnx { -void pass_level5(Graph& g, const std::map& foldable_constants) +void pass_level5(Graph& g, const std::set& foldable_constants, const std::string& foldable_constants_zippath) { eval_expression(g); fuse_constant_expression(g); + fold_constants(g, foldable_constants, foldable_constants_zippath); + eliminate_noop_expression(g); - eliminate_slice(g); + eliminate_noop_slice(g); fuse_slice_indices(g); @@ -66,18 +78,26 @@ void pass_level5(Graph& g, const std::map& foldable_cons fuse_slice_to_tensor_split(g); + fuse_slice_copy(g); + + fuse_static_batchnorm(g); + fuse_static_groupnorm(g); + fuse_static_instancenorm(g); + fuse_static_layernorm(g); + fuse_static_conv(g); + fuse_static_convtranspose(g); + fuse_static_linear(g); fuse_conv1d_batchnorm1d(g); - fuse_conv2d_batchnorm2d(g); - fuse_convtranspose1d_batchnorm1d(g); - fuse_convtranspose2d_batchnorm2d(g); - fuse_linear_batchnorm1d(g); + fuse_pad_conv1d(g); + fuse_pad_conv2d(g); + eliminate_noop_pad(g); eliminate_noop_cat(g); @@ -88,11 +108,11 @@ void pass_level5(Graph& g, const std::map& foldable_cons fuse_contiguous_view(g); - eliminate_view_reshape(g); + fuse_adjacent_reshape(g); - fuse_channel_shuffle(g); + eliminate_noop_view_reshape(g); - fold_constants(g, foldable_constants); + fuse_channel_shuffle(g); fuse_index_expression(g); diff --git a/tools/pnnx/src/pass_level5.h b/tools/pnnx/src/pass_level5.h index fbf4ff486890..a040c7bf1457 100644 --- a/tools/pnnx/src/pass_level5.h +++ b/tools/pnnx/src/pass_level5.h @@ -19,7 +19,7 @@ namespace pnnx { -void pass_level5(Graph& g, const std::map& foldable_constants); +void pass_level5(Graph& g, const std::set& foldable_constants, const std::string& foldable_constants_zippath); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/eliminate_slice.cpp b/tools/pnnx/src/pass_level5/eliminate_noop_slice.cpp similarity index 94% rename from tools/pnnx/src/pass_level5/eliminate_slice.cpp rename to tools/pnnx/src/pass_level5/eliminate_noop_slice.cpp index 62939be2c422..5e31b7728972 100644 --- a/tools/pnnx/src/pass_level5/eliminate_slice.cpp +++ b/tools/pnnx/src/pass_level5/eliminate_noop_slice.cpp @@ -12,14 +12,15 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "eliminate_slice.h" +#include "eliminate_noop_slice.h" +#include #include #include "pass_level2.h" namespace pnnx { -void eliminate_slice(Graph& graph) +void eliminate_noop_slice(Graph& graph) { while (1) { @@ -44,7 +45,7 @@ void eliminate_slice(Graph& graph) int end = op->params.at("end").i; int step = op->params.at("step").i; - if (start == 0 && end == -1 && step == 1) + if (start == 0 && end == INT_MAX && step == 1) { // delete noop-like slice matched = true; diff --git a/tools/pnnx/src/pass_level5/eliminate_slice.h b/tools/pnnx/src/pass_level5/eliminate_noop_slice.h similarity index 94% rename from tools/pnnx/src/pass_level5/eliminate_slice.h rename to tools/pnnx/src/pass_level5/eliminate_noop_slice.h index a90ed96f4e98..162109d2a664 100644 --- a/tools/pnnx/src/pass_level5/eliminate_slice.h +++ b/tools/pnnx/src/pass_level5/eliminate_noop_slice.h @@ -16,6 +16,6 @@ namespace pnnx { -void eliminate_slice(Graph& graph); +void eliminate_noop_slice(Graph& graph); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/eliminate_view_reshape.cpp b/tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.cpp similarity index 96% rename from tools/pnnx/src/pass_level5/eliminate_view_reshape.cpp rename to tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.cpp index c3097bdb4439..e6b00e87b2a4 100644 --- a/tools/pnnx/src/pass_level5/eliminate_view_reshape.cpp +++ b/tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.cpp @@ -12,14 +12,14 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "eliminate_view_reshape.h" +#include "eliminate_noop_view_reshape.h" #include #include "pass_level2.h" namespace pnnx { -void eliminate_view_reshape(Graph& graph) +void eliminate_noop_view_reshape(Graph& graph) { while (1) { diff --git a/tools/pnnx/src/pass_level5/eliminate_view_reshape.h b/tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.h similarity index 94% rename from tools/pnnx/src/pass_level5/eliminate_view_reshape.h rename to tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.h index e3996354484e..1d724d99c41f 100644 --- a/tools/pnnx/src/pass_level5/eliminate_view_reshape.h +++ b/tools/pnnx/src/pass_level5/eliminate_noop_view_reshape.h @@ -16,6 +16,6 @@ namespace pnnx { -void eliminate_view_reshape(Graph& graph); +void eliminate_noop_view_reshape(Graph& graph); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/eval_expression.cpp b/tools/pnnx/src/pass_level5/eval_expression.cpp index 4261e5e5f255..7326f1b32c95 100644 --- a/tools/pnnx/src/pass_level5/eval_expression.cpp +++ b/tools/pnnx/src/pass_level5/eval_expression.cpp @@ -45,20 +45,14 @@ static bool token_is_literal(const std::string& t) float f; iss >> std::noskipws >> f; return iss.eof() && !iss.fail(); +} - // for (size_t i = 0; i < t.size(); i++) - // { - // if (i == 0 && t[i] == '-') - // continue; - // - // if (t[i] < '0' || t[i] > '9') - // { - // if (t[i] != '.' && t[i] != 'e') - // return false; - // } - // } - // - // return true; +static bool token_is_interger_literal(const std::string& t) +{ + std::istringstream iss(t); + int f; + iss >> std::noskipws >> f; + return iss.eof() && !iss.fail(); } static std::string eval_expression(const Operator* op) @@ -126,7 +120,17 @@ static std::string eval_expression(const Operator* op) { int bi = std::stoi(b); int r = op->inputs[input_index]->shape[bi]; - exprstack.push(std::to_string(r)); + if (r == -1) + { + // do not evaluate dynamic size info as -1 + // just keep the size expression + std::string r = std::string("size(") + a + "," + b + ")"; + exprstack.push(r); + } + else + { + exprstack.push(std::to_string(r)); + } } } else @@ -135,7 +139,31 @@ static std::string eval_expression(const Operator* op) exprstack.push(r); } } - else if (t == "int" || t == "sqrt" || t == "rsqrt" || t == "neg") + else if (t == "int" + || t == "abs" + || t == "acos" + || t == "acosh" + || t == "asin" + || t == "asinh" + || t == "atan" + || t == "atanh" + || t == "ceil" + || t == "cos" + || t == "cosh" + || t == "exp" + || t == "floor" + || t == "log" + || t == "neg" + || t == "reciprocal" + || t == "rsqrt" + || t == "sign" + || t == "sin" + || t == "sinh" + || t == "sqrt" + || t == "square" + || t == "tan" + || t == "tanh" + || t == "trunc") { std::string a = exprstack.top(); exprstack.pop(); @@ -149,14 +177,69 @@ static std::string eval_expression(const Operator* op) int r = int(af); exprstack.push(std::to_string(r)); } - if (t == "sqrt") + if (t == "abs") { - float r = sqrt(af); + float r = abs(af); exprstack.push(std::to_string(r)); } - if (t == "rsqrt") + if (t == "acos") { - float r = 1.f / sqrt(af); + float r = acos(af); + exprstack.push(std::to_string(r)); + } + if (t == "acosh") + { + float r = acosh(af); + exprstack.push(std::to_string(r)); + } + if (t == "asin") + { + float r = asin(af); + exprstack.push(std::to_string(r)); + } + if (t == "asinh") + { + float r = asinh(af); + exprstack.push(std::to_string(r)); + } + if (t == "atan") + { + float r = atan(af); + exprstack.push(std::to_string(r)); + } + if (t == "atanh") + { + float r = atanh(af); + exprstack.push(std::to_string(r)); + } + if (t == "ceil") + { + float r = ceil(af); + exprstack.push(std::to_string(r)); + } + if (t == "cos") + { + float r = cos(af); + exprstack.push(std::to_string(r)); + } + if (t == "cosh") + { + float r = cosh(af); + exprstack.push(std::to_string(r)); + } + if (t == "exp") + { + float r = exp(af); + exprstack.push(std::to_string(r)); + } + if (t == "floor") + { + float r = floor(af); + exprstack.push(std::to_string(r)); + } + if (t == "log") + { + float r = log(af); exprstack.push(std::to_string(r)); } if (t == "neg") @@ -164,6 +247,56 @@ static std::string eval_expression(const Operator* op) float r = -af; exprstack.push(std::to_string(r)); } + if (t == "reciprocal") + { + float r = 1.f / af; + exprstack.push(std::to_string(r)); + } + if (t == "rsqrt") + { + float r = 1.f / sqrt(af); + exprstack.push(std::to_string(r)); + } + if (t == "sign") + { + float r = af > 0.f ? 1.f : (af == 0.f ? 0.f : -1.f); + exprstack.push(std::to_string(r)); + } + if (t == "sin") + { + float r = sin(af); + exprstack.push(std::to_string(r)); + } + if (t == "sinh") + { + float r = sinh(af); + exprstack.push(std::to_string(r)); + } + if (t == "sqrt") + { + float r = sqrt(af); + exprstack.push(std::to_string(r)); + } + if (t == "square") + { + float r = af * af; + exprstack.push(std::to_string(r)); + } + if (t == "tan") + { + float r = tan(af); + exprstack.push(std::to_string(r)); + } + if (t == "tanh") + { + float r = tanh(af); + exprstack.push(std::to_string(r)); + } + if (t == "trunc") + { + float r = trunc(af); + exprstack.push(std::to_string(r)); + } } else { @@ -171,7 +304,14 @@ static std::string eval_expression(const Operator* op) exprstack.push(r); } } - else if (t == "add" || t == "sub" || t == "mul" || t == "div" || t == "floor_divide" || t == "pow" || t == "remainder" || t == "and" || t == "or" || t == "xor") + else if (t == "atan2" + || t == "add" + || t == "sub" + || t == "mul" + || t == "div" + || t == "floor_divide" + || t == "pow" + || t == "remainder") { std::string a = exprstack.top(); exprstack.pop(); @@ -183,6 +323,11 @@ static std::string eval_expression(const Operator* op) float af = std::stof(a); float bf = std::stof(b); + if (t == "atan2") + { + float r = atan2(af, bf); + exprstack.push(std::to_string(r)); + } if (t == "add") { float r = af + bf; @@ -227,6 +372,50 @@ static std::string eval_expression(const Operator* op) exprstack.push(r); } } + else if (t == "and" || t == "or" || t == "xor" || t == "lshift" || t == "rshift") + { + std::string a = exprstack.top(); + exprstack.pop(); + std::string b = exprstack.top(); + exprstack.pop(); + + if (token_is_interger_literal(a) && token_is_interger_literal(b)) + { + int ai = std::stoi(a); + int bi = std::stoi(b); + + if (t == "and") + { + int r = ai & bi; + exprstack.push(std::to_string(r)); + } + if (t == "or") + { + int r = ai | bi; + exprstack.push(std::to_string(r)); + } + if (t == "xor") + { + int r = ai ^ bi; + exprstack.push(std::to_string(r)); + } + if (t == "lshift") + { + int r = ai << bi; + exprstack.push(std::to_string(r)); + } + if (t == "rshift") + { + int r = ai >> bi; + exprstack.push(std::to_string(r)); + } + } + else + { + std::string r = t + "(" + a + "," + b + ")"; + exprstack.push(r); + } + } else if (t == "[") // list { std::vector elements; diff --git a/tools/pnnx/src/pass_level5/fold_constants.cpp b/tools/pnnx/src/pass_level5/fold_constants.cpp index 51c8e71539f9..e5bccd49827f 100644 --- a/tools/pnnx/src/pass_level5/fold_constants.cpp +++ b/tools/pnnx/src/pass_level5/fold_constants.cpp @@ -15,12 +15,19 @@ #include "fold_constants.h" #include +#include "storezip.h" #include "pass_level4/dead_code_elimination.h" namespace pnnx { -void fold_constants(Graph& graph, const std::map& foldable_constants) +void fold_constants(Graph& graph, const std::set& foldable_constants, const std::string& foldable_constants_zippath) { + if (foldable_constants.empty()) + return; + + StoreZipReader zip; + zip.open(foldable_constants_zippath); + for (size_t i = 0; i < graph.operands.size(); i++) { Operand* operand = graph.operands[i]; @@ -36,13 +43,23 @@ void fold_constants(Graph& graph, const std::map& foldab // replace producer with attribute Operator* op_new = graph.new_operator_before("pnnx.Attribute", std::string("pnnx_fold_") + name, op); - op_new->attrs[std::string("pnnx_fold_") + name] = foldable_constants.at(name); + op_new->attrs[std::string("pnnx_fold_") + name] = Attribute(); + + Attribute& t2 = op_new->attrs[std::string("pnnx_fold_") + name]; + t2.type = operand->type; + t2.shape = operand->shape; + size_t size = zip.get_file_size(name); + t2.data.resize(size); + zip.read_file(name, t2.data.data()); + op_new->outputs.push_back(operand); operand->producer = op_new; op->outputs.clear(); } + zip.close(); + // dce dead_code_elimination(graph); } diff --git a/tools/pnnx/src/pass_level5/fold_constants.h b/tools/pnnx/src/pass_level5/fold_constants.h index 6ebffbda0646..0d96f9fbd0c7 100644 --- a/tools/pnnx/src/pass_level5/fold_constants.h +++ b/tools/pnnx/src/pass_level5/fold_constants.h @@ -16,6 +16,6 @@ namespace pnnx { -void fold_constants(Graph& graph, const std::map& foldable_constants); +void fold_constants(Graph& graph, const std::set& foldable_constants, const std::string& foldable_constants_zippath); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.cpp b/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.cpp new file mode 100644 index 000000000000..f85050721295 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.cpp @@ -0,0 +1,105 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_adjacent_reshape.h" + +#include +#include "pass_level2.h" + +namespace pnnx { + +void fuse_adjacent_reshape(Graph& graph) +{ + while (1) + { + bool matched = false; + + for (int i = (int)graph.ops.size() - 1; i > 0; i--) + { + Operator* op = graph.ops[i]; + + // look for Tensor.view / Tensor.reshape / torch.squeeze / torch.unsqueeze chain + if (op->type != "Tensor.view" && op->type != "Tensor.reshape" && op->type != "torch.squeeze" && op->type != "torch.unsqueeze") + continue; + + if ((op->type == "torch.squeeze" || op->type == "torch.unsqueeze") && op->outputs[0]->shape.empty()) + continue; + + std::vector reshapes_to_delete; + const Operand* in0 = op->inputs[0]; + while (in0->consumers.size() == 1 && (in0->producer->type == "Tensor.view" || in0->producer->type == "Tensor.reshape" || in0->producer->type == "torch.squeeze" || in0->producer->type == "torch.unsqueeze")) + { + reshapes_to_delete.push_back(in0->producer); + in0 = in0->producer->inputs[0]; + } + + if (reshapes_to_delete.empty()) + continue; + + // keep the last reshape only + matched = true; + + op->type = "Tensor.reshape"; + + if (!op->outputs[0]->shape.empty()) + { + op->params.clear(); + op->params["shape"] = op->outputs[0]->shape; + } + + for (auto& op0 : reshapes_to_delete) + { + for (auto& x : op0->inputs) + { + x->remove_consumer(op0); + } + + Operand* op0_in = op0->inputs[0]; + Operand* op0_out = op0->outputs[0]; + + for (auto& x : op0_out->consumers) + { + for (size_t j = 0; j < x->inputs.size(); j++) + { + if (x->inputs[j] == op0_out) + x->inputs[j] = op0_in; + } + + op0_in->consumers.push_back(x); + } + + op0_in->name = op0_out->name; + + op0_out->producer = 0; + op0_out->consumers.clear(); + + graph.operands.erase(std::find(graph.operands.begin(), graph.operands.end(), op0_out)); + delete op0_out; + + op0->inputs.clear(); + op0->outputs.clear(); + + graph.ops.erase(std::find(graph.ops.begin(), graph.ops.end(), op0)); + delete op0; + } + + break; + } + + if (!matched) + break; + } +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.h b/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.h similarity index 87% rename from tools/pnnx/src/pass_ncnn/convert_to_fp16_model.h rename to tools/pnnx/src/pass_level5/fuse_adjacent_reshape.h index 3f609d30c4e2..7f3fb51cdf30 100644 --- a/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.h +++ b/tools/pnnx/src/pass_level5/fuse_adjacent_reshape.h @@ -12,14 +12,10 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "pass_ncnn.h" +#include "ir.h" namespace pnnx { -namespace ncnn { - -void convert_to_fp16_model(Graph& graph); - -} // namespace ncnn +void fuse_adjacent_reshape(Graph& graph); } // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_pad_conv1d.cpp b/tools/pnnx/src/pass_level5/fuse_pad_conv1d.cpp new file mode 100644 index 000000000000..2f1260061b5d --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_pad_conv1d.cpp @@ -0,0 +1,401 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_pad_conv1d.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_pad_conv1d_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +F.pad op_pad 1 1 input a mode=constant pad=%pad value=%value +nn.Conv1d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv1d"; + } + + const char* name_str() const + { + return "conv1d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // constant-0 + zeros + float pad_value = 0.f; + if (captured_params.at("value").type == 2) + pad_value = captured_params.at("value").i; + if (captured_params.at("value").type == 3) + pad_value = captured_params.at("value").f; + + if (pad_value != 0.f) + return false; + + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 2) + return false; + + if (pad.size() == 2 && pad[0] != pad[1]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + const std::vector& pad = captured_params.at("pad").ai; + std::vector padding = captured_params.at("padding").ai; + padding[0] += pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "zeros"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv1d_pass_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +F.pad op_pad 1 1 input a mode=%mode pad=%pad +nn.Conv1d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv1d"; + } + + const char* name_str() const + { + return "conv1d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // reflect/replicate + nopad + if (captured_params.at("mode").s != "reflect" && captured_params.at("mode").s != "replicate") + return false; + + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 2) + return false; + + if (pad.size() == 2 && pad[0] != pad[1]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + const std::vector& pad = captured_params.at("pad").ai; + std::vector padding(1); + padding[0] = pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = captured_params.at("mode"); + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv1d_pass_2 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +nn.ConstantPad1d op_pad 1 1 input a padding=%pad value=%value +nn.Conv1d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv1d"; + } + + const char* name_str() const + { + return "conv1d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // constant-0 + zeros + float pad_value = 0.f; + if (captured_params.at("value").type == 2) + pad_value = captured_params.at("value").i; + if (captured_params.at("value").type == 3) + pad_value = captured_params.at("value").f; + + if (pad_value != 0.f) + return false; + + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 2) + return false; + + if (pad[0] != pad[1]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + std::vector padding = captured_params.at("padding").ai; + const std::vector& pad = captured_params.at("pad").ai; + padding[0] += pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "zeros"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv1d_pass_3 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +nn.ReplicationPad1d op_pad 1 1 input a padding=%pad +nn.Conv1d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv1d"; + } + + const char* name_str() const + { + return "conv1d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // replicate + nopad + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 2) + return false; + + if (pad[0] != pad[1]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + std::vector padding(1); + const std::vector& pad = captured_params.at("pad").ai; + padding[0] = pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "replicate"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv1d_pass_4 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +nn.ReflectionPad1d op_pad 1 1 input a padding=%pad +nn.Conv1d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv1d"; + } + + const char* name_str() const + { + return "conv1d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // reflect + nopad + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 2) + return false; + + if (pad[0] != pad[1]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + std::vector padding(1); + const std::vector& pad = captured_params.at("pad").ai; + padding[0] = pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "reflect"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +void fuse_pad_conv1d(Graph& graph) +{ + fuse_pad_conv1d_pass a; + fuse_pad_conv1d_pass_1 b; + fuse_pad_conv1d_pass_2 c; + fuse_pad_conv1d_pass_3 d; + fuse_pad_conv1d_pass_4 e; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &b, opindex); + pnnx_graph_rewrite(graph, &c, opindex); + pnnx_graph_rewrite(graph, &d, opindex); + pnnx_graph_rewrite(graph, &e, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_pad_conv1d.h b/tools/pnnx/src/pass_level5/fuse_pad_conv1d.h new file mode 100644 index 000000000000..f121b340cb08 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_pad_conv1d.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_pad_conv1d(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_pad_conv2d.cpp b/tools/pnnx/src/pass_level5/fuse_pad_conv2d.cpp new file mode 100644 index 000000000000..3723ed9c0e9a --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_pad_conv2d.cpp @@ -0,0 +1,500 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_pad_conv2d.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_pad_conv2d_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +F.pad op_pad 1 1 input a mode=constant pad=%pad value=%value +nn.Conv2d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv2d"; + } + + const char* name_str() const + { + return "conv2d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // constant-0 + zeros + float pad_value = 0.f; + if (captured_params.at("value").type == 2) + pad_value = captured_params.at("value").i; + if (captured_params.at("value").type == 3) + pad_value = captured_params.at("value").f; + + if (pad_value != 0.f) + return false; + + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 2 && pad.size() != 4) + return false; + + if (pad.size() == 2 && pad[0] != pad[1]) + return false; + + if (pad.size() == 4 && (pad[0] != pad[1] || pad[2] != pad[3])) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + const std::vector& pad = captured_params.at("pad").ai; + std::vector padding = captured_params.at("padding").ai; + + if (pad.size() == 2) + { + padding[1] += pad[0]; + } + else if (pad.size() == 4) + { + padding[0] += pad[2]; + padding[1] += pad[0]; + } + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "zeros"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv2d_pass_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +F.pad op_pad 1 1 input a mode=%mode pad=%pad +nn.Conv2d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv2d"; + } + + const char* name_str() const + { + return "conv2d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // reflect/replicate + nopad + if (captured_params.at("mode").s != "reflect" && captured_params.at("mode").s != "replicate") + return false; + + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 2 && pad.size() != 4) + return false; + + if (pad.size() == 2 && pad[0] != pad[1]) + return false; + + if (pad.size() == 4 && (pad[0] != pad[1] || pad[2] != pad[3])) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + const std::vector& pad = captured_params.at("pad").ai; + std::vector padding(2); + + if (pad.size() == 2) + { + padding[0] = 0; + padding[1] = pad[0]; + } + else if (pad.size() == 4) + { + padding[0] = pad[2]; + padding[1] = pad[0]; + } + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = captured_params.at("mode"); + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv2d_pass_2 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +nn.ConstantPad2d op_pad 1 1 input a padding=%pad value=%value +nn.Conv2d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv2d"; + } + + const char* name_str() const + { + return "conv2d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // constant-0 + zeros + float pad_value = 0.f; + if (captured_params.at("value").type == 2) + pad_value = captured_params.at("value").i; + if (captured_params.at("value").type == 3) + pad_value = captured_params.at("value").f; + + if (pad_value != 0.f) + return false; + + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 4) + return false; + + if (pad[0] != pad[1] || pad[2] != pad[3]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + std::vector padding = captured_params.at("padding").ai; + const std::vector& pad = captured_params.at("pad").ai; + padding[0] += pad[2]; + padding[1] += pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "zeros"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv2d_pass_3 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +nn.ZeroPad2d op_pad 1 1 input a padding=%pad +nn.Conv2d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=zeros padding=%padding dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv2d"; + } + + const char* name_str() const + { + return "conv2d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // constant-0 + zeros + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 4) + return false; + + if (pad[0] != pad[1] || pad[2] != pad[3]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + std::vector padding = captured_params.at("padding").ai; + const std::vector& pad = captured_params.at("pad").ai; + padding[0] += pad[2]; + padding[1] += pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "zeros"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv2d_pass_4 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +nn.ReplicationPad2d op_pad 1 1 input a padding=%pad +nn.Conv2d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv2d"; + } + + const char* name_str() const + { + return "conv2d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // replicate + nopad + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 4) + return false; + + if (pad[0] != pad[1] || pad[2] != pad[3]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + std::vector padding(2); + const std::vector& pad = captured_params.at("pad").ai; + padding[0] = pad[2]; + padding[1] = pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "replicate"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +class fuse_pad_conv2d_pass_5 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +nn.ReflectionPad2d op_pad 1 1 input a padding=%pad +nn.Conv2d op_0 1 1 a out in_channels=%in_channels out_channels=%out_channels kernel_size=%kernel_size stride=%stride padding_mode=* padding=(0,0) dilation=%dilation groups=%groups bias=%bias @weight @bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv2d"; + } + + const char* name_str() const + { + return "conv2d"; + } + + bool match_captured_params_attrs(const std::map& captured_params) const + { + // reflect + nopad + const std::vector& pad = captured_params.at("pad").ai; + for (int x : pad) + { + if (x < 0) + return false; + } + + if (pad.size() != 4) + return false; + + if (pad[0] != pad[1] || pad[2] != pad[3]) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + std::vector padding(2); + const std::vector& pad = captured_params.at("pad").ai; + padding[0] = pad[2]; + padding[1] = pad[0]; + + op->params["in_channels"] = captured_params.at("in_channels"); + op->params["out_channels"] = captured_params.at("out_channels"); + op->params["kernel_size"] = captured_params.at("kernel_size"); + op->params["padding_mode"] = "reflect"; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = padding; + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = captured_params.at("bias"); + + op->attrs["weight"] = captured_attrs.at("op_0.weight"); + + if (captured_params.at("bias").b) + { + op->attrs["bias"] = captured_attrs.at("op_0.bias"); + } + } +}; + +void fuse_pad_conv2d(Graph& graph) +{ + fuse_pad_conv2d_pass a; + fuse_pad_conv2d_pass_1 b; + fuse_pad_conv2d_pass_2 c; + fuse_pad_conv2d_pass_3 d; + fuse_pad_conv2d_pass_4 e; + fuse_pad_conv2d_pass_5 f; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &b, opindex); + pnnx_graph_rewrite(graph, &c, opindex); + pnnx_graph_rewrite(graph, &d, opindex); + pnnx_graph_rewrite(graph, &e, opindex); + pnnx_graph_rewrite(graph, &f, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_pad_conv2d.h b/tools/pnnx/src/pass_level5/fuse_pad_conv2d.h new file mode 100644 index 000000000000..fb47be50ec71 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_pad_conv2d.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_pad_conv2d(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp b/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp new file mode 100644 index 000000000000..0a5fabab7afc --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_slice_copy.cpp @@ -0,0 +1,279 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_slice_copy.h" + +#include +#include +#include +#include "pass_level2.h" + +namespace pnnx { + +void fuse_slice_copy(Graph& graph) +{ + while (1) + { + bool matched = false; + + for (size_t i = 0; i < graph.ops.size(); i++) + { + Operator* op = graph.ops[i]; + + if (op->type != "Tensor.copy") + continue; + + // collect slice / select op chain + std::stack slice_select_ops; + int descent_dim_current = INT_MAX; + const Operand* in0 = op->inputs[0]; + while (in0->producer->type == "Tensor.slice" || in0->producer->type == "Tensor.select") + { + const Operator* sop = in0->producer; + if (sop->type == "Tensor.slice") + { + if (sop->params.find("dims") == sop->params.end() + || sop->params.find("starts") == sop->params.end() + || sop->params.find("ends") == sop->params.end() + || sop->params.find("steps") == sop->params.end()) + { + fprintf(stderr, "dynamic index in slice copy chain is not supported\n"); + break; + } + + int dims0 = sop->params.at("dims").ai[0]; + if (descent_dim_current < dims0) + { + break; + } + + descent_dim_current = dims0; + } + + if (sop->type == "Tensor.select") + { + if (sop->params.find("dim") == sop->params.end() + || sop->params.find("index") == sop->params.end()) + { + fprintf(stderr, "dynamic index in select copy chain is not supported\n"); + break; + } + + int dim = sop->params.at("dim").i; + if (descent_dim_current < dim) + { + break; + } + + descent_dim_current = dim; + } + + slice_select_ops.push(sop); + in0 = sop->inputs[0]; + } + + matched = true; + + if (slice_select_ops.empty()) + { + // eliminate noop copy + Operand* out = op->outputs[0]; + + for (auto& x : out->consumers) + { + for (size_t j = 0; j < x->inputs.size(); j++) + { + if (x->inputs[j] == out) + x->inputs[j] = op->inputs[1]; + } + + op->inputs[1]->consumers.push_back(x); + } + + op->inputs[0]->remove_consumer(op); + op->inputs[1]->remove_consumer(op); + + op->inputs[1]->name = out->name; + + out->producer = 0; + out->consumers.clear(); + + graph.operands.erase(std::find(graph.operands.begin(), graph.operands.end(), out)); + delete out; + + op->inputs.clear(); + op->outputs.clear(); + + graph.ops.erase(graph.ops.begin() + i); + delete op; + + break; + } + + const Operator* top_sop = slice_select_ops.top(); + + // construct one-step slice + std::vector new_dims; + std::vector new_starts; + std::vector new_ends; + std::vector new_steps; + + int select_dims_offset = 0; + while (!slice_select_ops.empty()) + { + const Operator* sop = slice_select_ops.top(); + slice_select_ops.pop(); + + if (sop->type == "Tensor.slice") + { + std::vector dims = sop->params.at("dims").ai; + std::vector starts = sop->params.at("starts").ai; + std::vector ends = sop->params.at("ends").ai; + std::vector steps = sop->params.at("steps").ai; + + for (size_t j = 0; j < dims.size(); j++) + { + dims[j] += select_dims_offset; + } + + new_dims.insert(new_dims.end(), dims.begin(), dims.end()); + new_starts.insert(new_starts.end(), starts.begin(), starts.end()); + new_ends.insert(new_ends.end(), ends.begin(), ends.end()); + new_steps.insert(new_steps.end(), steps.begin(), steps.end()); + } + else if (sop->type == "Tensor.select") + { + int dim = sop->params.at("dim").i; + int index = sop->params.at("index").i; + + dim += select_dims_offset; + int end = index + 1; + if (index == -1) + end = INT_MAX; + + new_dims.push_back(dim); + new_starts.push_back(index); + new_ends.push_back(end); + new_steps.push_back(1); + + select_dims_offset += 1; + } + } + + op->type = "Tensor.slice_copy"; + + // insert clone before any slices + Operator* op_clone = graph.new_operator_before("Tensor.clone", op->name + "_ncnnclone", top_sop); + Operand* clone_out = graph.new_operand(op->name + "_ncnnclone_out"); + + clone_out->shape = top_sop->inputs[0]->shape; + + op_clone->inputs.push_back(top_sop->inputs[0]); + top_sop->inputs[0]->consumers.push_back(op_clone); + + op_clone->outputs.push_back(clone_out); + clone_out->producer = op_clone; + + op->inputs[0]->remove_consumer(op); + op->inputs[0] = clone_out; + clone_out->consumers.push_back(op); + + op->params["dims"] = new_dims; + op->params["starts"] = new_starts; + op->params["ends"] = new_ends; + op->params["steps"] = new_steps; + + int input_rank = (int)op->inputs[0]->shape.size(); + if (input_rank == 0) + { + // insert view_as(sliced) for different or unknown rank + Operator* op_slice = graph.new_operator_before("Tensor.slice", op->name + "_ncnnslice", op); + Operator* op_view_as = graph.new_operator_before("Tensor.view_as", op->name + "_ncnnview_as", op); + + Operand* slice_out = graph.new_operand(op->name + "_ncnnslice_out"); + Operand* view_as_out = graph.new_operand(op->name + "_ncnnview_as_out"); + + op_slice->params["dims"] = new_dims; + op_slice->params["starts"] = new_starts; + op_slice->params["ends"] = new_ends; + op_slice->params["steps"] = new_steps; + + op_slice->inputs.push_back(op->inputs[0]); + op->inputs[0]->consumers.push_back(op_slice); + + op_slice->outputs.push_back(slice_out); + slice_out->producer = op_slice; + + op_view_as->inputs.push_back(op->inputs[1]); + op->inputs[1]->consumers.push_back(op_view_as); + op->inputs[1]->remove_consumer(op); + op_view_as->inputs.push_back(slice_out); + slice_out->consumers.push_back(op_view_as); + + op_view_as->outputs.push_back(view_as_out); + view_as_out->producer = op_view_as; + + op->inputs[1] = view_as_out; + view_as_out->consumers.push_back(op); + } + else if (input_rank != (int)op->inputs[1]->shape.size()) + { + // solve the target shape + std::vector target_shape = op->inputs[0]->shape; + for (size_t j = 0; j < new_dims.size(); j++) + { + int dim = new_dims[j]; + int start = new_starts[j]; + int end = new_ends[j]; + int step = new_steps[j]; + + if (dim < 0) + dim = input_rank + dim; + if (start < 0) + start = target_shape[dim] + start; + if (end < 0) + end = target_shape[dim] + end; + if (end == INT_MAX) + end = target_shape[dim]; + + target_shape[dim] = (end - start + (step - 1)) / step; + } + + Operator* op_view = graph.new_operator_before("Tensor.view", op->name + "_ncnnview", op); + Operand* view_out = graph.new_operand(op->name + "_ncnnview_out"); + + op_view->params["shape"] = target_shape; + + view_out->shape = target_shape; + + op_view->inputs.push_back(op->inputs[1]); + op->inputs[1]->consumers.push_back(op_view); + op->inputs[1]->remove_consumer(op); + + op_view->outputs.push_back(view_out); + view_out->producer = op_view; + + op->inputs[1] = view_out; + view_out->consumers.push_back(op); + } + + break; + } + + if (!matched) + break; + } +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_slice_copy.h b/tools/pnnx/src/pass_level5/fuse_slice_copy.h new file mode 100644 index 000000000000..db3aef773594 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_slice_copy.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_slice_copy(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp b/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp index 2162908b4282..6ccacd5628f7 100644 --- a/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp +++ b/tools/pnnx/src/pass_level5/fuse_slice_to_tensor_split.cpp @@ -14,6 +14,7 @@ #include "fuse_slice_to_tensor_split.h" +#include #include #include "pass_level2.h" @@ -102,7 +103,7 @@ void fuse_slice_to_tensor_split(Graph& graph) cur = op2; int end2 = op2->params.at("ends").ai[0]; - if (end2 == -1) + if (end2 == INT_MAX) { slice_n_ops.push_back(op2); full_dimsize_slice = true; diff --git a/tools/pnnx/src/pass_level5/fuse_static_batchnorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_batchnorm.cpp new file mode 100644 index 000000000000..0a3b9fbe405b --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_batchnorm.cpp @@ -0,0 +1,384 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_static_batchnorm.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_static_Fbatchnorm_pass_1d : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_mean 0 1 running_mean @qwq +pnnx.Attribute op_var 0 1 running_var @qwq +F.batchnorm op_0 3 1 input running_mean running_var out weight=None bias=None eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.BatchNorm1d"; + } + + const char* name_str() const + { + return "batchnorm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 2 || input_rank == 3; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute running_mean; + Attribute running_var; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 8) == "op_mean.") + running_mean = x.second; + if (x.first.substr(0, 7) == "op_var.") + running_var = x.second; + } + + op->params["num_features"] = running_mean.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = false; + + op->attrs["running_mean"] = running_mean; + op->attrs["running_var"] = running_var; + } +}; + +class fuse_static_Fbatchnorm_pass_1d_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +7 6 +pnnx.Input input 0 1 input +pnnx.Attribute op_mean 0 1 running_mean @qwq +pnnx.Attribute op_var 0 1 running_var @qwq +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.batch_norm op_0 5 1 input running_mean running_var weight bias out eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.BatchNorm1d"; + } + + const char* name_str() const + { + return "batchnorm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 2 || input_rank == 3; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute running_mean; + Attribute running_var; + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 8) == "op_mean.") + running_mean = x.second; + if (x.first.substr(0, 7) == "op_var.") + running_var = x.second; + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["num_features"] = running_mean.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = true; + + op->attrs["running_mean"] = running_mean; + op->attrs["running_var"] = running_var; + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +class fuse_static_Fbatchnorm_pass_2d : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_mean 0 1 running_mean @qwq +pnnx.Attribute op_var 0 1 running_var @qwq +F.batchnorm op_0 3 1 input running_mean running_var out weight=None bias=None eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.BatchNorm2d"; + } + + const char* name_str() const + { + return "batchnorm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 4; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute running_mean; + Attribute running_var; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 8) == "op_mean.") + running_mean = x.second; + if (x.first.substr(0, 7) == "op_var.") + running_var = x.second; + } + + op->params["num_features"] = running_mean.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = false; + + op->attrs["running_mean"] = running_mean; + op->attrs["running_var"] = running_var; + } +}; + +class fuse_static_Fbatchnorm_pass_2d_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +7 6 +pnnx.Input input 0 1 input +pnnx.Attribute op_mean 0 1 running_mean @qwq +pnnx.Attribute op_var 0 1 running_var @qwq +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.batch_norm op_0 5 1 input running_mean running_var weight bias out eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.BatchNorm2d"; + } + + const char* name_str() const + { + return "batchnorm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 4; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute running_mean; + Attribute running_var; + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 8) == "op_mean.") + running_mean = x.second; + if (x.first.substr(0, 7) == "op_var.") + running_var = x.second; + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["num_features"] = running_mean.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = true; + + op->attrs["running_mean"] = running_mean; + op->attrs["running_var"] = running_var; + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +class fuse_static_Fbatchnorm_pass_3d : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_mean 0 1 running_mean @qwq +pnnx.Attribute op_var 0 1 running_var @qwq +F.batchnorm op_0 3 1 input running_mean running_var out weight=None bias=None eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.BatchNorm3d"; + } + + const char* name_str() const + { + return "batchnorm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 5; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute running_mean; + Attribute running_var; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 8) == "op_mean.") + running_mean = x.second; + if (x.first.substr(0, 7) == "op_var.") + running_var = x.second; + } + + op->params["num_features"] = running_mean.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = false; + + op->attrs["running_mean"] = running_mean; + op->attrs["running_var"] = running_var; + } +}; + +class fuse_static_Fbatchnorm_pass_3d_1 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +7 6 +pnnx.Input input 0 1 input +pnnx.Attribute op_mean 0 1 running_mean @qwq +pnnx.Attribute op_var 0 1 running_var @qwq +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.batch_norm op_0 5 1 input running_mean running_var weight bias out eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.BatchNorm3d"; + } + + const char* name_str() const + { + return "batchnorm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 5; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute running_mean; + Attribute running_var; + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 8) == "op_mean.") + running_mean = x.second; + if (x.first.substr(0, 7) == "op_var.") + running_var = x.second; + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["num_features"] = running_mean.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = true; + + op->attrs["running_mean"] = running_mean; + op->attrs["running_var"] = running_var; + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +void fuse_static_batchnorm(Graph& graph) +{ + fuse_static_Fbatchnorm_pass_1d a; + fuse_static_Fbatchnorm_pass_2d b; + fuse_static_Fbatchnorm_pass_3d c; + fuse_static_Fbatchnorm_pass_1d_1 a1; + fuse_static_Fbatchnorm_pass_2d_1 b1; + fuse_static_Fbatchnorm_pass_3d_1 c1; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &b, opindex); + pnnx_graph_rewrite(graph, &c, opindex); + pnnx_graph_rewrite(graph, &a1, opindex); + pnnx_graph_rewrite(graph, &b1, opindex); + pnnx_graph_rewrite(graph, &c1, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_batchnorm.h b/tools/pnnx/src/pass_level5/fuse_static_batchnorm.h new file mode 100644 index 000000000000..7ffc7ca2ce88 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_batchnorm.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_static_batchnorm(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_conv.cpp b/tools/pnnx/src/pass_level5/fuse_static_conv.cpp index 7d5e256d9acb..6e29bcaaccce 100644 --- a/tools/pnnx/src/pass_level5/fuse_static_conv.cpp +++ b/tools/pnnx/src/pass_level5/fuse_static_conv.cpp @@ -120,6 +120,82 @@ pnnx.Output output 1 0 out } }; +class fuse_static_Fconv1d_pass_3 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.conv1d op_0 2 1 input weight a bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups +pnnx.Expression op_1 2 1 a bias out expr=%expr +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv1d"; + } + + const char* name_str() const + { + return "conv1d"; + } + + bool match(const std::map& captured_params, const std::map& captured_attrs) const + { + const std::string& expr = captured_params.at("expr").s; + if (expr != "add(@0,@1)") + return false; + + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + int out_channels = weight.shape[0]; + if (bias.shape != std::vector{1, out_channels, 1}) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["in_channels"] = weight.shape[1] * captured_params.at("groups").i; + op->params["out_channels"] = weight.shape[0]; + op->params["kernel_size"] = std::vector{weight.shape[2]}; + op->params["padding_mode"] = std::string("zeros"); + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + class fuse_static_Fconv2d_pass : public GraphRewriterPass { public: @@ -219,6 +295,82 @@ pnnx.Output output 1 0 out } }; +class fuse_static_Fconv2d_pass_3 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.conv2d op_0 2 1 input weight a bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups +pnnx.Expression op_1 2 1 a bias out expr=%expr +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv2d"; + } + + const char* name_str() const + { + return "conv2d"; + } + + bool match(const std::map& captured_params, const std::map& captured_attrs) const + { + const std::string& expr = captured_params.at("expr").s; + if (expr != "add(@0,@1)") + return false; + + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + int out_channels = weight.shape[0]; + if (bias.shape != std::vector{1, out_channels, 1, 1}) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["in_channels"] = weight.shape[1] * captured_params.at("groups").i; + op->params["out_channels"] = weight.shape[0]; + op->params["kernel_size"] = std::vector{weight.shape[2], weight.shape[3]}; + op->params["padding_mode"] = std::string("zeros"); + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + class fuse_static_Fconv3d_pass : public GraphRewriterPass { public: @@ -318,8 +470,88 @@ pnnx.Output output 1 0 out } }; +class fuse_static_Fconv3d_pass_3 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.conv3d op_0 2 1 input weight a bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups +pnnx.Expression op_1 2 1 a bias out expr=%expr +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Conv3d"; + } + + const char* name_str() const + { + return "conv3d"; + } + + bool match(const std::map& captured_params, const std::map& captured_attrs) const + { + const std::string& expr = captured_params.at("expr").s; + if (expr != "add(@0,@1)") + return false; + + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + int out_channels = weight.shape[0]; + if (bias.shape != std::vector{1, out_channels, 1, 1, 1}) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["in_channels"] = weight.shape[1] * captured_params.at("groups").i; + op->params["out_channels"] = weight.shape[0]; + op->params["kernel_size"] = std::vector{weight.shape[2], weight.shape[3], weight.shape[4]}; + op->params["padding_mode"] = std::string("zeros"); + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["groups"] = captured_params.at("groups"); + op->params["bias"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + void fuse_static_conv(Graph& graph) { + fuse_static_Fconv1d_pass_3 a3; + fuse_static_Fconv2d_pass_3 a4; + fuse_static_Fconv3d_pass_3 a5; + fuse_static_Fconv1d_pass a; fuse_static_Fconv1d_pass_2 b; fuse_static_Fconv2d_pass c; @@ -328,6 +560,10 @@ void fuse_static_conv(Graph& graph) fuse_static_Fconv3d_pass_2 f; int opindex = 0; + pnnx_graph_rewrite(graph, &a3, opindex); + pnnx_graph_rewrite(graph, &a4, opindex); + pnnx_graph_rewrite(graph, &a5, opindex); + pnnx_graph_rewrite(graph, &a, opindex); pnnx_graph_rewrite(graph, &b, opindex); pnnx_graph_rewrite(graph, &c, opindex); diff --git a/tools/pnnx/src/pass_level5/fuse_static_convtranspose.cpp b/tools/pnnx/src/pass_level5/fuse_static_convtranspose.cpp new file mode 100644 index 000000000000..6f6e164952ad --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_convtranspose.cpp @@ -0,0 +1,351 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_static_convtranspose.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_static_Fconvtranspose1d_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +F.conv_transpose1d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.ConvTranspose1d"; + } + + const char* name_str() const + { + return "conv_transpose1d"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + } + + const int groups = captured_params.at("groups").i; + + op->params["groups"] = groups; + op->params["in_channels"] = weight.shape[0]; + op->params["out_channels"] = weight.shape[1] * groups; + op->params["kernel_size"] = Parameter{weight.shape[2]}; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["output_padding"] = captured_params.at("output_padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["bias"] = false; + + op->attrs["weight"] = weight; + } +}; + +class fuse_static_Fconvtranspose1d_pass_2 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.conv_transpose1d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.ConvTranspose1d"; + } + + const char* name_str() const + { + return "conv_transpose1d"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + const int groups = captured_params.at("groups").i; + + op->params["groups"] = groups; + op->params["in_channels"] = weight.shape[0]; + op->params["out_channels"] = weight.shape[1] * groups; + op->params["kernel_size"] = Parameter{weight.shape[2]}; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["output_padding"] = captured_params.at("output_padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["bias"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +class fuse_static_Fconvtranspose2d_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +F.conv_transpose2d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.ConvTranspose2d"; + } + + const char* name_str() const + { + return "conv_transpose2d"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + } + + const int groups = captured_params.at("groups").i; + + op->params["groups"] = groups; + op->params["in_channels"] = weight.shape[0]; + op->params["out_channels"] = weight.shape[1] * groups; + op->params["kernel_size"] = Parameter{weight.shape[2], weight.shape[3]}; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["output_padding"] = captured_params.at("output_padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["bias"] = false; + + op->attrs["weight"] = weight; + } +}; + +class fuse_static_Fconvtranspose2d_pass_2 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.conv_transpose2d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.ConvTranspose2d"; + } + + const char* name_str() const + { + return "conv_transpose2d"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + const int groups = captured_params.at("groups").i; + + op->params["groups"] = groups; + op->params["in_channels"] = weight.shape[0]; + op->params["out_channels"] = weight.shape[1] * groups; + op->params["kernel_size"] = Parameter{weight.shape[2], weight.shape[3]}; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["output_padding"] = captured_params.at("output_padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["bias"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +class fuse_static_Fconvtranspose3d_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +F.conv_transpose3d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.ConvTranspose3d"; + } + + const char* name_str() const + { + return "conv_transpose3d"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + } + + const int groups = captured_params.at("groups").i; + + op->params["groups"] = groups; + op->params["in_channels"] = weight.shape[0]; + op->params["out_channels"] = weight.shape[1] * groups; + op->params["kernel_size"] = Parameter{weight.shape[2], weight.shape[3], weight.shape[4]}; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["output_padding"] = captured_params.at("output_padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["bias"] = false; + + op->attrs["weight"] = weight; + } +}; + +class fuse_static_Fconvtranspose3d_pass_2 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.conv_transpose3d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.ConvTranspose3d"; + } + + const char* name_str() const + { + return "conv_transpose3d"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + const int groups = captured_params.at("groups").i; + + op->params["groups"] = groups; + op->params["in_channels"] = weight.shape[0]; + op->params["out_channels"] = weight.shape[1] * groups; + op->params["kernel_size"] = Parameter{weight.shape[2], weight.shape[3], weight.shape[4]}; + op->params["stride"] = captured_params.at("stride"); + op->params["padding"] = captured_params.at("padding"); + op->params["output_padding"] = captured_params.at("output_padding"); + op->params["dilation"] = captured_params.at("dilation"); + op->params["bias"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +void fuse_static_convtranspose(Graph& graph) +{ + fuse_static_Fconvtranspose1d_pass a; + fuse_static_Fconvtranspose1d_pass_2 b; + fuse_static_Fconvtranspose2d_pass c; + fuse_static_Fconvtranspose2d_pass_2 d; + fuse_static_Fconvtranspose3d_pass e; + fuse_static_Fconvtranspose3d_pass_2 f; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &b, opindex); + pnnx_graph_rewrite(graph, &c, opindex); + pnnx_graph_rewrite(graph, &d, opindex); + pnnx_graph_rewrite(graph, &e, opindex); + pnnx_graph_rewrite(graph, &f, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_convtranspose.h b/tools/pnnx/src/pass_level5/fuse_static_convtranspose.h new file mode 100644 index 000000000000..2474074a1505 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_convtranspose.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_static_convtranspose(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_groupnorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_groupnorm.cpp new file mode 100644 index 000000000000..203168e2596d --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_groupnorm.cpp @@ -0,0 +1,79 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_static_groupnorm.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_static_Fgroupnorm_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.group_norm op_0 3 1 input weight bias out num_groups=%num_groups eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.GroupNorm"; + } + + const char* name_str() const + { + return "group_norm"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["num_channels"] = weight.shape[0]; + op->params["num_groups"] = captured_params.at("num_groups"); + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +void fuse_static_groupnorm(Graph& graph) +{ + fuse_static_Fgroupnorm_pass a; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_groupnorm.h b/tools/pnnx/src/pass_level5/fuse_static_groupnorm.h new file mode 100644 index 000000000000..2de65fa307b7 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_groupnorm.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_static_groupnorm(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_instancenorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_instancenorm.cpp new file mode 100644 index 000000000000..5bf08017f6db --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_instancenorm.cpp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_static_instancenorm.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_static_Finstancenorm_pass_1d : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.instance_norm op_0 3 1 input weight bias out running_mean=None running_var=None eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.InstanceNorm1d"; + } + + const char* name_str() const + { + return "instance_norm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 2 || input_rank == 3; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["num_features"] = weight.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = true; + op->params["track_running_stats"] = false; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +class fuse_static_Finstancenorm_pass_2d : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.instance_norm op_0 3 1 input weight bias out running_mean=None running_var=None eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.InstanceNorm1d"; + } + + const char* name_str() const + { + return "instance_norm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 4; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["num_features"] = weight.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = true; + op->params["track_running_stats"] = false; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +class fuse_static_Finstancenorm_pass_3d : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.instance_norm op_0 3 1 input weight bias out running_mean=None running_var=None eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.InstanceNorm1d"; + } + + const char* name_str() const + { + return "instance_norm"; + } + + bool match(const std::map& matched_operators) const + { + int input_rank = matched_operators.at("op_0")->inputs[0]->shape.size(); + return input_rank == 5; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["num_features"] = weight.shape[0]; + op->params["eps"] = captured_params.at("eps"); + op->params["affine"] = true; + op->params["track_running_stats"] = false; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +void fuse_static_instancenorm(Graph& graph) +{ + fuse_static_Finstancenorm_pass_1d a; + fuse_static_Finstancenorm_pass_2d b; + fuse_static_Finstancenorm_pass_3d c; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &b, opindex); + pnnx_graph_rewrite(graph, &c, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_instancenorm.h b/tools/pnnx/src/pass_level5/fuse_static_instancenorm.h new file mode 100644 index 000000000000..df71b0e52a75 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_instancenorm.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_static_instancenorm(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_layernorm.cpp b/tools/pnnx/src/pass_level5/fuse_static_layernorm.cpp new file mode 100644 index 000000000000..d6c494f089d8 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_layernorm.cpp @@ -0,0 +1,78 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_static_layernorm.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_static_Flayernorm_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.layer_norm op_0 3 1 input weight bias out normalized_shape=%normalized_shape eps=%eps +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.LayerNorm"; + } + + const char* name_str() const + { + return "layer_norm"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["normalized_shape"] = captured_params.at("normalized_shape"); + op->params["eps"] = captured_params.at("eps"); + op->params["elementwise_affine"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +void fuse_static_layernorm(Graph& graph) +{ + fuse_static_Flayernorm_pass a; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_layernorm.h b/tools/pnnx/src/pass_level5/fuse_static_layernorm.h new file mode 100644 index 000000000000..e61f254d2b58 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_layernorm.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_static_layernorm(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_linear.cpp b/tools/pnnx/src/pass_level5/fuse_static_linear.cpp new file mode 100644 index 000000000000..a34177e20ee3 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_linear.cpp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "fuse_static_linear.h" + +#include "pass_level2.h" + +#include +#include + +namespace pnnx { + +class fuse_static_Flinear_pass : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +F.linear op_0 2 1 input weight out bias=None +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Linear"; + } + + const char* name_str() const + { + return "linear"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + } + + op->params["in_features"] = weight.shape[1]; + op->params["out_features"] = weight.shape[0]; + op->params["bias"] = false; + + op->attrs["weight"] = weight; + } +}; + +class fuse_static_Flinear_pass_2 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.linear op_0 3 1 input weight bias out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Linear"; + } + + const char* name_str() const + { + return "linear"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["in_features"] = weight.shape[1]; + op->params["out_features"] = weight.shape[0]; + op->params["bias"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +class fuse_static_Flinear_pass_3 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +6 5 +pnnx.Input input 0 1 input +pnnx.Attribute op_weight 0 1 weight @qwq +pnnx.Attribute op_bias 0 1 bias @qwq +F.linear op_0 2 1 input weight a bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups +pnnx.Expression op_1 2 1 a bias out expr=%expr +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "nn.Linear"; + } + + const char* name_str() const + { + return "linear"; + } + + bool match(const std::map& captured_params, const std::map& captured_attrs) const + { + const std::string& expr = captured_params.at("expr").s; + if (expr != "add(@0,@1)") + return false; + + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + int out_channels = weight.shape[0]; + if (bias.shape != std::vector{1, out_channels, 1}) + return false; + + return true; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + Attribute weight; + Attribute bias; + for (const auto& x : captured_attrs) + { + if (x.first.substr(0, 10) == "op_weight.") + weight = x.second; + if (x.first.substr(0, 8) == "op_bias.") + bias = x.second; + } + + op->params["in_features"] = weight.shape[1]; + op->params["out_features"] = weight.shape[0]; + op->params["bias"] = true; + + op->attrs["weight"] = weight; + op->attrs["bias"] = bias; + } +}; + +void fuse_static_linear(Graph& graph) +{ + fuse_static_Flinear_pass_3 a3; + + fuse_static_Flinear_pass a; + fuse_static_Flinear_pass_2 b; + int opindex = 0; + + pnnx_graph_rewrite(graph, &a3, opindex); + + pnnx_graph_rewrite(graph, &a, opindex); + pnnx_graph_rewrite(graph, &b, opindex); +} + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/fuse_static_linear.h b/tools/pnnx/src/pass_level5/fuse_static_linear.h new file mode 100644 index 000000000000..8c26f924c166 --- /dev/null +++ b/tools/pnnx/src/pass_level5/fuse_static_linear.h @@ -0,0 +1,21 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +void fuse_static_linear(Graph& graph); + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_level5/unroll_rnn_op.cpp b/tools/pnnx/src/pass_level5/unroll_rnn_op.cpp index c832353be229..2fda02423099 100644 --- a/tools/pnnx/src/pass_level5/unroll_rnn_op.cpp +++ b/tools/pnnx/src/pass_level5/unroll_rnn_op.cpp @@ -42,6 +42,7 @@ void unroll_rnn_op(Graph& graph) bool has_output_hidden = op->outputs.size() >= 2; bool has_output_cell = op->outputs.size() == 3; const int hidden_size = op->params["hidden_size"].i; + const int proj_size = (op->type == "nn.LSTM") ? op->params["proj_size"].i : 0; bool has_bias = op->params["bias"].b; bool is_bidirectional = op->params["bidirectional"].b; @@ -116,7 +117,14 @@ void unroll_rnn_op(Graph& graph) } else { - op1->params["input_size"] = is_bidirectional ? hidden_size * 2 : hidden_size; + if (proj_size) + { + op1->params["input_size"] = is_bidirectional ? proj_size * 2 : proj_size; + } + else + { + op1->params["input_size"] = is_bidirectional ? hidden_size * 2 : hidden_size; + } op1->inputs.push_back(unrolled_ops[j - 1]->outputs[0]); op1->inputs[0]->consumers.push_back(op1); @@ -171,6 +179,11 @@ void unroll_rnn_op(Graph& graph) op1->attrs["bias_ih_l0"] = op->attrs["bias_ih_l" + std::to_string(j)]; } + if (proj_size) + { + op1->attrs["weight_hr_l0"] = op->attrs["weight_hr_l" + std::to_string(j)]; + } + if (is_bidirectional) { op1->attrs["weight_hh_l0_reverse"] = op->attrs["weight_hh_l" + std::to_string(j) + "_reverse"]; @@ -181,6 +194,11 @@ void unroll_rnn_op(Graph& graph) op1->attrs["bias_hh_l0_reverse"] = op->attrs["bias_hh_l" + std::to_string(j) + "_reverse"]; op1->attrs["bias_ih_l0_reverse"] = op->attrs["bias_ih_l" + std::to_string(j) + "_reverse"]; } + + if (proj_size) + { + op1->attrs["weight_hr_l0_reverse"] = op->attrs["weight_hr_l" + std::to_string(j) + "_reverse"]; + } } unrolled_ops[j] = op1; diff --git a/tools/pnnx/src/pass_ncnn.cpp b/tools/pnnx/src/pass_ncnn.cpp index 603b6f20705f..309692f29429 100644 --- a/tools/pnnx/src/pass_ncnn.cpp +++ b/tools/pnnx/src/pass_ncnn.cpp @@ -31,7 +31,6 @@ #include "pass_ncnn/insert_split.h" #include "pass_ncnn/chain_multi_output.h" #include "pass_ncnn/solve_batch_index.h" -#include "pass_ncnn/convert_to_fp16_model.h" #include "pass_ncnn/eliminate_noop.h" #include "pass_ncnn/eliminate_tail_reshape_permute.h" @@ -137,8 +136,6 @@ void pass_ncnn(Graph& g) ncnn::convert_input(g); ncnn::eliminate_output(g); - - ncnn::convert_to_fp16_model(g); } } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_conv1d.cpp b/tools/pnnx/src/pass_ncnn/F_conv1d.cpp index 0d969caca48f..c861842b95fb 100644 --- a/tools/pnnx/src/pass_ncnn/F_conv1d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_conv1d.cpp @@ -18,254 +18,6 @@ namespace pnnx { namespace ncnn { -class F_conv1d : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv1d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Convolution1D"; - } - - const char* name_str() const - { - return "conv1d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv1d, 20) - -class F_conv1d_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv1d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Convolution1D"; - } - - const char* name_str() const - { - return "conv1d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv1d_1, 20) - -class F_conv1d_2 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv1d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "ConvolutionDepthWise1D"; - } - - const char* name_str() const - { - return "convdw1d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = captured_params.at("groups"); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv1d_2, 21) - -class F_conv1d_3 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv1d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "ConvolutionDepthWise1D"; - } - - const char* name_str() const - { - return "convdw1d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = captured_params.at("groups"); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv1d_3, 21) - class F_conv1d_4 : public GraphRewriterPass { public: diff --git a/tools/pnnx/src/pass_ncnn/F_conv2d.cpp b/tools/pnnx/src/pass_ncnn/F_conv2d.cpp index 0814a470957d..8480b80aa28e 100644 --- a/tools/pnnx/src/pass_ncnn/F_conv2d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_conv2d.cpp @@ -18,270 +18,6 @@ namespace pnnx { namespace ncnn { -class F_conv2d : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv2d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Convolution"; - } - - const char* name_str() const - { - return "conv2d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[3]; - op->params["11"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[1]; - op->params["12"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[1]; - op->params["13"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[1]; - op->params["14"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv2d, 20) - -class F_conv2d_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv2d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Convolution"; - } - - const char* name_str() const - { - return "conv2d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[3]; - op->params["11"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[1]; - op->params["12"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[1]; - op->params["13"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[1]; - op->params["14"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv2d_1, 20) - -class F_conv2d_2 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv2d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "ConvolutionDepthWise"; - } - - const char* name_str() const - { - return "convdw2d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[3]; - op->params["11"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[1]; - op->params["12"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[1]; - op->params["13"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[1]; - op->params["14"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = captured_params.at("groups"); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv2d_2, 21) - -class F_conv2d_3 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv2d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "ConvolutionDepthWise"; - } - - const char* name_str() const - { - return "convdw2d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[3]; - op->params["11"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[1]; - op->params["12"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[1]; - op->params["13"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[1]; - op->params["14"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = captured_params.at("groups"); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv2d_3, 21) - class F_conv2d_4 : public GraphRewriterPass { public: diff --git a/tools/pnnx/src/pass_ncnn/F_conv3d.cpp b/tools/pnnx/src/pass_ncnn/F_conv3d.cpp index 317e220a0b22..890f36cc92a0 100644 --- a/tools/pnnx/src/pass_ncnn/F_conv3d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_conv3d.cpp @@ -18,286 +18,6 @@ namespace pnnx { namespace ncnn { -class F_conv3d : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv3d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Convolution3D"; - } - - const char* name_str() const - { - return "conv3d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[4]; - op->params["11"] = weight.shape[3]; - op->params["21"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[2]; - op->params["12"] = captured_params.at("dilation").ai[1]; - op->params["22"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[2]; - op->params["13"] = captured_params.at("stride").ai[1]; - op->params["23"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[2]; - op->params["14"] = captured_params.at("padding").ai[1]; - op->params["24"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv3d, 20) - -class F_conv3d_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv3d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Convolution3D"; - } - - const char* name_str() const - { - return "conv3d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[4]; - op->params["11"] = weight.shape[3]; - op->params["21"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[2]; - op->params["12"] = captured_params.at("dilation").ai[1]; - op->params["22"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[2]; - op->params["13"] = captured_params.at("stride").ai[1]; - op->params["23"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[2]; - op->params["14"] = captured_params.at("padding").ai[1]; - op->params["24"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv3d_1, 20) - -class F_conv3d_2 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv3d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "ConvolutionDepthWise3D"; - } - - const char* name_str() const - { - return "convdw3d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[4]; - op->params["11"] = weight.shape[3]; - op->params["21"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[2]; - op->params["12"] = captured_params.at("dilation").ai[1]; - op->params["22"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[2]; - op->params["13"] = captured_params.at("stride").ai[1]; - op->params["23"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[2]; - op->params["14"] = captured_params.at("padding").ai[1]; - op->params["24"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = captured_params.at("groups"); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv3d_2, 21) - -class F_conv3d_3 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv3d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "ConvolutionDepthWise3D"; - } - - const char* name_str() const - { - return "convdw3d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = weight.shape[4]; - op->params["11"] = weight.shape[3]; - op->params["21"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[2]; - op->params["12"] = captured_params.at("dilation").ai[1]; - op->params["22"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[2]; - op->params["13"] = captured_params.at("stride").ai[1]; - op->params["23"] = captured_params.at("stride").ai[0]; - if (captured_params.at("padding").type == 4) - { - if (captured_params.at("padding").s == "same") - op->params["4"] = -233; - else if (captured_params.at("padding").s == "valid") - op->params["4"] = 0; - } - else - { - op->params["4"] = captured_params.at("padding").ai[2]; - op->params["14"] = captured_params.at("padding").ai[1]; - op->params["24"] = captured_params.at("padding").ai[0]; - } - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = captured_params.at("groups"); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv3d_3, 21) - } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp b/tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp index fd121d3c2293..5901522afca7 100644 --- a/tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_conv_transpose1d.cpp @@ -18,332 +18,6 @@ namespace pnnx { namespace ncnn { -class F_conv_transpose1d : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv_transpose1d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Deconvolution1D"; - } - - const char* name_str() const - { - return "conv_transpose1d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[1]; - op->params["1"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - // transpose inch-outch-kw to outch-inch-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1]; - const int kw = weight.shape[2]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch * inch * kw); - float* w2 = (float*)new_weight.data(); - - // reorder weight from inch-outch to outch-inch - for (int i = 0; i < outch; i++) - { - for (int j = 0; j < inch; j++) - { - for (int k = 0; k < kw; k++) - { - w2[(i * inch + j) * kw + k] = w[(j * outch + i) * kw + k]; - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch, inch, kw}, new_weight); - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose1d, 20) - -class F_conv_transpose1d_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv_transpose1d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Deconvolution1D"; - } - - const char* name_str() const - { - return "conv_transpose1d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[1]; - op->params["1"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - // transpose inch-outch-kw to outch-inch-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1]; - const int kw = weight.shape[2]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch * inch * kw); - float* w2 = (float*)new_weight.data(); - - // reorder weight from inch-outch to outch-inch - for (int i = 0; i < outch; i++) - { - for (int j = 0; j < inch; j++) - { - for (int k = 0; k < kw; k++) - { - w2[(i * inch + j) * kw + k] = w[(j * outch + i) * kw + k]; - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch, inch, kw}, new_weight); - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose1d_1, 20) - -class F_conv_transpose1d_2 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv_transpose1d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "DeconvolutionDepthWise1D"; - } - - const char* name_str() const - { - return "deconvdw1d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - const int groups = captured_params.at("groups").i; - - op->params["0"] = weight.shape[1] * groups; - op->params["1"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = groups; - - // transpose group-inch/group-outch/group-kw to group-outch/group-inch/group-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1] * groups; - const int kw = weight.shape[2]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch / groups * inch * kw); - float* w2 = (float*)new_weight.data(); - const int outch_g = outch / groups; - const int inch_g = inch / groups; - - for (int g = 0; g < groups; g++) - { - // reorder weight from inch-outch to outch-inch - float* wg2 = w2 + g * outch_g * inch_g * kw; - const float* wg = w + g * inch_g * outch_g * kw; - for (int i = 0; i < outch_g; i++) - { - for (int j = 0; j < inch_g; j++) - { - for (int k = 0; k < kw; k++) - { - wg2[(i * inch_g + j) * kw + k] = wg[(j * outch_g + i) * kw + k]; - } - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch / groups, inch, kw}, new_weight); - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose1d_2, 21) - -class F_conv_transpose1d_3 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv_transpose1d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "DeconvolutionDepthWise1D"; - } - - const char* name_str() const - { - return "deconvdw1d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - const int groups = captured_params.at("groups").i; - - op->params["0"] = weight.shape[1] * groups; - op->params["1"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = groups; - - // transpose group-inch/group-outch/group-kw to group-outch/group-inch/group-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1] * groups; - const int kw = weight.shape[2]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch / groups * inch * kw); - float* w2 = (float*)new_weight.data(); - const int outch_g = outch / groups; - const int inch_g = inch / groups; - - for (int g = 0; g < groups; g++) - { - // reorder weight from inch-outch to outch-inch - float* wg2 = w2 + g * outch_g * inch_g * kw; - const float* wg = w + g * inch_g * outch_g * kw; - for (int i = 0; i < outch_g; i++) - { - for (int j = 0; j < inch_g; j++) - { - for (int k = 0; k < kw; k++) - { - wg2[(i * inch_g + j) * kw + k] = wg[(j * outch_g + i) * kw + k]; - } - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch / groups, inch, kw}, new_weight); - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose1d_3, 21) - } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp b/tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp index fc9f9e75fac4..890f36cc92a0 100644 --- a/tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_conv_transpose2d.cpp @@ -18,360 +18,6 @@ namespace pnnx { namespace ncnn { -class F_conv_transpose2d : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv_transpose2d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Deconvolution"; - } - - const char* name_str() const - { - return "conv_transpose2d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[1]; - op->params["1"] = weight.shape[3]; - op->params["11"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[1]; - op->params["12"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[1]; - op->params["13"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[1]; - op->params["14"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[1]; - op->params["19"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - // transpose inch-outch-kh-kw to outch-inch-kh-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1]; - const int kh = weight.shape[2]; - const int kw = weight.shape[3]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch * inch * kh * kw); - float* w2 = (float*)new_weight.data(); - const int maxk = kh * kw; - - // reorder weight from inch-outch to outch-inch - for (int i = 0; i < outch; i++) - { - for (int j = 0; j < inch; j++) - { - for (int k = 0; k < maxk; k++) - { - w2[(i * inch + j) * maxk + k] = w[(j * outch + i) * maxk + k]; - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch, inch, kh, kw}, new_weight); - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose2d, 20) - -class F_conv_transpose2d_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv_transpose2d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Deconvolution"; - } - - const char* name_str() const - { - return "conv_transpose2d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[1]; - op->params["1"] = weight.shape[3]; - op->params["11"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[1]; - op->params["12"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[1]; - op->params["13"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[1]; - op->params["14"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[1]; - op->params["19"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - // transpose inch-outch-kh-kw to outch-inch-kh-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1]; - const int kh = weight.shape[2]; - const int kw = weight.shape[3]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch * inch * kh * kw); - float* w2 = (float*)new_weight.data(); - const int maxk = kh * kw; - - // reorder weight from inch-outch to outch-inch - for (int i = 0; i < outch; i++) - { - for (int j = 0; j < inch; j++) - { - for (int k = 0; k < maxk; k++) - { - w2[(i * inch + j) * maxk + k] = w[(j * outch + i) * maxk + k]; - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch, inch, kh, kw}, new_weight); - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose2d_1, 20) - -class F_conv_transpose2d_2 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv_transpose2d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "DeconvolutionDepthWise"; - } - - const char* name_str() const - { - return "deconvdw2d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - const int groups = captured_params.at("groups").i; - - op->params["0"] = weight.shape[1] * groups; - op->params["1"] = weight.shape[3]; - op->params["11"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[1]; - op->params["12"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[1]; - op->params["13"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[1]; - op->params["14"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[1]; - op->params["19"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = groups; - - // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1] * groups; - const int kh = weight.shape[2]; - const int kw = weight.shape[3]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch / groups * inch * kh * kw); - float* w2 = (float*)new_weight.data(); - const int outch_g = outch / groups; - const int inch_g = inch / groups; - const int maxk = kh * kw; - - for (int g = 0; g < groups; g++) - { - // reorder weight from inch-outch to outch-inch - float* wg2 = w2 + g * outch_g * inch_g * maxk; - const float* wg = w + g * inch_g * outch_g * maxk; - for (int i = 0; i < outch_g; i++) - { - for (int j = 0; j < inch_g; j++) - { - for (int k = 0; k < maxk; k++) - { - wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; - } - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch / groups, inch, kh, kw}, new_weight); - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose2d_2, 21) - -class F_conv_transpose2d_3 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv_transpose2d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "DeconvolutionDepthWise"; - } - - const char* name_str() const - { - return "deconvdw2d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - const int groups = captured_params.at("groups").i; - - op->params["0"] = weight.shape[1] * groups; - op->params["1"] = weight.shape[3]; - op->params["11"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[1]; - op->params["12"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[1]; - op->params["13"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[1]; - op->params["14"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[1]; - op->params["19"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = groups; - - // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1] * groups; - const int kh = weight.shape[2]; - const int kw = weight.shape[3]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch / groups * inch * kh * kw); - float* w2 = (float*)new_weight.data(); - const int outch_g = outch / groups; - const int inch_g = inch / groups; - const int maxk = kh * kw; - - for (int g = 0; g < groups; g++) - { - // reorder weight from inch-outch to outch-inch - float* wg2 = w2 + g * outch_g * inch_g * maxk; - const float* wg = w + g * inch_g * outch_g * maxk; - for (int i = 0; i < outch_g; i++) - { - for (int j = 0; j < inch_g; j++) - { - for (int k = 0; k < maxk; k++) - { - wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; - } - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch / groups, inch, kh, kw}, new_weight); - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose2d_3, 21) - } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp b/tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp index 80017555231f..890f36cc92a0 100644 --- a/tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp +++ b/tools/pnnx/src/pass_ncnn/F_conv_transpose3d.cpp @@ -18,384 +18,6 @@ namespace pnnx { namespace ncnn { -class F_conv_transpose3d : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv_transpose3d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Deconvolution3D"; - } - - const char* name_str() const - { - return "conv_transpose3d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[1]; - op->params["1"] = weight.shape[4]; - op->params["11"] = weight.shape[3]; - op->params["21"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[2]; - op->params["12"] = captured_params.at("dilation").ai[1]; - op->params["22"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[2]; - op->params["13"] = captured_params.at("stride").ai[1]; - op->params["23"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[2]; - op->params["14"] = captured_params.at("padding").ai[1]; - op->params["24"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[2]; - op->params["19"] = captured_params.at("output_padding").ai[1]; - op->params["20"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - // transpose inch-outch-kd-kh-kw to outch-inch-kd-kh-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1]; - const int kd = weight.shape[2]; - const int kh = weight.shape[3]; - const int kw = weight.shape[4]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch * inch * kd * kh * kw); - float* w2 = (float*)new_weight.data(); - const int maxk = kd * kh * kw; - - // reorder weight from inch-outch to outch-inch - for (int i = 0; i < outch; i++) - { - for (int j = 0; j < inch; j++) - { - for (int k = 0; k < maxk; k++) - { - w2[(i * inch + j) * maxk + k] = w[(j * outch + i) * maxk + k]; - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch, inch, kd, kh, kw}, new_weight); - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose3d, 20) - -class F_conv_transpose3d_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv_transpose3d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=1 -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "Deconvolution3D"; - } - - const char* name_str() const - { - return "conv_transpose3d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[1]; - op->params["1"] = weight.shape[4]; - op->params["11"] = weight.shape[3]; - op->params["21"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[2]; - op->params["12"] = captured_params.at("dilation").ai[1]; - op->params["22"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[2]; - op->params["13"] = captured_params.at("stride").ai[1]; - op->params["23"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[2]; - op->params["14"] = captured_params.at("padding").ai[1]; - op->params["24"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[2]; - op->params["19"] = captured_params.at("output_padding").ai[1]; - op->params["20"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - - // transpose inch-outch-kd-kh-kw to outch-inch-kd-kh-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1]; - const int kd = weight.shape[2]; - const int kh = weight.shape[3]; - const int kw = weight.shape[4]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch * inch * kd * kh * kw); - float* w2 = (float*)new_weight.data(); - const int maxk = kd * kh * kw; - - // reorder weight from inch-outch to outch-inch - for (int i = 0; i < outch; i++) - { - for (int j = 0; j < inch; j++) - { - for (int k = 0; k < maxk; k++) - { - w2[(i * inch + j) * maxk + k] = w[(j * outch + i) * maxk + k]; - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch, inch, kd, kh, kw}, new_weight); - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose3d_1, 20) - -class F_conv_transpose3d_2 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.conv_transpose3d op_0 2 1 input weight out bias=None stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "DeconvolutionDepthWise3D"; - } - - const char* name_str() const - { - return "deconvdw3d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - const int groups = captured_params.at("groups").i; - - op->params["0"] = weight.shape[1] * groups; - op->params["1"] = weight.shape[4]; - op->params["11"] = weight.shape[3]; - op->params["21"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[2]; - op->params["12"] = captured_params.at("dilation").ai[1]; - op->params["22"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[2]; - op->params["13"] = captured_params.at("stride").ai[1]; - op->params["23"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[2]; - op->params["14"] = captured_params.at("padding").ai[1]; - op->params["24"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[2]; - op->params["19"] = captured_params.at("output_padding").ai[1]; - op->params["20"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 0; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = groups; - - // transpose group-inch/group-outch/group-kd-kh-kw to group-outch/group-inch/group-kd-kh-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1] * groups; - const int kd = weight.shape[2]; - const int kh = weight.shape[3]; - const int kw = weight.shape[4]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch / groups * inch * kd * kh * kw); - float* w2 = (float*)new_weight.data(); - const int outch_g = outch / groups; - const int inch_g = inch / groups; - const int maxk = kd * kh * kw; - - for (int g = 0; g < groups; g++) - { - // reorder weight from inch-outch to outch-inch - float* wg2 = w2 + g * outch_g * inch_g * maxk; - const float* wg = w + g * inch_g * outch_g * maxk; - for (int i = 0; i < outch_g; i++) - { - for (int j = 0; j < inch_g; j++) - { - for (int k = 0; k < maxk; k++) - { - wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; - } - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch / groups, inch, kd, kh, kw}, new_weight); - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose3d_2, 21) - -class F_conv_transpose3d_3 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.conv_transpose3d op_0 3 1 input weight bias out stride=%stride padding=%padding dilation=%dilation output_padding=%output_padding groups=%groups -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "DeconvolutionDepthWise3D"; - } - - const char* name_str() const - { - return "deconvdw3d"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - const int groups = captured_params.at("groups").i; - - op->params["0"] = weight.shape[1] * groups; - op->params["1"] = weight.shape[4]; - op->params["11"] = weight.shape[3]; - op->params["21"] = weight.shape[2]; - op->params["2"] = captured_params.at("dilation").ai[2]; - op->params["12"] = captured_params.at("dilation").ai[1]; - op->params["22"] = captured_params.at("dilation").ai[0]; - op->params["3"] = captured_params.at("stride").ai[2]; - op->params["13"] = captured_params.at("stride").ai[1]; - op->params["23"] = captured_params.at("stride").ai[0]; - op->params["4"] = captured_params.at("padding").ai[2]; - op->params["14"] = captured_params.at("padding").ai[1]; - op->params["24"] = captured_params.at("padding").ai[0]; - op->params["18"] = captured_params.at("output_padding").ai[2]; - op->params["19"] = captured_params.at("output_padding").ai[1]; - op->params["20"] = captured_params.at("output_padding").ai[0]; - op->params["5"] = 1; - op->params["6"] = (int)(weight.data.size() / sizeof(float)); - op->params["7"] = groups; - - // transpose group-inch/group-outch/group-kd-kh-kw to group-outch/group-inch/group-kd-kh-kw - const int inch = weight.shape[0]; - const int outch = weight.shape[1] * groups; - const int kd = weight.shape[2]; - const int kh = weight.shape[3]; - const int kw = weight.shape[4]; - std::vector new_weight; - { - const float* w = (const float*)weight.data.data(); - - new_weight.resize(outch / groups * inch * kd * kh * kw); - float* w2 = (float*)new_weight.data(); - const int outch_g = outch / groups; - const int inch_g = inch / groups; - const int maxk = kd * kh * kw; - - for (int g = 0; g < groups; g++) - { - // reorder weight from inch-outch to outch-inch - float* wg2 = w2 + g * outch_g * inch_g * maxk; - const float* wg = w + g * inch_g * outch_g * maxk; - for (int i = 0; i < outch_g; i++) - { - for (int j = 0; j < inch_g; j++) - { - for (int k = 0; k < maxk; k++) - { - wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; - } - } - } - } - } - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = Attribute({outch / groups, inch, kd, kh, kw}, new_weight); - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_conv_transpose3d_3, 21) - } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_fold.cpp b/tools/pnnx/src/pass_ncnn/F_fold.cpp new file mode 100644 index 000000000000..1d35a72eb119 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/F_fold.cpp @@ -0,0 +1,63 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class F_fold : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.fold op_0 1 1 input out output_size=%output_size kernel_size=%kernel_size dilation=%dilation stride=%stride padding=%padding +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Fold"; + } + + const char* name_str() const + { + return "fold"; + } + + void write(Operator* op, const std::map& captured_params) const + { + op->params["1"] = captured_params.at("kernel_size").ai[1]; + op->params["11"] = captured_params.at("kernel_size").ai[0]; + op->params["2"] = captured_params.at("dilation").ai[1]; + op->params["12"] = captured_params.at("dilation").ai[0]; + op->params["3"] = captured_params.at("stride").ai[1]; + op->params["13"] = captured_params.at("stride").ai[0]; + op->params["4"] = captured_params.at("padding").ai[1]; + op->params["14"] = captured_params.at("padding").ai[0]; + op->params["20"] = captured_params.at("output_size").ai[1]; + op->params["21"] = captured_params.at("output_size").ai[0]; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_fold, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_glu.cpp b/tools/pnnx/src/pass_ncnn/F_glu.cpp index cb9397dc15e7..3baf63ce52a0 100644 --- a/tools/pnnx/src/pass_ncnn/F_glu.cpp +++ b/tools/pnnx/src/pass_ncnn/F_glu.cpp @@ -1,16 +1,17 @@ -// Copyright (c) 2022 Xiaomi Corp. (author: Fangjun Kuang) +// Tencent is pleased to support the open source community by making ncnn available. // -// Licensed under the BSD 3-Clause License (the "License"); you may not use this -// file except in compliance with the License. You may obtain a copy of the -// License at +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// 2022 Xiaomi Corp. (author: Fangjun Kuang) +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at // // https://opensource.org/licenses/BSD-3-Clause // -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -// License for the specific language governing permissions and limitations under -// the License. +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. #include "pass_ncnn.h" @@ -18,30 +19,55 @@ namespace pnnx { namespace ncnn { -class F_glu : public GraphRewriterPass { - public: - const char *match_pattern_graph() const { - return R"PNNXIR(7767517 +class F_glu : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input F.glu op_0 1 1 input out dim=%dim pnnx.Output output 1 0 out )PNNXIR"; - } + } + + const char* type_str() const + { + return "GLU"; + } + + const char* name_str() const + { + return "glu"; + } + + void write(Operator* op, const std::map& captured_params) const + { + const int batch_index = op->inputs[0]->params["__batch_index"].i; + + int axis = captured_params.at("dim").i; + if (axis == batch_index) + { + fprintf(stderr, "glu along batch axis %d is not supported\n", batch_index); + return; + } - const char *type_str() const { return "GLU"; } + if (axis < 0) + { + int input_rank = op->inputs[0]->shape.size(); + axis = input_rank + axis; + } - const char *name_str() const { return "glu"; } - void write(Operator *op, - const std::map &captured_params) const { - int axis = captured_params.at("dim").i; + if (axis > batch_index) + axis -= 1; - op->params["0"] = axis; - } + op->params["0"] = axis; + } }; REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_glu, 20) -} // namespace ncnn +} // namespace ncnn -} // namespace pnnx +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp new file mode 100644 index 000000000000..41dfc65ee39c --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp @@ -0,0 +1,71 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class F_grid_sample : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 input0 +pnnx.Input input_1 0 1 input1 +F.grid_sample op_0 2 1 input0 input1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "GridSample"; + } + + const char* name_str() const + { + return "gridsample"; + } + + void write(Operator* op, const std::map& captured_params) const + { + const std::string& mode = captured_params.at("mode").s; + if (mode == "bilinear") + op->params["0"] = 1; + if (mode == "nearest") + op->params["0"] = 2; + if (mode == "bicubic") + op->params["0"] = 3; + + const std::string& padding_mode = captured_params.at("padding_mode").s; + if (padding_mode == "zeros") + op->params["1"] = 1; + if (padding_mode == "border") + op->params["1"] = 2; + if (padding_mode == "reflection") + op->params["1"] = 3; + + op->params["2"] = captured_params.at("align_corners").b ? 1 : 0; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_group_norm.cpp b/tools/pnnx/src/pass_ncnn/F_group_norm.cpp index 7aecbf238558..0af5d32c5561 100644 --- a/tools/pnnx/src/pass_ncnn/F_group_norm.cpp +++ b/tools/pnnx/src/pass_ncnn/F_group_norm.cpp @@ -60,55 +60,6 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_group_norm, 20) -class F_group_norm_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.group_norm op_0 3 1 input weight bias out num_groups=%num_groups eps=%eps -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "GroupNorm"; - } - - const char* name_str() const - { - return "gn"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = captured_params.at("num_groups"); - op->params["1"] = weight.shape[0]; - op->params["2"] = captured_params.at("eps"); - op->params["3"] = 1; - - op->attrs["0"] = weight; - op->attrs["1"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_group_norm_1, 20) - } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_layer_norm.cpp b/tools/pnnx/src/pass_ncnn/F_layer_norm.cpp index 4ae1c5061c9c..74ec974fb3cf 100644 --- a/tools/pnnx/src/pass_ncnn/F_layer_norm.cpp +++ b/tools/pnnx/src/pass_ncnn/F_layer_norm.cpp @@ -58,61 +58,6 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_layer_norm, 20) -class F_layer_norm_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.layer_norm op_0 3 1 input weight bias out normalized_shape=%normalized_shape eps=%eps -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "LayerNorm"; - } - - const char* name_str() const - { - return "ln"; - } - - void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - const std::vector& normalized_shape = captured_params.at("normalized_shape").ai; - int affine_size = normalized_shape[0]; - for (size_t i = 1; i < normalized_shape.size(); i++) - { - affine_size *= normalized_shape[i]; - } - - op->params["0"] = affine_size; - op->params["1"] = captured_params.at("eps"); - op->params["2"] = 1; - - op->attrs["0"] = weight; - op->attrs["1"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_layer_norm_1, 20) - } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_linear.cpp b/tools/pnnx/src/pass_ncnn/F_linear.cpp index b76c444e4b63..890f36cc92a0 100644 --- a/tools/pnnx/src/pass_ncnn/F_linear.cpp +++ b/tools/pnnx/src/pass_ncnn/F_linear.cpp @@ -18,101 +18,6 @@ namespace pnnx { namespace ncnn { -class F_linear : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -4 3 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -F.linear op_0 2 1 input weight out bias=None -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "InnerProduct"; - } - - const char* name_str() const - { - return "linear"; - } - - void write(Operator* op, const std::map& /*captured_params*/, const std::map& captured_attrs) const - { - Attribute weight; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = 0; - op->params["2"] = (int)(weight.data.size() / sizeof(float)); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_linear, 20) - -class F_linear_1 : public GraphRewriterPass -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Attribute op_weight 0 1 weight @qwq -pnnx.Attribute op_bias 0 1 bias @qwq -F.linear op_0 3 1 input weight bias out -pnnx.Output output 1 0 out -)PNNXIR"; - } - - const char* type_str() const - { - return "InnerProduct"; - } - - const char* name_str() const - { - return "linear"; - } - - void write(Operator* op, const std::map& /*captured_params*/, const std::map& captured_attrs) const - { - Attribute weight; - Attribute bias; - for (const auto& x : captured_attrs) - { - if (x.first.substr(0, 10) == "op_weight.") - weight = x.second; - if (x.first.substr(0, 8) == "op_bias.") - bias = x.second; - } - - op->params["0"] = weight.shape[0]; - op->params["1"] = 1; - op->params["2"] = (int)(weight.data.size() / sizeof(float)); - - op->attrs["0"] = Attribute(); - op->attrs["0"].data = {0, 0, 0, 0}; - op->attrs["1"] = weight; - op->attrs["2"] = bias; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_linear_1, 20) - } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/F_normalize.cpp b/tools/pnnx/src/pass_ncnn/F_normalize.cpp index db5e54ca47c2..2030ba5675e6 100644 --- a/tools/pnnx/src/pass_ncnn/F_normalize.cpp +++ b/tools/pnnx/src/pass_ncnn/F_normalize.cpp @@ -45,11 +45,6 @@ pnnx.Output output 1 0 out { const int batch_index = op->inputs[0]->params["__batch_index"].i; - int input_rank = op->inputs[0]->shape.size(); - - if (batch_index >= 0 && batch_index < input_rank) - input_rank -= 1; - int axis = captured_params.at("dim").i; if (axis == batch_index) { @@ -58,7 +53,10 @@ pnnx.Output output 1 0 out } if (axis < 0) + { + int input_rank = op->inputs[0]->shape.size(); axis = input_rank + axis; + } if (axis > batch_index) axis -= 1; @@ -75,6 +73,11 @@ pnnx.Output output 1 0 out return; } + int input_rank = op->inputs[0]->shape.size(); + + if (batch_index >= 0 && batch_index < input_rank) + input_rank -= 1; + if (input_rank == 2 || axis != 0) { fprintf(stderr, "unsupported normalize for %d-rank tensor with axis %d\n", input_rank, axis); diff --git a/tools/pnnx/src/pass_ncnn/F_softmax.cpp b/tools/pnnx/src/pass_ncnn/F_softmax.cpp index 1ec110523ce6..a3a23587a86e 100644 --- a/tools/pnnx/src/pass_ncnn/F_softmax.cpp +++ b/tools/pnnx/src/pass_ncnn/F_softmax.cpp @@ -45,11 +45,6 @@ pnnx.Output output 1 0 out { const int batch_index = op->inputs[0]->params["__batch_index"].i; - int input_rank = op->inputs[0]->shape.size(); - - if (batch_index >= 0 && batch_index < input_rank) - input_rank -= 1; - int axis = captured_params.at("dim").i; if (axis == batch_index) { @@ -58,7 +53,10 @@ pnnx.Output output 1 0 out } if (axis < 0) + { + int input_rank = op->inputs[0]->shape.size(); axis = input_rank + axis; + } if (axis > batch_index) axis -= 1; diff --git a/tools/pnnx/src/pass_ncnn/F_unfold.cpp b/tools/pnnx/src/pass_ncnn/F_unfold.cpp new file mode 100644 index 000000000000..14f82b08f998 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/F_unfold.cpp @@ -0,0 +1,61 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class F_unfold : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +F.unfold op_0 1 1 input out kernel_size=%kernel_size dilation=%dilation stride=%stride padding=%padding +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Unfold"; + } + + const char* name_str() const + { + return "unfold"; + } + + void write(Operator* op, const std::map& captured_params) const + { + op->params["1"] = captured_params.at("kernel_size").ai[1]; + op->params["11"] = captured_params.at("kernel_size").ai[0]; + op->params["2"] = captured_params.at("dilation").ai[1]; + op->params["12"] = captured_params.at("dilation").ai[0]; + op->params["3"] = captured_params.at("stride").ai[1]; + op->params["13"] = captured_params.at("stride").ai[0]; + op->params["4"] = captured_params.at("padding").ai[1]; + op->params["14"] = captured_params.at("padding").ai[0]; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_unfold, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp b/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp index 0ca99525238c..ecc36506e860 100644 --- a/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp +++ b/tools/pnnx/src/pass_ncnn/Tensor_slice.cpp @@ -14,6 +14,8 @@ #include "pass_ncnn.h" +#include + namespace pnnx { namespace ncnn { @@ -60,32 +62,37 @@ pnnx.Output output 1 0 out const int batch_index = op->inputs[0]->params["__batch_index"].i; - int input_rank = op->inputs[0]->shape.size(); + { + int input_rank = op->inputs[0]->shape.size(); - if (batch_index >= 0 && batch_index < input_rank) - input_rank -= 1; + if (batch_index >= 0 && batch_index < input_rank) + input_rank -= 1; - if (input_rank > 4) - { - fprintf(stderr, "slice %d-rank tensor with %d-rank axes is not possible!\n", input_rank, axes_rank); - return; + if (input_rank > 4) + { + fprintf(stderr, "slice %d-rank tensor with %d-rank axes is not possible!\n", input_rank, axes_rank); + return; + } } for (int i = 0; i < axes_rank; i++) { - if (axes[i] == batch_index && (starts[i] != 0 || ends[i] != -1)) + if (axes[i] == batch_index && (starts[i] != 0 || ends[i] != INT_MAX)) { fprintf(stderr, "slice along batch axis is not supported\n"); return; } if (axes[i] < 0) + { + int input_rank = op->inputs[0]->shape.size(); axes[i] = input_rank + axes[i]; + } if (axes[i] > batch_index) axes[i] -= 1; - if (ends[i] == -1) + if (ends[i] == INT_MAX) ends[i] = -233; } diff --git a/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.cpp b/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.cpp deleted file mode 100644 index 0d800bf8e617..000000000000 --- a/tools/pnnx/src/pass_ncnn/convert_to_fp16_model.cpp +++ /dev/null @@ -1,121 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#include "convert_to_fp16_model.h" - -namespace pnnx { - -namespace ncnn { - -static unsigned short float32_to_float16(float value) -{ - // 1 : 8 : 23 - union - { - unsigned int u; - float f; - } tmp; - - tmp.f = value; - - // 1 : 8 : 23 - unsigned short sign = (tmp.u & 0x80000000) >> 31; - unsigned short exponent = (tmp.u & 0x7F800000) >> 23; - unsigned int significand = tmp.u & 0x7FFFFF; - - // NCNN_LOGE("%d %d %d", sign, exponent, significand); - - // 1 : 5 : 10 - unsigned short fp16; - if (exponent == 0) - { - // zero or denormal, always underflow - fp16 = (sign << 15) | (0x00 << 10) | 0x00; - } - else if (exponent == 0xFF) - { - // infinity or NaN - fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00); - } - else - { - // normalized - short newexp = exponent + (-127 + 15); - if (newexp >= 31) - { - // overflow, return infinity - fp16 = (sign << 15) | (0x1F << 10) | 0x00; - } - else if (newexp <= 0) - { - // Some normal fp32 cannot be expressed as normal fp16 - fp16 = (sign << 15) | (0x00 << 10) | 0x00; - } - else - { - // normal fp16 - fp16 = (sign << 15) | (newexp << 10) | (significand >> 13); - } - } - - return fp16; -} - -void convert_to_fp16_model(Graph& graph) -{ - for (Operator* op : graph.ops) - { - bool is_type_flag_fp32 = false; - for (auto& it : op->attrs) - { - Attribute& attr = it.second; - - if (is_type_flag_fp32) - { - // fp32 -> fp16 - const float* p = (const float*)attr.data.data(); - int len = attr.data.size() / 4; - std::vector data_fp16(len * 2); - unsigned short* p_fp16 = (unsigned short*)data_fp16.data(); - for (int i = 0; i < len; i++) - { - p_fp16[i] = float32_to_float16(p[i]); - } - - attr.type = 3; - attr.data = data_fp16; - - is_type_flag_fp32 = false; - continue; - } - - if (attr.type == 0 && attr.data == std::vector {0, 0, 0, 0}) - { - // write fp16 flag - // unsigned int fp16_flag = 0x01306B47; - attr.data[0] = 0x47; - attr.data[1] = 0x6B; - attr.data[2] = 0x30; - attr.data[3] = 0x01; - - is_type_flag_fp32 = true; - continue; - } - } - } -} - -} // namespace ncnn - -} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/expand_expression.cpp b/tools/pnnx/src/pass_ncnn/expand_expression.cpp index e24764349b87..baec8795c5dd 100644 --- a/tools/pnnx/src/pass_ncnn/expand_expression.cpp +++ b/tools/pnnx/src/pass_ncnn/expand_expression.cpp @@ -119,7 +119,23 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx // not supported return std::string(); } - else if (t == "sqrt" || t == "rsqrt" || t == "neg" || t == "floor" || t == "exp") + else if (t == "abs" + || t == "acos" + || t == "asin" + || t == "atan" + || t == "ceil" + || t == "cos" + || t == "exp" + || t == "floor" + || t == "log" + || t == "neg" + || t == "reciprocal" + || t == "rsqrt" + || t == "sin" + || t == "sqrt" + || t == "square" + || t == "tan" + || t == "tanh") { std::string a = exprstack.top(); exprstack.pop(); @@ -129,11 +145,23 @@ static std::string expand_expression(Graph& graph, const Operator* op, int& pnnx Operator* op_unary = graph.new_operator_before("UnaryOp", t + "_" + std::to_string(pnnx_expr_index++), op); - if (t == "sqrt") op_unary->params["0"] = 5; - if (t == "rsqrt") op_unary->params["0"] = 6; - if (t == "neg") op_unary->params["0"] = 1; - if (t == "floor") op_unary->params["0"] = 2; + if (t == "abs") op_unary->params["0"] = 0; + if (t == "acos") op_unary->params["0"] = 13; + if (t == "asin") op_unary->params["0"] = 12; + if (t == "atan") op_unary->params["0"] = 14; + if (t == "ceil") op_unary->params["0"] = 3; + if (t == "cos") op_unary->params["0"] = 10; if (t == "exp") op_unary->params["0"] = 7; + if (t == "floor") op_unary->params["0"] = 2; + if (t == "log") op_unary->params["0"] = 8; + if (t == "neg") op_unary->params["0"] = 1; + if (t == "reciprocal") op_unary->params["0"] = 15; + if (t == "rsqrt") op_unary->params["0"] = 6; + if (t == "sin") op_unary->params["0"] = 9; + if (t == "sqrt") op_unary->params["0"] = 5; + if (t == "square") op_unary->params["0"] = 4; + if (t == "tan") op_unary->params["0"] = 11; + if (t == "tanh") op_unary->params["0"] = 16; Operand* op_unary_in = token_is_argument(a) ? op->inputs[std::stoi(a.substr(1))] : graph.get_operand(op->name + "_" + a); op_unary_in->consumers.push_back(op_unary); diff --git a/tools/pnnx/src/pass_ncnn/nn_Fold.cpp b/tools/pnnx/src/pass_ncnn/nn_Fold.cpp new file mode 100644 index 000000000000..d94bc68b0303 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/nn_Fold.cpp @@ -0,0 +1,63 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class nn_Fold : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.Fold op_0 1 1 input out output_size=%output_size kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Fold"; + } + + const char* name_str() const + { + return "fold"; + } + + void write(Operator* op, const std::map& captured_params) const + { + op->params["1"] = captured_params.at("kernel_size").ai[1]; + op->params["11"] = captured_params.at("kernel_size").ai[0]; + op->params["2"] = captured_params.at("dilation").ai[1]; + op->params["12"] = captured_params.at("dilation").ai[0]; + op->params["3"] = captured_params.at("stride").ai[1]; + op->params["13"] = captured_params.at("stride").ai[0]; + op->params["4"] = captured_params.at("padding").ai[1]; + op->params["14"] = captured_params.at("padding").ai[0]; + op->params["20"] = captured_params.at("output_size").ai[1]; + op->params["21"] = captured_params.at("output_size").ai[0]; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Fold, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/nn_GLU.cpp b/tools/pnnx/src/pass_ncnn/nn_GLU.cpp new file mode 100644 index 000000000000..82e3f84942c6 --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/nn_GLU.cpp @@ -0,0 +1,73 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// 2022 Xiaomi Corp. (author: Fangjun Kuang) +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class nn_GLU : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.GLU op_0 1 1 input out dim=%dim +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "GLU"; + } + + const char* name_str() const + { + return "glu"; + } + + void write(Operator* op, const std::map& captured_params) const + { + const int batch_index = op->inputs[0]->params["__batch_index"].i; + + int axis = captured_params.at("dim").i; + if (axis == batch_index) + { + fprintf(stderr, "glu along batch axis %d is not supported\n", batch_index); + return; + } + + if (axis < 0) + { + int input_rank = op->inputs[0]->shape.size(); + axis = input_rank + axis; + } + + if (axis > batch_index) + axis -= 1; + + op->params["0"] = axis; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_GLU, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/nn_LSTM.cpp b/tools/pnnx/src/pass_ncnn/nn_LSTM.cpp index c8cfbe4e33ba..1a1511680934 100644 --- a/tools/pnnx/src/pass_ncnn/nn_LSTM.cpp +++ b/tools/pnnx/src/pass_ncnn/nn_LSTM.cpp @@ -27,7 +27,7 @@ class nn_LSTM : public GraphRewriterPass return R"PNNXIR(7767517 3 4 pnnx.Input input 0 1 input -nn.LSTM op_0 1 3 input out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse +nn.LSTM op_0 1 3 input out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_hr_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse @weight_hr_l0_reverse pnnx.Output output 3 0 out out_hidden out_cell )PNNXIR"; } @@ -46,23 +46,19 @@ pnnx.Output output 3 0 out out_hidden out_cell { const bool bidirectional = captured_params.at("bidirectional").b; const int num_directions = bidirectional ? 2 : 1; - const int num_output = captured_params.at("hidden_size").i; + const int hidden_size = captured_params.at("hidden_size").i; const int input_size = captured_params.at("input_size").i; - int proj_size = captured_params.at("proj_size").i; - if (captured_params.count("proj_size")) { - proj_size = captured_params.at("proj_size").i; - } - const int real_output_size = proj_size ? proj_size : num_output; + int proj_size = captured_params.at("proj_size").i; + if (proj_size == 0) + proj_size = hidden_size; - int weight_data_size = num_directions * num_output * input_size * 4; + int weight_data_size = num_directions * hidden_size * input_size * 4; - op->params["0"] = num_output; + op->params["0"] = proj_size; op->params["1"] = weight_data_size; op->params["2"] = bidirectional ? 2 : 0; - if (proj_size) { - op->params["3"] = proj_size; - } + op->params["3"] = hidden_size; op->attrs["0"] = Attribute(); op->attrs["0"].data = {0, 0, 0, 0}; @@ -71,7 +67,7 @@ pnnx.Output output 3 0 out out_hidden out_cell { std::vector new_weight_ih; { - const int weight_data_size_g = num_output * input_size; + const int weight_data_size_g = hidden_size * input_size; const float* weight_ih = (const float*)captured_attrs.at("op_0.weight_ih_l0").data.data(); const float* iptr = weight_ih; @@ -79,7 +75,7 @@ pnnx.Output output 3 0 out out_hidden out_cell const float* gptr = weight_ih + weight_data_size_g * 2; const float* optr = weight_ih + weight_data_size_g * 3; - new_weight_ih.resize(4 * num_output * input_size); + new_weight_ih.resize(4 * hidden_size * input_size); float* weight = (float*)new_weight_ih.data(); float* w_iptr = weight; float* w_fptr = weight + weight_data_size_g; @@ -95,7 +91,7 @@ pnnx.Output output 3 0 out out_hidden out_cell { std::vector new_weight_ih_reverse; { - const int weight_data_size_g = num_output * input_size; + const int weight_data_size_g = hidden_size * input_size; const float* weight_ih = (const float*)captured_attrs.at("op_0.weight_ih_l0_reverse").data.data(); const float* iptr = weight_ih; @@ -103,7 +99,7 @@ pnnx.Output output 3 0 out out_hidden out_cell const float* gptr = weight_ih + weight_data_size_g * 2; const float* optr = weight_ih + weight_data_size_g * 3; - new_weight_ih_reverse.resize(4 * num_output * input_size); + new_weight_ih_reverse.resize(4 * hidden_size * input_size); float* weight = (float*)new_weight_ih_reverse.data(); float* w_iptr = weight; float* w_fptr = weight + weight_data_size_g; @@ -114,11 +110,11 @@ pnnx.Output output 3 0 out out_hidden out_cell memcpy(w_optr, optr, weight_data_size_g * sizeof(float)); memcpy(w_gptr, gptr, weight_data_size_g * sizeof(float)); } - op->attrs["1"] = Attribute({4, num_output, input_size}, new_weight_ih) + Attribute({4, num_output, input_size}, new_weight_ih_reverse); + op->attrs["1"] = Attribute({4, hidden_size, input_size}, new_weight_ih) + Attribute({4, hidden_size, input_size}, new_weight_ih_reverse); } else { - op->attrs["1"] = Attribute({4, num_output, input_size}, new_weight_ih); + op->attrs["1"] = Attribute({4, hidden_size, input_size}, new_weight_ih); } } @@ -133,33 +129,33 @@ pnnx.Output output 3 0 out out_hidden out_cell const float* bias_ih = (const float*)captured_attrs.at("op_0.bias_ih_l0").data.data(); const float* bias_hh = (const float*)captured_attrs.at("op_0.bias_hh_l0").data.data(); const float* bias_ih_iptr = bias_ih; - const float* bias_ih_fptr = bias_ih + num_output; - const float* bias_ih_gptr = bias_ih + num_output * 2; - const float* bias_ih_optr = bias_ih + num_output * 3; + const float* bias_ih_fptr = bias_ih + hidden_size; + const float* bias_ih_gptr = bias_ih + hidden_size * 2; + const float* bias_ih_optr = bias_ih + hidden_size * 3; const float* bias_hh_iptr = bias_hh; - const float* bias_hh_fptr = bias_hh + num_output; - const float* bias_hh_gptr = bias_hh + num_output * 2; - const float* bias_hh_optr = bias_hh + num_output * 3; + const float* bias_hh_fptr = bias_hh + hidden_size; + const float* bias_hh_gptr = bias_hh + hidden_size * 2; + const float* bias_hh_optr = bias_hh + hidden_size * 3; - new_bias.resize(4 * num_output); + new_bias.resize(4 * hidden_size); float* bias = (float*)new_bias.data(); float* b_iptr = bias; - float* b_fptr = bias + num_output; - float* b_optr = bias + num_output * 2; - float* b_gptr = bias + num_output * 3; - for (int i = 0; i < num_output; i++) + float* b_fptr = bias + hidden_size; + float* b_optr = bias + hidden_size * 2; + float* b_gptr = bias + hidden_size * 3; + for (int i = 0; i < hidden_size; i++) { b_iptr[i] = bias_ih_iptr[i] + bias_hh_iptr[i]; } - for (int i = 0; i < num_output; i++) + for (int i = 0; i < hidden_size; i++) { b_fptr[i] = bias_ih_fptr[i] + bias_hh_fptr[i]; } - for (int i = 0; i < num_output; i++) + for (int i = 0; i < hidden_size; i++) { b_optr[i] = bias_ih_optr[i] + bias_hh_optr[i]; } - for (int i = 0; i < num_output; i++) + for (int i = 0; i < hidden_size; i++) { b_gptr[i] = bias_ih_gptr[i] + bias_hh_gptr[i]; } @@ -172,63 +168,63 @@ pnnx.Output output 3 0 out out_hidden out_cell const float* bias_ih = (const float*)captured_attrs.at("op_0.bias_ih_l0_reverse").data.data(); const float* bias_hh = (const float*)captured_attrs.at("op_0.bias_hh_l0_reverse").data.data(); const float* bias_ih_iptr = bias_ih; - const float* bias_ih_fptr = bias_ih + num_output; - const float* bias_ih_gptr = bias_ih + num_output * 2; - const float* bias_ih_optr = bias_ih + num_output * 3; + const float* bias_ih_fptr = bias_ih + hidden_size; + const float* bias_ih_gptr = bias_ih + hidden_size * 2; + const float* bias_ih_optr = bias_ih + hidden_size * 3; const float* bias_hh_iptr = bias_hh; - const float* bias_hh_fptr = bias_hh + num_output; - const float* bias_hh_gptr = bias_hh + num_output * 2; - const float* bias_hh_optr = bias_hh + num_output * 3; + const float* bias_hh_fptr = bias_hh + hidden_size; + const float* bias_hh_gptr = bias_hh + hidden_size * 2; + const float* bias_hh_optr = bias_hh + hidden_size * 3; - new_bias_reverse.resize(4 * num_output); + new_bias_reverse.resize(4 * hidden_size); float* bias = (float*)new_bias_reverse.data(); float* b_iptr = bias; - float* b_fptr = bias + num_output; - float* b_optr = bias + num_output * 2; - float* b_gptr = bias + num_output * 3; - for (int i = 0; i < num_output; i++) + float* b_fptr = bias + hidden_size; + float* b_optr = bias + hidden_size * 2; + float* b_gptr = bias + hidden_size * 3; + for (int i = 0; i < hidden_size; i++) { b_iptr[i] = bias_ih_iptr[i] + bias_hh_iptr[i]; } - for (int i = 0; i < num_output; i++) + for (int i = 0; i < hidden_size; i++) { b_fptr[i] = bias_ih_fptr[i] + bias_hh_fptr[i]; } - for (int i = 0; i < num_output; i++) + for (int i = 0; i < hidden_size; i++) { b_optr[i] = bias_ih_optr[i] + bias_hh_optr[i]; } - for (int i = 0; i < num_output; i++) + for (int i = 0; i < hidden_size; i++) { b_gptr[i] = bias_ih_gptr[i] + bias_hh_gptr[i]; } } - op->attrs["3"] = Attribute({4, num_output}, new_bias) + Attribute({4, num_output}, new_bias_reverse); + op->attrs["3"] = Attribute({4, hidden_size}, new_bias) + Attribute({4, hidden_size}, new_bias_reverse); } else { - op->attrs["3"] = Attribute({4, num_output}, new_bias); + op->attrs["3"] = Attribute({4, hidden_size}, new_bias); } } else { - std::vector bias(4 * num_output, 0.f); + std::vector bias(4 * hidden_size, 0.f); if (bidirectional) - op->attrs["3"] = Attribute({4, num_output}, bias) + Attribute({4, num_output}, bias); + op->attrs["3"] = Attribute({4, hidden_size}, bias) + Attribute({4, hidden_size}, bias); else - op->attrs["3"] = Attribute({4, num_output}, bias); + op->attrs["3"] = Attribute({4, hidden_size}, bias); } op->attrs["4"] = Attribute(); op->attrs["4"].data = {0, 0, 0, 0}; - // reorder IFGO-hidden-hidden to IFOG-hidden-hidden + // reorder IFGO-hidden-proj to IFOG-hidden-proj { std::vector new_weight_hh; { - const int weight_data_size_g = num_output * real_output_size; + const int weight_data_size_g = hidden_size * proj_size; const float* weight_hh = (const float*)captured_attrs.at("op_0.weight_hh_l0").data.data(); const float* iptr = weight_hh; @@ -236,7 +232,7 @@ pnnx.Output output 3 0 out out_hidden out_cell const float* gptr = weight_hh + weight_data_size_g * 2; const float* optr = weight_hh + weight_data_size_g * 3; - new_weight_hh.resize(4 * weight_data_size_g); + new_weight_hh.resize(4 * hidden_size * proj_size); float* weight = (float*)new_weight_hh.data(); float* w_iptr = weight; float* w_fptr = weight + weight_data_size_g; @@ -252,7 +248,7 @@ pnnx.Output output 3 0 out out_hidden out_cell { std::vector new_weight_hh_reverse; { - const int weight_data_size_g = num_output * real_output_size; + const int weight_data_size_g = hidden_size * proj_size; const float* weight_hh = (const float*)captured_attrs.at("op_0.weight_hh_l0_reverse").data.data(); const float* iptr = weight_hh; @@ -260,7 +256,7 @@ pnnx.Output output 3 0 out out_hidden out_cell const float* gptr = weight_hh + weight_data_size_g * 2; const float* optr = weight_hh + weight_data_size_g * 3; - new_weight_hh_reverse.resize(4 * weight_data_size_g); + new_weight_hh_reverse.resize(4 * hidden_size * proj_size); float* weight = (float*)new_weight_hh_reverse.data(); float* w_iptr = weight; float* w_fptr = weight + weight_data_size_g; @@ -271,30 +267,28 @@ pnnx.Output output 3 0 out out_hidden out_cell memcpy(w_optr, optr, weight_data_size_g * sizeof(float)); memcpy(w_gptr, gptr, weight_data_size_g * sizeof(float)); } - op->attrs["5"] = Attribute({4, num_output, real_output_size}, new_weight_hh) + Attribute({4, num_output, real_output_size}, new_weight_hh_reverse); + op->attrs["5"] = Attribute({4, hidden_size, proj_size}, new_weight_hh) + Attribute({4, hidden_size, proj_size}, new_weight_hh_reverse); } else { - op->attrs["5"] = Attribute({4, num_output, real_output_size}, new_weight_hh); + op->attrs["5"] = Attribute({4, hidden_size, proj_size}, new_weight_hh); } } - if (proj_size) { - op->attrs["6"] = Attribute(); - op->attrs["6"].data = {0, 0, 0, 0}; - const float* weight_hr = (const float*)captured_attrs.at("op_0.weight_hr_l0").data.data(); - - const int weight_data_size_g = proj_size * num_output; - std::vector new_weight_hr(weight_hr, weight_hr + weight_data_size_g); - op->attrs["7"] = Attribute({proj_size, num_output}, new_weight_hr); - - if (bidirectional) { - fprintf(stderr, "Not implemented yet for bi-LSTM with proj_size > 0!\n"); - exit(-1); - } + if (proj_size != hidden_size) + { + op->attrs["6"] = Attribute(); + op->attrs["6"].data = {0, 0, 0, 0}; + if (bidirectional) + { + op->attrs["7"] = captured_attrs.at("op_0.weight_hr_l0") + captured_attrs.at("op_0.weight_hr_l0_reverse"); + } + else + { + op->attrs["7"] = captured_attrs.at("op_0.weight_hr_l0"); + } } - } }; @@ -310,7 +304,7 @@ class nn_LSTM_1 : public nn_LSTM pnnx.Input input 0 1 input pnnx.Input in_hidden 0 1 in_hidden pnnx.Input in_hidden 0 1 in_cell -nn.LSTM op_0 3 3 input in_hidden in_cell out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse +nn.LSTM op_0 3 3 input in_hidden in_cell out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_hr_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse @weight_hr_l0_reverse pnnx.Output output 3 0 out out_hidden out_cell )PNNXIR"; } @@ -326,7 +320,7 @@ class nn_LSTM_2 : public nn_LSTM return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -nn.LSTM op_0 1 1 input out input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse +nn.LSTM op_0 1 1 input out input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_hr_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse @weight_hr_l0_reverse pnnx.Output output 1 0 out )PNNXIR"; } @@ -344,7 +338,7 @@ class nn_LSTM_3 : public nn_LSTM pnnx.Input input 0 1 input pnnx.Input in_hidden 0 1 in_hidden pnnx.Input in_hidden 0 1 in_cell -nn.LSTM op_0 3 1 input in_hidden in_cell out input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse +nn.LSTM op_0 3 1 input in_hidden in_cell out input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @bias_ih_l0 @bias_hh_l0 @weight_hr_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse @weight_hr_l0_reverse pnnx.Output output 1 0 out )PNNXIR"; } @@ -352,34 +346,6 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_LSTM_3, 20) -class nn_LSTM_4 : public nn_LSTM -{ -public: - const char* match_pattern_graph() const - { - return R"PNNXIR(7767517 -5 4 -pnnx.Input input 0 1 input -pnnx.Input in_hidden 0 1 in_hidden -pnnx.Input in_hidden 0 1 in_cell -nn.LSTM op_0 3 3 input in_hidden in_cell out out_hidden out_cell input_size=%input_size hidden_size=%hidden_size num_layers=1 bias=%bias batch_first=%batch_first bidirectional=%bidirectional proj_size=%proj_size @weight_ih_l0 @weight_hh_l0 @weight_hr_l0 @bias_ih_l0 @bias_hh_l0 @weight_ih_l0_reverse @weight_hh_l0_reverse @bias_ih_l0_reverse @bias_hh_l0_reverse -pnnx.Output output 3 0 out out_hidden out_cell -)PNNXIR"; - } - - const char* type_str() const - { - return "LSTM2"; - } - - const char* name_str() const - { - return "lstm2"; - } -}; - -REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_LSTM_4, 19) - } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp b/tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp index bcc9407f6053..610b304db6cb 100644 --- a/tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp +++ b/tools/pnnx/src/pass_ncnn/nn_MultiheadAttention.cpp @@ -29,7 +29,7 @@ class nn_MultiheadAttention : public GraphRewriterPass return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -nn.MultiheadAttention op_0 1 1 input out num_heads=%num_heads batch_first=%batch_first add_zero_attn=%add_zero_attn embed_dim=%embed_dim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias +nn.MultiheadAttention op_0 1 1 input out num_heads=%num_heads batch_first=%batch_first add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias pnnx.Output output 1 0 out )PNNXIR"; } @@ -55,6 +55,8 @@ pnnx.Output output 1 0 out } const int embed_dim = captured_params.at("embed_dim").i; + const int kdim = captured_params.at("kdim").i; + const int vdim = captured_params.at("vdim").i; // split in_proj_weight and in_proj_bias into q k v std::vector q_weight(embed_dim * embed_dim); @@ -90,6 +92,8 @@ pnnx.Output output 1 0 out } op->params["2"] = embed_dim * embed_dim; + op->params["3"] = kdim; + op->params["4"] = vdim; op->attrs["0"] = Attribute(); op->attrs["0"].data = {0, 0, 0, 0}; @@ -120,7 +124,7 @@ class nn_MultiheadAttention_1 : public nn_MultiheadAttention return R"PNNXIR(7767517 3 2 pnnx.Input input 0 1 input -nn.MultiheadAttention op_0 1 1 input out num_heads=%num_heads add_zero_attn=%add_zero_attn embed_dim=%embed_dim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias +nn.MultiheadAttention op_0 1 1 input out num_heads=%num_heads add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias pnnx.Output output 1 0 out )PNNXIR"; } @@ -128,6 +132,187 @@ pnnx.Output output 1 0 out REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_1, 20) +class nn_MultiheadAttention_2 : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_0 0 1 query +pnnx.Input input_1 0 1 key +pnnx.Input input_2 0 1 value +nn.MultiheadAttention op_0 3 1 query key value out num_heads=%num_heads batch_first=%batch_first add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @q_proj_weight @k_proj_weight @v_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "MultiHeadAttention"; + } + + const char* name_str() const + { + return "attention"; + } + + void write(Operator* op, const std::map& captured_params, const std::map& captured_attrs) const + { + op->params["0"] = captured_params.at("embed_dim"); + op->params["1"] = captured_params.at("num_heads"); + + if (captured_params.at("add_bias_kv").b) + { + fprintf(stderr, "MultiheadAttention add_bias_kv=True not supported\n"); + } + + const int embed_dim = captured_params.at("embed_dim").i; + const int kdim = captured_params.at("kdim").i; + const int vdim = captured_params.at("vdim").i; + + // split in_proj_bias into q k v + std::vector q_bias(embed_dim); + std::vector k_bias(embed_dim); + std::vector v_bias(embed_dim); + { + // qkv - embed_dim + const float* bptr = (const float*)captured_attrs.at("op_0.in_proj_bias").data.data(); + + { + memcpy(q_bias.data(), bptr, embed_dim * sizeof(float)); + bptr += embed_dim; + } + + { + memcpy(k_bias.data(), bptr, embed_dim * sizeof(float)); + bptr += embed_dim; + } + + { + memcpy(v_bias.data(), bptr, embed_dim * sizeof(float)); + } + } + + op->params["2"] = embed_dim * embed_dim; + op->params["3"] = kdim; + op->params["4"] = vdim; + + if (captured_attrs.find("op_0.in_proj_weight") != captured_attrs.end()) + { + // split in_proj_weight and in_proj_bias into q k v + std::vector q_weight(embed_dim * embed_dim); + std::vector k_weight(embed_dim * kdim); + std::vector v_weight(embed_dim * vdim); + { + // qkv - embed_dim - embed_dim + const float* wptr = (const float*)captured_attrs.at("op_0.in_proj_weight").data.data(); + + { + memcpy(q_weight.data(), wptr, embed_dim * embed_dim * sizeof(float)); + wptr += embed_dim * embed_dim; + } + + { + memcpy(k_weight.data(), wptr, embed_dim * kdim * sizeof(float)); + wptr += embed_dim * kdim; + } + + { + memcpy(v_weight.data(), wptr, embed_dim * vdim * sizeof(float)); + } + } + + op->attrs["0"] = Attribute(); + op->attrs["0"].data = {0, 0, 0, 0}; + op->attrs["1"] = Attribute({embed_dim, embed_dim}, q_weight); + op->attrs["2"] = Attribute({embed_dim}, q_bias); + op->attrs["3"] = Attribute(); + op->attrs["3"].data = {0, 0, 0, 0}; + op->attrs["4"] = Attribute({embed_dim, kdim}, k_weight); + op->attrs["5"] = Attribute({embed_dim}, k_bias); + op->attrs["6"] = Attribute(); + op->attrs["6"].data = {0, 0, 0, 0}; + op->attrs["7"] = Attribute({embed_dim, vdim}, v_weight); + op->attrs["8"] = Attribute({embed_dim}, v_bias); + } + else + { + op->attrs["0"] = Attribute(); + op->attrs["0"].data = {0, 0, 0, 0}; + op->attrs["1"] = captured_attrs.at("op_0.q_proj_weight"); + op->attrs["2"] = Attribute({embed_dim}, q_bias); + op->attrs["3"] = Attribute(); + op->attrs["3"].data = {0, 0, 0, 0}; + op->attrs["4"] = captured_attrs.at("op_0.k_proj_weight"); + op->attrs["5"] = Attribute({embed_dim}, k_bias); + op->attrs["6"] = Attribute(); + op->attrs["6"].data = {0, 0, 0, 0}; + op->attrs["7"] = captured_attrs.at("op_0.v_proj_weight"); + op->attrs["8"] = Attribute({embed_dim}, v_bias); + } + + op->attrs["9"] = Attribute(); + op->attrs["9"].data = {0, 0, 0, 0}; + op->attrs["a"] = captured_attrs.at("op_0.out_proj.weight"); + op->attrs["b"] = captured_attrs.at("op_0.out_proj.bias"); + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_2, 20) + +class nn_MultiheadAttention_3 : public nn_MultiheadAttention_2 +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +5 4 +pnnx.Input input_0 0 1 query +pnnx.Input input_1 0 1 key +pnnx.Input input_2 0 1 value +nn.MultiheadAttention op_0 3 1 query key value out num_heads=%num_heads add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @q_proj_weight @k_proj_weight @v_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_3, 20) + +class nn_MultiheadAttention_4 : public nn_MultiheadAttention_2 +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 query +pnnx.Input input_1 0 1 key +nn.MultiheadAttention op_0 2 1 query key out num_heads=%num_heads batch_first=%batch_first add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @q_proj_weight @k_proj_weight @v_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_4, 20) + +class nn_MultiheadAttention_5 : public nn_MultiheadAttention_2 +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +4 3 +pnnx.Input input_0 0 1 query +pnnx.Input input_1 0 1 key +nn.MultiheadAttention op_0 2 1 query key out num_heads=%num_heads add_zero_attn=%add_zero_attn embed_dim=%embed_dim kdim=%kdim vdim=%vdim bias=%bias add_bias_kv=%add_bias_kv @in_proj_weight @q_proj_weight @k_proj_weight @v_proj_weight @in_proj_bias @bias_k @bias_v @out_proj.weight @out_proj.bias +pnnx.Output output 1 0 out +)PNNXIR"; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_MultiheadAttention_5, 20) + } // namespace ncnn } // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/nn_Softmax.cpp b/tools/pnnx/src/pass_ncnn/nn_Softmax.cpp index f263e357207d..d9629e6eff8e 100644 --- a/tools/pnnx/src/pass_ncnn/nn_Softmax.cpp +++ b/tools/pnnx/src/pass_ncnn/nn_Softmax.cpp @@ -45,11 +45,6 @@ pnnx.Output output 1 0 out { const int batch_index = op->inputs[0]->params["__batch_index"].i; - int input_rank = op->inputs[0]->shape.size(); - - if (batch_index >= 0 && batch_index < input_rank) - input_rank -= 1; - int axis = captured_params.at("dim").i; if (axis == batch_index) { @@ -58,7 +53,10 @@ pnnx.Output output 1 0 out } if (axis < 0) + { + int input_rank = op->inputs[0]->shape.size(); axis = input_rank + axis; + } if (axis > batch_index) axis -= 1; diff --git a/tools/pnnx/src/pass_ncnn/nn_Softmax2d.cpp b/tools/pnnx/src/pass_ncnn/nn_Softmax2d.cpp new file mode 100644 index 000000000000..152eb3a6a37f --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/nn_Softmax2d.cpp @@ -0,0 +1,55 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class nn_Softmax2d : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.Softmax2d op_0 1 1 input out +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Softmax"; + } + + const char* name_str() const + { + return "softmax2d"; + } + + void write(Operator* op, const std::map& /*captured_params*/) const + { + op->params["0"] = 0; + op->params["1"] = 1; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Softmax2d, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/nn_Unfold.cpp b/tools/pnnx/src/pass_ncnn/nn_Unfold.cpp new file mode 100644 index 000000000000..526e5d24c38c --- /dev/null +++ b/tools/pnnx/src/pass_ncnn/nn_Unfold.cpp @@ -0,0 +1,61 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "pass_ncnn.h" + +namespace pnnx { + +namespace ncnn { + +class nn_Unfold : public GraphRewriterPass +{ +public: + const char* match_pattern_graph() const + { + return R"PNNXIR(7767517 +3 2 +pnnx.Input input 0 1 input +nn.Unfold op_0 1 1 input out kernel_size=%kernel_size stride=%stride padding=%padding dilation=%dilation +pnnx.Output output 1 0 out +)PNNXIR"; + } + + const char* type_str() const + { + return "Unfold"; + } + + const char* name_str() const + { + return "unfold"; + } + + void write(Operator* op, const std::map& captured_params) const + { + op->params["1"] = captured_params.at("kernel_size").ai[1]; + op->params["11"] = captured_params.at("kernel_size").ai[0]; + op->params["2"] = captured_params.at("dilation").ai[1]; + op->params["12"] = captured_params.at("dilation").ai[0]; + op->params["3"] = captured_params.at("stride").ai[1]; + op->params["13"] = captured_params.at("stride").ai[0]; + op->params["4"] = captured_params.at("padding").ai[1]; + op->params["14"] = captured_params.at("padding").ai[0]; + } +}; + +REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(nn_Unfold, 20) + +} // namespace ncnn + +} // namespace pnnx diff --git a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp index 13049e5f05b0..73e8e08eb39d 100644 --- a/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp +++ b/tools/pnnx/src/pass_ncnn/solve_batch_index.cpp @@ -40,6 +40,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "F.conv1d", "F.conv2d", "F.conv3d", + "F.fold", "F.grid_sample", "F.group_norm", "F.instance_norm", @@ -54,6 +55,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "F.pixel_shuffle", "F.pixel_unshuffle", "F.prelu", + "F.unfold", "F.upsample_bilinear", "F.upsample_nearest", "F.upsample", @@ -80,6 +82,7 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "nn.ConvTranspose1d", "nn.ConvTranspose2d", "nn.ConvTranspose3d", + "nn.Fold", "nn.GroupNorm", "nn.InstanceNorm1d", "nn.InstanceNorm2d", @@ -98,6 +101,8 @@ static bool is_known_operator_with_batch_index_0(const Operator* op) "nn.ReplicationPad1d", "nn.ReplicationPad2d", "nn.ReplicationPad3d", + "nn.Softmax2d", + "nn.Unfold", "nn.Upsample", "nn.UpsamplingBilinear2d", "nn.UpsamplingNearest2d", @@ -283,6 +288,11 @@ void solve_batch_index(Graph& graph) { if (is_known_operator_with_batch_index_0(op)) { + if (op->type == std::string("F.grid_sample")) + { + op->inputs[1]->params["__batch_index"] = 0; + } + op->inputs[0]->params["__batch_index"] = 0; op->outputs[0]->params["__batch_index"] = 0; } diff --git a/tools/pnnx/src/pass_ncnn/torch_squeeze.cpp b/tools/pnnx/src/pass_ncnn/torch_squeeze.cpp index 1b475bbd7555..a1a52d272a70 100644 --- a/tools/pnnx/src/pass_ncnn/torch_squeeze.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_squeeze.cpp @@ -54,7 +54,7 @@ pnnx.Output output 1 0 out int input_rank = op->inputs[0]->shape.size(); - if (input_rank > 4) + if (input_rank > 5) { fprintf(stderr, "squeeze %d-rank tensor is not supported yet!\n", input_rank); return; @@ -97,6 +97,7 @@ pnnx.Output output 1 0 out { op->params["0"] = 1; op->params["1"] = 1; + op->params["11"] = 1; op->params["2"] = 1; } }; diff --git a/tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp b/tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp index 3dc2084d8f15..3c8dc24d18d7 100644 --- a/tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp +++ b/tools/pnnx/src/pass_ncnn/torch_unsqueeze.cpp @@ -54,7 +54,7 @@ pnnx.Output output 1 0 out int input_rank = op->inputs[0]->shape.size(); - if (input_rank > 3) + if (input_rank > 4) { fprintf(stderr, "unsqueeze %d-rank tensor is not supported yet!\n", input_rank); return; diff --git a/tools/pnnx/src/save_ncnn.cpp b/tools/pnnx/src/save_ncnn.cpp new file mode 100644 index 000000000000..6a4407879df0 --- /dev/null +++ b/tools/pnnx/src/save_ncnn.cpp @@ -0,0 +1,456 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "save_ncnn.h" + +namespace pnnx { + +static bool type_is_integer(int type) +{ + if (type == 1) return false; + if (type == 2) return false; + if (type == 3) return false; + if (type == 4) return true; + if (type == 5) return true; + if (type == 6) return true; + if (type == 7) return true; + if (type == 8) return true; + if (type == 9) return true; + if (type == 10) return false; + if (type == 11) return false; + if (type == 12) return false; + return false; +} + +static const char* type_to_dtype_string(int type) +{ + if (type == 1) return "torch.float"; + if (type == 2) return "torch.double"; + if (type == 3) return "torch.half"; + if (type == 4) return "torch.int"; + if (type == 5) return "torch.long"; + if (type == 6) return "torch.short"; + if (type == 7) return "torch.int8"; + if (type == 8) return "torch.uint8"; + if (type == 9) return "torch.bool"; + if (type == 10) return "torch.complex64"; + if (type == 11) return "torch.complex128"; + if (type == 12) return "torch.complex32"; + return "null"; +} + +static bool string_is_positive_integer(const std::string& t) +{ + for (size_t i = 0; i < t.size(); i++) + { + if (t[i] < '0' || t[i] > '9') + return false; + } + + return true; +} + +static unsigned short float32_to_float16(float value) +{ + // 1 : 8 : 23 + union + { + unsigned int u; + float f; + } tmp; + + tmp.f = value; + + // 1 : 8 : 23 + unsigned short sign = (tmp.u & 0x80000000) >> 31; + unsigned short exponent = (tmp.u & 0x7F800000) >> 23; + unsigned int significand = tmp.u & 0x7FFFFF; + + // NCNN_LOGE("%d %d %d", sign, exponent, significand); + + // 1 : 5 : 10 + unsigned short fp16; + if (exponent == 0) + { + // zero or denormal, always underflow + fp16 = (sign << 15) | (0x00 << 10) | 0x00; + } + else if (exponent == 0xFF) + { + // infinity or NaN + fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00); + } + else + { + // normalized + short newexp = exponent + (-127 + 15); + if (newexp >= 31) + { + // overflow, return infinity + fp16 = (sign << 15) | (0x1F << 10) | 0x00; + } + else if (newexp <= 0) + { + // Some normal fp32 cannot be expressed as normal fp16 + fp16 = (sign << 15) | (0x00 << 10) | 0x00; + } + else + { + // normal fp16 + fp16 = (sign << 15) | (newexp << 10) | (significand >> 13); + } + } + + return fp16; +} + +static size_t alignSize(size_t sz, int n) +{ + return (sz + n - 1) & -n; +} + +int save_ncnn(const Graph& g, const std::string& parampath, const std::string& binpath, const std::string& pypath, int fp16) +{ + FILE* paramfp = fopen(parampath.c_str(), "wb"); + if (!paramfp) + { + fprintf(stderr, "fopen %s failed\n", parampath.c_str()); + return -1; + } + + FILE* binfp = fopen(binpath.c_str(), "wb"); + if (!binfp) + { + fprintf(stderr, "fopen %s failed\n", binpath.c_str()); + fclose(paramfp); + return -1; + } + + // magic + fprintf(paramfp, "7767517\n"); + + // op count and oprand count + fprintf(paramfp, "%d %d\n", (int)g.ops.size(), (int)g.operands.size()); + + for (const Operator* op : g.ops) + { + fprintf(paramfp, "%-24s %-24s %d %d", op->type.c_str(), op->name.c_str(), (int)op->inputs.size(), (int)op->outputs.size()); + + for (const Operand* oprand : op->inputs) + { + fprintf(paramfp, " %s", oprand->name.c_str()); + } + + for (const Operand* oprand : op->outputs) + { + fprintf(paramfp, " %s", oprand->name.c_str()); + } + + for (const auto& it : op->params) + { + const Parameter& param = it.second; + + if (!string_is_positive_integer(it.first)) + { + fprintf(stderr, "ignore %s %s param %s=", op->type.c_str(), op->name.c_str(), it.first.c_str()); + + if (param.type == 0) + { + fprintf(stderr, "None"); + } + if (param.type == 1) + { + if (param.b) + fprintf(stderr, "True"); + else + fprintf(stderr, "False"); + } + if (param.type == 2) + { + fprintf(stderr, "%d", param.i); + } + if (param.type == 3) + { + fprintf(stderr, "%e", param.f); + } + if (param.type == 4) + { + fprintf(stderr, "%s", param.s.c_str()); + } + if (param.type == 5) + { + fprintf(stderr, "("); + for (size_t i = 0; i < param.ai.size(); i++) + { + fprintf(stderr, "%d", param.ai[i]); + if (i + 1 != param.ai.size()) + fprintf(stderr, ","); + } + fprintf(stderr, ")"); + } + if (param.type == 6) + { + fprintf(stderr, "("); + for (size_t i = 0; i < param.af.size(); i++) + { + fprintf(stderr, "%e", param.af[i]); + if (i + 1 != param.af.size()) + fprintf(stderr, ","); + } + fprintf(stderr, ")"); + } + if (param.type == 7) + { + fprintf(stderr, "("); + for (size_t i = 0; i < param.as.size(); i++) + { + fprintf(stderr, "%s", param.as[i].c_str()); + if (i + 1 != param.as.size()) + fprintf(stderr, ","); + } + fprintf(stderr, ")"); + } + fprintf(stderr, "\n"); + + continue; + } + + const int idkey = std::stoi(it.first); + if (param.type == 2) + { + fprintf(paramfp, " %d=%d", idkey, param.i); + } + if (param.type == 3) + { + fprintf(paramfp, " %d=%e", idkey, param.f); + } + if (param.type == 5) + { + const int array_size = (int)param.ai.size(); + fprintf(paramfp, " %d=%d", -23300 - idkey, array_size); + for (size_t i = 0; i < param.ai.size(); i++) + { + fprintf(paramfp, ",%d", param.ai[i]); + } + } + if (param.type == 6) + { + const int array_size = (int)param.af.size(); + fprintf(paramfp, " %d=%d", -23300 - idkey, array_size); + for (size_t i = 0; i < param.af.size(); i++) + { + fprintf(paramfp, ",%e", param.af[i]); + } + } + } + + bool is_type_flag_fp32 = false; + for (const auto& it : op->attrs) + { + // fprintf(paramfp, " @%s=", it.first.c_str()); + + const Attribute& attr = it.second; + + if (fp16 && is_type_flag_fp32) + { + // fp32 -> fp16 + const float* p = (const float*)attr.data.data(); + int len = attr.data.size() / 4; + std::vector data_fp16(alignSize(len * 2, 4)); + unsigned short* p_fp16 = (unsigned short*)data_fp16.data(); + for (int i = 0; i < len; i++) + { + p_fp16[i] = float32_to_float16(p[i]); + } + + // pad size to 4bytes + if (len % 2 == 1) + { + // pad with fixed value for model hash consistency + p_fp16[len] = 0x2283; + } + + fwrite(data_fp16.data(), data_fp16.size(), 1, binfp); + + is_type_flag_fp32 = false; + continue; + } + + if (fp16 && attr.type == 0 && attr.data == std::vector {0, 0, 0, 0}) + { + // write fp16 flag + unsigned int fp16_flag = 0x01306B47; + fwrite((const char*)&fp16_flag, sizeof(fp16_flag), 1, binfp); + + is_type_flag_fp32 = true; + continue; + } + + fwrite(attr.data.data(), attr.data.size(), 1, binfp); + } + + // if (op->inputnames.size() == op->inputs.size()) + // { + // for (size_t i = 0; i < op->inputs.size(); i++) + // { + // const Operand* oprand = op->inputs[i]; + // fprintf(paramfp, " $%s=%s", op->inputnames[i].c_str(), oprand->name.c_str()); + // } + // } + + // for (const Operand* oprand : op->outputs) + // { + // if (oprand->params.find("__batch_index") == oprand->params.end()) + // continue; + // + // const int batch_index = oprand->params.at("__batch_index").i; + // + // fprintf(paramfp, " #%s=%d", oprand->name.c_str(), batch_index); + // } + + // for (const Operand* oprand : op->outputs) + // { + // if (oprand->shape.empty()) + // continue; + // + // fprintf(paramfp, " #%s=", oprand->name.c_str()); + // + // fprintf(paramfp, "("); + // for (int64_t i = 0; i < oprand->shape.size() - 1; i++) + // { + // fprintf(paramfp, "%d,", oprand->shape[i]); + // } + // if (oprand->shape.size() > 0) + // fprintf(paramfp, "%d", oprand->shape[oprand->shape.size() - 1]); + // fprintf(paramfp, ")"); + // + // fprintf(paramfp, type_to_string(oprand->type)); + // } + + fprintf(paramfp, "\n"); + } + + fclose(paramfp); + fclose(binfp); + + FILE* pyfp = fopen(pypath.c_str(), "wb"); + if (!pyfp) + { + fprintf(stderr, "fopen %s failed\n", pypath.c_str()); + return -1; + } + + fprintf(pyfp, "import numpy as np\n"); + fprintf(pyfp, "import ncnn\n"); + fprintf(pyfp, "import torch\n"); + + fprintf(pyfp, "\n"); + + // test inference + { + fprintf(pyfp, "def test_inference():\n"); + fprintf(pyfp, " torch.manual_seed(0)\n"); + + for (int input_index = 0;; input_index++) + { + std::string input_name = std::string("in") + std::to_string(input_index); + const Operand* r = g.get_operand(input_name); + if (!r) + break; + + if (type_is_integer(r->type)) + { + fprintf(pyfp, " %s = torch.randint(10, (", input_name.c_str()); + for (size_t i = 0; i < r->shape.size(); i++) + { + fprintf(pyfp, "%d", r->shape[i]); + if (i + 1 != r->shape.size() || r->shape.size() == 1) + fprintf(pyfp, ", "); + } + fprintf(pyfp, "), dtype=%s)\n", type_to_dtype_string(r->type)); + } + else + { + fprintf(pyfp, " %s = torch.rand(", input_name.c_str()); + for (size_t i = 0; i < r->shape.size(); i++) + { + fprintf(pyfp, "%d, ", r->shape[i]); + } + fprintf(pyfp, "dtype=%s)\n", type_to_dtype_string(r->type)); + } + } + + fprintf(pyfp, " out = []\n"); + fprintf(pyfp, "\n"); + + fprintf(pyfp, " with ncnn.Net() as net:\n"); + fprintf(pyfp, " net.load_param(\"%s\")\n", parampath.c_str()); + fprintf(pyfp, " net.load_model(\"%s\")\n", binpath.c_str()); + fprintf(pyfp, "\n"); + fprintf(pyfp, " with net.create_extractor() as ex:\n"); + + for (int input_index = 0;; input_index++) + { + std::string input_name = std::string("in") + std::to_string(input_index); + const Operand* r = g.get_operand(input_name); + if (!r) + break; + + const int batch_index = r->params.at("__batch_index").i; + if (batch_index != 233) + { + fprintf(pyfp, " ex.input(\"%s\", ncnn.Mat(%s.squeeze(%d).numpy()).clone())\n", input_name.c_str(), input_name.c_str(), batch_index); + } + else + { + fprintf(pyfp, " ex.input(\"%s\", ncnn.Mat(%s.numpy()).clone())\n", input_name.c_str(), input_name.c_str()); + } + } + + fprintf(pyfp, "\n"); + + for (int output_index = 0;; output_index++) + { + std::string output_name = std::string("out") + std::to_string(output_index); + const Operand* r = g.get_operand(output_name); + if (!r) + break; + + fprintf(pyfp, " _, %s = ex.extract(\"%s\")\n", output_name.c_str(), output_name.c_str()); + + const int batch_index = r->params.at("__batch_index").i; + if (batch_index != 233) + { + fprintf(pyfp, " out.append(torch.from_numpy(np.array(%s)).unsqueeze(%d))\n", output_name.c_str(), batch_index); + } + else + { + fprintf(pyfp, " out.append(torch.from_numpy(np.array(%s)))\n", output_name.c_str()); + } + } + + fprintf(pyfp, "\n"); + + fprintf(pyfp, " if len(out) == 1:\n"); + fprintf(pyfp, " return out[0]\n"); + fprintf(pyfp, " else:\n"); + fprintf(pyfp, " return tuple(out)\n"); + } + + fclose(pyfp); + + return 0; +} + +} // namespace pnnx diff --git a/tools/pnnx/src/save_ncnn.h b/tools/pnnx/src/save_ncnn.h new file mode 100644 index 000000000000..458c14700209 --- /dev/null +++ b/tools/pnnx/src/save_ncnn.h @@ -0,0 +1,26 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef PNNX_SAVE_NCNN_H +#define PNNX_SAVE_NCNN_H + +#include "ir.h" + +namespace pnnx { + +int save_ncnn(const Graph& g, const std::string& parampath, const std::string& binpath, const std::string& pypath, int fp16); + +} // namespace pnnx + +#endif // PNNX_SAVE_NCNN_H diff --git a/tools/pnnx/src/save_onnx.cpp b/tools/pnnx/src/save_onnx.cpp new file mode 100644 index 000000000000..55bb10cf7222 --- /dev/null +++ b/tools/pnnx/src/save_onnx.cpp @@ -0,0 +1,333 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "save_onnx.h" + +#include "onnx.pb.h" + +#include +#include +#include + +namespace pnnx { + +// from cxxabi bridge +extern const char* get_operand_name(const Operand* x); +extern const char* get_operator_type(const Operator* op); +extern const char* get_operator_name(const Operator* op); +extern std::vector get_operator_params_keys(const Operator* op); +extern std::vector get_operator_attrs_keys(const Operator* op); +extern const Parameter& get_operator_param(const Operator* op, const char* key); +extern const Attribute& get_operator_attr(const Operator* op, const char* key); +extern const char* get_param_s(const Parameter& p); +extern std::vector get_param_as(const Parameter& p); + +static unsigned short float32_to_float16(float value) +{ + // 1 : 8 : 23 + union + { + unsigned int u; + float f; + } tmp; + + tmp.f = value; + + // 1 : 8 : 23 + unsigned short sign = (tmp.u & 0x80000000) >> 31; + unsigned short exponent = (tmp.u & 0x7F800000) >> 23; + unsigned int significand = tmp.u & 0x7FFFFF; + + // NCNN_LOGE("%d %d %d", sign, exponent, significand); + + // 1 : 5 : 10 + unsigned short fp16; + if (exponent == 0) + { + // zero or denormal, always underflow + fp16 = (sign << 15) | (0x00 << 10) | 0x00; + } + else if (exponent == 0xFF) + { + // infinity or NaN + fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00); + } + else + { + // normalized + short newexp = exponent + (-127 + 15); + if (newexp >= 31) + { + // overflow, return infinity + fp16 = (sign << 15) | (0x1F << 10) | 0x00; + } + else if (newexp <= 0) + { + // Some normal fp32 cannot be expressed as normal fp16 + fp16 = (sign << 15) | (0x00 << 10) | 0x00; + } + else + { + // normal fp16 + fp16 = (sign << 15) | (newexp << 10) | (significand >> 13); + } + } + + return fp16; +} + +int save_onnx(const Graph& g, const char* onnxpath, int fp16) +{ + onnx::ModelProto model; + + onnx::GraphProto* gp = model.mutable_graph(); + + for (const Operand* x : g.operands) + { + onnx::ValueInfoProto* vip = gp->add_value_info(); + + vip->set_name(get_operand_name(x)); + + onnx::TypeProto* tp = vip->mutable_type(); + + onnx::TypeProto_Tensor* tpt = tp->mutable_tensor_type(); + + switch (x->type) + { + case 1: // f32 + tpt->set_elem_type(fp16 ? 10 : 1); + break; + case 2: // f64 + tpt->set_elem_type(fp16 ? 10 : 11); + break; + case 3: // f16 + tpt->set_elem_type(10); + break; + case 4: // i32 + tpt->set_elem_type(6); + break; + case 5: // i64 + tpt->set_elem_type(7); + break; + case 6: // i16 + tpt->set_elem_type(5); + break; + case 7: // i8 + tpt->set_elem_type(3); + break; + case 8: // u8 + tpt->set_elem_type(2); + break; + case 9: // bool + tpt->set_elem_type(9); + break; + case 10: // cp64 + tpt->set_elem_type(14); + break; + case 11: // cp128 + tpt->set_elem_type(15); + break; + case 12: // cp32 + tpt->set_elem_type(0); + break; + default: // null + tpt->set_elem_type(0); + break; + } + + onnx::TensorShapeProto* tsp = tpt->mutable_shape(); + + for (auto s : x->shape) + { + onnx::TensorShapeProto_Dimension* tspd = tsp->add_dim(); + + tspd->set_dim_value(s); + } + } + + for (const Operator* op : g.ops) + { + onnx::NodeProto* np = gp->add_node(); + + np->set_op_type(get_operator_type(op)); + np->set_name(get_operator_name(op)); + + for (const Operand* oprand : op->inputs) + { + np->add_input(get_operand_name(oprand)); + } + + for (const Operand* oprand : op->outputs) + { + np->add_output(get_operand_name(oprand)); + } + + std::vector params_keys = get_operator_params_keys(op); + for (const char* param_name : params_keys) + { + const Parameter& param = get_operator_param(op, param_name); + + onnx::AttributeProto* ap = np->add_attribute(); + + ap->set_name(param_name); + + if (param.type == 0) + { + ap->set_s("None"); + } + if (param.type == 1) + { + if (param.b) + ap->set_i(1); + else + ap->set_i(0); + } + if (param.type == 2) + { + ap->set_i(param.i); + } + if (param.type == 3) + { + ap->set_f(param.f); + } + if (param.type == 4) + { + ap->set_s(get_param_s(param)); + } + if (param.type == 5) + { + for (auto i : param.ai) + { + ap->add_ints(i); + } + } + if (param.type == 6) + { + for (auto f : param.af) + { + ap->add_floats(f); + } + } + if (param.type == 7) + { + std::vector as = get_param_as(param); + for (auto s : as) + { + ap->add_strings(s); + } + } + } + + std::vector attrs_keys = get_operator_attrs_keys(op); + for (const char* attr_name : attrs_keys) + { + onnx::TensorProto* tp = gp->add_initializer(); + + tp->set_name(std::string(get_operator_name(op)) + "." + attr_name); + + np->add_input(std::string(get_operator_name(op)) + "." + attr_name); + + const Attribute& attr = get_operator_attr(op, attr_name); + for (auto s : attr.shape) + { + tp->add_dims(s); + } + + switch (attr.type) + { + case 1: // f32 + tp->set_data_type(fp16 ? 10 : 1); + break; + case 2: // f64 + tp->set_data_type(fp16 ? 10 : 11); + break; + case 3: // f16 + tp->set_data_type(10); + break; + case 4: // i32 + tp->set_data_type(6); + break; + case 5: // i64 + tp->set_data_type(7); + break; + case 6: // i16 + tp->set_data_type(5); + break; + case 7: // i8 + tp->set_data_type(3); + break; + case 8: // u8 + tp->set_data_type(2); + break; + case 9: // bool + tp->set_data_type(9); + break; + case 10: // cp64 + tp->set_data_type(14); + break; + case 11: // cp128 + tp->set_data_type(15); + break; + case 12: // cp32 + tp->set_data_type(0); + break; + default: // null + tp->set_data_type(0); + break; + } + + std::string* d = tp->mutable_raw_data(); + if (fp16 && attr.type == 1) + { + // fp32 to fp16 + const float* p = (const float*)attr.data.data(); + int len = attr.data.size() / 4; + d->resize(len * 2); + unsigned short* p_fp16 = (unsigned short*)d->data(); + for (int i = 0; i < len; i++) + { + p_fp16[i] = float32_to_float16(p[i]); + } + } + else if (fp16 && attr.type == 2) + { + // fp64 to fp16 + const double* p = (const double*)attr.data.data(); + int len = attr.data.size() / 4; + d->resize(len); + unsigned short* p_fp16 = (unsigned short*)d->data(); + for (int i = 0; i < len; i++) + { + p_fp16[i] = float32_to_float16((float)p[i]); + } + } + else + { + d->resize(attr.data.size()); + memcpy((void*)d->data(), attr.data.data(), attr.data.size()); + } + } + } + + std::fstream output(onnxpath, std::ios::out | std::ios::trunc | std::ios::binary); + if (!model.SerializeToOstream(&output)) + { + fprintf(stderr, "write onnx failed\n"); + return -1; + } + + return 0; +} + +} // namespace pnnx diff --git a/tools/pnnx/src/save_onnx.h b/tools/pnnx/src/save_onnx.h new file mode 100644 index 000000000000..9a4099872a6b --- /dev/null +++ b/tools/pnnx/src/save_onnx.h @@ -0,0 +1,26 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef PNNX_SAVE_ONNX_H +#define PNNX_SAVE_ONNX_H + +#include "ir.h" + +namespace pnnx { + +int save_onnx(const Graph& g, const char* onnxpath, int fp16); + +} // namespace pnnx + +#endif // PNNX_SAVE_ONNX_H diff --git a/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp b/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp new file mode 100644 index 000000000000..b74f2ab7a724 --- /dev/null +++ b/tools/pnnx/src/save_onnx_cxxabi_bridge.cpp @@ -0,0 +1,81 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "ir.h" + +namespace pnnx { + +const char* get_operand_name(const Operand* x) +{ + return x->name.c_str(); +} + +const char* get_operator_type(const Operator* op) +{ + return op->type.c_str(); +} + +const char* get_operator_name(const Operator* op) +{ + return op->name.c_str(); +} + +std::vector get_operator_params_keys(const Operator* op) +{ + std::vector keys; + for (const auto& it : op->params) + { + const std::string& key = it.first; + keys.push_back(key.c_str()); + } + return keys; +} + +std::vector get_operator_attrs_keys(const Operator* op) +{ + std::vector keys; + for (const auto& it : op->attrs) + { + const std::string& key = it.first; + keys.push_back(key.c_str()); + } + return keys; +} + +const Parameter& get_operator_param(const Operator* op, const char* key) +{ + return op->params.at(key); +} + +const Attribute& get_operator_attr(const Operator* op, const char* key) +{ + return op->attrs.at(key); +} + +const char* get_param_s(const Parameter& p) +{ + return p.s.c_str(); +} + +std::vector get_param_as(const Parameter& p) +{ + std::vector as; + for (const auto& s : p.as) + { + as.push_back(s.c_str()); + } + return as; +} + +} // namespace pnnx diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt index 50a2d327eeb1..8a69446d3609 100644 --- a/tools/pnnx/tests/CMakeLists.txt +++ b/tools/pnnx/tests/CMakeLists.txt @@ -30,6 +30,7 @@ pnnx_add_test(F_dropout3d) pnnx_add_test(F_elu) pnnx_add_test(F_embedding) pnnx_add_test(F_feature_alpha_dropout) +pnnx_add_test(F_fold) pnnx_add_test(F_gelu) pnnx_add_test(F_glu) pnnx_add_test(F_grid_sample) @@ -70,6 +71,7 @@ pnnx_add_test(F_softsign) pnnx_add_test(F_tanh) pnnx_add_test(F_tanhshrink) pnnx_add_test(F_threshold) +pnnx_add_test(F_unfold) pnnx_add_test(F_upsample_bilinear) pnnx_add_test(F_upsample_nearest) pnnx_add_test(F_upsample) @@ -103,7 +105,9 @@ pnnx_add_test(nn_Dropout2d) pnnx_add_test(nn_Dropout3d) pnnx_add_test(nn_ELU) pnnx_add_test(nn_Embedding) +pnnx_add_test(nn_Fold) pnnx_add_test(nn_GELU) +pnnx_add_test(nn_GLU) pnnx_add_test(nn_GroupNorm) pnnx_add_test(nn_GRU) pnnx_add_test(nn_Hardshrink) @@ -142,6 +146,7 @@ pnnx_add_test(nn_SELU) pnnx_add_test(nn_Sigmoid) pnnx_add_test(nn_SiLU) pnnx_add_test(nn_Softmax) +pnnx_add_test(nn_Softmax2d) pnnx_add_test(nn_Softmin) pnnx_add_test(nn_Softplus) pnnx_add_test(nn_Softshrink) @@ -149,6 +154,7 @@ pnnx_add_test(nn_Softsign) pnnx_add_test(nn_Tanh) pnnx_add_test(nn_Tanhshrink) pnnx_add_test(nn_Threshold) +pnnx_add_test(nn_Unfold) pnnx_add_test(nn_Upsample) pnnx_add_test(nn_UpsamplingBilinear2d) pnnx_add_test(nn_UpsamplingNearest2d) @@ -158,10 +164,14 @@ pnnx_add_test(Tensor_contiguous) pnnx_add_test(Tensor_index) pnnx_add_test(Tensor_masked_fill) pnnx_add_test(Tensor_new_empty) +pnnx_add_test(Tensor_new_full) +pnnx_add_test(Tensor_new_ones) +pnnx_add_test(Tensor_new_zeros) pnnx_add_test(Tensor_repeat) pnnx_add_test(Tensor_reshape) pnnx_add_test(Tensor_select) pnnx_add_test(Tensor_slice) +pnnx_add_test(Tensor_slice_copy) pnnx_add_test(Tensor_view) pnnx_add_test(torch_addmm) @@ -176,7 +186,6 @@ pnnx_add_test(torch_bitwise_xor) pnnx_add_test(torch_bmm) pnnx_add_test(torch_cat) pnnx_add_test(torch_chunk) -pnnx_add_test(torch_clamp) pnnx_add_test(torch_clone) pnnx_add_test(torch_complex) pnnx_add_test(torch_einsum) @@ -187,7 +196,6 @@ pnnx_add_test(torch_full_like) pnnx_add_test(torch_gather) pnnx_add_test(torch_ge) pnnx_add_test(torch_gt) -pnnx_add_test(torch_imag) pnnx_add_test(torch_index_select) pnnx_add_test(torch_le) pnnx_add_test(torch_logsumexp) @@ -202,7 +210,6 @@ pnnx_add_test(torch_ones) pnnx_add_test(torch_ones_like) pnnx_add_test(torch_permute) pnnx_add_test(torch_prod) -pnnx_add_test(torch_real) pnnx_add_test(torch_scatter_add) pnnx_add_test(torch_sum) pnnx_add_test(torch_split) @@ -232,6 +239,36 @@ pnnx_add_test(torch_fft_fft) pnnx_add_test(torch_fft_fft2) pnnx_add_test(torch_fft_fftn) +pnnx_add_test(torch_abs) +pnnx_add_test(torch_acos) +pnnx_add_test(torch_acosh) +pnnx_add_test(torch_asin) +pnnx_add_test(torch_asinh) +pnnx_add_test(torch_atan) +pnnx_add_test(torch_atanh) +pnnx_add_test(torch_atan2) +pnnx_add_test(torch_ceil) +pnnx_add_test(torch_clamp) +pnnx_add_test(torch_cos) +pnnx_add_test(torch_cosh) +pnnx_add_test(torch_exp) +pnnx_add_test(torch_floor) +pnnx_add_test(torch_imag) +pnnx_add_test(torch_log) +pnnx_add_test(torch_neg) +pnnx_add_test(torch_pow) +pnnx_add_test(torch_real) +pnnx_add_test(torch_reciprocal) +pnnx_add_test(torch_rsqrt) +pnnx_add_test(torch_sign) +pnnx_add_test(torch_sin) +pnnx_add_test(torch_sinh) +pnnx_add_test(torch_sqrt) +pnnx_add_test(torch_square) +pnnx_add_test(torch_tan) +pnnx_add_test(torch_tanh) +pnnx_add_test(torch_trunc) + pnnx_add_test(convnext_tiny) pnnx_add_test(mobilenet_v2) pnnx_add_test(mobilenet_v3_small) @@ -255,12 +292,20 @@ pnnx_add_test(pnnx_fuse_convtranspose2d_batchnorm2d) pnnx_add_test(pnnx_fuse_linear_batchnorm1d) pnnx_add_test(pnnx_fuse_select_to_unbind) pnnx_add_test(pnnx_fuse_slice_to_tensor_split) +pnnx_add_test(pnnx_fuse_adjacent_reshape) +pnnx_add_test(pnnx_fuse_pad_conv1d) +pnnx_add_test(pnnx_fuse_pad_conv2d) if(Torch_VERSION VERSION_GREATER_EQUAL "1.9") pnnx_add_test(F_mish) pnnx_add_test(nn_Mish) endif() +if(Torch_VERSION VERSION_GREATER_EQUAL "1.10") + pnnx_add_test(torch_bitwise_left_shift) + pnnx_add_test(torch_bitwise_right_shift) +endif() + if(Torch_VERSION VERSION_GREATER_EQUAL "1.11") pnnx_add_test(torch_fft_ihfft2) pnnx_add_test(torch_fft_ihfftn) diff --git a/tools/pnnx/tests/ncnn/CMakeLists.txt b/tools/pnnx/tests/ncnn/CMakeLists.txt index 95e1320d8e29..272551a8a5a7 100644 --- a/tools/pnnx/tests/ncnn/CMakeLists.txt +++ b/tools/pnnx/tests/ncnn/CMakeLists.txt @@ -28,8 +28,10 @@ pnnx_ncnn_add_test(F_dropout3d) pnnx_ncnn_add_test(F_elu) pnnx_ncnn_add_test(F_embedding) pnnx_ncnn_add_test(F_feature_alpha_dropout) +pnnx_ncnn_add_test(F_fold) pnnx_ncnn_add_test(F_gelu) pnnx_ncnn_add_test(F_glu) +pnnx_ncnn_add_test(F_grid_sample) pnnx_ncnn_add_test(F_group_norm) pnnx_ncnn_add_test(F_hardsigmoid) pnnx_ncnn_add_test(F_hardswish) @@ -52,6 +54,7 @@ pnnx_ncnn_add_test(F_sigmoid) pnnx_ncnn_add_test(F_silu) pnnx_ncnn_add_test(F_softmax) pnnx_ncnn_add_test(F_tanh) +pnnx_ncnn_add_test(F_unfold) pnnx_ncnn_add_test(F_upsample_bilinear) pnnx_ncnn_add_test(F_upsample_nearest) pnnx_ncnn_add_test(F_upsample) @@ -84,7 +87,9 @@ pnnx_ncnn_add_test(nn_Dropout2d) pnnx_ncnn_add_test(nn_Dropout3d) pnnx_ncnn_add_test(nn_ELU) pnnx_ncnn_add_test(nn_Embedding) +pnnx_ncnn_add_test(nn_Fold) pnnx_ncnn_add_test(nn_GELU) +pnnx_ncnn_add_test(nn_GLU) pnnx_ncnn_add_test(nn_GroupNorm) pnnx_ncnn_add_test(nn_GRU) pnnx_ncnn_add_test(nn_Hardsigmoid) @@ -114,7 +119,9 @@ pnnx_ncnn_add_test(nn_SELU) pnnx_ncnn_add_test(nn_Sigmoid) pnnx_ncnn_add_test(nn_SiLU) pnnx_ncnn_add_test(nn_Softmax) +pnnx_ncnn_add_test(nn_Softmax2d) pnnx_ncnn_add_test(nn_Tanh) +pnnx_ncnn_add_test(nn_Unfold) pnnx_ncnn_add_test(nn_Upsample) pnnx_ncnn_add_test(nn_UpsamplingBilinear2d) pnnx_ncnn_add_test(nn_UpsamplingNearest2d) @@ -132,7 +139,6 @@ pnnx_ncnn_add_test(torch_amin) pnnx_ncnn_add_test(torch_bmm) pnnx_ncnn_add_test(torch_cat) pnnx_ncnn_add_test(torch_chunk) -pnnx_ncnn_add_test(torch_clamp) pnnx_ncnn_add_test(torch_clone) pnnx_ncnn_add_test(torch_einsum) pnnx_ncnn_add_test(torch_logsumexp) @@ -150,6 +156,26 @@ pnnx_ncnn_add_test(torch_transpose) pnnx_ncnn_add_test(torch_unbind) pnnx_ncnn_add_test(torch_unsqueeze) +pnnx_ncnn_add_test(torch_abs) +pnnx_ncnn_add_test(torch_acos) +pnnx_ncnn_add_test(torch_asin) +pnnx_ncnn_add_test(torch_atan) +pnnx_ncnn_add_test(torch_ceil) +pnnx_ncnn_add_test(torch_clamp) +pnnx_ncnn_add_test(torch_cos) +pnnx_ncnn_add_test(torch_exp) +pnnx_ncnn_add_test(torch_floor) +pnnx_ncnn_add_test(torch_log) +pnnx_ncnn_add_test(torch_neg) +pnnx_ncnn_add_test(torch_pow) +pnnx_ncnn_add_test(torch_reciprocal) +pnnx_ncnn_add_test(torch_rsqrt) +pnnx_ncnn_add_test(torch_sin) +pnnx_ncnn_add_test(torch_sqrt) +pnnx_ncnn_add_test(torch_square) +pnnx_ncnn_add_test(torch_tan) +pnnx_ncnn_add_test(torch_tanh) + pnnx_ncnn_add_test(convnext_tiny) pnnx_ncnn_add_test(mobilenet_v2) pnnx_ncnn_add_test(mobilenet_v3_small) diff --git a/tools/pnnx/tests/ncnn/test_F_fold.py b/tools/pnnx/tests/ncnn/test_F_fold.py new file mode 100644 index 000000000000..54103fdf2611 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_F_fold.py @@ -0,0 +1,63 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = F.fold(x, output_size=22, kernel_size=3) + y = F.fold(y, output_size=(17,18), kernel_size=(2,4), stride=(2,1), padding=2, dilation=1) + z = F.fold(z, output_size=(5,11), kernel_size=(2,3), stride=1, padding=(2,4), dilation=(1,2)) + + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 108, 400) + y = torch.rand(1, 96, 190) + z = torch.rand(1, 36, 120) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_F_fold.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_F_fold.pt inputshape=[1,108,400],[1,96,190],[1,36,120]") + + # ncnn inference + import test_F_fold_ncnn + b = test_F_fold_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_F_glu.py b/tools/pnnx/tests/ncnn/test_F_glu.py index fbc939ad158b..937253c12791 100644 --- a/tools/pnnx/tests/ncnn/test_F_glu.py +++ b/tools/pnnx/tests/ncnn/test_F_glu.py @@ -27,7 +27,8 @@ def forward(self, x, y, z): z0 = F.glu(z, dim=0) z1 = F.glu(z, dim=1) z2 = F.glu(z, dim=2) - return x0, y0, y1, z0, z1, z2 + z3 = F.glu(z, dim=-1) + return x0, y0, y1, z0, z1, z2, z3 def test(): net = Model() @@ -46,7 +47,7 @@ def test(): # torchscript to pnnx import os - # os.system("../../src/pnnx test_F_glu.pt") + os.system("../../src/pnnx test_F_glu.pt inputshape=[18],[12,16],[24,28,34]") # ncnn inference import test_F_glu_ncnn diff --git a/tools/pnnx/tests/ncnn/test_F_grid_sample.py b/tools/pnnx/tests/ncnn/test_F_grid_sample.py new file mode 100644 index 000000000000..c84d38232b1e --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_F_grid_sample.py @@ -0,0 +1,98 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, xg1, xg2, y, yg1, yg2): + # norm to -1 ~ 1 + xg1 = xg1 * 2 - 1 + xg2 = xg2 * 2 - 1 + yg1 = yg1 * 2 - 1 + yg2 = yg2 * 2 - 1 + + x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) + x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) + x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='reflection', align_corners=False) + x = F.grid_sample(x, xg2, mode='nearest', padding_mode='zeros', align_corners=False) + x = F.grid_sample(x, xg1, mode='nearest', padding_mode='border', align_corners=False) + x = F.grid_sample(x, xg2, mode='nearest', padding_mode='reflection', align_corners=False) + x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='zeros', align_corners=False) + x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='border', align_corners=False) + x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='reflection', align_corners=False) + x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='zeros', align_corners=True) + x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='border', align_corners=True) + x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='reflection', align_corners=True) + x = F.grid_sample(x, xg1, mode='nearest', padding_mode='zeros', align_corners=True) + x = F.grid_sample(x, xg2, mode='nearest', padding_mode='border', align_corners=True) + x = F.grid_sample(x, xg1, mode='nearest', padding_mode='reflection', align_corners=True) + x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='zeros', align_corners=True) + x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='border', align_corners=True) + x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='reflection', align_corners=True) + + y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False) + y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False) + y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='reflection', align_corners=False) + y = F.grid_sample(y, yg2, mode='nearest', padding_mode='zeros', align_corners=False) + y = F.grid_sample(y, yg1, mode='nearest', padding_mode='border', align_corners=False) + y = F.grid_sample(y, yg2, mode='nearest', padding_mode='reflection', align_corners=False) + y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=True) + y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=True) + y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='reflection', align_corners=True) + y = F.grid_sample(y, yg2, mode='nearest', padding_mode='zeros', align_corners=True) + y = F.grid_sample(y, yg1, mode='nearest', padding_mode='border', align_corners=True) + y = F.grid_sample(y, yg2, mode='nearest', padding_mode='reflection', align_corners=True) + + return x, y + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 12, 16) + xg1 = torch.rand(1, 21, 27, 2) + xg2 = torch.rand(1, 12, 16, 2) + y = torch.rand(1, 5, 10, 12, 16) + yg1 = torch.rand(1, 10, 21, 27, 3) + yg2 = torch.rand(1, 10, 12, 16, 3) + + a0, a1 = net(x, xg1, xg2, y, yg1, yg2) + + # export torchscript + mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2)) + mod.save("test_F_grid_sample.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_F_grid_sample.pt inputshape=[1,3,12,16],[1,21,27,2],[1,12,16,2],[1,5,10,12,16],[1,10,21,27,3],[1,10,12,16,3]") + + # ncnn inference + import test_F_grid_sample_ncnn + b0, b1 = test_F_grid_sample_ncnn.test_inference() + + return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_F_group_norm.py b/tools/pnnx/tests/ncnn/test_F_group_norm.py index 0e4710fbbd6d..6b0347950450 100644 --- a/tools/pnnx/tests/ncnn/test_F_group_norm.py +++ b/tools/pnnx/tests/ncnn/test_F_group_norm.py @@ -20,29 +20,37 @@ class Model(nn.Module): def __init__(self): super(Model, self).__init__() + self.w3 = nn.Parameter(torch.rand(16)) + self.b3 = nn.Parameter(torch.rand(16)) + self.w4 = nn.Parameter(torch.rand(12)) + self.b4 = nn.Parameter(torch.rand(12)) self.w5 = nn.Parameter(torch.rand(32)) self.b5 = nn.Parameter(torch.rand(32)) - def forward(self, z): + def forward(self, x, y, z): + x = F.group_norm(x, 4, self.w3, self.b3) + y = F.group_norm(y, 6, self.w4, self.b4) z = F.group_norm(z, 8, self.w5, self.b5, eps=1e-2) - return z + return x, y, z def test(): net = Model() net.eval() torch.manual_seed(0) + x = torch.rand(1, 16) + y = torch.rand(1, 12, 16) z = torch.rand(1, 32, 12, 16) - a = net(z) + a = net(x, y, z) # export torchscript - mod = torch.jit.trace(net, z) + mod = torch.jit.trace(net, (x, y, z)) mod.save("test_F_group_norm.pt") # torchscript to pnnx import os - os.system("../../src/pnnx test_F_group_norm.pt inputshape=[1,32,12,16]") + os.system("../../src/pnnx test_F_group_norm.pt inputshape=[1,16],[1,12,16],[1,32,12,16]") # ncnn inference import test_F_group_norm_ncnn diff --git a/tools/pnnx/tests/ncnn/test_F_softmax.py b/tools/pnnx/tests/ncnn/test_F_softmax.py index f7d612eda643..83a5324f49da 100644 --- a/tools/pnnx/tests/ncnn/test_F_softmax.py +++ b/tools/pnnx/tests/ncnn/test_F_softmax.py @@ -24,7 +24,8 @@ def forward(self, x, y, z): x = F.softmax(x, 0) y = F.softmax(y, 1) z = F.softmax(z, 2) - return x, y, z + z2 = F.softmax(z, -1) + return x, y, z, z2 def test(): net = Model() diff --git a/tools/pnnx/tests/ncnn/test_F_unfold.py b/tools/pnnx/tests/ncnn/test_F_unfold.py new file mode 100644 index 000000000000..e8e1a603cc3a --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_F_unfold.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x): + x0 = F.unfold(x, kernel_size=3) + x1 = F.unfold(x, kernel_size=(2,4), stride=(2,1), padding=2, dilation=1) + x2 = F.unfold(x, kernel_size=(1,3), stride=1, padding=(2,4), dilation=(1,2)) + + return x0, x1, x2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 64, 64) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_F_unfold.pt") + + # torchscript to ncnn + import os + os.system("../../src/pnnx test_F_unfold.pt inputshape=[1,12,64,64]") + + # ncnn inference + import test_F_unfold_ncnn + b = test_F_unfold_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_nn_Fold.py b/tools/pnnx/tests/ncnn/test_nn_Fold.py new file mode 100644 index 000000000000..8b07b2b5d385 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_nn_Fold.py @@ -0,0 +1,67 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.fold_0 = nn.Fold(output_size=22, kernel_size=3) + self.fold_1 = nn.Fold(output_size=(17,18), kernel_size=(2,4), stride=(2,1), padding=2, dilation=1) + self.fold_2 = nn.Fold(output_size=(5,11), kernel_size=(2,3), stride=1, padding=(2,4), dilation=(1,2)) + + def forward(self, x, y, z): + x = self.fold_0(x) + y = self.fold_1(y) + z = self.fold_2(z) + + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 108, 400) + y = torch.rand(1, 96, 190) + z = torch.rand(1, 36, 120) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_nn_Fold.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_nn_Fold.pt inputshape=[1,108,400],[1,96,190],[1,36,120]") + + # ncnn inference + import test_nn_Fold_ncnn + b = test_nn_Fold_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_nn_GLU.py b/tools/pnnx/tests/ncnn/test_nn_GLU.py new file mode 100644 index 000000000000..49a018ee2c4a --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_nn_GLU.py @@ -0,0 +1,67 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.GLU(dim=0) + self.act_1 = nn.GLU(dim=1) + self.act_2 = nn.GLU(dim=2) + self.act_3 = nn.GLU(dim=-1) + + def forward(self, x, y, z): + x = self.act_0(x) + y = self.act_1(y) + z = self.act_2(z) + z2 = self.act_3(z) + return x, y, z, z2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(12) + y = torch.rand(12, 64) + z = torch.rand(12, 24, 64) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_nn_GLU.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_nn_GLU.pt inputshape=[12],[12,64],[12,24,64]") + + # ncnn inference + import test_nn_GLU_ncnn + b = test_nn_GLU_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_nn_GroupNorm.py b/tools/pnnx/tests/ncnn/test_nn_GroupNorm.py index 71f7d684fc5d..c016e7ae250e 100644 --- a/tools/pnnx/tests/ncnn/test_nn_GroupNorm.py +++ b/tools/pnnx/tests/ncnn/test_nn_GroupNorm.py @@ -24,34 +24,47 @@ def __init__(self): self.gn_1 = nn.GroupNorm(num_groups=12, num_channels=12, eps=1e-2, affine=True) self.gn_2 = nn.GroupNorm(num_groups=1, num_channels=12, eps=1e-4, affine=True) - def forward(self, x): + def forward(self, x, y, z): x = self.gn_0(x) x = self.gn_1(x) x = self.gn_2(x) - return x + + y = self.gn_0(y) + y = self.gn_1(y) + y = self.gn_2(y) + + z = self.gn_0(z) + z = self.gn_1(z) + z = self.gn_2(z) + return x, y, z def test(): net = Model() net.eval() torch.manual_seed(0) - x = torch.rand(1, 12, 24, 64) + x = torch.rand(1, 12, 64) + y = torch.rand(1, 12, 24, 64) + z = torch.rand(1, 12, 24, 32, 64) - a0 = net(x) + a = net(x, y, z) # export torchscript - mod = torch.jit.trace(net, x) + mod = torch.jit.trace(net, (x, y, z)) mod.save("test_nn_GroupNorm.pt") # torchscript to pnnx import os - os.system("../../src/pnnx test_nn_GroupNorm.pt inputshape=[1,12,24,64]") + os.system("../../src/pnnx test_nn_GroupNorm.pt inputshape=[1,12,64],[1,12,24,64],[1,12,24,32,64]") # ncnn inference import test_nn_GroupNorm_ncnn - b0 = test_nn_GroupNorm_ncnn.test_inference() + b = test_nn_GroupNorm_ncnn.test_inference() - return torch.allclose(a0, b0, 1e-4, 1e-4) + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True if __name__ == "__main__": if test(): diff --git a/tools/pnnx/tests/ncnn/test_nn_LSTM.py b/tools/pnnx/tests/ncnn/test_nn_LSTM.py index 575d44aacd14..a51f5e940547 100644 --- a/tools/pnnx/tests/ncnn/test_nn_LSTM.py +++ b/tools/pnnx/tests/ncnn/test_nn_LSTM.py @@ -22,15 +22,15 @@ def __init__(self): self.lstm_0_0 = nn.LSTM(input_size=32, hidden_size=16) self.lstm_0_1 = nn.LSTM(input_size=16, hidden_size=16, num_layers=3, bias=False) - self.lstm_0_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True) - self.lstm_0_3 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True) - self.lstm_0_4 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True) + self.lstm_0_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10) + self.lstm_0_3 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10) + self.lstm_0_4 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10) self.lstm_1_0 = nn.LSTM(input_size=25, hidden_size=16, batch_first=True) self.lstm_1_1 = nn.LSTM(input_size=16, hidden_size=16, num_layers=3, bias=False, batch_first=True) - self.lstm_1_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True) - self.lstm_1_3 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True) - self.lstm_1_4 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True) + self.lstm_1_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10) + self.lstm_1_3 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10) + self.lstm_1_4 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10) def forward(self, x, y): x = x.permute(1, 0, 2) @@ -38,14 +38,14 @@ def forward(self, x, y): x0, _ = self.lstm_0_0(x) x1, _ = self.lstm_0_1(x0) x2, (h0, c0) = self.lstm_0_2(x1) - x3, (h1, c1) = self.lstm_0_3(x1, (h0, c0)) - x4, _ = self.lstm_0_4(x1, (h1, c1)) + x3, (h1, c1) = self.lstm_0_3(x2, (h0, c0)) + x4, _ = self.lstm_0_4(x3, (h1, c1)) y0, _ = self.lstm_1_0(y) y1, _ = self.lstm_1_1(y0) y2, (h2, c2) = self.lstm_1_2(y1) - y3, (h3, c3) = self.lstm_1_3(y1, (h2, c2)) - y4, _ = self.lstm_1_4(y1, (h3, c3)) + y3, (h3, c3) = self.lstm_1_3(y2, (h2, c2)) + y4, _ = self.lstm_1_4(y3, (h3, c3)) x2 = x2.permute(1, 0, 2) x3 = x3.permute(1, 0, 2) diff --git a/tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py b/tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py index 05a4f2c0b048..a35d05b0f282 100644 --- a/tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py +++ b/tools/pnnx/tests/ncnn/test_nn_MultiheadAttention.py @@ -22,40 +22,61 @@ def __init__(self): super(Model, self).__init__() self.attention_0_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4) + self.attention_0_1 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20) if version.parse(torch.__version__) >= version.parse('1.9'): - self.attention_1_0 = nn.MultiheadAttention(embed_dim=40, num_heads=4, batch_first=True) + self.attention_1_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4, batch_first=True) + self.attention_1_1 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20, batch_first=True) - def forward(self, x, y): - x0, _ = self.attention_0_0(x, x, x) + def forward(self, xq, xk, xv, yq, yk, yv): + x0, _ = self.attention_0_0(xq, xq, xq) + x1, _ = self.attention_0_0(xq, xk, xv) + x2, _ = self.attention_0_0(xq, xk, xk) + x3, _ = self.attention_0_1(yq, yk, yv) if version.parse(torch.__version__) < version.parse('1.9'): - return x0 + return x0, x1, x2, x3 - y0, _ = self.attention_1_0(y, y, y) + xq = xq.transpose(0, 1) + xk = xk.transpose(0, 1) + xv = xv.transpose(0, 1) + yq = yq.transpose(0, 1) + yk = yk.transpose(0, 1) + yv = yv.transpose(0, 1) - return x0, y0 + y0, _ = self.attention_1_0(xq, xq, xq) + y1, _ = self.attention_1_0(xq, xk, xv) + y2, _ = self.attention_1_0(xq, xk, xk) + y3, _ = self.attention_1_1(yq, yk, yv) + + return x0, x1, x2, x3, y0, y1, y2, y3 def test(): + torch.set_grad_enabled(False) + net = Model().half().float() net.eval() torch.manual_seed(0) - x = torch.rand(1, 1, 64) - y = torch.rand(1, 15, 40) + xq = torch.rand(20, 1, 64) + xk = torch.rand(20, 1, 64) + xv = torch.rand(20, 1, 64) + yq = torch.rand(15, 1, 40) + yk = torch.rand(24, 1, 30) + yv = torch.rand(24, 1, 20) - a = net(x, y) + a = net(xq, xk, xv, yq, yk, yv) # export torchscript if version.parse(torch.__version__) >= version.parse('1.12.0'): - mod = torch.jit.trace(net, (x, y), check_trace=False) + mod = torch.jit.trace(net, (xq, xk, xv, yq, yk, yv), check_trace=False) else: - mod = torch.jit.trace(net, (x, y)) + mod = torch.jit.trace(net, (xq, xk, xv, yq, yk, yv)) mod.save("test_nn_MultiheadAttention.pt") # torchscript to pnnx import os - os.system("../../src/pnnx test_nn_MultiheadAttention.pt inputshape=[1,1,64],[1,15,40]") + os.system("../../src/pnnx test_nn_MultiheadAttention.pt inputshape=[20,1,64],[20,1,64],[20,1,64],[15,1,40],[24,1,30],[24,1,20]") # ncnn inference import test_nn_MultiheadAttention_ncnn diff --git a/tools/pnnx/tests/ncnn/test_nn_Softmax.py b/tools/pnnx/tests/ncnn/test_nn_Softmax.py index aa9e3b737a8a..d4ca3df0ff26 100644 --- a/tools/pnnx/tests/ncnn/test_nn_Softmax.py +++ b/tools/pnnx/tests/ncnn/test_nn_Softmax.py @@ -23,12 +23,14 @@ def __init__(self): self.act_0 = nn.Softmax(dim=0) self.act_1 = nn.Softmax(dim=1) self.act_2 = nn.Softmax(dim=2) + self.act_3 = nn.Softmax(dim=-1) def forward(self, x, y, z): x = self.act_0(x) y = self.act_1(y) z = self.act_2(z) - return x, y, z + z2 = self.act_3(z) + return x, y, z, z2 def test(): net = Model() diff --git a/tools/pnnx/tests/ncnn/test_nn_Softmax2d.py b/tools/pnnx/tests/ncnn/test_nn_Softmax2d.py new file mode 100644 index 000000000000..c92537e90348 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_nn_Softmax2d.py @@ -0,0 +1,56 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softmax2d() + + def forward(self, x): + x = self.act_0(x) + return x + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 24, 64) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_nn_Softmax2d.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_nn_Softmax2d.pt inputshape=[1,12,24,64]") + + # ncnn inference + import test_nn_Softmax2d_ncnn + b = test_nn_Softmax2d_ncnn.test_inference() + + return torch.allclose(a, b, 1e-4, 1e-4) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_nn_Unfold.py b/tools/pnnx/tests/ncnn/test_nn_Unfold.py new file mode 100644 index 000000000000..8d618f761507 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_nn_Unfold.py @@ -0,0 +1,65 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.unfold_0 = nn.Unfold(kernel_size=3) + self.unfold_1 = nn.Unfold(kernel_size=(2,4), stride=(2,1), padding=2, dilation=1) + self.unfold_2 = nn.Unfold(kernel_size=(1,3), stride=1, padding=(2,4), dilation=(1,2)) + + def forward(self, x): + x0 = self.unfold_0(x) + x1 = self.unfold_1(x) + x2 = self.unfold_2(x) + + return x0, x1, x2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 64, 64) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_nn_Unfold.pt") + + # torchscript to ncnn + import os + os.system("../../src/pnnx test_nn_Unfold.pt inputshape=[1,12,64,64]") + + # ncnn inference + import test_nn_Unfold_ncnn + b = test_nn_Unfold_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_abs.py b/tools/pnnx/tests/ncnn/test_torch_abs.py new file mode 100644 index 000000000000..8d824a11b1ea --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_abs.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.abs(x - 0.5) + y = torch.abs(y - 0.5) + z = torch.abs(z - 0.5) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_abs.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_abs.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_abs_ncnn + b = test_torch_abs_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_acos.py b/tools/pnnx/tests/ncnn/test_torch_acos.py new file mode 100644 index 000000000000..4fc0e43a37a8 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_acos.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.acos(x) + y = torch.acos(y) + z = torch.acos(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_acos.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_acos.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_acos_ncnn + b = test_torch_acos_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_asin.py b/tools/pnnx/tests/ncnn/test_torch_asin.py new file mode 100644 index 000000000000..24099e592145 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_asin.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.asin(x) + y = torch.asin(y) + z = torch.asin(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_asin.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_asin.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_asin_ncnn + b = test_torch_asin_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_atan.py b/tools/pnnx/tests/ncnn/test_torch_atan.py new file mode 100644 index 000000000000..bef6aae58913 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_atan.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.atan(x) + y = torch.atan(y) + z = torch.atan(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_atan.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_atan.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_atan_ncnn + b = test_torch_atan_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_ceil.py b/tools/pnnx/tests/ncnn/test_torch_ceil.py new file mode 100644 index 000000000000..4ee628adb964 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_ceil.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.ceil(x * 10) + y = torch.ceil(y * 10) + z = torch.ceil(z * 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_ceil.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_ceil.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_ceil_ncnn + b = test_torch_ceil_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_cos.py b/tools/pnnx/tests/ncnn/test_torch_cos.py new file mode 100644 index 000000000000..f32b1ff4e63a --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_cos.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.cos(x) + y = torch.cos(y) + z = torch.cos(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_cos.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_cos.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_cos_ncnn + b = test_torch_cos_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_exp.py b/tools/pnnx/tests/ncnn/test_torch_exp.py new file mode 100644 index 000000000000..5e608687c40c --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_exp.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.exp(x) + y = torch.exp(y) + z = torch.exp(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_exp.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_exp.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_exp_ncnn + b = test_torch_exp_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_floor.py b/tools/pnnx/tests/ncnn/test_torch_floor.py new file mode 100644 index 000000000000..e100d96de192 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_floor.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.floor(x * 10) + y = torch.floor(y * 10) + z = torch.floor(z * 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_floor.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_floor.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_floor_ncnn + b = test_torch_floor_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_log.py b/tools/pnnx/tests/ncnn/test_torch_log.py new file mode 100644 index 000000000000..a3583f472737 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_log.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.log(x) + y = torch.log(y) + z = torch.log(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_log.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_log.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_log_ncnn + b = test_torch_log_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_neg.py b/tools/pnnx/tests/ncnn/test_torch_neg.py new file mode 100644 index 000000000000..75197fa16fd6 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_neg.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.neg(x - 0.5) + y = torch.neg(y - 0.5) + z = torch.neg(z - 0.5) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_neg.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_neg.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_neg_ncnn + b = test_torch_neg_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_pow.py b/tools/pnnx/tests/ncnn/test_torch_pow.py new file mode 100644 index 000000000000..26c850cbaad8 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_pow.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.pow(x, y) + out1 = torch.pow(y, y) + out2 = torch.pow(z, torch.ones_like(z) + 0.5) + return out0, out1, out2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_pow.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_pow.pt inputshape=[3,16],[3,16],[5,9,3]") + + # ncnn inference + import test_torch_pow_ncnn + b = test_torch_pow_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_reciprocal.py b/tools/pnnx/tests/ncnn/test_torch_reciprocal.py new file mode 100644 index 000000000000..83f2c89c96bb --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_reciprocal.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.reciprocal(x) + y = torch.reciprocal(y) + z = torch.reciprocal(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_reciprocal.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_reciprocal.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_reciprocal_ncnn + b = test_torch_reciprocal_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_rsqrt.py b/tools/pnnx/tests/ncnn/test_torch_rsqrt.py new file mode 100644 index 000000000000..b3a34c70d519 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_rsqrt.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.rsqrt(x) + y = torch.rsqrt(y) + z = torch.rsqrt(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_rsqrt.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_rsqrt.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_rsqrt_ncnn + b = test_torch_rsqrt_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-3, 1e-3): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_sin.py b/tools/pnnx/tests/ncnn/test_torch_sin.py new file mode 100644 index 000000000000..d984a269e1c5 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_sin.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.sin(x) + y = torch.sin(y) + z = torch.sin(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_sin.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_sin.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_sin_ncnn + b = test_torch_sin_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_sqrt.py b/tools/pnnx/tests/ncnn/test_torch_sqrt.py new file mode 100644 index 000000000000..bfd5ddb3d2a7 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_sqrt.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.sqrt(x) + y = torch.sqrt(y) + z = torch.sqrt(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_sqrt.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_sqrt.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_sqrt_ncnn + b = test_torch_sqrt_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_square.py b/tools/pnnx/tests/ncnn/test_torch_square.py new file mode 100644 index 000000000000..d5b5c8af9108 --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_square.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.square(x) + y = torch.square(y) + z = torch.square(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_square.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_square.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_square_ncnn + b = test_torch_square_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_squeeze.py b/tools/pnnx/tests/ncnn/test_torch_squeeze.py index 339cebe8f88e..807997677e43 100644 --- a/tools/pnnx/tests/ncnn/test_torch_squeeze.py +++ b/tools/pnnx/tests/ncnn/test_torch_squeeze.py @@ -35,7 +35,7 @@ def test(): x = torch.rand(1, 16) y = torch.rand(3, 1) z = torch.rand(5, 1, 11) - w = torch.rand(5, 9, 1) + w = torch.rand(5, 9, 1, 33) a = net(x, y, z, w) @@ -45,7 +45,7 @@ def test(): # torchscript to pnnx import os - os.system("../../src/pnnx test_torch_squeeze.pt inputshape=[1,16],[3,1],[5,1,11],[5,9,1]") + os.system("../../src/pnnx test_torch_squeeze.pt inputshape=[1,16],[3,1],[5,1,11],[5,9,1,33]") # ncnn inference import test_torch_squeeze_ncnn diff --git a/tools/pnnx/tests/ncnn/test_torch_tan.py b/tools/pnnx/tests/ncnn/test_torch_tan.py new file mode 100644 index 000000000000..b97bad2fd9ad --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_tan.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.tan(x) + y = torch.tan(y) + z = torch.tan(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_tan.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_tan.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_tan_ncnn + b = test_torch_tan_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_tanh.py b/tools/pnnx/tests/ncnn/test_torch_tanh.py new file mode 100644 index 000000000000..10b1b72ffcdc --- /dev/null +++ b/tools/pnnx/tests/ncnn/test_torch_tanh.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.tanh(x) + y = torch.tanh(y) + z = torch.tanh(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(5, 9, 11) + z = torch.rand(8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_tanh.pt") + + # torchscript to pnnx + import os + os.system("../../src/pnnx test_torch_tanh.pt inputshape=[3,16],[5,9,11],[8,5,9,10]") + + # ncnn inference + import test_torch_tanh_ncnn + b = test_torch_tanh_ncnn.test_inference() + + for a0, b0 in zip(a, b): + if not torch.allclose(a0, b0, 1e-4, 1e-4): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/ncnn/test_torch_unsqueeze.py b/tools/pnnx/tests/ncnn/test_torch_unsqueeze.py index baf121710751..b15a43419e40 100644 --- a/tools/pnnx/tests/ncnn/test_torch_unsqueeze.py +++ b/tools/pnnx/tests/ncnn/test_torch_unsqueeze.py @@ -20,12 +20,14 @@ class Model(nn.Module): def __init__(self): super(Model, self).__init__() - def forward(self, x, y): + def forward(self, x, y, z): x0 = torch.unsqueeze(x, 0) x1 = torch.unsqueeze(x, 1) y0 = torch.unsqueeze(y, 1) y1 = torch.unsqueeze(y, -1) - return x0, x1, y0, y1 + z0 = torch.unsqueeze(z, 0) + z1 = torch.unsqueeze(z, -2) + return x0, x1, y0, y1, z0, z1 def test(): net = Model() @@ -34,16 +36,17 @@ def test(): torch.manual_seed(0) x = torch.rand(16) y = torch.rand(9, 11) + z = torch.rand(4, 6, 7) - a = net(x, y) + a = net(x, y, z) # export torchscript - mod = torch.jit.trace(net, (x, y)) + mod = torch.jit.trace(net, (x, y, z)) mod.save("test_torch_unsqueeze.pt") # torchscript to pnnx import os - os.system("../../src/pnnx test_torch_unsqueeze.pt inputshape=[16],[9,11]") + os.system("../../src/pnnx test_torch_unsqueeze.pt inputshape=[16],[9,11],[4,6,7]") # ncnn inference import test_torch_unsqueeze_ncnn diff --git a/tools/pnnx/tests/test_F_fold.py b/tools/pnnx/tests/test_F_fold.py new file mode 100644 index 000000000000..68c5b566d567 --- /dev/null +++ b/tools/pnnx/tests/test_F_fold.py @@ -0,0 +1,60 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = F.fold(x, output_size=22, kernel_size=3) + y = F.fold(y, output_size=(17,18), kernel_size=(2,4), stride=(2,1), padding=2, dilation=1) + z = F.fold(z, output_size=(5,11), kernel_size=(2,3), stride=1, padding=(2,4), dilation=(1,2)) + + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 108, 400) + y = torch.rand(1, 96, 190) + z = torch.rand(1, 36, 120) + + a0, a1, a2 = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_F_fold.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_F_fold.pt inputshape=[1,108,400],[1,96,190],[1,36,120]") + + # pnnx inference + import test_F_fold_pnnx + b0, b1, b2 = test_F_fold_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_F_glu.py b/tools/pnnx/tests/test_F_glu.py index ba8432875660..0c1868683447 100644 --- a/tools/pnnx/tests/test_F_glu.py +++ b/tools/pnnx/tests/test_F_glu.py @@ -38,7 +38,7 @@ def test(): y = torch.rand(12, 16) z = torch.rand(24, 28, 34) - x0, y0, y1, z0, z1, z2= net(x, y, z) + x0, y0, y1, z0, z1, z2 = net(x, y, z) # export torchscript mod = torch.jit.trace(net, (x, y, z)) @@ -46,13 +46,11 @@ def test(): # torchscript to pnnx import os - os.system("../src/pnnx test_F_glu.pt") + os.system("../src/pnnx test_F_glu.pt inputshape=[18],[12,16],[24,28,34]") # pnnx inference import test_F_glu_pnnx - m = test_F_glu_pnnx.Model() - m.eval() - x0p, y0p, y1p, z0p, z1p, z2p = m(x, y, z) + x0p, y0p, y1p, z0p, z1p, z2p = test_F_glu_pnnx.test_inference() return torch.equal(x0, x0p) and torch.equal(y0, y0p) and torch.equal(y1, y1p) \ and torch.equal(z0, z0p) and torch.equal(z1, z1p) and torch.equal(z2, z2p) diff --git a/tools/pnnx/tests/test_F_grid_sample.py b/tools/pnnx/tests/test_F_grid_sample.py index ae4ed354cdf3..8cb6d214568c 100644 --- a/tools/pnnx/tests/test_F_grid_sample.py +++ b/tools/pnnx/tests/test_F_grid_sample.py @@ -21,6 +21,12 @@ def __init__(self): super(Model, self).__init__() def forward(self, x, xg1, xg2, y, yg1, yg2): + # norm to -1 ~ 1 + xg1 = xg1 * 2 - 1 + xg2 = xg2 * 2 - 1 + yg1 = yg1 * 2 - 1 + yg2 = yg2 * 2 - 1 + x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False) x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False) x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='reflection', align_corners=False) diff --git a/tools/pnnx/tests/test_F_unfold.py b/tools/pnnx/tests/test_F_unfold.py new file mode 100644 index 000000000000..51f19a4f48a4 --- /dev/null +++ b/tools/pnnx/tests/test_F_unfold.py @@ -0,0 +1,58 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x): + x0 = F.unfold(x, kernel_size=3) + x1 = F.unfold(x, kernel_size=(2,4), stride=(2,1), padding=2, dilation=1) + x2 = F.unfold(x, kernel_size=(1,3), stride=1, padding=(2,4), dilation=(1,2)) + + return x0, x1, x2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 64, 64) + + a0, a1, a2 = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_F_unfold.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_F_unfold.pt inputshape=[1,12,64,64]") + + # pnnx inference + import test_F_unfold_pnnx + b0, b1, b2 = test_F_unfold_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_Tensor_new_full.py b/tools/pnnx/tests/test_Tensor_new_full.py new file mode 100644 index 000000000000..f6855f201a75 --- /dev/null +++ b/tools/pnnx/tests/test_Tensor_new_full.py @@ -0,0 +1,62 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x): + out0 = x.new_full((2,2), 1.5) + out1 = x.new_full((3,), 3) + out2 = x.new_full((4,5,6,7,8), -0.5) + out3 = x.new_full((1,2,1), 0) + out4 = x.new_full((3,3,3,3), 1) + return out0, out1, out2, out3, out4 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_Tensor_new_full.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_Tensor_new_full.pt inputshape=[1,16]") + + # pnnx inference + import test_Tensor_new_full_pnnx + b = test_Tensor_new_full_pnnx.test_inference() + + # test shape only for uninitialized data + for a0, b0 in zip(a, b): + if not a0.shape == b0.shape: + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_Tensor_new_ones.py b/tools/pnnx/tests/test_Tensor_new_ones.py new file mode 100644 index 000000000000..b1ee76b13c55 --- /dev/null +++ b/tools/pnnx/tests/test_Tensor_new_ones.py @@ -0,0 +1,62 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x): + out0 = x.new_ones((2,2)) + out1 = x.new_ones(3) + out2 = x.new_ones((4,5,6,7,8)) + out3 = x.new_ones((1,2,1)) + out4 = x.new_ones((3,3,3,3)) + return out0, out1, out2, out3, out4 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_Tensor_new_ones.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_Tensor_new_ones.pt inputshape=[1,16]") + + # pnnx inference + import test_Tensor_new_ones_pnnx + b = test_Tensor_new_ones_pnnx.test_inference() + + # test shape only for uninitialized data + for a0, b0 in zip(a, b): + if not a0.shape == b0.shape: + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_Tensor_new_zeros.py b/tools/pnnx/tests/test_Tensor_new_zeros.py new file mode 100644 index 000000000000..abe87abbc4a2 --- /dev/null +++ b/tools/pnnx/tests/test_Tensor_new_zeros.py @@ -0,0 +1,62 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x): + out0 = x.new_zeros((2,2)) + out1 = x.new_zeros(3) + out2 = x.new_zeros((4,5,6,7,8)) + out3 = x.new_zeros((1,2,1)) + out4 = x.new_zeros((3,3,3,3)) + return out0, out1, out2, out3, out4 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 16) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_Tensor_new_zeros.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_Tensor_new_zeros.pt inputshape=[1,16]") + + # pnnx inference + import test_Tensor_new_zeros_pnnx + b = test_Tensor_new_zeros_pnnx.test_inference() + + # test shape only for uninitialized data + for a0, b0 in zip(a, b): + if not a0.shape == b0.shape: + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_Tensor_slice_copy.py b/tools/pnnx/tests/test_Tensor_slice_copy.py new file mode 100644 index 000000000000..e3c76a2b867f --- /dev/null +++ b/tools/pnnx/tests/test_Tensor_slice_copy.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x): + x = x.clone() + x[2:10,...] += 1 + x[...,1] = x[...,-1] * 3 + y = x.clone() + x[:,:,3,::2].clamp_(0, 0.5) + x[:,:,3,::2] = x[:,:,3,::2].exp_() + x[:,:,::2,:] = y[:,:,::2,:].pow(2) + x[:,:,:,:] = x[:,:,:,:] / 2 + return x + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(18, 15, 19, 20) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_Tensor_slice_copy.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_Tensor_slice_copy.pt inputshape=[18,15,19,20]") + + # pnnx inference + import test_Tensor_slice_copy_pnnx + b = test_Tensor_slice_copy_pnnx.test_inference() + + return torch.equal(a, b) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_nn_Fold.py b/tools/pnnx/tests/test_nn_Fold.py new file mode 100644 index 000000000000..8f53639db2ab --- /dev/null +++ b/tools/pnnx/tests/test_nn_Fold.py @@ -0,0 +1,64 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.fold_0 = nn.Fold(output_size=22, kernel_size=3) + self.fold_1 = nn.Fold(output_size=(17,18), kernel_size=(2,4), stride=(2,1), padding=2, dilation=1) + self.fold_2 = nn.Fold(output_size=(5,11), kernel_size=(2,3), stride=1, padding=(2,4), dilation=(1,2)) + + def forward(self, x, y, z): + x = self.fold_0(x) + y = self.fold_1(y) + z = self.fold_2(z) + + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 108, 400) + y = torch.rand(1, 96, 190) + z = torch.rand(1, 36, 120) + + a0, a1, a2 = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_nn_Fold.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_nn_Fold.pt inputshape=[1,108,400],[1,96,190],[1,36,120]") + + # pnnx inference + import test_nn_Fold_pnnx + b0, b1, b2 = test_nn_Fold_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_nn_GLU.py b/tools/pnnx/tests/test_nn_GLU.py new file mode 100644 index 000000000000..0643f3643555 --- /dev/null +++ b/tools/pnnx/tests/test_nn_GLU.py @@ -0,0 +1,79 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.glu0 = nn.GLU(dim=0) + self.glu1 = nn.GLU(dim=1) + self.glu2 = nn.GLU(dim=2) + + def forward(self, x, y, z): + x0 = self.glu0(x) + + y0 = self.glu0(y) + y1 = self.glu1(y) + + z0 = self.glu0(z) + z1 = self.glu1(z) + z2 = self.glu2(z) + return x0, y0, y1, z0, z1, z2 + + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(18) + y = torch.rand(12, 16) + z = torch.rand(24, 28, 34) + + x0, y0, y1, z0, z1, z2 = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_nn_GLU.pt") + + # torchscript to pnnx + import os + + os.system("../src/pnnx test_nn_GLU.pt inputshape=[18],[12,16],[24,28,34]") + + # pnnx inference + import test_nn_GLU_pnnx + + x0p, y0p, y1p, z0p, z1p, z2p = test_nn_GLU_pnnx.test_inference() + + return ( + torch.equal(x0, x0p) + and torch.equal(y0, y0p) + and torch.equal(y1, y1p) + and torch.equal(z0, z0p) + and torch.equal(z1, z1p) + and torch.equal(z2, z2p) + ) + + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_nn_LSTM.py b/tools/pnnx/tests/test_nn_LSTM.py index 36274c8fc387..33c54219050c 100644 --- a/tools/pnnx/tests/test_nn_LSTM.py +++ b/tools/pnnx/tests/test_nn_LSTM.py @@ -22,24 +22,24 @@ def __init__(self): self.lstm_0_0 = nn.LSTM(input_size=32, hidden_size=16) self.lstm_0_1 = nn.LSTM(input_size=16, hidden_size=16, num_layers=3, bias=False) - self.lstm_0_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True) - self.lstm_0_3 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True) + self.lstm_0_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10) + self.lstm_0_3 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, bidirectional=True, proj_size=10) self.lstm_1_0 = nn.LSTM(input_size=25, hidden_size=16, batch_first=True) self.lstm_1_1 = nn.LSTM(input_size=16, hidden_size=16, num_layers=3, bias=False, batch_first=True) - self.lstm_1_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True) - self.lstm_1_3 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True) + self.lstm_1_2 = nn.LSTM(input_size=16, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10) + self.lstm_1_3 = nn.LSTM(input_size=20, hidden_size=16, num_layers=4, bias=True, batch_first=True, bidirectional=True, proj_size=10) def forward(self, x, y): x0, (h0, c0) = self.lstm_0_0(x) x1, (h1, c1) = self.lstm_0_1(x0) x2, (h2, c2) = self.lstm_0_2(x1) - x3, (h3, c3) = self.lstm_0_3(x1, (h2, c2)) + x3, (h3, c3) = self.lstm_0_3(x2, (h2, c2)) y0, (h4, c4) = self.lstm_1_0(y) y1, (h5, c5) = self.lstm_1_1(y0) y2, (h6, c6) = self.lstm_1_2(y1) - y3, (h7, c7) = self.lstm_1_3(y1, (h6, c6)) + y3, (h7, c7) = self.lstm_1_3(y2, (h6, c6)) return x2, x3, h0, h1, h2, h3, c0, c1, c2, c3, y2, y3, h4, h5, h6, h7, c4, c5, c6, c7 def test(): diff --git a/tools/pnnx/tests/test_nn_MultiheadAttention.py b/tools/pnnx/tests/test_nn_MultiheadAttention.py index 67dabab9532f..cc222621c20f 100644 --- a/tools/pnnx/tests/test_nn_MultiheadAttention.py +++ b/tools/pnnx/tests/test_nn_MultiheadAttention.py @@ -24,31 +24,61 @@ def __init__(self): self.attention_0_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4) self.attention_0_1 = nn.MultiheadAttention(embed_dim=64, num_heads=8, bias=False, add_bias_kv=False, add_zero_attn=False) self.attention_0_2 = nn.MultiheadAttention(embed_dim=64, num_heads=16, bias=True, add_bias_kv=True, add_zero_attn=True) + self.attention_0_3 = nn.MultiheadAttention(embed_dim=32, num_heads=8, bias=True) + self.attention_0_4 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20) + self.attention_0_5 = nn.MultiheadAttention(embed_dim=40, num_heads=8, kdim=30, vdim=20, bias=False, add_bias_kv=False, add_zero_attn=False) + self.attention_0_6 = nn.MultiheadAttention(embed_dim=40, num_heads=10, kdim=30, vdim=20, bias=True, add_bias_kv=True, add_zero_attn=True) + if version.parse(torch.__version__) >= version.parse('1.9'): - self.attention_1_0 = nn.MultiheadAttention(embed_dim=40, num_heads=4, batch_first=True) - self.attention_1_1 = nn.MultiheadAttention(embed_dim=40, num_heads=8, bias=False, add_bias_kv=False, add_zero_attn=False, batch_first=True) - self.attention_1_2 = nn.MultiheadAttention(embed_dim=40, num_heads=10, bias=True, add_bias_kv=True, add_zero_attn=True, batch_first=True) + self.attention_1_0 = nn.MultiheadAttention(embed_dim=64, num_heads=4, batch_first=True) + self.attention_1_1 = nn.MultiheadAttention(embed_dim=64, num_heads=8, bias=False, add_bias_kv=False, add_zero_attn=False, batch_first=True) + self.attention_1_2 = nn.MultiheadAttention(embed_dim=64, num_heads=16, bias=True, add_bias_kv=True, add_zero_attn=True, batch_first=True) + self.attention_1_3 = nn.MultiheadAttention(embed_dim=32, num_heads=8, bias=True, batch_first=True) - def forward(self, xq, xk, xv, z, yq, yk, yv, w): + self.attention_1_4 = nn.MultiheadAttention(embed_dim=40, num_heads=4, kdim=30, vdim=20, batch_first=True) + self.attention_1_5 = nn.MultiheadAttention(embed_dim=40, num_heads=8, kdim=30, vdim=20, bias=False, add_bias_kv=False, add_zero_attn=False, batch_first=True) + self.attention_1_6 = nn.MultiheadAttention(embed_dim=40, num_heads=10, kdim=30, vdim=20, bias=True, add_bias_kv=True, add_zero_attn=True, batch_first=True) + + def forward(self, xq, xk, xv, z, yq, yk, yv): x0, x0w = self.attention_0_0(xq, xk, xv) x1, x1w = self.attention_0_1(xq, xk, xv) - x2, x2w = self.attention_0_2(xq, xk, xv) + x2, x2w = self.attention_0_2(xq, xk, xk) + x3, _ = self.attention_0_3(z, z, z) + x4, x4w = self.attention_0_4(yq, yk, yv) + x5, x5w = self.attention_0_5(yq, yk, yv) + x6, x6w = self.attention_0_6(yq, yk, yv) + if version.parse(torch.__version__) < version.parse('1.9'): - return x0, x0w, x1, x1w, x2, x2w, x3 + return x0, x0w, x1, x1w, x2, x2w, x3, x4, x4w, x5, x5w, x6, x6w + + xq = xq.transpose(0, 1) + xk = xk.transpose(0, 1) + xv = xv.transpose(0, 1) + z = z.transpose(0, 1) + yq = yq.transpose(0, 1) + yk = yk.transpose(0, 1) + yv = yv.transpose(0, 1) + + y0, y0w = self.attention_1_0(xq, xk, xv) + y1, y1w = self.attention_1_1(xq, xk, xv) + y2, y2w = self.attention_1_2(xq, xk, xk) - y0, y0w = self.attention_1_0(yq, yk, yv) - y1, y1w = self.attention_1_1(yq, yk, yv) - y2, y2w = self.attention_1_2(yq, yk, yv) - y3, _ = self.attention_1_3(w, w, w) + y3, _ = self.attention_1_3(z, z, z) - return x0, x0w, x1, x1w, x2, x2w, x3, y0, y0w, y1, y1w, y2, y2w, y3 + y4, y4w = self.attention_1_4(yq, yk, yv) + y5, y5w = self.attention_1_5(yq, yk, yv) + y6, y6w = self.attention_1_6(yq, yk, yv) + + return x0, x0w, x1, x1w, x2, x2w, x3, x4, x4w, x5, x5w, x6, x6w, y0, y0w, y1, y1w, y2, y2w, y3, y4, y4w, y5, y5w, y6, y6w def test(): + torch.set_grad_enabled(False) + net = Model() net.eval() @@ -57,24 +87,23 @@ def test(): xk = torch.rand(20, 1, 64) xv = torch.rand(20, 1, 64) z = torch.rand(30, 1, 32) - yq = torch.rand(1, 15, 40) - yk = torch.rand(1, 24, 40) - yv = torch.rand(1, 24, 40) - w = torch.rand(1, 20, 32) + yq = torch.rand(15, 1, 40) + yk = torch.rand(24, 1, 30) + yv = torch.rand(24, 1, 20) - a = net(xq, xk, xv, z, yq, yk, yv, w) + a = net(xq, xk, xv, z, yq, yk, yv) # export torchscript print(torch.__version__) if version.parse(torch.__version__) >= version.parse('1.12.0'): - mod = torch.jit.trace(net, (xq, xk, xv, z, yq, yk, yv, w), check_trace=False) + mod = torch.jit.trace(net, (xq, xk, xv, z, yq, yk, yv), check_trace=False) else: - mod = torch.jit.trace(net, (xq, xk, xv, z, yq, yk, yv, w)) + mod = torch.jit.trace(net, (xq, xk, xv, z, yq, yk, yv)) mod.save("test_nn_MultiheadAttention.pt") # torchscript to pnnx import os - os.system("../src/pnnx test_nn_MultiheadAttention.pt inputshape=[20,1,64],[20,1,64],[20,1,64],[30,1,32],[1,15,40],[1,24,40],[1,24,40],[1,20,32]") + os.system("../src/pnnx test_nn_MultiheadAttention.pt inputshape=[20,1,64],[20,1,64],[20,1,64],[30,1,32],[15,1,40],[24,1,30],[24,1,20]") # pnnx inference import test_nn_MultiheadAttention_pnnx diff --git a/tools/pnnx/tests/test_nn_Softmax2d.py b/tools/pnnx/tests/test_nn_Softmax2d.py new file mode 100644 index 000000000000..e75ce61d252f --- /dev/null +++ b/tools/pnnx/tests/test_nn_Softmax2d.py @@ -0,0 +1,56 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.act_0 = nn.Softmax2d() + + def forward(self, x): + x = self.act_0(x) + return x + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 24, 64) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_nn_Softmax2d.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_nn_Softmax2d.pt inputshape=[1,12,24,64]") + + # pnnx inference + import test_nn_Softmax2d_pnnx + b = test_nn_Softmax2d_pnnx.test_inference() + + return torch.equal(a, b) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_nn_Unfold.py b/tools/pnnx/tests/test_nn_Unfold.py new file mode 100644 index 000000000000..aece085668c6 --- /dev/null +++ b/tools/pnnx/tests/test_nn_Unfold.py @@ -0,0 +1,62 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F +from packaging import version + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.unfold_0 = nn.Unfold(kernel_size=3) + self.unfold_1 = nn.Unfold(kernel_size=(2,4), stride=(2,1), padding=2, dilation=1) + self.unfold_2 = nn.Unfold(kernel_size=(1,3), stride=1, padding=(2,4), dilation=(1,2)) + + def forward(self, x): + x0 = self.unfold_0(x) + x1 = self.unfold_1(x) + x2 = self.unfold_2(x) + + return x0, x1, x2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 64, 64) + + a0, a1, a2 = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_nn_Unfold.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_nn_Unfold.pt inputshape=[1,12,64,64]") + + # pnnx inference + import test_nn_Unfold_pnnx + b0, b1, b2 = test_nn_Unfold_pnnx.test_inference() + + return torch.equal(a0, b0) and torch.equal(a1, b1) and torch.equal(a2, b2) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_pnnx_fuse_adjacent_reshape.py b/tools/pnnx/tests/test_pnnx_fuse_adjacent_reshape.py new file mode 100644 index 000000000000..8f44987fb5d9 --- /dev/null +++ b/tools/pnnx/tests/test_pnnx_fuse_adjacent_reshape.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = x.view(1, 1, 8).reshape(2, -1) + y = y.reshape(-1, x.size(0)).unsqueeze(1) + z = z.unsqueeze(0).unsqueeze(2).view(-1) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(8) + y = torch.rand(9, 10) + z = torch.rand(8, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_pnnx_fuse_adjacent_reshape.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_pnnx_fuse_adjacent_reshape.pt inputshape=[8],[9,10],[8,9,10]") + + # pnnx inference + import test_pnnx_fuse_adjacent_reshape_pnnx + b = test_pnnx_fuse_adjacent_reshape_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_pnnx_fuse_pad_conv1d.py b/tools/pnnx/tests/test_pnnx_fuse_pad_conv1d.py new file mode 100644 index 000000000000..5e1e456f0013 --- /dev/null +++ b/tools/pnnx/tests/test_pnnx_fuse_pad_conv1d.py @@ -0,0 +1,84 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.pad_0 = nn.ConstantPad1d(2, 0.0) + self.pad_1 = nn.ReflectionPad1d(4) + self.pad_2 = nn.ReplicationPad1d(3) + + self.conv_0 = nn.Conv1d(in_channels=12, out_channels=14, kernel_size=3) + self.conv_1 = nn.Conv1d(in_channels=14, out_channels=14, kernel_size=1) + self.conv_2 = nn.Conv1d(in_channels=14, out_channels=14, kernel_size=2) + self.conv_3 = nn.Conv1d(in_channels=14, out_channels=12, kernel_size=3, padding=(1,)) + + def forward(self, x): + x = self.pad_0(x) + x = F.pad(x, pad=(1,1)) + x = self.conv_0(x) + + x = self.pad_1(x) + x = self.conv_1(x) + + x = F.pad(x, pad=(3,3), mode='reflect') + x = self.conv_1(x) + + x = self.pad_2(x) + x = self.conv_2(x) + + x = F.pad(x, pad=(1,1), mode='replicate') + x = self.conv_2(x) + + x = F.pad(x, pad=(2,2)) + x = self.conv_3(x) + + return x + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 13) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_pnnx_pnnx_fuse_pad_conv1d.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_pnnx_pnnx_fuse_pad_conv1d.pt inputshape=[1,12,13]") + + # pnnx inference + import test_pnnx_pnnx_fuse_pad_conv1d_pnnx + b = test_pnnx_pnnx_fuse_pad_conv1d_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_pnnx_fuse_pad_conv2d.py b/tools/pnnx/tests/test_pnnx_fuse_pad_conv2d.py new file mode 100644 index 000000000000..23d24100cff8 --- /dev/null +++ b/tools/pnnx/tests/test_pnnx_fuse_pad_conv2d.py @@ -0,0 +1,86 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + self.pad_0 = nn.ConstantPad2d(2, 0.0) + self.pad_1 = nn.ReflectionPad2d(4) + self.pad_2 = nn.ReplicationPad2d(3) + self.pad_3 = nn.ZeroPad2d((1,1,0,0)) + + self.conv_0 = nn.Conv2d(in_channels=12, out_channels=14, kernel_size=3) + self.conv_1 = nn.Conv2d(in_channels=14, out_channels=14, kernel_size=1) + self.conv_2 = nn.Conv2d(in_channels=14, out_channels=14, kernel_size=2) + self.conv_3 = nn.Conv2d(in_channels=14, out_channels=12, kernel_size=3, padding=(1,1)) + + def forward(self, x): + x = self.pad_0(x) + x = F.pad(x, pad=(1,1)) + x = self.conv_0(x) + + x = self.pad_1(x) + x = self.conv_1(x) + + x = F.pad(x, pad=(3,3,2,2), mode='reflect') + x = self.conv_1(x) + + x = self.pad_2(x) + x = self.conv_2(x) + + x = F.pad(x, pad=(1,1,1,1), mode='replicate') + x = self.conv_2(x) + + x = self.pad_3(x) + x = F.pad(x, pad=(2,2,0,0)) + x = self.conv_3(x) + + return x + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 12, 13, 13) + + a = net(x) + + # export torchscript + mod = torch.jit.trace(net, x) + mod.save("test_pnnx_pnnx_fuse_pad_conv2d.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_pnnx_pnnx_fuse_pad_conv2d.pt inputshape=[1,12,13,13]") + + # pnnx inference + import test_pnnx_pnnx_fuse_pad_conv2d_pnnx + b = test_pnnx_pnnx_fuse_pad_conv2d_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_abs.py b/tools/pnnx/tests/test_torch_abs.py new file mode 100644 index 000000000000..9d24e6d3057c --- /dev/null +++ b/tools/pnnx/tests/test_torch_abs.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.abs(x - 0.5) + y = torch.abs(y - 0.5) + z = torch.abs(z - 0.5) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_abs.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_abs.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_abs_pnnx + b = test_torch_abs_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_acos.py b/tools/pnnx/tests/test_torch_acos.py new file mode 100644 index 000000000000..7380f753d877 --- /dev/null +++ b/tools/pnnx/tests/test_torch_acos.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.acos(x) + y = torch.acos(y) + z = torch.acos(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_acos.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_acos.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_acos_pnnx + b = test_torch_acos_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_acosh.py b/tools/pnnx/tests/test_torch_acosh.py new file mode 100644 index 000000000000..5d572a4ab87a --- /dev/null +++ b/tools/pnnx/tests/test_torch_acosh.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.acosh(x + 1) + y = torch.acosh(y + 1) + z = torch.acosh(z + 1) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_acosh.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_acosh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_acosh_pnnx + b = test_torch_acosh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_asin.py b/tools/pnnx/tests/test_torch_asin.py new file mode 100644 index 000000000000..2b4f3cb05422 --- /dev/null +++ b/tools/pnnx/tests/test_torch_asin.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.asin(x) + y = torch.asin(y) + z = torch.asin(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_asin.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_asin.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_asin_pnnx + b = test_torch_asin_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_asinh.py b/tools/pnnx/tests/test_torch_asinh.py new file mode 100644 index 000000000000..9f80fbabcf0e --- /dev/null +++ b/tools/pnnx/tests/test_torch_asinh.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.asinh(x) + y = torch.asinh(y) + z = torch.asinh(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_asinh.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_asinh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_asinh_pnnx + b = test_torch_asinh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_atan.py b/tools/pnnx/tests/test_torch_atan.py new file mode 100644 index 000000000000..8fd797b918c7 --- /dev/null +++ b/tools/pnnx/tests/test_torch_atan.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.atan(x) + y = torch.atan(y) + z = torch.atan(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_atan.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_atan.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_atan_pnnx + b = test_torch_atan_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_atan2.py b/tools/pnnx/tests/test_torch_atan2.py new file mode 100644 index 000000000000..27bf5c5deb33 --- /dev/null +++ b/tools/pnnx/tests/test_torch_atan2.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.atan2(x, y) + out1 = torch.atan2(y, y) + out2 = torch.atan2(z, torch.ones_like(z) + 0.5) + return out0, out1, out2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_atan2.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_atan2.pt inputshape=[3,16],[3,16],[5,9,3]") + + # pnnx inference + import test_torch_atan2_pnnx + b = test_torch_atan2_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_atanh.py b/tools/pnnx/tests/test_torch_atanh.py new file mode 100644 index 000000000000..2cc2ce60d812 --- /dev/null +++ b/tools/pnnx/tests/test_torch_atanh.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.atanh(x - 0.5) + y = torch.atanh(y - 0.5) + z = torch.atanh(z - 0.5) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_atanh.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_atanh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_atanh_pnnx + b = test_torch_atanh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_bitwise_left_shift.py b/tools/pnnx/tests/test_torch_bitwise_left_shift.py new file mode 100644 index 000000000000..cc60f144b11e --- /dev/null +++ b/tools/pnnx/tests/test_torch_bitwise_left_shift.py @@ -0,0 +1,55 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y): + out = torch.bitwise_left_shift(x, y) + return out + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.randint(10, (3, 16), dtype=torch.int) + y = torch.randint(10, (3, 16), dtype=torch.int) + + a = net(x, y) + + # export torchscript + mod = torch.jit.trace(net, (x, y)) + mod.save("test_torch_bitwise_left_shift.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_bitwise_left_shift.pt inputshape=[3,16]i32,[3,16]i32") + + # pnnx inference + import test_torch_bitwise_left_shift_pnnx + b = test_torch_bitwise_left_shift_pnnx.test_inference() + + return torch.equal(a, b) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_bitwise_right_shift.py b/tools/pnnx/tests/test_torch_bitwise_right_shift.py new file mode 100644 index 000000000000..59d6c9651db0 --- /dev/null +++ b/tools/pnnx/tests/test_torch_bitwise_right_shift.py @@ -0,0 +1,55 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y): + out = torch.bitwise_right_shift(x, y) + return out + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.randint(10, (3, 16), dtype=torch.int) + y = torch.randint(10, (3, 16), dtype=torch.int) + + a = net(x, y) + + # export torchscript + mod = torch.jit.trace(net, (x, y)) + mod.save("test_torch_bitwise_right_shift.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_bitwise_right_shift.pt inputshape=[3,16]i32,[3,16]i32") + + # pnnx inference + import test_torch_bitwise_right_shift_pnnx + b = test_torch_bitwise_right_shift_pnnx.test_inference() + + return torch.equal(a, b) + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_ceil.py b/tools/pnnx/tests/test_torch_ceil.py new file mode 100644 index 000000000000..bd6379a7b3e9 --- /dev/null +++ b/tools/pnnx/tests/test_torch_ceil.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.ceil(x * 10) + y = torch.ceil(y * 10) + z = torch.ceil(z * 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_ceil.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_ceil.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_ceil_pnnx + b = test_torch_ceil_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_cos.py b/tools/pnnx/tests/test_torch_cos.py new file mode 100644 index 000000000000..9d60eb6613e7 --- /dev/null +++ b/tools/pnnx/tests/test_torch_cos.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.cos(x) + y = torch.cos(y) + z = torch.cos(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_cos.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_cos.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_cos_pnnx + b = test_torch_cos_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_cosh.py b/tools/pnnx/tests/test_torch_cosh.py new file mode 100644 index 000000000000..7190e7f9e46d --- /dev/null +++ b/tools/pnnx/tests/test_torch_cosh.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.cosh(x) + y = torch.cosh(y) + z = torch.cosh(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_cosh.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_cosh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_cosh_pnnx + b = test_torch_cosh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_exp.py b/tools/pnnx/tests/test_torch_exp.py new file mode 100644 index 000000000000..507d96a22095 --- /dev/null +++ b/tools/pnnx/tests/test_torch_exp.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.exp(x) + y = torch.exp(y) + z = torch.exp(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_exp.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_exp.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_exp_pnnx + b = test_torch_exp_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_floor.py b/tools/pnnx/tests/test_torch_floor.py new file mode 100644 index 000000000000..04b4cb96c220 --- /dev/null +++ b/tools/pnnx/tests/test_torch_floor.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.floor(x * 10) + y = torch.floor(y * 10) + z = torch.floor(z * 10) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_floor.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_floor.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_floor_pnnx + b = test_torch_floor_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_log.py b/tools/pnnx/tests/test_torch_log.py new file mode 100644 index 000000000000..f98928d0a0db --- /dev/null +++ b/tools/pnnx/tests/test_torch_log.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.log(x) + y = torch.log(y) + z = torch.log(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_log.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_log.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_log_pnnx + b = test_torch_log_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_neg.py b/tools/pnnx/tests/test_torch_neg.py new file mode 100644 index 000000000000..e3424f2176c5 --- /dev/null +++ b/tools/pnnx/tests/test_torch_neg.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.neg(x - 0.5) + y = torch.neg(y - 0.5) + z = torch.neg(z - 0.5) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_neg.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_neg.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_neg_pnnx + b = test_torch_neg_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_pow.py b/tools/pnnx/tests/test_torch_pow.py new file mode 100644 index 000000000000..85bebce3629e --- /dev/null +++ b/tools/pnnx/tests/test_torch_pow.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + out0 = torch.pow(x, y) + out1 = torch.pow(y, y) + out2 = torch.pow(z, torch.ones_like(z) + 0.5) + return out0, out1, out2 + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(3, 16) + y = torch.rand(3, 16) + z = torch.rand(5, 9, 3) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_pow.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_pow.pt inputshape=[3,16],[3,16],[5,9,3]") + + # pnnx inference + import test_torch_pow_pnnx + b = test_torch_pow_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_reciprocal.py b/tools/pnnx/tests/test_torch_reciprocal.py new file mode 100644 index 000000000000..e508929c2241 --- /dev/null +++ b/tools/pnnx/tests/test_torch_reciprocal.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.reciprocal(x) + y = torch.reciprocal(y) + z = torch.reciprocal(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_reciprocal.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_reciprocal.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_reciprocal_pnnx + b = test_torch_reciprocal_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_rsqrt.py b/tools/pnnx/tests/test_torch_rsqrt.py new file mode 100644 index 000000000000..ec39dae71b5b --- /dev/null +++ b/tools/pnnx/tests/test_torch_rsqrt.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.rsqrt(x) + y = torch.rsqrt(y) + z = torch.rsqrt(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_rsqrt.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_rsqrt.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_rsqrt_pnnx + b = test_torch_rsqrt_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_sign.py b/tools/pnnx/tests/test_torch_sign.py new file mode 100644 index 000000000000..b834b7ea15e1 --- /dev/null +++ b/tools/pnnx/tests/test_torch_sign.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.sign(x - 0.5) + y = torch.sign(y - 0.5) + z = torch.sign(z - 0.5) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_sign.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_sign.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_sign_pnnx + b = test_torch_sign_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_sin.py b/tools/pnnx/tests/test_torch_sin.py new file mode 100644 index 000000000000..b0aa628a3717 --- /dev/null +++ b/tools/pnnx/tests/test_torch_sin.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.sin(x) + y = torch.sin(y) + z = torch.sin(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_sin.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_sin.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_sin_pnnx + b = test_torch_sin_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_sinh.py b/tools/pnnx/tests/test_torch_sinh.py new file mode 100644 index 000000000000..8f49e78652a4 --- /dev/null +++ b/tools/pnnx/tests/test_torch_sinh.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.sinh(x) + y = torch.sinh(y) + z = torch.sinh(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_sinh.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_sinh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_sinh_pnnx + b = test_torch_sinh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_sqrt.py b/tools/pnnx/tests/test_torch_sqrt.py new file mode 100644 index 000000000000..6cb88569ec49 --- /dev/null +++ b/tools/pnnx/tests/test_torch_sqrt.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.sqrt(x) + y = torch.sqrt(y) + z = torch.sqrt(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_sqrt.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_sqrt.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_sqrt_pnnx + b = test_torch_sqrt_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_square.py b/tools/pnnx/tests/test_torch_square.py new file mode 100644 index 000000000000..65069b0e71d9 --- /dev/null +++ b/tools/pnnx/tests/test_torch_square.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.square(x) + y = torch.square(y) + z = torch.square(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_square.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_square.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_square_pnnx + b = test_torch_square_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_tan.py b/tools/pnnx/tests/test_torch_tan.py new file mode 100644 index 000000000000..bd3ca4cf863f --- /dev/null +++ b/tools/pnnx/tests/test_torch_tan.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.tan(x) + y = torch.tan(y) + z = torch.tan(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_tan.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_tan.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_tan_pnnx + b = test_torch_tan_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_tanh.py b/tools/pnnx/tests/test_torch_tanh.py new file mode 100644 index 000000000000..9157f2daef01 --- /dev/null +++ b/tools/pnnx/tests/test_torch_tanh.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.tanh(x) + y = torch.tanh(y) + z = torch.tanh(z) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_tanh.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_tanh.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_tanh_pnnx + b = test_torch_tanh_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1) diff --git a/tools/pnnx/tests/test_torch_trunc.py b/tools/pnnx/tests/test_torch_trunc.py new file mode 100644 index 000000000000..95e82dbb8a1f --- /dev/null +++ b/tools/pnnx/tests/test_torch_trunc.py @@ -0,0 +1,61 @@ +# Tencent is pleased to support the open source community by making ncnn available. +# +# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +# +# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +class Model(nn.Module): + def __init__(self): + super(Model, self).__init__() + + def forward(self, x, y, z): + x = torch.trunc(x * 10 - 5) + y = torch.trunc(y * 10 - 5) + z = torch.trunc(z * 10 - 5) + return x, y, z + +def test(): + net = Model() + net.eval() + + torch.manual_seed(0) + x = torch.rand(1, 3, 16) + y = torch.rand(1, 5, 9, 11) + z = torch.rand(14, 8, 5, 9, 10) + + a = net(x, y, z) + + # export torchscript + mod = torch.jit.trace(net, (x, y, z)) + mod.save("test_torch_trunc.pt") + + # torchscript to pnnx + import os + os.system("../src/pnnx test_torch_trunc.pt inputshape=[1,3,16],[1,5,9,11],[14,8,5,9,10]") + + # pnnx inference + import test_torch_trunc_pnnx + b = test_torch_trunc_pnnx.test_inference() + + for a0, b0 in zip(a, b): + if not torch.equal(a0, b0): + return False + return True + +if __name__ == "__main__": + if test(): + exit(0) + else: + exit(1)