Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add vector optimization for loongarch64 #4242

Merged
merged 22 commits into from
Nov 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 13 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)")
else()
message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)")
set(NCNN_TARGET_ARCH loongarch)

include(CheckCXXCompilerFlag)

check_cxx_compiler_flag("-mlsx" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)

if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
option(NCNN_LSX "optimize loongarch platform with lsx extension" ON)
else()
message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.")
endif()

elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
set(NCNN_TARGET_ARCH riscv)

Expand Down Expand Up @@ -332,8 +345,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
set(NCNN_TARGET_ARCH powerpc)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch)")
set(NCNN_TARGET_ARCH mips)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)")
set(NCNN_TARGET_ARCH xtensa)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)")
Expand Down
6 changes: 6 additions & 0 deletions cmake/ncnn_add_layer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,12 @@ macro(ncnn_add_layer class)
endif()
endif()

if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "loongarch")
if(NCNN_LSX)
ncnn_add_arch_opt_layer(${class} lsx "-mlsx")
nihui marked this conversation as resolved.
Show resolved Hide resolved
endif()
endif()

if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv")
if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh")
Expand Down
14 changes: 14 additions & 0 deletions cmake/ncnn_generate_lsx_source.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,12 @@ if(NCNN_TARGET_ARCH STREQUAL "mips")
endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "loongarch")
if(NOT NCNN_RUNTIME_CPU AND NCNN_LSX)
target_compile_options(ncnn PRIVATE -mlsx)
endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
Expand Down
34 changes: 33 additions & 1 deletion src/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
return 0;
}

#if __aarch64__ || __mips64 || __riscv_xlen == 64
#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
struct
{
uint64_t tag;
Expand Down Expand Up @@ -236,6 +236,12 @@ static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
#define HWCAP_LOONGSON_MMI (1 << 11)
#endif

#if __loongarch64
// from arch/loongarch/include/uapi/asm/hwcap.h
#define HWCAP_LOONGARCH_LSX (1 << 4)
#define HWCAP_LOONGARCH_LASX (1 << 5)
#endif

#if __riscv
// from arch/riscv/include/uapi/asm/hwcap.h
#define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
Expand Down Expand Up @@ -1001,6 +1007,32 @@ int cpu_support_mips_msa()
#endif
}

int cpu_support_loongarch_lsx()
{
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
return g_hwcaps & HWCAP_LOONGARCH_LSX;
#else
return 0;
#endif
#else
return 0;
#endif
}
nihui marked this conversation as resolved.
Show resolved Hide resolved

int cpu_support_loongarch_lasx()
{
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
return g_hwcaps & HWCAP_LOONGARCH_LASX;
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_loongson_mmi()
{
#if defined __ANDROID__ || defined __linux__
Expand Down
5 changes: 5 additions & 0 deletions src/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ NCNN_EXPORT int cpu_support_x86_avx512_bf16();
// avx512_fp16 = x86 avx512 fp16
NCNN_EXPORT int cpu_support_x86_avx512_fp16();

// lsx = loongarch lsx
NCNN_EXPORT int cpu_support_loongarch_lsx();
// lasx = loongarch lasx
NCNN_EXPORT int cpu_support_loongarch_lasx();

// msa = mips mas
NCNN_EXPORT int cpu_support_mips_msa();
// mmi = loongson mmi
Expand Down
7 changes: 7 additions & 0 deletions src/layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,13 @@ Layer* create_layer(int index)
}
else
#endif // NCNN_RUNTIME_CPU && NCNN_AVX
#if NCNN_RUNTIME_CPU && NCNN_LSX
if (ncnn::cpu_support_loongarch_lsx())
{
layer_creator = layer_registry_lsx[index].creator;
}
else
#endif // NCNN_RUNTIME_CPU && NCNN_LSX
#if NCNN_RUNTIME_CPU && NCNN_MSA
if (ncnn::cpu_support_mips_msa())
{
Expand Down
67 changes: 67 additions & 0 deletions src/layer/loongarch/absval_loongarch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// yala is pleased to support the open source community by making ncnn available.
//
//
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "absval_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

namespace ncnn {

AbsVal_loongarch::AbsVal_loongarch()
{
#if __loongarch_sx
support_packing = true;
#endif
}

int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

int i = 0;
#if __loongarch_sx
for (; i + 3 < size; i += 4)
{
__builtin_prefetch(ptr + 16);
__m128i _p = __lsx_vld(ptr, 0);
__m128i _outp = __lsx_vbitclri_w(_p, 31);
__lsx_vst(_outp, ptr, 0);

ptr += 4;
}
#endif // __loongarch_sx
for (; i < size; i++)
{
*ptr = *ptr > 0 ? *ptr : -*ptr;

ptr++;
}
}

return 0;
}

} // namespace ncnn
32 changes: 32 additions & 0 deletions src/layer/loongarch/absval_loongarch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// yala is pleased to support the open source community by making ncnn available.
//
//
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_ABSVAL_LOONGARCH_H
#define LAYER_ABSVAL_LOONGARCH_H

#include "absval.h"

namespace ncnn {

class AbsVal_loongarch : virtual public AbsVal
{
public:
AbsVal_loongarch();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ABSVAL_LOONGARCH_H
145 changes: 145 additions & 0 deletions src/layer/loongarch/batchnorm_loongarch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// yala is pleased to support the open source community by making ncnn available.
//
//
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "batchnorm_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

BatchNorm_loongarch::BatchNorm_loongarch()
{
#if __loongarch_sx
support_packing = true;
#endif // __loongarch_sx
}

int BatchNorm_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int dims = bottom_top_blob.dims;
int elempack = bottom_top_blob.elempack;

if (dims == 1)
{
int w = bottom_top_blob.w * elempack;

#if __loongarch_sx
int nn_w = w / 4;
int remain_w_start = nn_w * 4;
#else
int remain_w_start = 0;
#endif // __loongarch_sx

float* ptr = bottom_top_blob;

#if __loongarch_sx
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < nn_w; i++)
{
float* ptr0 = ptr + i * 4;

__m128 _p = (__m128)__lsx_vld(ptr0, 0);
__m128 _a = (__m128)__lsx_vld((const float*)a_data + i * 4, 0);
__m128 _b = (__m128)__lsx_vld((const float*)b_data + i * 4, 0);
_p = __lsx_vfmadd_s(_b, _p, _a);
__lsx_vst(_p, ptr0, 0);
}
#endif // __loongarch_sx

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_w_start; i < w; i++)
{
ptr[i] = b_data[i] * ptr[i] + a_data[i];
}
}

if (dims == 2)
{
int w = bottom_top_blob.w * elempack;
int h = bottom_top_blob.h;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < h; i++)
{
float* ptr = bottom_top_blob.row(i);
float a = a_data[i];
float b = b_data[i];

int j = 0;
#if __loongarch_sx
__m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
__m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
for (; j + 3 < w; j += 4)
{
__builtin_prefetch(ptr + 16);
__m128 _p = (__m128)__lsx_vld(ptr, 0);
_p = __lsx_vfmadd_s(_b, _p, _a);
__lsx_vst(_p, ptr, 0);

ptr += 4;
}
#endif // __loongarch_sx
for (; j < w; j++)
{
*ptr = b * *ptr + a;
ptr++;
}
}
}

if (dims == 3 || dims == 4)
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int c = bottom_top_blob.c;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < c; q++)
{
float* ptr = bottom_top_blob.channel(q);
float a = a_data[q];
float b = b_data[q];

int i = 0;
#if __loongarch_sx
__m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
__m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
for (; i + 3 < size; i += 4)
{
__builtin_prefetch(ptr + 16);
__m128 _p = (__m128)__lsx_vld(ptr, 0);
_p = __lsx_vfmadd_s(_b, _p, _a);
__lsx_vst(_p, ptr, 0);

ptr += 4;
}
#endif // __loongarch_sx
for (; i < size; i++)
{
*ptr = b * *ptr + a;
ptr++;
}
}
}

return 0;
}

} // namespace ncnn
Loading