From 4cb1db0e3bd5a48433a9193b19994d539250ebdc Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Tue, 9 Jun 2020 06:25:45 +0000 Subject: [PATCH 001/349] Test flang build --- .github/workflows/dynamic_arch.yml | 32 +++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index b6a4090bd2..ca53e8857e 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -9,6 +9,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, macos-latest] + fortran: [gfortran, flang] build: [cmake, make] steps: - name: Checkout repository @@ -24,7 +25,7 @@ jobs: # Restore any ccache cache entry, if none for # ${{ runner.os }}-ccache-${{ github.sha }} exists restore-keys: | - ${{ runner.os }}-ccache + ${{ runner.os }}-ccache- - name: Print system information run: | @@ -49,8 +50,8 @@ jobs: fi ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB - - name: Build - if: matrix.build == 'make' + - name: gfortran build + if: matrix.build == 'make' && matrix.fortran == 'gfortran' run: | if [ "$RUNNER_OS" == "Linux" ]; then export PATH="/usr/lib/ccache:${PATH}" @@ -63,8 +64,29 @@ jobs: make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 - - name: CMake build - if: matrix.build == 'cmake' + - name: flang build + if: matrix.build == 'make' && matrix.fortran == 'flang' + run: | + if [ "$RUNNER_OS" == "Linux" ]; then + export PATH="/usr/lib/ccache:${PATH}" + elif [ "$RUNNER_OS" == "macOS" ]; then + exit 0 + else + echo "$RUNNER_OS not supported" + exit 1 + fi + + cd /usr/ + sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz + sudo tar xf flang-20190329-x86-70.tgz + sudo rm flang-20190329-x86-70.tgz + cd - + + make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang + + + - name: CMake gfortran build + if: matrix.build == 'cmake' && matrix.fortran == 'gfortran' run: | if [ "$RUNNER_OS" == "Linux" ]; then export PATH="/usr/lib/ccache:${PATH}" From b98923f33a58c6d78d49e0a22bb6203df5c3f713 Mon Sep 17 00:00:00 2001 From: Leonard Lausen Date: Tue, 9 Jun 2020 06:54:42 +0000 Subject: [PATCH 002/349] Test enforce -O1 for flang --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 252c816a9e..a343a98292 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1276,7 +1276,7 @@ endif override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) -ifeq ($(FLANG_VENDOR),AOCC) +ifeq ($(F_COMPILER), FLANG) override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT) else override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) From 9fe930f205c3ad56fe92d9c4c65e48836db33a27 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 11 Jun 2020 15:47:20 -0500 Subject: [PATCH 003/349] powerpc: Add support for future processor This is the initial patch to support build infrastructure for POWER10 architecture. --- Makefile.power | 10 ++ Makefile.system | 7 ++ TargetList.txt | 1 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 2 +- common.h | 7 +- common_power.h | 8 +- cpuid_power.c | 15 ++- driver/others/dynamic_power.c | 16 ++- getarch.c | 13 +++ kernel/CMakeLists.txt | 2 +- kernel/Makefile.L3 | 4 + kernel/power/KERNEL.POWER10 | 214 ++++++++++++++++++++++++++++++++++ kernel/power/casum.c | 2 +- kernel/power/ccopy.c | 2 +- kernel/power/crot.c | 2 +- kernel/power/cswap.c | 2 +- kernel/power/dasum.c | 2 +- kernel/power/daxpy.c | 2 +- kernel/power/dcopy.c | 2 +- kernel/power/ddot.c | 2 +- kernel/power/dgemv_n.c | 2 +- kernel/power/drot.c | 2 +- kernel/power/dscal.c | 2 +- kernel/power/dswap.c | 2 +- kernel/power/sasum.c | 2 +- kernel/power/scopy.c | 2 +- kernel/power/sdot.c | 2 +- kernel/power/srot.c | 2 +- kernel/power/sscal.c | 2 +- kernel/power/sswap.c | 2 +- kernel/power/zasum.c | 2 +- kernel/power/zaxpy.c | 2 +- kernel/power/zcopy.c | 2 +- kernel/power/zdot.c | 2 +- kernel/power/zscal.c | 2 +- kernel/power/zswap.c | 2 +- param.h | 2 +- 38 files changed, 309 insertions(+), 42 deletions(-) create mode 100644 kernel/power/KERNEL.POWER10 diff --git a/Makefile.power b/Makefile.power index 24d8aa8a7e..5c431860f6 100644 --- a/Makefile.power +++ b/Makefile.power @@ -9,6 +9,16 @@ else USE_OPENMP = 1 endif +ifeq ($(CORE), POWER10) +ifeq ($(USE_OPENMP), 1) +COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -fno-fast-math +endif +endif + ifeq ($(CORE), POWER9) ifeq ($(USE_OPENMP), 1) COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp diff --git a/Makefile.system b/Makefile.system index 56e94f2a62..3decc1457e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -595,6 +595,7 @@ DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 +DYNAMIC_CORE += POWER10 endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) @@ -603,6 +604,12 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif +GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) +ifeq ($(GCCVERSIONGTEQ11), 1) +DYNAMIC_CORE += POWER10 +else +$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) +endif endif endif diff --git a/TargetList.txt b/TargetList.txt index e2d2f40263..4e54e30773 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -49,6 +49,7 @@ POWER6 POWER7 POWER8 POWER9 +POWER10 PPCG4 PPC970 PPC970MP diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 9d51f777cd..d56ba99cb6 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -49,7 +49,7 @@ if (DYNAMIC_ARCH) endif () if (POWER) - set(DYNAMIC_CORE POWER6 POWER8 POWER9) + set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) endif () if (X86) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 067b97b4bc..30256870ca 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -420,7 +420,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 8) set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) - elseif ("${TCORE}" STREQUAL "POWER9") + elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10") file(APPEND ${TARGET_CONF_TEMP} "#define L1_DATA_SIZE 32768\n" "#define L1_DATA_LINESIZE 128\n" diff --git a/common.h b/common.h index e2c8cdee53..00b34a3f76 100644 --- a/common.h +++ b/common.h @@ -360,13 +360,8 @@ typedef int blasint; #endif #endif -#ifdef POWER8 -#ifndef YIELDING -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); -#endif -#endif -#ifdef POWER9 +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #ifndef YIELDING #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); #endif diff --git a/common_power.h b/common_power.h index e29d0f382a..aa19794b50 100644 --- a/common_power.h +++ b/common_power.h @@ -68,7 +68,7 @@ #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") #define RMB __asm__ __volatile__ ("eieio":::"memory") @@ -272,7 +272,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -294,7 +294,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst @@ -843,7 +843,7 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) -#elif defined(POWER8) || defined(POWER9) +#elif defined(POWER8) || defined(POWER9) || defined(POWER10) #define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) diff --git a/cpuid_power.c b/cpuid_power.c index d5ba6fb2ce..b36aa4945c 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -57,6 +57,7 @@ #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 #define CPUTYPE_POWER9 9 +#define CPUTYPE_POWER10 10 char *cpuname[] = { "UNKNOWN", @@ -68,7 +69,8 @@ char *cpuname[] = { "CELL", "PPCG4", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; char *lowercpuname[] = { @@ -81,7 +83,8 @@ char *lowercpuname[] = { "cell", "ppcg4", "power8", - "power9" + "power9", + "power10" }; char *corename[] = { @@ -94,7 +97,8 @@ char *corename[] = { "CELL", "PPCG4", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; int detect(void){ @@ -125,6 +129,7 @@ int detect(void){ if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; + if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; @@ -157,6 +162,7 @@ int detect(void){ if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; + if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; @@ -179,6 +185,9 @@ int detect(void){ int id; __asm __volatile("mfpvr %0" : "=r"(id)); switch ( id >> 16 ) { + case 0x80: // POWER10 + return CPUTYPE_POWER10; + break; case 0x4e: // POWER9 return CPUTYPE_POWER9; break; diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 8c831b9982..811a5fae33 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,6 +6,9 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) +extern gotoblas_t gotoblas_POWER10; +#endif extern void openblas_warning(int verbose, const char *msg); @@ -13,7 +16,8 @@ static char *corename[] = { "unknown", "POWER6", "POWER8", - "POWER9" + "POWER9", + "POWER10" }; #define NUM_CORETYPES 4 @@ -23,6 +27,9 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER8) return corename[2]; #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + if (gotoblas == &gotoblas_POWER10) return corename[4]; #endif return corename[0]; } @@ -36,6 +43,10 @@ static gotoblas_t *get_coretype(void) { #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + if (__builtin_cpu_is("isa_3_1") && __builtin_cpu_supports ("mma")) + return &gotoblas_POWER10; #endif return NULL; } @@ -61,6 +72,9 @@ static gotoblas_t *force_coretype(char * coretype) { case 2: return (&gotoblas_POWER8); #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); +#endif +#if (!defined __GNUC__) || ( __GNUC__ >= 11) + case 4: return (&gotoblas_POWER10); #endif default: return NULL; } diff --git a/getarch.c b/getarch.c index c173d58b85..164947f3e1 100644 --- a/getarch.c +++ b/getarch.c @@ -650,6 +650,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER9" #endif +#if defined(FORCE_POWER10) +#define FORCE +#define ARCHITECTURE "POWER" +#define SUBARCHITECTURE "POWER10" +#define SUBDIRNAME "power" +#define ARCHCONFIG "-DPOWER10 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ + "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " +#define LIBNAME "power10" +#define CORENAME "POWER10" +#endif + #ifdef FORCE_PPCG4 #define FORCE #define ARCHITECTURE "POWER" diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index b114c6a337..d1349c5f83 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -130,7 +130,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) set(USE_TRMM true) endif () - if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) + if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) set(USE_TRMM true) endif () diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index da6c5fd57c..0cb02ef855 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -51,6 +51,10 @@ ifeq ($(CORE), POWER9) USE_TRMM = 1 endif +ifeq ($(CORE), POWER10) +USE_TRMM = 1 +endif + ifeq ($(ARCH), zarch) USE_TRMM = 1 endif diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 new file mode 100644 index 0000000000..ab8fbfcd93 --- /dev/null +++ b/kernel/power/KERNEL.POWER10 @@ -0,0 +1,214 @@ +ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +include $(KERNELDIR)/KERNEL.POWER8 +else + +#SGEMM_BETA = ../generic/gemm_beta.c +#DGEMM_BETA = ../generic/gemm_beta.c +#CGEMM_BETA = ../generic/zgemm_beta.c +#ZGEMM_BETA = ../generic/zgemm_beta.c + +STRMMKERNEL = sgemm_kernel_power9.S +DTRMMKERNEL = dgemm_kernel_power9.S +CTRMMKERNEL = cgemm_kernel_power9.S +ZTRMMKERNEL = zgemm_kernel_power9.S + +SGEMMKERNEL = sgemm_kernel_power9.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = sgemm_tcopy_16_power8.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = sgemm_tcopy_8_power8.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_power9.S +DGEMMINCOPY = ../generic/gemm_ncopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_power8.S +DGEMMONCOPY = dgemm_ncopy_4_power8.S +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_power9.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_power9.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c +ZGEMMITCOPY = zgemm_tcopy_8_power8.S +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. +#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S +#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S + +#Pure C for other kernels +#SAMAXKERNEL = ../arm/amax.c +#DAMAXKERNEL = ../arm/amax.c +#CAMAXKERNEL = ../arm/zamax.c +#ZAMAXKERNEL = ../arm/zamax.c +# +#SAMINKERNEL = ../arm/amin.c +#DAMINKERNEL = ../arm/amin.c +#CAMINKERNEL = ../arm/zamin.c +#ZAMINKERNEL = ../arm/zamin.c +# +#SMAXKERNEL = ../arm/max.c +#DMAXKERNEL = ../arm/max.c +# +#SMINKERNEL = ../arm/min.c +#DMINKERNEL = ../arm/min.c +# +ifneq ($(GCCVERSIONGTEQ9),1) +ISAMAXKERNEL = isamax_power9.S +else +ISAMAXKERNEL = isamax.c +endif +IDAMAXKERNEL = idamax.c +ifneq ($(GCCVERSIONGTEQ9),1) +ICAMAXKERNEL = icamax_power9.S +else +ICAMAXKERNEL = icamax.c +endif +IZAMAXKERNEL = izamax.c +# +ifneq ($(GCCVERSIONGTEQ9),1) +ISAMINKERNEL = isamin_power9.S +else +ISAMINKERNEL = isamin.c +endif +IDAMINKERNEL = idamin.c +ifneq ($(GCCVERSIONGTEQ9),1) +ICAMINKERNEL = icamin_power9.S +else +ICAMINKERNEL = icamin.c +endif +IZAMINKERNEL = izamin.c +# +#ISMAXKERNEL = ../arm/imax.c +#IDMAXKERNEL = ../arm/imax.c +# +#ISMINKERNEL = ../arm/imin.c +#IDMINKERNEL = ../arm/imin.c +# +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c +# +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +ifneq ($(GCCVERSIONGTEQ9),1) +CAXPYKERNEL = caxpy_power9.S +else +CAXPYKERNEL = caxpy.c +endif +ZAXPYKERNEL = zaxpy.c +# +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c +# +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +DSDOTKERNEL = sdot.c +ifneq ($(GCCVERSIONGTEQ9),1) +CDOTKERNEL = cdot_power9.S +else +CDOTKERNEL = cdot.c +endif +ZDOTKERNEL = zdot.c +# +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c +# +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c +# +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c +# +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c +# + +SGEMVNKERNEL = sgemv_n.c +DGEMVNKERNEL = dgemv_n.c +CGEMVNKERNEL = cgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c +# +SGEMVTKERNEL = sgemv_t.c +DGEMVTKERNEL = dgemv_t.c +CGEMVTKERNEL = cgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c + + +#SSYMV_U_KERNEL = ../generic/symv_k.c +#SSYMV_L_KERNEL = ../generic/symv_k.c +#DSYMV_U_KERNEL = ../generic/symv_k.c +#DSYMV_L_KERNEL = ../generic/symv_k.c +#QSYMV_U_KERNEL = ../generic/symv_k.c +#QSYMV_L_KERNEL = ../generic/symv_k.c +#CSYMV_U_KERNEL = ../generic/zsymv_k.c +#CSYMV_L_KERNEL = ../generic/zsymv_k.c +#ZSYMV_U_KERNEL = ../generic/zsymv_k.c +#ZSYMV_L_KERNEL = ../generic/zsymv_k.c +#XSYMV_U_KERNEL = ../generic/zsymv_k.c +#XSYMV_L_KERNEL = ../generic/zsymv_k.c + +#ZHEMV_U_KERNEL = ../generic/zhemv_k.c +#ZHEMV_L_KERNEL = ../generic/zhemv_k.c + +LSAME_KERNEL = ../generic/lsame.c +SCABS_KERNEL = ../generic/cabs.c +DCABS_KERNEL = ../generic/cabs.c +QCABS_KERNEL = ../generic/cabs.c + +#Dump kernel +CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c +ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c + +endif diff --git a/kernel/power/casum.c b/kernel/power/casum.c index a9ece07685..3478a39ef3 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "casum_microk_power8.c" #endif diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index 50df84cc50..cbe5b48d2f 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "ccopy_microk_power8.c" #endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 2a5835546a..5c1d44620d 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 31e02fe5a4..88cb1d6387 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "cswap_microk_power8.c" #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index d0e060977c..09e06d9091 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dasum_microk_power8.c" #endif diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index f09611ff09..018beafd17 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "daxpy_microk_power8.c" #endif diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index 27b39144ba..cf203e71e3 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dcopy_microk_power8.c" #endif diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index f985df1c5a..bd9e1fb97d 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "ddot_microk_power8.c" #endif diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index 1a3d7669c6..b4dfda5509 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dgemv_n_microk_power8.c" #endif diff --git a/kernel/power/drot.c b/kernel/power/drot.c index baeb542051..b808ab5665 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "drot_microk_power8.c" #endif diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 779a08e9ce..7e0fe48c0f 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dscal_microk_power8.c" #endif diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 52b7f50dad..795bb10b48 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "dswap_microk_power8.c" #endif diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index 5908347d3d..b259d7d76f 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sasum_microk_power8.c" #endif diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c index 5e3fe45a57..5207d386e4 100644 --- a/kernel/power/scopy.c +++ b/kernel/power/scopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "scopy_microk_power8.c" #endif diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c index ae527dde9d..8de434e418 100644 --- a/kernel/power/sdot.c +++ b/kernel/power/sdot.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sdot_microk_power8.c" #endif diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 6af813c161..9638a59eb8 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "srot_microk_power8.c" #endif diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 4f3ba56980..ddd5b2c5b3 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sscal_microk_power8.c" #endif diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 23d13280fb..a564344448 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "sswap_microk_power8.c" #endif diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index f61c62e75b..8383e39ab9 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zasum_microk_power8.c" #endif diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index f0f8c69108..4a7c26c694 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zaxpy_microk_power8.c" #endif diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index b21d6ef15c..bb80decd27 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zcopy_microk_power8.c" #endif diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index fd36c7f448..9086ef35bc 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zdot_microk_power8.c" #endif diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index a1b441d2c8..16b584bca0 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 1d8826f414..c6508f0321 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) || defined(POWER9) || defined(POWER10) #include "zswap_microk_power8.c" #endif diff --git a/param.h b/param.h index 04928277c2..fd0ea75992 100644 --- a/param.h +++ b/param.h @@ -2260,7 +2260,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER9) +#if defined(POWER9) || defined(POWER10) #define SNUMOPT 16 #define DNUMOPT 8 From 1c53e1366d5441ee7fa22b77be7bea8c5eabef32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 22:04:37 +0200 Subject: [PATCH 004/349] Increment version to 0.3.10.dev --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 12621d6b8b..bb5322a1d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 10) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") From 1bd3cd66c270134d138f7b61cd158407a07086cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 14 Jun 2020 22:05:19 +0200 Subject: [PATCH 005/349] Increment version to 0.3.10.dev --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 551c094cac..2c12177ee6 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.10 +VERSION = 0.3.10.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From a2d13ea61183099c05aa31e23ef59e1411d77177 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 16 Jun 2020 14:40:50 +0200 Subject: [PATCH 006/349] Fix gcc version detection for zarch Employ common variables for gcc version detection and fix the broken check for gcc >= 5.2. Fixes #2668 Signed-off-by: Marius Hillenbrand --- Makefile.system | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 8d78b420fb..5738b14ecb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -282,9 +282,11 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) endif @@ -570,20 +572,27 @@ ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer -GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2")) +ifeq ($(GCCVERSIONGT5), 1) + ZARCH_SUPPORT_Z13 := 1 +else ifeq ($(GCCVERSIONEQ5), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) -RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3")) +ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) + ZARCH_SUPPORT_Z13 := 1 +endif endif -ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1) +ifeq ($(ZARCH_SUPPORT_Z13), 1) DYNAMIC_CORE += Z13 else $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) endif -GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) -ifeq ($(GCC_MAJOR_GE_7), 1) +ifeq ($(GCCVERSIONGTEQ7), 1) DYNAMIC_CORE += Z14 else $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) From 23892917667d87072eef2f18b6120f5d3c029f90 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 16 Jun 2020 14:45:09 +0200 Subject: [PATCH 007/349] Makefile.system: remove duplicate variable GCCVERSIONGT5 ... to bring unified gcc version detection with common variables to the one remaining spot in Makefile.system. Signed-off-by: Marius Hillenbrand --- Makefile.system | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 5738b14ecb..63cdbccd85 100644 --- a/Makefile.system +++ b/Makefile.system @@ -606,7 +606,6 @@ ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 endif ifeq ($(C_COMPILER), GCC) -GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) ifeq ($(GCCVERSIONGT5), 1) DYNAMIC_CORE += POWER9 else From cde4690721ad54043239db000a46537a9169ca02 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 16 Jun 2020 15:45:59 +0200 Subject: [PATCH 008/349] RFC: Use gcc -dumpfullversion to get minor version with gcc-7.x In gcc-7.1, the behavior of -dumpversion changed to be configured at compile-time. On some distributions it only dumps the major version (e.g., Ubuntu), so the current checks for the gcc minor version report false negatives. As a replacement, gcc-7.1 introduced -dumpfullversion which always prints the full version. Update the gcc version detection in Makefile.system to employ -dumpfullversion with gcc-7 and newer. Posting this patch for discussion, since it emerged from discussions around issue #2668 and PR #2669. It is not solving a problem right now, but may be useful in the future. Signed-off-by: Marius Hillenbrand --- Makefile.system | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 63cdbccd85..7e0b2757eb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -286,8 +286,15 @@ GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +# Note that the behavior of -dumpversion is compile-time-configurable for +# gcc-7.x and newer. Use -dumpfullversion there +ifeq ($(GCCVERSIONGTEQ7),1) + GCCDUMPVERSION_PARAM := -dumpfullversion +else + GCCDUMPVERSION_PARAM := -dumpversion +endif +GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif # From 478898b37a91836a39d046f8c70e26c6c9fc06c7 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Wed, 17 Jun 2020 16:08:48 +0200 Subject: [PATCH 009/349] cpp_thread_test/dgemv: cap concurrency to number of hw threads on small systems ... instead of (number of hw threads - 4) to avoid invalid numbers on smaller systems. Currently, systems with 4 or fewer CPUs (e.g., small CI VMs) would fail the test. Fixes one of the issues discussed in #2668 Signed-off-by: Marius Hillenbrand --- cpp_thread_test/dgemv_thread_safety.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp_thread_test/dgemv_thread_safety.cpp b/cpp_thread_test/dgemv_thread_safety.cpp index 5411fec297..277594ff0b 100644 --- a/cpp_thread_test/dgemv_thread_safety.cpp +++ b/cpp_thread_test/dgemv_thread_safety.cpp @@ -18,7 +18,7 @@ int main(int argc, char* argv[]){ uint32_t maxHwThreads = omp_get_max_threads(); if (maxHwThreads < 52) - numConcurrentThreads = maxHwThreads -4; + numConcurrentThreads = maxHwThreads; if (argc > 4){ std::cout<<"ERROR: too many arguments for thread safety tester"< Date: Wed, 17 Jun 2020 16:15:44 +0200 Subject: [PATCH 010/349] cpp_thread_test/dgemv: fail early if concurrency is zero The two test cases dgemv_tester and dgemm_tester accept the degree of concurrency as command line argument (amongst others). Fail early if value 0 has been specified, instead of later with less-clear symptoms. Signed-off-by: Marius Hillenbrand --- cpp_thread_test/cpp_thread_safety_common.h | 8 ++++++++ cpp_thread_test/dgemm_thread_safety.cpp | 2 ++ cpp_thread_test/dgemv_thread_safety.cpp | 2 ++ 3 files changed, 12 insertions(+) diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h index 60ab5bb2f4..8005369a8a 100644 --- a/cpp_thread_test/cpp_thread_safety_common.h +++ b/cpp_thread_test/cpp_thread_safety_common.h @@ -5,6 +5,14 @@ inline void pauser(){ std::getline(std::cin, dummy); } +void FailIfThreadsAreZero(uint32_t numConcurrentThreads) { + if(numConcurrentThreads == 0) { + std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ for(uint32_t i=0; i(randomMatSize*randomMatSize); j++){ diff --git a/cpp_thread_test/dgemm_thread_safety.cpp b/cpp_thread_test/dgemm_thread_safety.cpp index 1c5287524f..104c64f2ac 100644 --- a/cpp_thread_test/dgemm_thread_safety.cpp +++ b/cpp_thread_test/dgemm_thread_safety.cpp @@ -46,6 +46,8 @@ int main(int argc, char* argv[]){ std::cout<<"Number of concurrent calls into OpenBLAS : "<(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast(1024*1024)<<" MiB of RAM\n"<(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast(randomMatSize)*numConcurrentThreads*8*2))/static_cast(1024*1024)<<" MiB of RAM\n"< Date: Sat, 20 Jun 2020 00:07:43 +0800 Subject: [PATCH 011/349] AVX512 dgemm tcopy_16 function --- kernel/x86_64/KERNEL.SKYLAKEX | 5 +- kernel/x86_64/dgemm_tcopy_16_skylakex.c | 129 ++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_tcopy_16_skylakex.c diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 65f031d033..9b8b84c301 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -14,7 +14,7 @@ STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = ../generic/gemm_tcopy_16.c +DGEMMITCOPY = dgemm_tcopy_16_skylakex.c DGEMMONCOPY = ../generic/gemm_ncopy_2.c DGEMMOTCOPY = ../generic/gemm_tcopy_2.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c @@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c + +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c diff --git a/kernel/x86_64/dgemm_tcopy_16_skylakex.c b/kernel/x86_64/dgemm_tcopy_16_skylakex.c new file mode 100644 index 0000000000..a1da60f8f6 --- /dev/null +++ b/kernel/x86_64/dgemm_tcopy_16_skylakex.c @@ -0,0 +1,129 @@ +#include +#include "common.h" +#include + +int CNAME(BLASLONG dim_second, BLASLONG dim_first, double *src, BLASLONG lead_dim, double *dst){ + double *src1, *src2, *src3, *src4, *dst1; + __m512d z1,z2,z3,z4,z5,z6,z7,z8; __m256d y1,y2,y3,y4; __m128d x1,x2,x3,x4; double s1,s2,s3,s4; + BLASLONG dim1_count, dim2_count, src_inc; + src_inc = 4 * lead_dim - dim_first; + src1 = src; src2 = src + lead_dim; src3 = src2 + lead_dim; src4 = src3 + lead_dim; + for(dim2_count=dim_second; dim2_count>3; dim2_count-=4){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; + z5 = _mm512_loadu_pd(src3); z6 = _mm512_loadu_pd(src3+8); src3 += 16; + z7 = _mm512_loadu_pd(src4); z8 = _mm512_loadu_pd(src4+8); src4 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); + _mm512_storeu_pd(dst1+32,z5); _mm512_storeu_pd(dst1+40,z6); + _mm512_storeu_pd(dst1+48,z7); _mm512_storeu_pd(dst1+56,z8); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + z2 = _mm512_loadu_pd(src2); src2 += 8; + z3 = _mm512_loadu_pd(src3); src3 += 8; + z4 = _mm512_loadu_pd(src4); src4 += 8; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + y2 = _mm256_loadu_pd(src2); src2 += 4; + y3 = _mm256_loadu_pd(src3); src3 += 4; + y4 = _mm256_loadu_pd(src4); src4 += 4; + _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); + _mm256_storeu_pd(dst1+ 8,y3); _mm256_storeu_pd(dst1+12,y4); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + x2 = _mm_loadu_pd(src2); src2 += 2; + x3 = _mm_loadu_pd(src3); src3 += 2; + x4 = _mm_loadu_pd(src4); src4 += 2; + _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); + _mm_storeu_pd(dst1+4,x3); _mm_storeu_pd(dst1+6,x4); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; s2 = *src2; src2++; s3 = *src3; src3++; s4 = *src4; src4++; + dst1[0] = s1; dst1[1] = s2; dst1[2] = s3; dst1[3] = s4; + } + src1 += src_inc; src2 += src_inc; src3 += src_inc; src4 += src_inc; + } + src_inc -= 2 * lead_dim; + for(; dim2_count>1; dim2_count-=2){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + z3 = _mm512_loadu_pd(src2); z4 = _mm512_loadu_pd(src2+8); src2 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); + _mm512_storeu_pd(dst1+16,z3); _mm512_storeu_pd(dst1+24,z4); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + z2 = _mm512_loadu_pd(src2); src2 += 8; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + y2 = _mm256_loadu_pd(src2); src2 += 4; + _mm256_storeu_pd(dst1+ 0,y1); _mm256_storeu_pd(dst1+ 4,y2); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + x2 = _mm_loadu_pd(src2); src2 += 2; + _mm_storeu_pd(dst1+0,x1); _mm_storeu_pd(dst1+2,x2); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; s2 = *src2; src2++; + dst1[0] = s1; dst1[1] = s2; + } + src1 += src_inc; src2 += src_inc; + } + src_inc -= lead_dim; + for(; dim2_count>0; dim2_count--){ + dst1 = dst + 16 * (dim_second - dim2_count); + for(dim1_count=dim_first; dim1_count>15; dim1_count-=16){ + z1 = _mm512_loadu_pd(src1); z2 = _mm512_loadu_pd(src1+8); src1 += 16; + _mm512_storeu_pd(dst1+ 0,z1); _mm512_storeu_pd(dst1+ 8,z2); dst1 += 16 * dim_second; + } + dst1 -= 8 * (dim_second - dim2_count); + if(dim1_count>7){ + z1 = _mm512_loadu_pd(src1); src1 += 8; + _mm512_storeu_pd(dst1+ 0,z1); dst1 += 8 * dim_second; + dim1_count -= 8; + } + dst1 -= 4 * (dim_second - dim2_count); + if(dim1_count>3){ + y1 = _mm256_loadu_pd(src1); src1 += 4; + _mm256_storeu_pd(dst1+ 0,y1); dst1 += 4 * dim_second; + dim1_count -= 4; + } + dst1 -= 2 * (dim_second - dim2_count); + if(dim1_count>1){ + x1 = _mm_loadu_pd(src1); src1 += 2; + _mm_storeu_pd(dst1+0,x1); dst1 += 2 * dim_second; + dim1_count -= 2; + } + dst1 -= dim_second - dim2_count; + if(dim1_count>0){ + s1 = *src1; src1++; + dst1[0] = s1; + } + src1 += src_inc; + } +} From e6b92750349e273d6bb7b28673f10c39cff90c26 Mon Sep 17 00:00:00 2001 From: User User-User Date: Wed, 24 Jun 2020 09:12:23 +0300 Subject: [PATCH 012/349] address vs2019 C4293 --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1bf0e4a6d6..38eb766430 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -332,7 +332,7 @@ int support_avx512(){ if((ebx & (1<<7)) == 0){ ret=0; //OS does not even support AVX2 } - if((ebx & (1<<31)) != 0){ + if((ebx & (1u<<31)) != 0){ xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) ret=1; //OS supports AVX512VL @@ -632,7 +632,7 @@ static gotoblas_t *get_coretype(void){ cpuid(0x80000000, &eax, &ebx, &ecx, &edx); if ( (eax & 0xffff) >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) + if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0) return NULL; } else From df4ade070f745d5c542067b5fd5bab3e29d39dcf Mon Sep 17 00:00:00 2001 From: Kavana Bhat Date: Wed, 24 Jun 2020 04:25:47 -0500 Subject: [PATCH 013/349] Fix for #2671 --- kernel/Makefile.L3 | 94 +++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 0cb02ef855..86772cb22a 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -483,7 +483,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ rm shgemmotcopy.s shgemmotcopy_nomacros.s @@ -498,7 +498,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s m4 shgemmitcopy.s > shgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ rm shgemmitcopy.s shgemmitcopy_nomacros.s @@ -514,7 +514,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -530,7 +530,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s @@ -542,7 +542,7 @@ endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ rm dgemm_ncopy.s dgemm_ncopy_nomacros.s @@ -560,7 +560,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ rm dgemm_itcopy.s dgemm_itcopy_nomacros.s @@ -603,7 +603,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ rm cgemm_itcopy.s cgemm_itcopy_nomacros.s @@ -626,7 +626,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ rm zgemm_itcopy.s zgemm_itcopy_nomacros.s @@ -658,7 +658,7 @@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s @@ -670,7 +670,7 @@ ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s @@ -681,7 +681,7 @@ endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s @@ -694,7 +694,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s @@ -704,7 +704,7 @@ endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s @@ -714,7 +714,7 @@ endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s @@ -724,7 +724,7 @@ endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s @@ -734,7 +734,7 @@ endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s @@ -744,7 +744,7 @@ endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s @@ -754,7 +754,7 @@ endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s @@ -764,7 +764,7 @@ endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s @@ -788,7 +788,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ rm strmmkernel_ln.s strmmkernel_ln_nomacros.s @@ -798,7 +798,7 @@ endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ rm strmmkernel_lt.s strmmkernel_lt_nomacros.s @@ -808,7 +808,7 @@ endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ rm strmmkernel_rn.s strmmkernel_rn_nomacros.s @@ -818,7 +818,7 @@ endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s @@ -828,7 +828,7 @@ endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s @@ -838,7 +838,7 @@ endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s @@ -848,7 +848,7 @@ endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s @@ -858,7 +858,7 @@ endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s @@ -880,7 +880,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s @@ -890,7 +890,7 @@ endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s @@ -900,7 +900,7 @@ endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s @@ -910,7 +910,7 @@ endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s @@ -920,7 +920,7 @@ endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s @@ -930,7 +930,7 @@ endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s @@ -940,7 +940,7 @@ endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s @@ -950,7 +950,7 @@ endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s @@ -960,7 +960,7 @@ endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s @@ -970,7 +970,7 @@ endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s @@ -980,7 +980,7 @@ endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s @@ -990,7 +990,7 @@ endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s @@ -1000,7 +1000,7 @@ endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s @@ -1010,7 +1010,7 @@ endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s @@ -1020,7 +1020,7 @@ endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s @@ -1030,7 +1030,7 @@ endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s @@ -1050,7 +1050,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s @@ -1184,7 +1184,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s + $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s @@ -2460,7 +2460,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s @@ -2506,7 +2506,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s From 571eadb88063c91ea9b5b1bcb2ae33cd8fbc5762 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 24 Jun 2020 14:48:15 -0500 Subject: [PATCH 014/349] powerpc: Optimized SGEMM/DGEMM/CGEMM for POWER10 This patch introduces new optimized version of SGEMM, CGEMM and DGEMM using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. Cycles count reduced by 30-50% compared to POWER9 version depending on M/N/K sizes. MMA GCC patch for reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8ee2640bfdc62f835ec9740278f948034bc7d9f1 --- kernel/power/KERNEL.POWER10 | 12 +- kernel/power/cgemm_kernel_power10.S | 286 +++ kernel/power/cgemm_logic_power10.S | 2814 +++++++++++++++++++++++++++ kernel/power/cgemm_macros_power10.S | 2131 ++++++++++++++++++++ kernel/power/dgemm_kernel_power10.c | 864 ++++++++ kernel/power/sgemm_kernel_power10.c | 1334 +++++++++++++ 6 files changed, 7435 insertions(+), 6 deletions(-) create mode 100644 kernel/power/cgemm_kernel_power10.S create mode 100644 kernel/power/cgemm_logic_power10.S create mode 100644 kernel/power/cgemm_macros_power10.S create mode 100644 kernel/power/dgemm_kernel_power10.c create mode 100644 kernel/power/sgemm_kernel_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index ab8fbfcd93..00d31f8b6a 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -7,12 +7,12 @@ else #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = sgemm_kernel_power9.S -DTRMMKERNEL = dgemm_kernel_power9.S -CTRMMKERNEL = cgemm_kernel_power9.S +STRMMKERNEL = sgemm_kernel_power10.c +DTRMMKERNEL = dgemm_kernel_power10.c +CTRMMKERNEL = cgemm_kernel_power10.S ZTRMMKERNEL = zgemm_kernel_power9.S -SGEMMKERNEL = sgemm_kernel_power9.S +SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c @@ -22,7 +22,7 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = dgemm_kernel_power9.S +DGEMMKERNEL = dgemm_kernel_power10.c DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S @@ -32,7 +32,7 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMKERNEL = cgemm_kernel_power9.S +CGEMMKERNEL = cgemm_kernel_power10.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_4.c diff --git a/kernel/power/cgemm_kernel_power10.S b/kernel/power/cgemm_kernel_power10.S new file mode 100644 index 0000000000..e04f948dd5 --- /dev/null +++ b/kernel/power/cgemm_kernel_power10.S @@ -0,0 +1,286 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + + +#define LOAD ld +#define STACKSIZE (512 ) +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + +#define alpha_r vs51 +#define alpha_i vs55 +#define save_permute_1 vs59 +#define permute_mask vs63 +#define o0 0 + + +#define T1 r11 +#define T2 r12 +#define T3 r14 +#define T4 r15 +#define T5 r16 +#define T6 r17 +#define L r18 +#define T7 r19 +#define T8 r20 +#define TEMP_REG r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T9 r27 +#define T10 r28 +#define PRE r29 + +#define T12 r30 +#define T13 r31 + +#include "cgemm_macros_power10.S" + +.equ perm_const1, 0x0405060700010203 +.equ perm_const2, 0x0c0d0e0f08090a0b +.equ save_permute_12, 0x0c0d0e0f1c1d1e1f +.equ save_permute_11, 0x0405060714151617 + + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + + addi SP, SP, -STACKSIZE + mflr r0 + + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs52, 288(SP) + stxv vs53, 304(SP) + stxv vs54, 320(SP) + stxv vs55, 336(SP) + stxv vs56, 352(SP) + stxv vs57, 368(SP) + stxv vs58, 384(SP) + stxv vs59, 400(SP) + stxv vs60, 416(SP) + stxv vs61, 432(SP) + stxv vs62, 448(SP) + stxv vs63, 464(SP) + std r0, FLINK_SAVE(SP) + + + + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + + + +#ifdef TRMMKERNEL + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + slwi LDC, LDC, ZBASE_SHIFT + + + + /*alpha is stored in f1. convert to single and splat*/ + xscvdpspn alpha_r,vs1 + xscvdpspn alpha_i,vs2 + xxspltw alpha_r,alpha_r,0 + xxspltw alpha_i,alpha_i,0 +/*load reverse permute mask for big endian + uint128 = 0xc0d0e0f08090a0b0405060700010203 +*/ + + lis T2, perm_const2@highest + lis T1, perm_const1@highest + lis T3, save_permute_12@highest + lis T4, save_permute_11@highest + + + ori T2, T2, perm_const2@higher + ori T1, T1, perm_const1@higher + ori T3, T3, save_permute_12@higher + ori T4, T4, save_permute_11@higher + + + rldicr T2, T2, 32, 31 + rldicr T1, T1, 32, 31 + rldicr T3, T3, 32, 31 + rldicr T4, T4, 32, 31 + + oris T2, T2, perm_const2@h + oris T1, T1, perm_const1@h + oris T3, T3, save_permute_12@h + oris T4, T4, save_permute_11@h + + + ori T2, T2, perm_const2@l + ori T1, T1, perm_const1@l + ori T3, T3, save_permute_12@l + ori T4, T4, save_permute_11@l + + + li r0,0 + li PRE,512 + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegsp alpha_r,alpha_r + xvnegsp alpha_i,alpha_i +#endif + + mtvsrdd permute_mask,T2,T1 + mtvsrdd save_permute_1,T3,T4 + + /*mask is reverse permute so we have to make it inner permute */ + xxpermdi permute_mask, permute_mask, permute_mask,2 + +#include "cgemm_logic_power10.S" + +.L999: + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs52, 288(SP) + lxv vs53, 304(SP) + lxv vs54, 320(SP) + lxv vs55, 336(SP) + lxv vs56, 352(SP) + lxv vs57, 368(SP) + lxv vs58, 384(SP) + lxv vs59, 400(SP) + mtlr r0 + lxv vs60, 416(SP) + lxv vs61, 432(SP) + lxv vs62, 448(SP) + lxv vs63, 464(SP) + + addi SP, SP, STACKSIZE + blr + + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_power10.S b/kernel/power/cgemm_logic_power10.S new file mode 100644 index 0000000000..3700ac87bc --- /dev/null +++ b/kernel/power/cgemm_logic_power10.S @@ -0,0 +1,2814 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define MY_ALIGN .align 3 +b CGEMM_L4 +/* MINI SUBROUTINES */ +/* 4x8 MAIN 128x+2 LOOP */ + + +CGEMM_L4x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x8_2 + MY_ALIGN +CGEMM_L4x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 +CGEMM_L4x8_K128: +/*----------------------------------------*/ + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_L2 128,64,31,0 + KERNEL4x8_L2 128,64,32,0 + KERNEL4x8_L2 128,64,33,0 + KERNEL4x8_L2 128,64,34,0 + KERNEL4x8_L2 128,64,35,0 + KERNEL4x8_L2 128,64,36,0 + KERNEL4x8_L2 128,64,37,0 + KERNEL4x8_L2 128,64,38,0 + KERNEL4x8_L2 128,64,39,0 + KERNEL4x8_L2 128,64,40,0 + KERNEL4x8_L2 128,64,41,0 + KERNEL4x8_L2 128,64,42,0 + KERNEL4x8_L2 128,64,43,0 + KERNEL4x8_L2 128,64,44,0 + KERNEL4x8_L2 128,64,45,0 + KERNEL4x8_L2 128,64,46,0 + KERNEL4x8_L2 128,64,47,0 + KERNEL4x8_L2 128,64,48,0 + KERNEL4x8_L2 128,64,49,0 + KERNEL4x8_L2 128,64,50,0 + KERNEL4x8_L2 128,64,51,0 + KERNEL4x8_L2 128,64,52,0 + KERNEL4x8_L2 128,64,53,0 + KERNEL4x8_L2 128,64,54,0 + KERNEL4x8_L2 128,64,55,0 + KERNEL4x8_L2 128,64,56,0 + KERNEL4x8_L2 128,64,57,0 + KERNEL4x8_L2 128,64,58,0 + KERNEL4x8_L2 128,64,59,0 + KERNEL4x8_L2 128,64,60,0 + KERNEL4x8_L2 128,64,61,0 + KERNEL4x8_L2 128,64,62,0 + KERNEL4x8_L2 128,64,63,1 + bdnz CGEMM_L4x8_LOOP + MY_ALIGN +CGEMM_L4x8_LOOP_END: +/*----------------------------------------*/ + END4x8_2 + blr + MY_ALIGN + + +CGEMM_4x8_L64_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_L2 128,64,15,0 + KERNEL4x8_L2 128,64,16,0 + KERNEL4x8_L2 128,64,17,0 + KERNEL4x8_L2 128,64,18,0 + KERNEL4x8_L2 128,64,19,0 + KERNEL4x8_L2 128,64,20,0 + KERNEL4x8_L2 128,64,21,0 + KERNEL4x8_L2 128,64,22,0 + KERNEL4x8_L2 128,64,23,0 + KERNEL4x8_L2 128,64,24,0 + KERNEL4x8_L2 128,64,25,0 + KERNEL4x8_L2 128,64,26,0 + KERNEL4x8_L2 128,64,27,0 + KERNEL4x8_L2 128,64,28,0 + KERNEL4x8_L2 128,64,29,0 + KERNEL4x8_L2 128,64,30,0 + KERNEL4x8_E2 128,64,31,1 + blr + MY_ALIGN + + +CGEMM_4x8_L32_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_L2 128,64,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL4x8_L2 128,64,8,0 + KERNEL4x8_L2 128,64,9,0 + KERNEL4x8_L2 128,64,10,0 + KERNEL4x8_L2 128,64,11,0 + dcbt BO, T4 + KERNEL4x8_L2 128,64,12,0 + KERNEL4x8_L2 128,64,13,0 + KERNEL4x8_L2 128,64,14,0 + KERNEL4x8_E2 128,64,15,1 + blr + MY_ALIGN + + +CGEMM_4x8_L16_SUB: +/*----------------------------------------*/ + LOAD4x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL4x8_L2 128,64,0,0 + KERNEL4x8_L2 128,64,1,0 + dcbt AO, T2 + KERNEL4x8_L2 128,64,2,0 + KERNEL4x8_L2 128,64,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL4x8_L2 128,64,4,0 + KERNEL4x8_L2 128,64,5,0 + dcbt AO, T4 + KERNEL4x8_L2 128,64,6,0 + KERNEL4x8_E2 128,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x4_2 + MY_ALIGN +CGEMM_L4x4_LOOP: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,0,0 +CGEMM_L4x4_K32: +/*----------------------------------------*/ + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_L2 64,64,7,0 + KERNEL4x4_L2 64,64,8,0 + KERNEL4x4_L2 64,64,9,0 + KERNEL4x4_L2 64,64,10,0 + KERNEL4x4_L2 64,64,11,0 + KERNEL4x4_L2 64,64,12,0 + KERNEL4x4_L2 64,64,13,0 + KERNEL4x4_L2 64,64,14,0 + KERNEL4x4_L2 64,64,15,1 + bdnz CGEMM_L4x4_LOOP + MY_ALIGN +CGEMM_L4x4_LOOP_END: +/*----------------------------------------*/ + END4x4_2 + blr + MY_ALIGN + + +CGEMM_4x4_L16_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_L2 64,64,3,0 + KERNEL4x4_L2 64,64,4,0 + KERNEL4x4_L2 64,64,5,0 + KERNEL4x4_L2 64,64,6,0 + KERNEL4x4_E2 64,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x4_L8_SUB: +/*----------------------------------------*/ + LOAD4x4_2 + KERNEL4x4_L2 64,64,0,0 + KERNEL4x4_L2 64,64,1,0 + KERNEL4x4_L2 64,64,2,0 + KERNEL4x4_E2 64,64,3,1 + blr + + +CGEMM_4x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x2_2 + MY_ALIGN +CGEMM_L4x2_LOOP: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,0,0 +CGEMM_L4x2_K32: +/*----------------------------------------*/ + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_L2 32,64,7,0 + KERNEL4x2_L2 32,64,8,0 + KERNEL4x2_L2 32,64,9,0 + KERNEL4x2_L2 32,64,10,0 + KERNEL4x2_L2 32,64,11,0 + KERNEL4x2_L2 32,64,12,0 + KERNEL4x2_L2 32,64,13,0 + KERNEL4x2_L2 32,64,14,0 + KERNEL4x2_L2 32,64,15,1 + bdnz CGEMM_L4x2_LOOP + MY_ALIGN + + +CGEMM_L4x2_LOOP_END: +/*----------------------------------------*/ + END4x2_2 + blr + MY_ALIGN +CGEMM_4x2_L16_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_L2 32,64,3,0 + KERNEL4x2_L2 32,64,4,0 + KERNEL4x2_L2 32,64,5,0 + KERNEL4x2_L2 32,64,6,0 + KERNEL4x2_E2 32,64,7,1 + blr + MY_ALIGN +CGEMM_4x2_L8_SUB: +/*----------------------------------------*/ + LOAD4x2_2 + KERNEL4x2_L2 32,64,0,0 + KERNEL4x2_L2 32,64,1,0 + KERNEL4x2_L2 32,64,2,0 + KERNEL4x2_E2 32,64,3,1 + blr + + +CGEMM_4x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD4x1_2 + MY_ALIGN +CGEMM_L4x1_LOOP: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,0,0 +CGEMM_L4x1_K32: +/*----------------------------------------*/ + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_L2 16,64,7,0 + KERNEL4x1_L2 16,64,8,0 + KERNEL4x1_L2 16,64,9,0 + KERNEL4x1_L2 16,64,10,0 + KERNEL4x1_L2 16,64,11,0 + KERNEL4x1_L2 16,64,12,0 + KERNEL4x1_L2 16,64,13,0 + KERNEL4x1_L2 16,64,14,0 + KERNEL4x1_L2 16,64,15,1 + bdnz CGEMM_L4x1_LOOP + MY_ALIGN +CGEMM_L4x1_LOOP_END: +/*----------------------------------------*/ + END4x1_2 + blr + + MY_ALIGN +CGEMM_4x1_L16_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_L2 16,64,3,0 + KERNEL4x1_L2 16,64,4,0 + KERNEL4x1_L2 16,64,5,0 + KERNEL4x1_L2 16,64,6,0 + KERNEL4x1_E2 16,64,7,1 + blr + MY_ALIGN + + +CGEMM_4x1_L8_SUB: +/*----------------------------------------*/ + LOAD4x1_2 + KERNEL4x1_L2 16,64,0,0 + KERNEL4x1_L2 16,64,1,0 + KERNEL4x1_L2 16,64,2,0 + KERNEL4x1_E2 16,64,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L4: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + /* Pre set value in vs57 as 0xffff0000ffff0000 for masking */ + vspltisb v24, -1 + vspltisb v25, 0 + xxsldwi vs57, vs56, vs57, 1 + xxpermdi vs57, vs57, vs57, 3 + srawi. J, N, 2 + ble CGEMM_L4_END + + +CGEMM_L4_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 2 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L4x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L4x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO4x8 + ble CGEMM_L4x8_SUB0 + bl CGEMM_L4x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 + + +CGEMM_L4x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP4x8_128K + addi BO,BO,-32 + addi AO,AO,-64 + LOAD4x8O 64,32 + END4x8_WITHOUT_ADD + LOAD4x8_2O 128, 64 + mtctr T8 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + CMP4x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L4x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-128 + LOAD4x8_2O 128,64 + bl CGEMM_L4x8_K128 + b CGEMM_L4x8_SAVE + MY_ALIGN + + +CGEMM_L4x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L4x8_SUB2_32 + bl CGEMM_4x8_L64_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L4x8_SUB2_16 + bl CGEMM_4x8_L32_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x8_SUB2_8 + bl CGEMM_4x8_L16_SUB + MY_ALIGN + + +CGEMM_L4x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x8_SUB2_4 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_L2 128,64, 1,0 + KERNEL4x8_L2 128,64, 2,0 + KERNEL4x8_E2 128,64, 3,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x8_SUB2_2 + LOAD4x8_2 + KERNEL4x8_L2 128,64, 0,0 + KERNEL4x8_E2 128,64, 1,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x8_SUB2_1 + LOAD4x8_2 + KERNEL4x8_E2 128,64, 0,1 + MY_ALIGN + + +CGEMM_L4x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x8_SAVE + KERNEL4x8 + + MY_ALIGN +CGEMM_L4x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE4x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 +#endif + bgt CGEMM_L4x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END + b CGEMM_L4x4_BEGIN + MY_ALIGN + + +CGEMM_L4x8_END: +/*----------------------------------------*/ + + +CGEMM_L4x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L4x1_END + andi. T1, M, 4 + ble CGEMM_L4x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x4 + ble CGEMM_L4x4_SUB0 + bl CGEMM_4x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 + + +CGEMM_L4x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x4_32K + addi BO,BO,-32 + addi AO,AO,-32 + LOAD4x4O 32,32 + END4x4_WITHOUT_ADD + LOAD4x4_2O 64, 64 + mtctr T8 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + CMP4x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-64 + LOAD4x4_2O 64,64 + bl CGEMM_L4x4_K32 + b CGEMM_L4x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x4_SUB2_8 + bl CGEMM_4x4_L16_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x4_SUB2_4 + bl CGEMM_4x4_L8_SUB + MY_ALIGN + + +CGEMM_L4x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x4_SUB2_2 + LOAD4x4_2 + KERNEL4x4_L2 64,64, 0,0 + KERNEL4x4_E2 64,64, 1,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x4_SUB2_1 + LOAD4x4_2 + KERNEL4x4_E2 64,64, 0,1 + MY_ALIGN + + +CGEMM_L4x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x4_SAVE + KERNEL4x4 + + +CGEMM_L4x4_SAVE: +/*----------------------------------------*/ + SAVE4x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 +#endif + + +CGEMM_L4x4_END: +/*----------------------------------------*/ + + +CGEMM_L4x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L4x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x2 + ble CGEMM_L4x2_SUB0 + bl CGEMM_4x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 + + +CGEMM_L4x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x2_32K + addi BO,BO,-32 + addi AO,AO,-16 + LOAD4x2O 16,32 + END4x2_WITHOUT_ADD + LOAD4x2_2O 32, 64 + mtctr T8 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + CMP4x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-32 + LOAD4x2_2O 32,64 + bl CGEMM_L4x2_K32 + b CGEMM_L4x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x2_SUB2_8 + bl CGEMM_4x2_L16_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x2_SUB2_4 + bl CGEMM_4x2_L8_SUB + MY_ALIGN + + +CGEMM_L4x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x2_SUB2_2 + LOAD4x2_2 + KERNEL4x2_L2 32,64, 0,0 + KERNEL4x2_E2 32,64, 1,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x2_SUB2_1 + LOAD4x2_2 + KERNEL4x2_E2 32,64, 0,1 + MY_ALIGN + + +CGEMM_L4x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x2_SAVE + KERNEL4x2 + + MY_ALIGN +CGEMM_L4x2_SAVE: +/*----------------------------------------*/ + SAVE4x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 +#endif + + +CGEMM_L4x2_END: +/*----------------------------------------*/ + + +CGEMM_L4x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L4x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO4x1 + ble CGEMM_L4x1_SUB0 + bl CGEMM_4x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 + + +CGEMM_L4x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP4x1_32K + addi BO,BO,-32 + addi AO,AO,-8 + LOAD4x1O 8,32 + END4x1_WITHOUT_ADD + LOAD4x1_2O 16, 64 + mtctr T8 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + CMP4x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L4x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-64 + addi AO,AO,-16 + LOAD4x1_2O 16,64 + bl CGEMM_L4x1_K32 + b CGEMM_L4x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L4x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L4x1_SUB2_8 + bl CGEMM_4x1_L16_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L4x1_SUB2_4 + bl CGEMM_4x1_L8_SUB + MY_ALIGN + + +CGEMM_L4x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L4x1_SUB2_2 + LOAD4x1_2 + KERNEL4x1_L2 16,64, 0,0 + KERNEL4x1_E2 16,64, 1,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L4x1_SUB2_1 + LOAD4x1_2 + KERNEL4x1_E2 16,64, 0,1 + MY_ALIGN + + +CGEMM_L4x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L4x1_SAVE + KERNEL4x1 + + MY_ALIGN +CGEMM_L4x1_SAVE: +/*----------------------------------------*/ + + SAVE4x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 +#endif + + +CGEMM_L4x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 4 +#endif + bgt CGEMM_L4_BEGIN + + +CGEMM_L4_END: + +b CGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +CGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x8_2 + MY_ALIGN +CGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 +CGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_L2 128,32,31,0 + KERNEL2x8_L2 128,32,32,0 + KERNEL2x8_L2 128,32,33,0 + KERNEL2x8_L2 128,32,34,0 + KERNEL2x8_L2 128,32,35,0 + KERNEL2x8_L2 128,32,36,0 + KERNEL2x8_L2 128,32,37,0 + KERNEL2x8_L2 128,32,38,0 + KERNEL2x8_L2 128,32,39,0 + KERNEL2x8_L2 128,32,40,0 + KERNEL2x8_L2 128,32,41,0 + KERNEL2x8_L2 128,32,42,0 + KERNEL2x8_L2 128,32,43,0 + KERNEL2x8_L2 128,32,44,0 + KERNEL2x8_L2 128,32,45,0 + KERNEL2x8_L2 128,32,46,0 + KERNEL2x8_L2 128,32,47,0 + KERNEL2x8_L2 128,32,48,0 + KERNEL2x8_L2 128,32,49,0 + KERNEL2x8_L2 128,32,50,0 + KERNEL2x8_L2 128,32,51,0 + KERNEL2x8_L2 128,32,52,0 + KERNEL2x8_L2 128,32,53,0 + KERNEL2x8_L2 128,32,54,0 + KERNEL2x8_L2 128,32,55,0 + KERNEL2x8_L2 128,32,56,0 + KERNEL2x8_L2 128,32,57,0 + KERNEL2x8_L2 128,32,58,0 + KERNEL2x8_L2 128,32,59,0 + KERNEL2x8_L2 128,32,60,0 + KERNEL2x8_L2 128,32,61,0 + KERNEL2x8_L2 128,32,62,0 + KERNEL2x8_L2 128,32,63,1 + bdnz CGEMM_L2x8_LOOP + MY_ALIGN +CGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + END2x8_2 + blr + MY_ALIGN + + +CGEMM_2x8_L64_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_L2 128,32,15,0 + KERNEL2x8_L2 128,32,16,0 + KERNEL2x8_L2 128,32,17,0 + KERNEL2x8_L2 128,32,18,0 + KERNEL2x8_L2 128,32,19,0 + KERNEL2x8_L2 128,32,20,0 + KERNEL2x8_L2 128,32,21,0 + KERNEL2x8_L2 128,32,22,0 + KERNEL2x8_L2 128,32,23,0 + KERNEL2x8_L2 128,32,24,0 + KERNEL2x8_L2 128,32,25,0 + KERNEL2x8_L2 128,32,26,0 + KERNEL2x8_L2 128,32,27,0 + KERNEL2x8_L2 128,32,28,0 + KERNEL2x8_L2 128,32,29,0 + KERNEL2x8_L2 128,32,30,0 + KERNEL2x8_E2 128,32,31,1 + blr + MY_ALIGN + + +CGEMM_2x8_L32_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_L2 128,32,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_L2 128,32,8,0 + KERNEL2x8_L2 128,32,9,0 + KERNEL2x8_L2 128,32,10,0 + KERNEL2x8_L2 128,32,11,0 + dcbt BO, T4 + KERNEL2x8_L2 128,32,12,0 + KERNEL2x8_L2 128,32,13,0 + KERNEL2x8_L2 128,32,14,0 + KERNEL2x8_E2 128,32,15,1 + blr + MY_ALIGN + + +CGEMM_2x8_L16_SUB: +/*----------------------------------------*/ + LOAD2x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_L2 128,32,0,0 + KERNEL2x8_L2 128,32,1,0 + dcbt AO, T2 + KERNEL2x8_L2 128,32,2,0 + KERNEL2x8_L2 128,32,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_L2 128,32,4,0 + KERNEL2x8_L2 128,32,5,0 + dcbt AO, T4 + KERNEL2x8_L2 128,32,6,0 + KERNEL2x8_E2 128,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x4_2 + MY_ALIGN +CGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,0,0 +CGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_L2 64,32,7,0 + KERNEL2x4_L2 64,32,8,0 + KERNEL2x4_L2 64,32,9,0 + KERNEL2x4_L2 64,32,10,0 + KERNEL2x4_L2 64,32,11,0 + KERNEL2x4_L2 64,32,12,0 + KERNEL2x4_L2 64,32,13,0 + KERNEL2x4_L2 64,32,14,0 + KERNEL2x4_L2 64,32,15,1 + bdnz CGEMM_L2x4_LOOP + MY_ALIGN +CGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + END2x4_2 + blr + MY_ALIGN + + +CGEMM_2x4_L16_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_L2 64,32,3,0 + KERNEL2x4_L2 64,32,4,0 + KERNEL2x4_L2 64,32,5,0 + KERNEL2x4_L2 64,32,6,0 + KERNEL2x4_E2 64,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x4_L8_SUB: +/*----------------------------------------*/ + LOAD2x4_2 + KERNEL2x4_L2 64,32,0,0 + KERNEL2x4_L2 64,32,1,0 + KERNEL2x4_L2 64,32,2,0 + KERNEL2x4_E2 64,32,3,1 + blr + + +CGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x2_2 + MY_ALIGN +CGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,0,0 +CGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_L2 32,32,7,0 + KERNEL2x2_L2 32,32,8,0 + KERNEL2x2_L2 32,32,9,0 + KERNEL2x2_L2 32,32,10,0 + KERNEL2x2_L2 32,32,11,0 + KERNEL2x2_L2 32,32,12,0 + KERNEL2x2_L2 32,32,13,0 + KERNEL2x2_L2 32,32,14,0 + KERNEL2x2_L2 32,32,15,1 + bdnz CGEMM_L2x2_LOOP + MY_ALIGN + + +CGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + END2x2_2 + blr + MY_ALIGN +CGEMM_2x2_L16_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_L2 32,32,3,0 + KERNEL2x2_L2 32,32,4,0 + KERNEL2x2_L2 32,32,5,0 + KERNEL2x2_L2 32,32,6,0 + KERNEL2x2_E2 32,32,7,1 + blr + MY_ALIGN +CGEMM_2x2_L8_SUB: +/*----------------------------------------*/ + LOAD2x2_2 + KERNEL2x2_L2 32,32,0,0 + KERNEL2x2_L2 32,32,1,0 + KERNEL2x2_L2 32,32,2,0 + KERNEL2x2_E2 32,32,3,1 + blr + + +CGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +CGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,0,0 +CGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_L2 16,32,7,0 + KERNEL2x1_L2 16,32,8,0 + KERNEL2x1_L2 16,32,9,0 + KERNEL2x1_L2 16,32,10,0 + KERNEL2x1_L2 16,32,11,0 + KERNEL2x1_L2 16,32,12,0 + KERNEL2x1_L2 16,32,13,0 + KERNEL2x1_L2 16,32,14,0 + KERNEL2x1_L2 16,32,15,1 + bdnz CGEMM_L2x1_LOOP + MY_ALIGN +CGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN +CGEMM_2x1_L16_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_L2 16,32,3,0 + KERNEL2x1_L2 16,32,4,0 + KERNEL2x1_L2 16,32,5,0 + KERNEL2x1_L2 16,32,6,0 + KERNEL2x1_E2 16,32,7,1 + blr + MY_ALIGN + + +CGEMM_2x1_L8_SUB: +/*----------------------------------------*/ + LOAD2x1_2 + KERNEL2x1_L2 16,32,0,0 + KERNEL2x1_L2 16,32,1,0 + KERNEL2x1_L2 16,32,2,0 + KERNEL2x1_E2 16,32,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L2: +/*----------------------------------------*/ + + andi. J, N, 2 + ble CGEMM_L2_END + + +CGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC , 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L2x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO2x8 + ble CGEMM_L2x8_SUB0 + bl CGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 + + +CGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP2x8_128K + addi BO,BO,-16 + addi AO,AO,-64 + LOAD2x8O 64,16 + END2x8_WITHOUT_ADD + LOAD2x8_2O 128, 32 + mtctr T8 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-128 + LOAD2x8_2O 128,32 + bl CGEMM_L2x8_K128 + b CGEMM_L2x8_SAVE + MY_ALIGN + + +CGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L2x8_SUB2_32 + bl CGEMM_2x8_L64_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L2x8_SUB2_16 + bl CGEMM_2x8_L32_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x8_SUB2_8 + bl CGEMM_2x8_L16_SUB + MY_ALIGN + + +CGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x8_SUB2_4 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_L2 128,32, 1,0 + KERNEL2x8_L2 128,32, 2,0 + KERNEL2x8_E2 128,32, 3,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x8_SUB2_2 + LOAD2x8_2 + KERNEL2x8_L2 128,32, 0,0 + KERNEL2x8_E2 128,32, 1,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x8_SUB2_1 + LOAD2x8_2 + KERNEL2x8_E2 128,32, 0,1 + MY_ALIGN + + +CGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x8_SAVE + KERNEL2x8 + + MY_ALIGN +CGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 +#endif + bgt CGEMM_L2x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END + b CGEMM_L2x4_BEGIN + MY_ALIGN + + +CGEMM_L2x8_END: +/*----------------------------------------*/ + + +CGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L2x1_END + andi. T1, M, 4 + ble CGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x4 + ble CGEMM_L2x4_SUB0 + bl CGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 + + +CGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x4_32K + addi BO,BO,-16 + addi AO,AO,-32 + LOAD2x4O 32,16 + END2x4_WITHOUT_ADD + LOAD2x4_2O 64, 32 + mtctr T8 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-64 + LOAD2x4_2O 64,32 + bl CGEMM_L2x4_K32 + b CGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x4_SUB2_8 + bl CGEMM_2x4_L16_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x4_SUB2_4 + bl CGEMM_2x4_L8_SUB + MY_ALIGN + + +CGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x4_SUB2_2 + LOAD2x4_2 + KERNEL2x4_L2 64,32, 0,0 + KERNEL2x4_E2 64,32, 1,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x4_SUB2_1 + LOAD2x4_2 + KERNEL2x4_E2 64,32, 0,1 + MY_ALIGN + + +CGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x4_SAVE + KERNEL2x4 + + +CGEMM_L2x4_SAVE: +/*----------------------------------------*/ + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 +#endif + + +CGEMM_L2x4_END: +/*----------------------------------------*/ + + +CGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x2 + ble CGEMM_L2x2_SUB0 + bl CGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 + + +CGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x2_32K + addi BO,BO,-16 + addi AO,AO,-16 + LOAD2x2O 16,16 + END2x2_WITHOUT_ADD + LOAD2x2_2O 32, 32 + mtctr T8 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-32 + LOAD2x2_2O 32,32 + bl CGEMM_L2x2_K32 + b CGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x2_SUB2_8 + bl CGEMM_2x2_L16_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x2_SUB2_4 + bl CGEMM_2x2_L8_SUB + MY_ALIGN + + +CGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x2_SUB2_2 + LOAD2x2_2 + KERNEL2x2_L2 32,32, 0,0 + KERNEL2x2_E2 32,32, 1,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x2_SUB2_1 + LOAD2x2_2 + KERNEL2x2_E2 32,32, 0,1 + MY_ALIGN + + +CGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x2_SAVE + KERNEL2x2 + + MY_ALIGN +CGEMM_L2x2_SAVE: +/*----------------------------------------*/ + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 +#endif + + +CGEMM_L2x2_END: +/*----------------------------------------*/ + + +CGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble CGEMM_L2x1_SUB0 + bl CGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 + + +CGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP2x1_32K + addi BO,BO,-16 + addi AO,AO,-8 + LOAD2x1O 8,16 + END2x1_WITHOUT_ADD + LOAD2x1_2O 16, 32 + mtctr T8 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-32 + addi AO,AO,-16 + LOAD2x1_2O 16,32 + bl CGEMM_L2x1_K32 + b CGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L2x1_SUB2_8 + bl CGEMM_2x1_L16_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L2x1_SUB2_4 + bl CGEMM_2x1_L8_SUB + MY_ALIGN + + +CGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 16,32, 0,0 + KERNEL2x1_E2 16,32, 1,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 16,32, 0,1 + MY_ALIGN + + +CGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L2x1_SAVE + KERNEL2x1 + + MY_ALIGN +CGEMM_L2x1_SAVE: +/*----------------------------------------*/ + + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 +#endif + + +CGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 4 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + +CGEMM_L2_END: + + +b CGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +CGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x8_2 + MY_ALIGN +CGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 +CGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_L2 128,16,31,0 + KERNEL1x8_L2 128,16,32,0 + KERNEL1x8_L2 128,16,33,0 + KERNEL1x8_L2 128,16,34,0 + KERNEL1x8_L2 128,16,35,0 + KERNEL1x8_L2 128,16,36,0 + KERNEL1x8_L2 128,16,37,0 + KERNEL1x8_L2 128,16,38,0 + KERNEL1x8_L2 128,16,39,0 + KERNEL1x8_L2 128,16,40,0 + KERNEL1x8_L2 128,16,41,0 + KERNEL1x8_L2 128,16,42,0 + KERNEL1x8_L2 128,16,43,0 + KERNEL1x8_L2 128,16,44,0 + KERNEL1x8_L2 128,16,45,0 + KERNEL1x8_L2 128,16,46,0 + KERNEL1x8_L2 128,16,47,0 + KERNEL1x8_L2 128,16,48,0 + KERNEL1x8_L2 128,16,49,0 + KERNEL1x8_L2 128,16,50,0 + KERNEL1x8_L2 128,16,51,0 + KERNEL1x8_L2 128,16,52,0 + KERNEL1x8_L2 128,16,53,0 + KERNEL1x8_L2 128,16,54,0 + KERNEL1x8_L2 128,16,55,0 + KERNEL1x8_L2 128,16,56,0 + KERNEL1x8_L2 128,16,57,0 + KERNEL1x8_L2 128,16,58,0 + KERNEL1x8_L2 128,16,59,0 + KERNEL1x8_L2 128,16,60,0 + KERNEL1x8_L2 128,16,61,0 + KERNEL1x8_L2 128,16,62,0 + KERNEL1x8_L2 128,16,63,1 + bdnz CGEMM_L1x8_LOOP + MY_ALIGN +CGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + END1x8_2 + blr + MY_ALIGN + + +CGEMM_1x8_L64_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_L2 128,16,15,0 + KERNEL1x8_L2 128,16,16,0 + KERNEL1x8_L2 128,16,17,0 + KERNEL1x8_L2 128,16,18,0 + KERNEL1x8_L2 128,16,19,0 + KERNEL1x8_L2 128,16,20,0 + KERNEL1x8_L2 128,16,21,0 + KERNEL1x8_L2 128,16,22,0 + KERNEL1x8_L2 128,16,23,0 + KERNEL1x8_L2 128,16,24,0 + KERNEL1x8_L2 128,16,25,0 + KERNEL1x8_L2 128,16,26,0 + KERNEL1x8_L2 128,16,27,0 + KERNEL1x8_L2 128,16,28,0 + KERNEL1x8_L2 128,16,29,0 + KERNEL1x8_L2 128,16,30,0 + KERNEL1x8_E2 128,16,31,1 + blr + MY_ALIGN + + +CGEMM_1x8_L32_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_L2 128,16,7,0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_L2 128,16,8,0 + KERNEL1x8_L2 128,16,9,0 + KERNEL1x8_L2 128,16,10,0 + KERNEL1x8_L2 128,16,11,0 + dcbt BO, T4 + KERNEL1x8_L2 128,16,12,0 + KERNEL1x8_L2 128,16,13,0 + KERNEL1x8_L2 128,16,14,0 + KERNEL1x8_E2 128,16,15,1 + blr + MY_ALIGN + + +CGEMM_1x8_L16_SUB: +/*----------------------------------------*/ + LOAD1x8_2 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_L2 128,16,0,0 + KERNEL1x8_L2 128,16,1,0 + dcbt AO, T2 + KERNEL1x8_L2 128,16,2,0 + KERNEL1x8_L2 128,16,3,0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_L2 128,16,4,0 + KERNEL1x8_L2 128,16,5,0 + dcbt AO, T4 + KERNEL1x8_L2 128,16,6,0 + KERNEL1x8_E2 128,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x4_2 + MY_ALIGN +CGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,0,0 +CGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_L2 64,16,7,0 + KERNEL1x4_L2 64,16,8,0 + KERNEL1x4_L2 64,16,9,0 + KERNEL1x4_L2 64,16,10,0 + KERNEL1x4_L2 64,16,11,0 + KERNEL1x4_L2 64,16,12,0 + KERNEL1x4_L2 64,16,13,0 + KERNEL1x4_L2 64,16,14,0 + KERNEL1x4_L2 64,16,15,1 + bdnz CGEMM_L1x4_LOOP + MY_ALIGN +CGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + END1x4_2 + blr + MY_ALIGN + + +CGEMM_1x4_L16_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_L2 64,16,3,0 + KERNEL1x4_L2 64,16,4,0 + KERNEL1x4_L2 64,16,5,0 + KERNEL1x4_L2 64,16,6,0 + KERNEL1x4_E2 64,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x4_L8_SUB: +/*----------------------------------------*/ + LOAD1x4_2 + KERNEL1x4_L2 64,16,0,0 + KERNEL1x4_L2 64,16,1,0 + KERNEL1x4_L2 64,16,2,0 + KERNEL1x4_E2 64,16,3,1 + blr + + +CGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x2_2 + MY_ALIGN +CGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,0,0 +CGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_L2 32,16,7,0 + KERNEL1x2_L2 32,16,8,0 + KERNEL1x2_L2 32,16,9,0 + KERNEL1x2_L2 32,16,10,0 + KERNEL1x2_L2 32,16,11,0 + KERNEL1x2_L2 32,16,12,0 + KERNEL1x2_L2 32,16,13,0 + KERNEL1x2_L2 32,16,14,0 + KERNEL1x2_L2 32,16,15,1 + bdnz CGEMM_L1x2_LOOP + MY_ALIGN + + +CGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + END1x2_2 + blr + MY_ALIGN +CGEMM_1x2_L16_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_L2 32,16,3,0 + KERNEL1x2_L2 32,16,4,0 + KERNEL1x2_L2 32,16,5,0 + KERNEL1x2_L2 32,16,6,0 + KERNEL1x2_E2 32,16,7,1 + blr + MY_ALIGN +CGEMM_1x2_L8_SUB: +/*----------------------------------------*/ + LOAD1x2_2 + KERNEL1x2_L2 32,16,0,0 + KERNEL1x2_L2 32,16,1,0 + KERNEL1x2_L2 32,16,2,0 + KERNEL1x2_E2 32,16,3,1 + blr + + +CGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN +CGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,0,0 +CGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_L2 16,16,7,0 + KERNEL1x1_L2 16,16,8,0 + KERNEL1x1_L2 16,16,9,0 + KERNEL1x1_L2 16,16,10,0 + KERNEL1x1_L2 16,16,11,0 + KERNEL1x1_L2 16,16,12,0 + KERNEL1x1_L2 16,16,13,0 + KERNEL1x1_L2 16,16,14,0 + KERNEL1x1_L2 16,16,15,1 + bdnz CGEMM_L1x1_LOOP + MY_ALIGN +CGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + + MY_ALIGN +CGEMM_1x1_L16_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_L2 16,16,3,0 + KERNEL1x1_L2 16,16,4,0 + KERNEL1x1_L2 16,16,5,0 + KERNEL1x1_L2 16,16,6,0 + KERNEL1x1_E2 16,16,7,1 + blr + MY_ALIGN + + +CGEMM_1x1_L8_SUB: +/*----------------------------------------*/ + LOAD1x1_2 + KERNEL1x1_L2 16,16,0,0 + KERNEL1x1_L2 16,16,1,0 + KERNEL1x1_L2 16,16,2,0 + KERNEL1x1_E2 16,16,3,1 + blr + + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +CGEMM_L1: +/*----------------------------------------*/ + + andi. J, N, 1 + ble CGEMM_L1_END + +CGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble CGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +CGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T1-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + ZERO1x8 + ble CGEMM_L1x8_SUB0 + bl CGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 + + +CGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6,129 +#else + andi. L, K, 255 + cmpwi K,129 +#endif + li T8,1 + bne CMP1x8_128K + addi BO,BO,-8 + addi AO,AO,-64 + LOAD1x8O 64,8 + END1x8_WITHOUT_ADD + LOAD1x8_2O 128, 16 + mtctr T8 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,128 +#else + cmpwi K,128 +#endif + bne CGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-128 + LOAD1x8_2O 128,16 + bl CGEMM_L1x8_K128 + b CGEMM_L1x8_SAVE + MY_ALIGN + + +CGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble CGEMM_L1x8_SUB2_32 + bl CGEMM_1x8_L64_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble CGEMM_L1x8_SUB2_16 + bl CGEMM_1x8_L32_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x8_SUB2_8 + bl CGEMM_1x8_L16_SUB + MY_ALIGN + + +CGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x8_SUB2_4 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_L2 128,16, 1,0 + KERNEL1x8_L2 128,16, 2,0 + KERNEL1x8_E2 128,16, 3,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x8_SUB2_2 + LOAD1x8_2 + KERNEL1x8_L2 128,16, 0,0 + KERNEL1x8_E2 128,16, 1,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x8_SUB2_1 + LOAD1x8_2 + KERNEL1x8_E2 128,16, 0,1 + MY_ALIGN + + +CGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x8_SAVE + KERNEL1x8 + + MY_ALIGN +CGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + MY_ALIGN + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 +#endif + bgt CGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END + b CGEMM_L1x4_BEGIN + MY_ALIGN + + +CGEMM_L1x8_END: +/*----------------------------------------*/ + + +CGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble CGEMM_L1x1_END + andi. T1, M, 4 + ble CGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x4 + ble CGEMM_L1x4_SUB0 + bl CGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 + + +CGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x4_32K + addi BO,BO,-8 + addi AO,AO,-32 + LOAD1x4O 32,8 + END1x4_WITHOUT_ADD + LOAD1x4_2O 64, 16 + mtctr T8 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-64 + LOAD1x4_2O 64,16 + bl CGEMM_L1x4_K32 + b CGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x4_SUB2_8 + bl CGEMM_1x4_L16_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x4_SUB2_4 + bl CGEMM_1x4_L8_SUB + MY_ALIGN + + +CGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x4_SUB2_2 + LOAD1x4_2 + KERNEL1x4_L2 64,16, 0,0 + KERNEL1x4_E2 64,16, 1,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x4_SUB2_1 + LOAD1x4_2 + KERNEL1x4_E2 64,16, 0,1 + MY_ALIGN + + +CGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x4_SAVE + KERNEL1x4 + + +CGEMM_L1x4_SAVE: +/*----------------------------------------*/ + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 +#endif + + +CGEMM_L1x4_END: +/*----------------------------------------*/ + + +CGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble CGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x2 + ble CGEMM_L1x2_SUB0 + bl CGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 + + +CGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x2_32K + addi BO,BO,-8 + addi AO,AO,-16 + LOAD1x2O 16,8 + END1x2_WITHOUT_ADD + LOAD1x2_2O 32, 16 + mtctr T8 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-32 + LOAD1x2_2O 32,16 + bl CGEMM_L1x2_K32 + b CGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x2_SUB2_8 + bl CGEMM_1x2_L16_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x2_SUB2_4 + bl CGEMM_1x2_L8_SUB + MY_ALIGN + + +CGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x2_SUB2_2 + LOAD1x2_2 + KERNEL1x2_L2 32,16, 0,0 + KERNEL1x2_E2 32,16, 1,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x2_SUB2_1 + LOAD1x2_2 + KERNEL1x2_E2 32,16, 0,1 + MY_ALIGN + + +CGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x2_SAVE + KERNEL1x2 + + MY_ALIGN +CGEMM_L1x2_SAVE: +/*----------------------------------------*/ + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 +#endif + + +CGEMM_L1x2_END: +/*----------------------------------------*/ + + +CGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble CGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T1-2) % 31x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 31x */ +#endif + ZERO1x1 + ble CGEMM_L1x1_SUB0 + bl CGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 + + +CGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6,33 +#else + andi. L, K, 63 + cmpwi K,33 +#endif + li T8,1 + bne CMP1x1_32K + addi BO,BO,-8 + addi AO,AO,-8 + LOAD1x1O 8,8 + END1x1_WITHOUT_ADD + LOAD1x1_2O 16, 16 + mtctr T8 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6,32 +#else + cmpwi K,32 +#endif + bne CGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO,BO,-16 + addi AO,AO,-16 + LOAD1x1_2O 16,16 + bl CGEMM_L1x1_K32 + b CGEMM_L1x1_SAVE + MY_ALIGN + MY_ALIGN + + +CGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble CGEMM_L1x1_SUB2_8 + bl CGEMM_1x1_L16_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble CGEMM_L1x1_SUB2_4 + bl CGEMM_1x1_L8_SUB + MY_ALIGN + + +CGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble CGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 16,16, 0,0 + KERNEL1x1_E2 16,16, 1,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble CGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 16,16, 0,1 + MY_ALIGN + + +CGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble CGEMM_L1x1_SAVE + KERNEL1x1 + + MY_ALIGN +CGEMM_L1x1_SAVE: +/*----------------------------------------*/ + + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 +#endif + + +CGEMM_L1x1_END: +/*----------------------------------------*/ + slwi T1, K, 3 + + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + +CGEMM_L1_END: + + + + diff --git a/kernel/power/cgemm_macros_power10.S b/kernel/power/cgemm_macros_power10.S new file mode 100644 index 0000000000..b66e934050 --- /dev/null +++ b/kernel/power/cgemm_macros_power10.S @@ -0,0 +1,2131 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 8 +#define DISP32(ind, disp) (ind*unit_size*32+disp) +#define DISP16(ind, disp) (ind*unit_size*16+disp) +#define DISP8(ind, disp) (ind*unit_size*8+disp) +#define DISP4(ind, disp) (ind*unit_size*4+disp) +#define DISP2(ind, disp) (ind*unit_size*2+disp) +#define DISP1(ind, disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1, VSINR, VSINI_OUT2, VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#endif +.endm + +.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1, VSINR, VSINI_OUT2, VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI, \VSINI_OUT2 +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvaddsp \VSINR_OUT1, \VSINR_OUT1, \VSINR + xvsubsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubsp \VSINR_OUT1, \VSINR, \VSINR_OUT1 + /*we will negate alpha image instead to fix sign*/ + xvaddsp \VSINI_OUT2, \VSINI_OUT2, \VSINI +#endif +.endm + +/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ + +.macro MULT_APLHA_PART1 VSINRR, VSINII, VSOUT1, VSOUT2 + xvmulsp \VSOUT1, \VSINII, alpha_i + xvmulsp \VSOUT2, \VSINRR, alpha_i +.endm + +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + +.macro MULT_APLHA_PART2 VSINRR, VSINII, VSOUT1, VSOUT2 + xvmsubasp \VSOUT1, \VSINRR, alpha_r + xvmaddasp \VSOUT2, \VSINII, alpha_r +.endm + +.macro PERMUTE1 OUT, R1, R2, R3, R4 + xxsel vs62, \R1, \R2, vs57 + xxsel \OUT, \R3, \R4, vs57 + xxpermdi \OUT, \OUT, vs62, 1 +.endm +.macro PERMUTE2 OUT, R1, R2, R3, R4 + xxsel vs62, \R2, \R1, vs57 + xxsel \OUT, \R4, \R3, vs57 + xxpermdi \OUT, vs62, \OUT, 1 + xxperm \OUT, \OUT, permute_mask +.endm +.macro PERMUTE3 OUT, R1, R2, R3, R4 + xxsel vs62, \R1, \R2, vs57 + xxsel \OUT, \R3, \R4, vs57 + xxpermdi \OUT, vs62, \OUT, 2 +.endm +.macro PERMUTE4 OUT, R1, R2, R3, R4 + xxsel vs62, \R2, \R1, vs57 + xxsel \OUT, \R4, \R3, vs57 + xxpermdi \OUT, \OUT, vs62, 2 + xxperm \OUT, \OUT, permute_mask +.endm +.macro GROUP1 + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + xxperm vs8, vs36, permute_mask + xxperm vs12, vs44, permute_mask + xxperm vs9, vs37, permute_mask + xxperm vs13, vs45, permute_mask +.endm +.macro AGG_GROUP1 + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES vs36, vs8, vs44, vs12 + AGGREGATE_REALS_IMAGES vs37, vs9, vs45, vs13 +.endm +.macro GROUP2 + xxperm vs0, vs34, permute_mask + xxperm vs4, vs42, permute_mask + xxperm vs1, vs35, permute_mask + xxperm vs5, vs43, permute_mask + xxperm vs8, vs38, permute_mask + xxperm vs12, vs46, permute_mask + xxperm vs9, vs39, permute_mask + xxperm vs13, vs47, permute_mask +.endm +.macro AGG_GROUP2 + AGGREGATE_REALS_IMAGES vs34, vs0, vs42, vs4 + AGGREGATE_REALS_IMAGES vs35, vs1, vs43, vs5 + AGGREGATE_REALS_IMAGES vs38, vs8, vs46, vs12 + AGGREGATE_REALS_IMAGES vs39, vs9, vs47, vs13 +.endm +.macro MULTIPLY_GROUP1 + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART1 vs37, vs45, vs10, vs11 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs37, vs45, vs10, vs11 +.endm +.macro MULTIPLY_GROUP2 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART1 vs38, vs46, vs12, vs13 + MULT_APLHA_PART1 vs39, vs47, vs14, vs15 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs38, vs46, vs12, vs13 + MULT_APLHA_PART2 vs39, vs47, vs14, vs15 +.endm +/* reconstruct r, i pairs*/ +.macro RECONSTRUCT_PAIR1 + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 + xxperm vs8, vs9, save_permute_1 + xxperm vs10, vs11, save_permute_1 +.endm +.macro RECONSTRUCT_PAIR2 + xxperm vs4, vs5, save_permute_1 + xxperm vs6, vs7, save_permute_1 + xxperm vs12, vs13, save_permute_1 + xxperm vs14, vs15, save_permute_1 +.endm +.macro SHUFFLE_ACC ACC, R0, R1, R2, R3, O1, O2, O3, O4 + xxmfacc \ACC + PERMUTE1 \O1, \R3, \R2, \R1, \R0 + PERMUTE2 \O2, \R1, \R0, \R3, \R2 + PERMUTE3 \O3, \R1, \R0, \R3, \R2 + PERMUTE4 \O4, \R3, \R2, \R1, \R0 +.endm +/* macros for N=4 and M=8 +**********************************************************************************************/ +.macro ZERO4x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + +.macro LOAD4x8 + LOAD4x8O 0, 0 +.endm + +.macro LOAD4x8O OffsetA, OffsetB + lxvp vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END4x8_NORMAL + END4x8 AO, BO, 64, 32 +.endm + +.macro END4x8_WITHOUT_ADD + END4x8 AO, BO, 0, 0 +.endm + +.macro END4x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 3, 36, 35 + xvf32gerpp 2, 37, 35 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 7, 36, 34 + xvf32gerpp 6, 37, 34 + xvf32gerpp 5, 32, 34 + xvf32gerpp 4, 33, 34 +.endm + +.macro LOAD4x8_2 + LOAD4x8_2O 0, 0 +.endm + +.macro LOAD4x8_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs38, (32+\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + lxvp vs40, (64+\OffsetA)(AO) + lxvp vs42, (64+32+\OffsetA)(AO) +.endm + +.macro END4x8_2 + /*for load2 offset will be 128 and 64*/ + KERNEL4x8_2 AO, BO, 128, 64, 0, 1, 1 +.endm + +.macro KERNEL4x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 3, 36, 35 + xvf32gerpp 2, 37, 35 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 7, 36, 34 + xvf32gerpp 6, 37, 34 + xvf32gerpp 5, 32, 34 + xvf32gerpp 4, 33, 34 +.if \Complete==0 + lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 3, 42, 39 + xvf32gerpp 2, 43, 39 + xvf32gerpp 1, 40, 39 + xvf32gerpp 0, 41, 39 + xvf32gerpp 7, 42, 38 + xvf32gerpp 6, 43, 38 + xvf32gerpp 5, 40, 38 + xvf32gerpp 4, 41, 38 +.if \Complete==0 + lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) + lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) + lxvp vs42, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index, 64) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL4x8 + LOAD4x8 + END4x8 AO, BO, 64, 32 +.endm + +.macro SAVE4x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + SHUFFLE_ACC 4, vs16, vs17, vs18, vs19, vs48, vs56, vs52, vs60 + SHUFFLE_ACC 5, vs20, vs21, vs22, vs23, vs49, vs16, vs53, vs61 + SHUFFLE_ACC 7, vs28, vs29, vs30, vs31, vs17, vs19, vs18, vs20 + SHUFFLE_ACC 6, vs24, vs25, vs26, vs27, vs50, vs58, vs54, vs21 + add T4, LDC, LDC + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs28, 0(T1) +#endif + xxperm vs2, vs34, permute_mask + xxperm vs6, vs42, permute_mask +#ifndef TRMMKERNEL + lxvp vs30, 32(T1) +#endif + xxperm vs3, vs35, permute_mask + xxperm vs7, vs43, permute_mask + add T2, CO, T4 + add T3, T1, T4 + GROUP1 + AGG_GROUP1 + AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 + xxperm vs10, vs38, permute_mask + xxperm vs14, vs46, permute_mask + AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 + xxperm vs11, vs39, permute_mask + xxperm vs15, vs47, permute_mask + xxperm vs0, vs48, permute_mask + xxperm vs4, vs56, permute_mask + xxperm vs1, vs49, permute_mask + xxperm vs5, vs16, permute_mask + AGGREGATE_REALS_IMAGES vs38, vs10, vs46, vs14 + xxperm vs2, vs50, permute_mask + xxperm vs6, vs58, permute_mask + AGGREGATE_REALS_IMAGES vs39, vs11, vs47, vs15 + xxperm vs3, vs17, permute_mask + xxperm vs7, vs19, permute_mask + AGGREGATE_REALS_IMAGES vs48, vs0, vs56, vs4 + xxperm vs8, vs52, permute_mask + xxperm vs12, vs60, permute_mask + AGGREGATE_REALS_IMAGES vs49, vs1, vs16, vs5 + xxperm vs9, vs53, permute_mask + xxperm vs13, vs61, permute_mask + AGGREGATE_REALS_IMAGES vs50, vs2, vs58, vs6 + xxperm vs10, vs54, permute_mask + xxperm vs14, vs21, permute_mask + AGGREGATE_REALS_IMAGES vs17, vs3, vs19, vs7 + xxperm vs11, vs18, permute_mask + xxperm vs15, vs20, permute_mask + AGGREGATE_REALS_IMAGES vs52, vs8, vs60, vs12 + AGGREGATE_REALS_IMAGES vs53, vs9, vs61, vs13 +/*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + AGGREGATE_REALS_IMAGES vs54, vs10, vs21, vs14 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + AGGREGATE_REALS_IMAGES vs18, vs11, vs20, vs15 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 +#ifndef TRMMKERNEL + lxvp vs32, 0(T2) +#endif + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART1 vs37, vs45, vs10, vs11 +#ifndef TRMMKERNEL + lxvp vs40, 32(T2) +#endif + MULT_APLHA_PART1 vs38, vs46, vs12, vs13 + MULT_APLHA_PART1 vs39, vs47, vs14, vs15 +#ifndef TRMMKERNEL + lxvp vs34, 0(T3) +#endif + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs37, vs45, vs10, vs11 +#ifndef TRMMKERNEL + lxvp vs42, 32(T3) +#endif + MULT_APLHA_PART2 vs38, vs46, vs12, vs13 + MULT_APLHA_PART2 vs39, vs47, vs14, vs15 + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs26, vs26, vs7 + xvaddsp vs27, vs27, vs5 + xvaddsp vs28, vs28, vs11 + xvaddsp vs29, vs29, vs9 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs12, vs4, 2 + xxpermdi vs26, vs14, vs6, 2 + xxpermdi vs29, vs0, vs8, 2 + xxpermdi vs28, vs2, vs10, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + MULT_APLHA_PART1 vs48, vs56, vs0, vs1 + MULT_APLHA_PART1 vs49, vs16, vs2, vs3 + stxvp vs26, 32(CO) + MULT_APLHA_PART1 vs50, vs58, vs4, vs5 + MULT_APLHA_PART1 vs17, vs19, vs6, vs7 + stxvp vs28, 0(T1) + MULT_APLHA_PART2 vs48, vs56, vs0, vs1 + MULT_APLHA_PART2 vs49, vs16, vs2, vs3 + stxvp vs30, 32(T1) + MULT_APLHA_PART2 vs50, vs58, vs4, vs5 + MULT_APLHA_PART2 vs17, vs19, vs6, vs7 + MULT_APLHA_PART1 vs52, vs60, vs8, vs9 + MULT_APLHA_PART1 vs53, vs61, vs10, vs11 + MULT_APLHA_PART1 vs54, vs21, vs12, vs13 + MULT_APLHA_PART1 vs18, vs20, vs14, vs15 + MULT_APLHA_PART2 vs52, vs60, vs8, vs9 + MULT_APLHA_PART2 vs53, vs61, vs10, vs11 + MULT_APLHA_PART2 vs54, vs21, vs12, vs13 + MULT_APLHA_PART2 vs18, vs20, vs14, vs15 + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs32, vs32, vs3 + xvaddsp vs33, vs33, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs40, vs40, vs7 + xvaddsp vs41, vs41, vs5 + xvaddsp vs34, vs34, vs11 + xvaddsp vs35, vs35, vs9 + xvaddsp vs42, vs42, vs15 + xvaddsp vs43, vs43, vs13 +#else + xxpermdi vs33, vs8, vs0, 2 + xxpermdi vs32, vs10, vs2, 2 + xxpermdi vs41, vs12, vs4, 2 + xxpermdi vs40, vs14, vs6, 2 + xxpermdi vs35, vs0, vs8, 2 + xxpermdi vs34, vs2, vs10, 2 + xxpermdi vs43, vs4, vs12, 2 + xxpermdi vs42, vs6, vs14, 2 +#endif + stxvp vs32, 0(T2) + stxvp vs40, 32(T2) + stxvp vs34, 0(T3) + stxvp vs42, 32(T3) + addi CO, CO, 64 +.endm + +/* macros for N=4 and M=4 +**********************************************************************************************/ + +.macro ZERO4x4 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD4x4 + LOAD4x4O 0, 0 +.endm + +.macro LOAD4x4O OffsetA, OffsetB + lxvp vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END4x4_NORMAL + END4x4 AO, BO, 32, 32 +.endm + +.macro END4x4_WITHOUT_ADD + END4x4 AO, BO, 0, 0 +.endm + +.macro END4x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 3, 32, 34 + xvf32gerpp 2, 33, 34 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 +.endm + +.macro LOAD4x4_2 + LOAD4x4_2O 0, 0 +.endm + +.macro LOAD4x4_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs38, (32+\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END4x4_2 + /*for load2 offset will be 64 and 64*/ + KERNEL4x4_2 AO, BO, 64, 64, 0, 1, 1 +.endm + +.macro KERNEL4x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 3, 32, 34 + xvf32gerpp 2, 33, 34 + xvf32gerpp 1, 32, 35 + xvf32gerpp 0, 33, 35 +.if \Complete==0 + lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 3, 36, 38 + xvf32gerpp 2, 37, 38 + xvf32gerpp 1, 36, 39 + xvf32gerpp 0, 37, 39 +.if \Complete==0 + lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP8(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP8(\Index, 64) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x4 + LOAD4x4 + END4x4 AO, BO, 32, 32 +.endm + +.macro SAVE4x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + add T4, LDC, LDC + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxvp vs26, 0(T1) +#endif + #ifndef TRMMKERNEL + lxvp vs28, 0(T2) +#endif +#ifndef TRMMKERNEL + lxvp vs30, 0(T3) +#endif + GROUP1 + AGG_GROUP1 + GROUP2 + AGG_GROUP2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 + MULTIPLY_GROUP2 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xvaddsp vs26, vs26, vs11 + xvaddsp vs27, vs27, vs9 + xvaddsp vs28, vs28, vs7 + xvaddsp vs29, vs29, vs5 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs0, vs8, 2 + xxpermdi vs26, vs2, vs10, 2 + xxpermdi vs29, vs12, vs4, 2 + xxpermdi vs28, vs14, vs6, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 0(T1) + stxvp vs28, 0(T2) + stxvp vs30, 0(T3) + addi CO, CO, 32 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro ZERO4x2 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD4x2 + LOAD4x2O 0, 0 +.endm + +.macro LOAD4x2O OffsetA, OffsetB + lxv vs32, (\OffsetA+0)(AO) + lxvp vs34, (\OffsetB+0)(BO) +.endm + +.macro END4x2_NORMAL + END4x2 AO, BO, 16, 32 +.endm + +.macro END4x2_WITHOUT_ADD + END4x2 AO, BO, 0, 0 +.endm + +.macro END4x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 1, 34, 32 + xvf32gerpp 0, 35, 32 +.endm + +.macro LOAD4x2_2 + LOAD4x2_2O 0, 0 +.endm + +.macro LOAD4x2_2O OffsetA, OffsetB + lxvp vs32, (\OffsetA)(AO) + lxvp vs34, (0+\OffsetB)(BO) + lxvp vs36, (32+\OffsetB)(BO) +.endm + +.macro END4x2_2 + /*for load2 offset will be 32 and 64*/ + KERNEL4x2_2 AO, BO, 32, 64, 0, 1, 1 +.endm + +.macro KERNEL4x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 1, 34, 33 + xvf32gerpp 0, 35, 33 +.if \Complete==0 + lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) +.endif + xvf32gerpp 1, 36, 32 + xvf32gerpp 0, 37, 32 +.if \Complete==0 + lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) + lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index, \OffsetA) + addi \BREG, \BREG, DISP8(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index, 32) + addi \BREG, \BREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x2 + LOAD4x2 + END4x2 AO, BO, 16, 32 +.endm + +.macro SAVE4x2 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + add T4, LDC, LDC + add T1, CO, LDC + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs25, 0(T1) +#endif +#ifndef TRMMKERNEL + lxv vs26, 0(T2) +#endif +#ifndef TRMMKERNEL + lxv vs27, 0(T3) +#endif + GROUP1 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs37, vs9, vs45, vs13 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 0 + xxpermdi vs9, vs10, vs2, 0 + xxpermdi vs3, vs0, vs8, 3 + xxpermdi vs11, vs2, vs10, 3 + xvaddsp vs24, vs24, vs1 + xvaddsp vs26, vs26, vs9 + xvaddsp vs25, vs25, vs3 + xvaddsp vs27, vs27, vs11 +#else + xxpermdi vs24, vs8, vs0, 0 + xxpermdi vs26, vs10, vs2, 0 + xxpermdi vs25, vs0, vs8, 3 + xxpermdi vs27, vs2, vs10, 3 +#endif + stxv vs24, 0(CO) + stxv vs25, 0(T1) + stxv vs26, 0(T2) + stxv vs27, 0(T3) + addi CO, CO, 16 +.endm + +/* macros for N=4 and M=2 +**********************************************************************************************/ + +.macro ZERO4x1 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD4x1 + LOAD4x1O 0, 0 +.endm + +.macro LOAD4x1O OffsetA, OffsetB + lxsd v0, (\OffsetA+0)(AO) + lxvp vs34, (\OffsetB+0)(BO) +.endm + +.macro END4x1_NORMAL + END4x1 AO, BO,8, 32 +.endm + +.macro END4x1_WITHOUT_ADD + END4x1 AO, BO, 0, 0 +.endm + +.macro END4x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 35, 32 + xvf32gerpp 1, 34, 32 +.endm + +.macro LOAD4x1_2 + LOAD4x1_2O 0, 0 +.endm + +.macro LOAD4x1_2O OffsetA, OffsetB + lxv vs32, (\OffsetA)(AO) + vspltisb v6, 0 + xxpermdi vs33, vs32, vs38, 0 + xxpermdi vs32, vs32, vs38, 2 + lxvp vs34, (0+\OffsetB)(BO) + lxvp vs36, (32+\OffsetB)(BO) +.endm + +.macro END4x1_2 + /*for load2 offset will be 16 and 64*/ + KERNEL4x1_2 AO, BO, 16, 64, 0, 1, 1 +.endm + +.macro KERNEL4x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL4x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL4x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 35, 32 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) +.endif + xvf32gerpp 0, 37, 33 + xvf32gerpp 1, 36, 33 +.if \Complete==0 + lxv vs32, DISP2(\Index, \OffsetA)(\AREG) + lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) + xxpermdi vs33, vs32, vs38, 0 + xxpermdi vs32, vs32, vs38, 2 +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index, \OffsetA) + addi \BREG, \BREG, DISP8(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index, 16) + addi \BREG, \BREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL4x1 + LOAD4x1 + END4x1 AO, BO, 8, 32 +.endm + +.macro SAVE4x1 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + xxpermdi vs32, vs32, vs36, 1 + xxpermdi vs40, vs40, vs44, 1 + xxpermdi vs33, vs33, vs37, 1 + xxpermdi vs41, vs41, vs45, 1 + add T4, LDC, LDC + add T1, CO, LDC + add T2, CO, T4 + add T3, T1, T4 +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5, 0(T1) +#endif +#ifndef TRMMKERNEL + lxsd v6, 0(T2) +#endif +#ifndef TRMMKERNEL + lxsd v7, 0(T3) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 + xxperm vs2, vs3, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1, vs0, 0 + xxspltd vs3, vs0, 1 + xxspltd vs9, vs2, 0 + xxspltd vs11, vs2, 1 + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xvaddsp vs36, vs36, vs1 + xvaddsp vs37, vs37, vs3 + xvaddsp vs38, vs38, vs9 + xvaddsp vs39, vs39, vs11 +#else + /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ + xxspltd vs36, vs0, 0 + xxspltd vs37, vs0, 1 + xxspltd vs38, vs2, 0 + xxspltd vs39, vs2, 1 +#endif + stxsd v4, 0(CO) + stxsd v5, 0(T1) + stxsd v6, 0(T2) + stxsd v7, 0(T3) + addi CO, CO, 8 +.endm + +/* macros for N=2 and M=8 +**********************************************************************************************/ + +.macro ZERO2x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD2x8 + LOAD2x8O 0, 0 +.endm + +.macro LOAD2x8O OffsetA, OffsetB + lxv vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END2x8_NORMAL + END2x8 AO, BO, 64, 16 +.endm + +.macro END2x8_WITHOUT_ADD + END2x8 AO, BO, 0, 0 +.endm + +.macro END2x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 2, 37, 34 + xvf32gerpp 3, 36, 34 + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +.endm + +.macro LOAD2x8_2 + LOAD2x8_2O 0, 0 +.endm + +.macro LOAD2x8_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + lxvp vs38, (64+\OffsetA)(AO) + lxvp vs40, (64+32+\OffsetA)(AO) +.endm + +.macro END2x8_2 + /*for load2 offset will be 128 and 32*/ + KERNEL2x8_2 AO, BO, 128, 32, 0, 1, 1 +.endm + +.macro KERNEL2x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 2, 37, 35 + xvf32gerpp 3, 36, 35 + xvf32gerpp 0, 33, 35 + xvf32gerpp 1, 32, 35 + +.if \Complete==0 + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 41, 34 + xvf32gerpp 3, 40, 34 + xvf32gerpp 0, 39, 34 + xvf32gerpp 1, 38, 34 + +.if \Complete==0 + lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) + lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) + lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index, 32) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL2x8 + LOAD2x8 + END2x8 AO, BO, 64, 16 +.endm + +.macro SAVE2x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs28, 0(T1) +#endif +#ifndef TRMMKERNEL + lxvp vs30, 32(T1) +#endif + add T2, CO, T4 + add T3, T1, T4 + GROUP1 + AGG_GROUP1 + GROUP2 + AGG_GROUP2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 + MULTIPLY_GROUP2 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 + RECONSTRUCT_PAIR2 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs5, vs12, vs4, 2 + xxpermdi vs7, vs14, vs6, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xxpermdi vs13, vs4, vs12, 2 + xxpermdi vs15, vs6, vs14, 2 + xvaddsp vs26, vs26, vs7 + xvaddsp vs27, vs27, vs5 + xvaddsp vs28, vs28, vs11 + xvaddsp vs29, vs29, vs9 + xvaddsp vs30, vs30, vs15 + xvaddsp vs31, vs31, vs13 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs12, vs4, 2 + xxpermdi vs26, vs14, vs6, 2 + xxpermdi vs29, vs0, vs8, 2 + xxpermdi vs28, vs2, vs10, 2 + xxpermdi vs31, vs4, vs12, 2 + xxpermdi vs30, vs6, vs14, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 32(CO) + stxvp vs28, 0(T1) + stxvp vs30, 32(T1) + addi CO, CO, 64 +.endm + +/* macros for N=2 and M=4 +**********************************************************************************************/ + +.macro ZERO2x4 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD2x4 + LOAD2x4O 0, 0 +.endm + +.macro LOAD2x4O OffsetA, OffsetB + lxv vs34, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END2x4_NORMAL + END2x4 AO, BO, 32, 16 +.endm + +.macro END2x4_WITHOUT_ADD + END2x4 AO, BO, 0, 0 +.endm + +.macro END2x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 33, 34 + xvf32gerpp 1, 32, 34 +.endm + +.macro LOAD2x4_2 + LOAD2x4_2O 0, 0 +.endm + +.macro LOAD2x4_2O OffsetA, OffsetB + lxvp vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END2x4_2 + /*for load2 offset will be 64 and 32*/ + KERNEL2x4_2 AO, BO, 64, 32, 0, 1, 1 +.endm + +.macro KERNEL2x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 33, 35 + xvf32gerpp 1, 32, 35 +.if \Complete==0 + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 37, 34 + xvf32gerpp 1, 36, 34 +.if \Complete==0 + lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP4(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP4(\Index, 32) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL2x4 + LOAD2x4 + END2x4 AO, BO, 32, 16 +.endm + +.macro SAVE2x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxvp vs26, 0(T1) +#endif + GROUP1 + AGG_GROUP1 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULTIPLY_GROUP1 +/* reconstruct r, i pairs*/ + RECONSTRUCT_PAIR1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 2 + xxpermdi vs3, vs10, vs2, 2 + xxpermdi vs9, vs0, vs8, 2 + xxpermdi vs11, vs2, vs10, 2 + xvaddsp vs24, vs24, vs3 + xvaddsp vs25, vs25, vs1 + xvaddsp vs26, vs26, vs11 + xvaddsp vs27, vs27, vs9 +#else + xxpermdi vs25, vs8, vs0, 2 + xxpermdi vs24, vs10, vs2, 2 + xxpermdi vs27, vs0, vs8, 2 + xxpermdi vs26, vs2, vs10, 2 +#endif + stxvp vs24, 0(CO) + stxvp vs26, 0(T1) + addi CO, CO, 32 +.endm + +/* macros for N=2 and M=2 +**********************************************************************************************/ + +.macro ZERO2x2 + xxsetaccz 0 +.endm + +.macro LOAD2x2 + LOAD2x2O 0, 0 +.endm + +.macro LOAD2x2O OffsetA, OffsetB + lxv vs32, (\OffsetA+0)(AO) + lxv vs34, (\OffsetB+0)(BO) +.endm + +.macro END2x2_NORMAL + END2x2 AO, BO, 16, 16 +.endm + +.macro END2x2_WITHOUT_ADD + END2x2 AO, BO, 0, 0 +.endm + +.macro END2x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 32 +.endm + +.macro LOAD2x2_2 + LOAD2x2_2O 0, 0 +.endm + +.macro LOAD2x2_2O OffsetA, OffsetB + lxvp vs32, (\OffsetA)(AO) + lxvp vs34, (0+\OffsetB)(BO) +.endm + +.macro END2x2_2 + /*for load2 offset will be 32 and 32*/ + KERNEL2x2_2 AO, BO, 32, 32, 0, 1, 1 +.endm + +.macro KERNEL2x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 32 + xvf32gerpp 0, 35, 33 +.if \Complete==0 + lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) + lxvp vs34, DISP4(\Index, \OffsetA)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP4(\Index, \OffsetA) + addi \BREG, \BREG, DISP4(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP4(\Index, 32) + addi \BREG, \BREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL2x2 + LOAD2x2 + END2x2 AO, BO, 16, 16 +.endm + +.macro SAVE2x2 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif +#ifndef TRMMKERNEL + lxv vs26, 0(T1) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs8, vs36, permute_mask + xxperm vs12, vs44, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs36, vs8, vs44, vs12 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs36, vs44, vs8, vs9 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs36, vs44, vs8, vs9 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 + xxperm vs8, vs9, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxpermdi vs1, vs8, vs0, 0 + xxpermdi vs9, vs0, vs8, 3 + xvaddsp vs24, vs24, vs1 + xvaddsp vs26, vs26, vs9 +#else + xxpermdi vs24, vs8, vs0, 0 + xxpermdi vs26, vs0, vs8, 3 +#endif + stxv vs24, 0(CO) + stxv vs26, 0(T1) + addi CO, CO, 16 +.endm + +/* macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD2x1 + LOAD2x1O 0, 0 +.endm + +.macro LOAD2x1O OffsetA, OffsetB + lxsd v4, (\OffsetA+0)(AO) + lxv vs0, (\OffsetB+0)(BO) + xxspltd vs24, vs36, 0 + xxperm vs26, vs24, permute_mask +.endm + +.macro END2x1_NORMAL + END2x1 AO, BO,8, 16 +.endm + +.macro END2x1_WITHOUT_ADD + END2x1 AO, BO, 0, 0 +.endm + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs0, vs24 + xvmaddasp vs40, vs0, vs26 +.endm + +.macro LOAD2x1_2 + LOAD2x1_2O 0, 0 +.endm + +.macro LOAD2x1_2O OffsetA, OffsetB + lxv vs27, (\OffsetA)(AO) + lxvp vs4, (0+\OffsetB)(BO) + xxspltd vs8, vs27, 1 + xxspltd vs24, vs27, 0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + +.macro END2x1_2 + /*for load2 offset will be 16 and 32*/ + KERNEL2x1_2 AO, BO, 16, 32, 0, 1, 1 +.endm + +.macro KERNEL2x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL2x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL2x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL2x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvmaddasp vs32, vs5, vs8 + xvmaddasp vs40, vs5, vs10 +.if \Complete==0 + lxv vs27, DISP2(\Index, \OffsetA)(\AREG) + xxspltd vs8, vs27, 1 +.endif +.if \Complete==0 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs4, vs24 + xvmaddasp vs40, vs4, vs26 +.if \Complete==0 + xxspltd vs24, vs27, 0 + xxperm vs26, vs24, permute_mask +.endif +.if \Complete==0 + lxvp vs4, DISP4(\Index, 0+\OffsetB)(\BREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index, \OffsetA) + addi \BREG, \BREG, DISP4(\Index, \OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index, 16) + addi \BREG, \BREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 8, 16 +.endm + +.macro SAVE2x1 + add T1, CO, LDC +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif +#ifndef TRMMKERNEL + lxsd v5, 0(T1) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES_A_PERMUTE vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES_A_PERMUTE vs33, vs1, vs41, vs5 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, save_permute_1 +#ifndef TRMMKERNEL + /* add */ + xxspltd vs1, vs0, 0 + xxspltd vs3, vs0, 1 + /*--v4==vs36 v5==vs37---*/ + xvaddsp vs36, vs36, vs1 + xvaddsp vs37, vs37, vs3 +#else + /*--v4==vs36 v5==vs37---*/ + xxspltd vs36, vs0, 0 + xxspltd vs37, vs0, 1 +#endif + stxsd v4, 0(CO) + stxsd v5, 0(T1) + addi CO, CO, 8 +.endm + +/* macros for N=1 and M=8 +**********************************************************************************************/ + +.macro ZERO1x8 + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + +.macro LOAD1x8 + LOAD1x8O 0, 0 +.endm + +.macro LOAD1x8O OffsetA, OffsetB + lxsd v2, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) + lxvp vs36, (\OffsetA+32)(AO) +.endm + +.macro END1x8_NORMAL + END1x8 AO, BO, 64,8 +.endm + +.macro END1x8_WITHOUT_ADD + END1x8 AO, BO, 0, 0 +.endm + +.macro END1x8 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 + xvf32gerpp 2, 34, 37 + xvf32gerpp 3, 34, 36 +.endm + +.macro LOAD1x8_2 + LOAD1x8_2O 0, 0 +.endm + +.macro LOAD1x8_2O OffsetA, OffsetB + lxv vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + lxvp vs36, (32+\OffsetA)(AO) + vspltisb v10, 0 + xxpermdi vs35, vs34, vs42, 0 + xxpermdi vs34, vs34, vs42, 2 + lxvp vs38, (64+\OffsetA)(AO) + lxvp vs40, (64+32+\OffsetA)(AO) +.endm + +.macro END1x8_2 + /*for load2 offset will be 128 and 16*/ + KERNEL1x8_2 AO, BO, 128, 16, 0, 1, 1 +.endm + +.macro KERNEL1x8_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x8_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x8_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 34, 37 + xvf32gerpp 3, 34, 36 +.if \Complete==0 + lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 35, 39 + xvf32gerpp 1, 35, 38 +.if \Complete==0 + lxvp vs38, DISP16(\Index, 64+\OffsetA)(\AREG) +.endif + xvf32gerpp 2, 35, 41 + xvf32gerpp 3, 35, 40 +.if \Complete==0 + lxv vs34, DISP2(\Index, \OffsetB)(\BREG) + xxpermdi vs35, vs34, vs42, 0 + xxpermdi vs34, vs34, vs42, 2 + lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP16(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP16(\Index, 128) +.endif +.endif +.endm + +.macro KERNEL1x8 + LOAD1x8 + END1x8 AO, BO, 64,8 +.endm + +.macro SAVE1x8 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + SHUFFLE_ACC 2, vs8, vs9, vs10, vs11, vs34, vs42, vs38, vs46 + SHUFFLE_ACC 3, vs12, vs13, vs14, vs15, vs35, vs43, vs39, vs47 + xxpermdi vs32, vs32, vs36, 0 + xxpermdi vs33, vs33, vs37, 0 + xxpermdi vs34, vs34, vs38, 0 + xxpermdi vs35, vs35, vs39, 0 + xxpermdi vs40, vs40, vs44, 0 + xxperm vs40, vs40, permute_mask + xxpermdi vs41, vs41, vs45, 0 + xxperm vs41, vs41, permute_mask + xxpermdi vs42, vs42, vs46, 0 + xxperm vs42, vs42, permute_mask + xxpermdi vs43, vs43, vs47, 0 + xxperm vs43, vs43, permute_mask +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask +#ifndef TRMMKERNEL + lxvp vs26, 32(CO) +#endif + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + xxperm vs2, vs34, permute_mask + xxperm vs6, vs42, permute_mask + xxperm vs3, vs35, permute_mask + xxperm vs7, vs43, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + AGGREGATE_REALS_IMAGES vs34, vs2, vs42, vs6 + AGGREGATE_REALS_IMAGES vs35, vs3, vs43, vs7 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART1 vs34, vs42, vs4, vs5 + MULT_APLHA_PART1 vs35, vs43, vs6, vs7 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs34, vs42, vs4, vs5 + MULT_APLHA_PART2 vs35, vs43, vs6, vs7 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 + xxperm vs2, vs3, vs28 + xxperm vs4, vs5, vs28 + xxperm vs6, vs7, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs2 + xvaddsp vs25, vs25, vs0 + xvaddsp vs26, vs26, vs6 + xvaddsp vs27, vs27, vs4 + stxvp vs24, 0(CO) + stxvp vs26, 32(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) + stxv vs2, 16(CO) + stxv vs4, 32(CO) + stxv vs6, 48(CO) +#endif + addi CO, CO, 64 +.endm + +/* macros for N=1 and M=4 +**********************************************************************************************/ + +.macro ZERO1x4 + xxsetaccz 0 + xxsetaccz 1 +.endm + +.macro LOAD1x4 + LOAD1x4O 0, 0 +.endm + +.macro LOAD1x4O OffsetA, OffsetB + lxsd v2, (\OffsetB+0)(BO) + lxvp vs32, (\OffsetA+0)(AO) +.endm + +.macro END1x4_NORMAL + END1x4 AO, BO, 32,8 +.endm + +.macro END1x4_WITHOUT_ADD + END1x4 AO, BO, 0, 0 +.endm + +.macro END1x4 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif + +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.endm + +.macro LOAD1x4_2 + LOAD1x4_2O 0, 0 +.endm + +.macro LOAD1x4_2O OffsetA, OffsetB + lxv vs34, (\OffsetB)(BO) + lxvp vs32, (0+\OffsetA)(AO) + vspltisb v6, 0 + xxpermdi vs35, vs34, vs38, 0 + xxpermdi vs34, vs34, vs38, 2 + lxvp vs36, (32+\OffsetA)(AO) +.endm + +.macro END1x4_2 + /*for load2 offset will be 64 and 16*/ + KERNEL1x4_2 AO, BO, 64, 16, 0, 1, 1 +.endm + +.macro KERNEL1x4_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x4_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x4_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvf32gerpp 0, 34, 33 + xvf32gerpp 1, 34, 32 +.if \Complete==0 + lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) +.endif + xvf32gerpp 0, 35, 37 + xvf32gerpp 1, 35, 36 +.if \Complete==0 + lxv vs34, DISP2(\Index, \OffsetB)(\BREG) + xxpermdi vs35, vs34, vs38, 0 + xxpermdi vs34, vs34, vs38, 2 + lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP8(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP8(\Index, 64) +.endif +.endif +.endm + +.macro KERNEL1x4 + LOAD1x4 + END1x4 AO, BO, 32,8 +.endm + +.macro SAVE1x4 + SHUFFLE_ACC 0, vs0, vs1, vs2, vs3, vs32, vs40, vs36, vs44 + SHUFFLE_ACC 1, vs4, vs5, vs6, vs7, vs33, vs41, vs37, vs45 + xxpermdi vs32, vs32, vs36, 0 + xxpermdi vs40, vs40, vs44, 0 + xxpermdi vs33, vs33, vs37, 0 + xxpermdi vs41, vs41, vs45, 0 + xxperm vs40, vs40, permute_mask + xxperm vs41, vs41, permute_mask +#ifndef TRMMKERNEL + lxvp vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + xxperm vs1, vs33, permute_mask + xxperm vs5, vs41, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + AGGREGATE_REALS_IMAGES vs33, vs1, vs41, vs5 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART1 vs33, vs41, vs2, vs3 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs33, vs41, vs2, vs3 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 + xxperm vs2, vs3, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs2 + xvaddsp vs25, vs25, vs0 + stxvp vs24, 0(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) + stxv vs2, 16(CO) +#endif + addi CO, CO, 32 +.endm + +/* macros for N=1 and M=2 +**********************************************************************************************/ + +.macro ZERO1x2 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD1x2 + LOAD1x2O 0, 0 +.endm + +.macro LOAD1x2O OffsetA, OffsetB + lxsd vs4, (\OffsetB+0)(BO) + lxv vs0, (\OffsetA+0)(AO) + xxspltd vs24, vs36, 0 + xxperm vs26, vs24, permute_mask +.endm + +.macro END1x2_NORMAL + END1x2 AO, BO, 16,8 +.endm + +.macro END1x2_WITHOUT_ADD + END1x2 AO, BO, 0, 0 +.endm + +.macro END1x2 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs0, vs24 + xvmaddasp vs40, vs0, vs26 +.endm + +.macro LOAD1x2_2 + LOAD1x2_2O 0, 0 +.endm + +.macro LOAD1x2_2O OffsetA, OffsetB + lxv vs27, (\OffsetB)(BO) + lxvp vs4, (0+\OffsetA)(AO) + xxspltd vs8, vs27, 1 + xxspltd vs24, vs27, 0 + xxperm vs10, vs8, permute_mask + xxperm vs26, vs24, permute_mask +.endm + +.macro END1x2_2 + /*for load2 offset will be 32 and 16*/ + KERNEL1x2_2 AO, BO, 32, 16, 0, 1, 1 +.endm + +.macro KERNEL1x2_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x2_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x2_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete +.if \Complete==0 + lxv vs27, DISP2(\Index, \OffsetB)(\BREG) +.endif + xvmaddasp vs32, vs5, vs8 + xvmaddasp vs40, vs5, vs10 + +.if \Complete==0 + xxspltd vs8, vs27, 1 + xxperm vs10, vs8, permute_mask +.endif + xvmaddasp vs32, vs4, vs24 + xvmaddasp vs40, vs4, vs26 +.if \Complete==0 + lxvp vs4, DISP4(\Index, 0+\OffsetA)(\AREG) +.endif + +.if \Complete==0 + xxspltd vs24, vs27, 0 + xxperm vs26, vs24, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP4(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP4(\Index, 32) +.endif +.endif +.endm + +.macro KERNEL1x2 + LOAD1x2 + END1x2 AO, BO, 16,8 +.endm + +.macro SAVE1x2 +#ifndef TRMMKERNEL + lxv vs24, 0(CO) +#endif + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs0, vs1 + MULT_APLHA_PART2 vs32, vs40, vs0, vs1 +/* reconstruct r, i pairs*/ + xxperm vs0, vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs24, vs24, vs0 + stxv vs24, 0(CO) +#else +/* reconstruct r, i pairs*/ + stxv vs0, 0(CO) +#endif + addi CO, CO, 16 +.endm + +/* macros for N=1 and M=1 +**********************************************************************************************/ +.macro ZERO1x1 + xxlxor vs32, vs32, vs32 + xxlxor vs40, vs40, vs40 +.endm + +.macro LOAD1x1 + LOAD1x1O 0, 0 +.endm + +.macro LOAD1x1O OffsetA, OffsetB + lxsd v4, (\OffsetB+0)(BO) + lxsd v5, (\OffsetA+0)(AO) + xxperm vs38, vs36, permute_mask +.endm + +.macro END1x1_NORMAL + END1x1 AO, BO,8,8 +.endm + +.macro END1x1_WITHOUT_ADD + END1x1 AO, BO, 0, 0 +.endm + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddasp vs32, vs37, vs36 + xvmaddasp vs40, vs37, vs38 +.endm + +.macro LOAD1x1_2 + LOAD1x1_2O 0, 0 +.endm + +.macro LOAD1x1_2O OffsetA, OffsetB + lxv vs8, (\OffsetB)(BO) + lxv vs4, (0+\OffsetA)(AO) + xxperm vs10, vs8, permute_mask +.endm + +.macro END1x1_2 + /*for load2 offset will be 16 and 16*/ + KERNEL1x1_2 AO, BO, 16, 16, 0, 1, 1 +.endm + +.macro KERNEL1x1_E2 OffsetA, OffsetB, Index, IsLast + KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 1 +.endm + +.macro KERNEL1x1_L2 OffsetA, OffsetB, Index, IsLast + KERNEL1x1_2 AO, BO, \OffsetA, \OffsetB, \Index, \IsLast, 0 +.endm + +.macro KERNEL1x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete + xvmaddasp vs32, vs4, vs8 + xvmaddasp vs40, vs4, vs10 +.if \Complete==0 + lxv vs8, DISP2(\Index, \OffsetB)(\BREG) + lxv vs4, DISP2(\Index, \OffsetB)(\AREG) + xxperm vs10, vs8, permute_mask +.endif +.if \IsLast==1 +.if \Complete==1 + addi \BREG, \BREG, DISP2(\Index, \OffsetB) + addi \AREG, \AREG, DISP2(\Index, \OffsetA) +.else + addi \BREG, \BREG, DISP2(\Index, 16) + addi \AREG, \AREG, DISP2(\Index, 16) +.endif +.endif +.endm + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 8,8 +.endm + +.macro SAVE1x1 +#ifndef TRMMKERNEL + lxsd v4, 0(CO) +#endif + /*aggregate x2*/ + xxpermdi vs33, vs32, vs32, 2 + xxpermdi vs41, vs40, vs40, 2 + xvaddsp vs32, vs32, vs33 + xvaddsp vs40, vs40, vs41 + + xxperm vs0, vs32, permute_mask + xxperm vs4, vs40, permute_mask + AGGREGATE_REALS_IMAGES vs32, vs0, vs40, vs4 + /*inner reverse save_permute and store vs28 */ + xxpermdi vs28,save_permute_1,save_permute_1, 2 + /*VSINRR, VSINII, VSOUT1, VSOUT2*/ + MULT_APLHA_PART1 vs32, vs40, vs37, vs1 + MULT_APLHA_PART2 vs32, vs40, vs37, vs1 +/* reconstruct r, i pairs*/ + xxperm vs37, vs1, vs28 +#ifndef TRMMKERNEL + /* add */ + xvaddsp vs36, vs36, vs37 + stxsd v4, 0(CO) +#else +/* vs37 is v5 */ + stxsd v5, 0(CO) +#endif + addi CO, CO, 8 +.endm + +/****************************TRMM POINTER REFRESH MACROSES*************************/ +.macro SHIFT_REG REG1,REG2,SHIFT_VAL +.if \SHIFT_VAL==16 + slwi \REG1, \REG2, 7 +.elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 6 +.elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 5 +.elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 4 +.elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 3 +.endif +.endm + +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*8; +// ptrbb = bb + off*4; +// #endif +*/ +.macro REFRESH_POINTERS PTR_A,PTR_B, OFF_VAL, B_VAL, C_A, C_B +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +/* ptrbb = bb;*/ + mr \PTR_B, \B_VAL /* refresh BPOINT */ +#else +/* +// ptrba =ptrba+ off*C_A; +// ptrbb = bb + off*C_B; +*/ + SHIFT_REG T4, \OFF_VAL, \C_B /* Number of values in B shifted */ + SHIFT_REG T2, \OFF_VAL, \C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL, T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ +#endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+8; // number of values in A +// #else +// temp = off+4; // number of values in B +// #endif +*/ +.macro REFRESH_TEMP_BK TEMP_BK, BK_VAL, OFF_VAL, INCR_A, INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK, \BK_VAL, \OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK, \OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 8; // number of values in A +// #else +// temp -= 4; // number of values in B +// #endif +// ptrba += temp*8; +// ptrbb += temp*4; +// #endif + +// #ifdef LEFT +// off += 8; // number of values in A +// #endif +*/ +.macro REFRESH_AFTER_SAVE TEMP_BK, BK_VAL, OFF_VAL,PTR_B,PTR_A, C_A, C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK, \BK_VAL, \OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK, \TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK, \TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4, \TEMP_BK, \C_A + SHIFT_REG T2, \TEMP_BK, \C_B + add \PTR_A, \PTR_A, T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B, T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL, \OFF_VAL, \C_A + #endif +.endm diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c new file mode 100644 index 0000000000..b3ee301be8 --- /dev/null +++ b/kernel/power/dgemm_kernel_power10.c @@ -0,0 +1,864 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +#ifdef TRMMKERNEL +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; +#endif + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + v4sf_t valpha = { alpha, alpha }; + N = n >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 2; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + BLASLONG l = 0; + PREFETCH1 (CO, 0); + PREFETCH1 (CO + ldc, 0); + PREFETCH1 (CO + ldc + ldc, 0); + PREFETCH1 (CO + ldc + ldc + ldc, 0); + PREFETCH1 (CO, 128); + PREFETCH1 (CO + ldc, 128); + PREFETCH1 (CO + ldc + ldc, 128); + PREFETCH1 (CO + ldc + ldc + ldc, 128); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC (&acc5, 10); + SAVE_ACC (&acc7, 14); + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[l << 2]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] }; + v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + SAVE2x4_ACC (&acc4, 8); + SAVE2x4_ACC (&acc5, 10); + SAVE2x4_ACC (&acc6, 12); + SAVE2x4_ACC (&acc7, 14); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[l << 1]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + v4sf_t t4 = { 0, 0 }; + v4sf_t t5 = { 0, 0 }; + v4sf_t t6 = { 0, 0 }; + v4sf_t t7 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; + v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; + v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; + v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; + v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; + v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; + v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; + v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + t4 += rowA4 * rowB; + t5 += rowA5 * rowB; + t6 += rowA6 * rowB; + t7 += rowA7 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + t4 = t4 * valpha; + t5 = t5 * valpha; + t6 = t6 * valpha; + t7 = t7 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; + CO[8] = t4[0]; + CO[9] = t4[1]; + CO[10] = t5[0]; + CO[11] = t5[1]; + CO[12] = t6[0]; + CO[13] = t6[1]; + CO[14] = t7[0]; + CO[15] = t7[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; + CO[8] += t4[0]; + CO[9] += t4[1]; + CO[10] += t5[0]; + CO[11] += t5[1]; + CO[12] += t6[0]; + CO[13] += t6[1]; + CO[14] += t7[0]; + CO[15] += t7[1]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] }; + v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] }; + v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] }; + v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] }; + v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c new file mode 100644 index 0000000000..01c122c6d4 --- /dev/null +++ b/kernel/power/sgemm_kernel_power10.c @@ -0,0 +1,1334 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); +#if defined(TRMMKERNEL) +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] = result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] = result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[2] * alpha; +#else +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; +#endif +#define KERNEL(i, j) \ + __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc1, rowB[i+1], rowA[j]); \ + __builtin_mma_xvf32gerpp (&acc2, rowB[i], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc3, rowB[i+1], rowA[j+1]); \ + __builtin_mma_xvf32gerpp (&acc4, rowB[i], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ + __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ + __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +#define REFRESH_TEMP_BK(x, y) \ + temp = k - off; +#elif defined(LEFT) +#define REFRESH_TEMP_BK(x, y) \ + temp = off + x; +#else +#define REFRESH_TEMP_BK(x, y) \ + temp = off + y; +#endif +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_POINTERS(x, y) \ + BO = B; \ + REFRESH_TEMP_BK(x, y) +#else +#define REFRESH_POINTERS(x, y) \ + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) +#endif + +#ifdef LEFT +#define REFRESH_OFF(x) \ + off += x; +#else +#define REFRESH_OFF(x) +#endif + +#ifdef LEFT +#define UPDATE_TEMP(x, y) \ + temp -= x; +#else +#define UPDATE_TEMP(x, y) \ + temp -= y; +#endif + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +#define REFRESH_TMP_AFTER_SAVE(x, y) \ + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; +#else +#define REFRESH_TMP_AFTER_SAVE(x, y) +#endif + +#define REFRESH_AFTER_SAVE(x,y) \ + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) +/************************************************************************************* +* GEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, + FLOAT * C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG N = n; + BLASLONG i1; +#if defined(TRMMKERNEL) + BLASLONG off; +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + N = n >> 3; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; + FLOAT *CO; + FLOAT *AO; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + BLASLONG K = temp / 64; + for (l = 0; l < K; l++) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + KERNEL (64, 128); + KERNEL (66, 132); + KERNEL (68, 136); + KERNEL (70, 140); + KERNEL (72, 144); + KERNEL (74, 148); + KERNEL (76, 152); + KERNEL (78, 156); + KERNEL (80, 160); + KERNEL (82, 164); + KERNEL (84, 168); + KERNEL (86, 172); + KERNEL (88, 176); + KERNEL (90, 180); + KERNEL (92, 184); + KERNEL (94, 188); + KERNEL (96, 192); + KERNEL (98, 196); + KERNEL (100, 200); + KERNEL (102, 204); + KERNEL (104, 208); + KERNEL (106, 212); + KERNEL (108, 216); + KERNEL (110, 220); + KERNEL (112, 224); + KERNEL (114, 228); + KERNEL (116, 232); + KERNEL (118, 236); + KERNEL (120, 240); + KERNEL (122, 244); + KERNEL (124, 248); + KERNEL (126, 252); + AO += 1024; + BO += 512; + } + if ((temp & 63) >> 5) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + KERNEL (32, 64); + KERNEL (34, 68); + KERNEL (36, 72); + KERNEL (38, 76); + KERNEL (40, 80); + KERNEL (42, 84); + KERNEL (44, 88); + KERNEL (46, 92); + KERNEL (48, 96); + KERNEL (50, 100); + KERNEL (52, 104); + KERNEL (54, 108); + KERNEL (56, 112); + KERNEL (58, 116); + KERNEL (60, 120); + KERNEL (62, 124); + AO += 512; + BO += 256; + } + if ((temp & 31) >> 4) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + KERNEL (16, 32); + KERNEL (18, 36); + KERNEL (20, 40); + KERNEL (22, 44); + KERNEL (24, 48); + KERNEL (26, 52); + KERNEL (28, 56); + KERNEL (30, 60); + AO += 256; + BO += 128; + } + if ((temp & 15) >> 3) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + KERNEL (8, 16); + KERNEL (10, 20); + KERNEL (12, 24); + KERNEL (14, 28); + AO += 128; + BO += 64; + } + if ((temp & 7) >> 2) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + KERNEL (4, 8); + KERNEL (6, 12); + AO += 64; + BO += 32; + } + if ((temp & 3) >> 1) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + KERNEL (2, 4); + AO += 32; + BO += 16; + } + if ((temp & 1) >> 0) + { + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + KERNEL (0, 0); + AO += 16; + BO += 8; + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 8) +#endif + CO += 16; + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc3, rowB[1], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + AO += (temp << 3); + BO += (temp << 3); + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (temp << 2); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); +#else + BO = B; + temp = k; +#endif + + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (temp << 1); + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 8) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1], BO[(l << 3) + 2], + BO[(l << 3) + 3] + }; + v4sf_t rowB1 = + { BO[(l << 3) + 4], BO[(l << 3) + 5], BO[(l << 3) + 6], + BO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; + CO[4 * ldc] = t1[0]; + CO[5 * ldc] = t1[1]; + CO[6 * ldc] = t1[2]; + CO[7 * ldc] = t1[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; +#endif + CO += 1; + AO += temp; + BO += (temp << 3); +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 8; // number of values in A +#endif + + B += k << 3; + } + N = (n & 7) >> 2; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += temp << 4; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 4) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 3]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); +#else + BO = B; + temp = k; +#endif + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + vec_t *rowA = (vec_t *) & AO[l << 2]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); +#else + BO = B; + temp = k; +#endif + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1], BO[(l << 2) + 2], + BO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t[2]; + CO[3 * ldc] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; +#endif + CO += 1; + AO += temp; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 4) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 4; // number of values in A +#endif + + B += k << 2; + } + N = (n & 3) >> 1; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; +#if !defined(TRMMKERNEL) + i = m >> 5; + for (j = 0; j < i; j++) + { + FLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + FLOAT *A1; + A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + vec_t *rowA1 = (vec_t *) & A1[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32gerpp (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32gerpp (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32gerpp (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32gerpp (&acc7, rowB[0], rowA1[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; +#else + i = m >> 4; +#endif + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 2) +#else + BO = B; + temp = k; +#endif + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 4]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32gerpp (&acc3, rowB[0], rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += temp << 4; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 2) +#endif + } + i = (m & 15) >> 3; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + i = (m & 7) >> 2; + for (j = 0; j < i; j++) + { + FLOAT *BO; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2) +#else + BO = B; + temp = k; +#endif + BLASLONG l = 0; + for (l = 0; l < temp; l++) + { + FLOAT t[4] = { 0 }; + t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + i = (m & 3) >> 1; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (temp << 1); l += 2) + { + v4sf_t rowA = { AO[l], AO[l], AO[l + 1], AO[l + 1] }; + v4sf_t rowB = { BO[l], BO[l + 1], BO[l], BO[l + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[0 * ldc + 1] = t[2]; + CO[1 * ldc + 1] = t[3]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; +#endif + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + i = (m & 1) >> 0; + for (j = 0; j < i; j++) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2) +#else + BO = B; + temp = k; +#endif + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowA = { AO[l], AO[l], 0, 0 }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; +#else + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 2) +#endif + } +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 2; // number of values in A +#endif + + B += k << 1; + } + N = (n & 1) >> 0; + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, temp; +#if defined(TRMMKERNEL) && defined(LEFT) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + while (i >= 16) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (16, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1], AO[(l << 4) + 2], + AO[(l << 4) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 4) + 4], AO[(l << 4) + 5], AO[(l << 4) + 6], + AO[(l << 4) + 7] + }; + v4sf_t rowA2 = + { AO[(l << 4) + 8], AO[(l << 4) + 9], AO[(l << 4) + 10], + AO[(l << 4) + 11] + }; + v4sf_t rowA3 = + { AO[(l << 4) + 12], AO[(l << 4) + 13], AO[(l << 4) + 14], + AO[(l << 4) + 15] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; + CO[8] = t2[0]; + CO[9] = t2[1]; + CO[10] = t2[2]; + CO[11] = t2[3]; + CO[12] = t3[0]; + CO[13] = t3[1]; + CO[14] = t3[2]; + CO[15] = t3[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; +#endif + AO += temp << 4; + BO += temp; + CO += 16; + i -= 16; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (16, 1) +#endif + } + while (i >= 8) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1], AO[(l << 3) + 2], + AO[(l << 3) + 3] + }; + v4sf_t rowA1 = + { AO[(l << 3) + 4], AO[(l << 3) + 5], AO[(l << 3) + 6], + AO[(l << 3) + 7] + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; + CO[4] = t1[0]; + CO[5] = t1[1]; + CO[6] = t1[2]; + CO[7] = t1[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; + i -= 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + while (i >= 4) + { + FLOAT *BO; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) +#else + BO = B; + temp = k; +#endif + + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1], AO[(l << 2) + 2], + AO[(l << 2) + 3] + }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t[2]; + CO[3] = t[3]; +#else + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; +#endif + AO += temp << 2; + BO += temp; + CO += 4; + i -= 4; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 1) +#endif + } + while (i >= 2) + { + FLOAT *BO; + BLASLONG l = 0; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 1) +#else + BO = B; + temp = k; +#endif + + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < temp; l++) + { + v4sf_t rowB = { BO[l], BO[l], 0, 0 }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1], 0, 0 }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; +#else + CO[0] += t[0]; + CO[1] += t[1]; +#endif + AO += temp << 1; + BO += temp; + CO += 2; + i -= 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 1) +#endif + } + while (i >= 1) + { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 1) +#else + BO = B; + temp = k; +#endif + + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) + { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; +#else + CO[0] += t * alpha; +#endif + CO += 1; + i -= 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 1) +#endif + } + +#if defined(TRMMKERNEL) && !defined(LEFT) + off += 1; // number of values in A +#endif + B += k; + } + return 0; +} From bb2f52844bbcd5c786d7b37f8c4d88dbf7a3b89e Mon Sep 17 00:00:00 2001 From: Gordon Fossum Date: Wed, 24 Jun 2020 14:50:12 -0500 Subject: [PATCH 015/349] powerpc: Optimized ZGEMM kernel for POWER10 This patch introduces new optimized version of ZGEMM kernel using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. Cycles count reduced by 30-50% compared to POWER9 version depending on M/N/K sizes. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/zgemm_kernel_power10.S | 245 ++++ kernel/power/zgemm_logic_power10.S | 1735 +++++++++++++++++++++++++++ kernel/power/zgemm_macros_power10.S | 1138 ++++++++++++++++++ 4 files changed, 3120 insertions(+), 2 deletions(-) create mode 100644 kernel/power/zgemm_kernel_power10.S create mode 100644 kernel/power/zgemm_logic_power10.S create mode 100644 kernel/power/zgemm_macros_power10.S diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 00d31f8b6a..4fc7190b0b 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -10,7 +10,7 @@ else STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c CTRMMKERNEL = cgemm_kernel_power10.S -ZTRMMKERNEL = zgemm_kernel_power9.S +ZTRMMKERNEL = zgemm_kernel_power10.S SGEMMKERNEL = sgemm_kernel_power10.c SGEMMINCOPY = ../generic/gemm_ncopy_16.c @@ -42,7 +42,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -ZGEMMKERNEL = zgemm_kernel_power9.S +ZGEMMKERNEL = zgemm_kernel_power10.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c diff --git a/kernel/power/zgemm_kernel_power10.S b/kernel/power/zgemm_kernel_power10.S new file mode 100644 index 0000000000..fca389e691 --- /dev/null +++ b/kernel/power/zgemm_kernel_power10.S @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#define LOAD ld + +#define STACKSIZE 512 + +#define FZERO 312+192(SP) + +#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ + +#define M r3 +#define N r4 +#define K r5 + + +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 + + + +#define o0 0 +#define alpha_r vs62 +#define alpha_i vs63 + +#define VECSAVE r11 + +#define FRAMEPOINTER r12 + +#define T10 r14 + +#define L r15 +#define T8 r16 +#define T5 r17 +#define T2 r19 +#define TEMP_REG r20 +#define T6 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define T7 r27 +#define T3 r28 +#define T4 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + mflr r0 + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + + xxspltd alpha_r,vs1,0 /*copy from register f1 */ + xxspltd alpha_i,vs2,0 /*copy from register f2 */ + + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + + + stxv vs20, 288(SP) + stxv vs21, 304(SP) + stxv vs22, 320(SP) + stxv vs23, 336(SP) + stxv vs24, 352(SP) + stxv vs25, 368(SP) + stxv vs26, 384(SP) + stxv vs27, 400(SP) + stxv vs28, 416(SP) + stxv vs29, 432(SP) + stxv vs30, 448(SP) + stxv vs31, 464(SP) + + std r0, FLINK_SAVE(SP) + + +#if defined(linux) || defined(__FreeBSD__) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) +#endif + + +#ifdef TRMMKERNEL +#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) +#endif +#endif + + +#include "zgemm_macros_power10.S" + + + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 512 + li r0, 0 + + +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +/*negate for this case as we will use addition -1*(a+b) */ + xvnegdp alpha_r,alpha_r + xvnegdp alpha_i,alpha_i +#endif + .align 4 + +#include "zgemm_logic_power10.S" + +L999: + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + + + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + + ld r0, FLINK_SAVE(SP) + + lxv vs20, 288(SP) + lxv vs21, 304(SP) + lxv vs22, 320(SP) + lxv vs23, 336(SP) + lxv vs24, 352(SP) + lxv vs25, 368(SP) + lxv vs26, 384(SP) + lxv vs27, 400(SP) + mtlr r0 + lxv vs28, 416(SP) + lxv vs29, 432(SP) + lxv vs30, 448(SP) + lxv vs31, 464(SP) + + addi SP, SP, STACKSIZE + blr + + EPILOGUE +#endif diff --git a/kernel/power/zgemm_logic_power10.S b/kernel/power/zgemm_logic_power10.S new file mode 100644 index 0000000000..1143733e0a --- /dev/null +++ b/kernel/power/zgemm_logic_power10.S @@ -0,0 +1,1735 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#define MY_ALIGN .align 3 +b ZGEMM_L2 +/* MINI SUBROUTINES */ +/* 2x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L2x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 +ZGEMM_L2x8_K128: +/*----------------------------------------*/ + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 0 + KERNEL2x8_2 32, 0 + KERNEL2x8_2 33, 0 + KERNEL2x8_2 34, 0 + KERNEL2x8_2 35, 0 + KERNEL2x8_2 36, 0 + KERNEL2x8_2 37, 0 + KERNEL2x8_2 38, 0 + KERNEL2x8_2 39, 0 + KERNEL2x8_2 40, 0 + KERNEL2x8_2 41, 0 + KERNEL2x8_2 42, 0 + KERNEL2x8_2 43, 0 + KERNEL2x8_2 44, 0 + KERNEL2x8_2 45, 0 + KERNEL2x8_2 46, 0 + KERNEL2x8_2 47, 0 + KERNEL2x8_2 48, 0 + KERNEL2x8_2 49, 0 + KERNEL2x8_2 50, 0 + KERNEL2x8_2 51, 0 + KERNEL2x8_2 52, 0 + KERNEL2x8_2 53, 0 + KERNEL2x8_2 54, 0 + KERNEL2x8_2 55, 0 + KERNEL2x8_2 56, 0 + KERNEL2x8_2 57, 0 + KERNEL2x8_2 58, 0 + KERNEL2x8_2 59, 0 + KERNEL2x8_2 60, 0 + KERNEL2x8_2 61, 0 + KERNEL2x8_2 62, 0 + KERNEL2x8_2 63, 1 + bdz ZGEMM_L2x8_LOOP_END + b ZGEMM_L2x8_LOOP + MY_ALIGN + +ZGEMM_L2x8_LOOP_END: +/*----------------------------------------*/ + KERNEL2x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x4_LOOP: +/*----------------------------------------*/ + KERNEL2x4_2 0, 0 +ZGEMM_L2x4_K32: +/*----------------------------------------*/ + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 0 + KERNEL2x4_2 8, 0 + KERNEL2x4_2 9, 0 + KERNEL2x4_2 10, 0 + KERNEL2x4_2 11, 0 + KERNEL2x4_2 12, 0 + KERNEL2x4_2 13, 0 + KERNEL2x4_2 14, 0 + KERNEL2x4_2 15, 1 + bdnz ZGEMM_L2x4_LOOP + MY_ALIGN +ZGEMM_L2x4_LOOP_END: +/*----------------------------------------*/ + KERNEL2x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_2x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L2x2_LOOP: +/*----------------------------------------*/ + KERNEL2x2_2 0, 0 +ZGEMM_L2x2_K32: +/*----------------------------------------*/ + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 0 + KERNEL2x2_2 8, 0 + KERNEL2x2_2 9, 0 + KERNEL2x2_2 10, 0 + KERNEL2x2_2 11, 0 + KERNEL2x2_2 12, 0 + KERNEL2x2_2 13, 0 + KERNEL2x2_2 14, 0 + KERNEL2x2_2 15, 1 + bdnz ZGEMM_L2x2_LOOP + MY_ALIGN + + +ZGEMM_L2x2_LOOP_END: +/*----------------------------------------*/ + KERNEL2x2_2 0, 1 + blr + MY_ALIGN + +ZGEMM_2x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD2x1_2 + MY_ALIGN +ZGEMM_L2x1_LOOP: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 0, 0 +ZGEMM_L2x1_K32: +/*----------------------------------------*/ + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_L2 32, 64, 7, 0 + KERNEL2x1_L2 32, 64, 8, 0 + KERNEL2x1_L2 32, 64, 9, 0 + KERNEL2x1_L2 32, 64, 10, 0 + KERNEL2x1_L2 32, 64, 11, 0 + KERNEL2x1_L2 32, 64, 12, 0 + KERNEL2x1_L2 32, 64, 13, 0 + KERNEL2x1_L2 32, 64, 14, 0 + KERNEL2x1_L2 32, 64, 15, 1 + bdnz ZGEMM_L2x1_LOOP + MY_ALIGN +ZGEMM_L2x1_LOOP_END: +/*----------------------------------------*/ + END2x1_2 + blr + + MY_ALIGN + + +/* MAIN LOOP BEGINS */ + MY_ALIGN + + +ZGEMM_L2: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + neg TEMP_REG, OFFSET +#endif + srawi. J, N, 1 + bgt ZGEMM_L2_BEGIN + b ZGEMM_L2_END + +ZGEMM_L2_BEGIN: +/*----------------------------------------*/ + mr CO, C + slwi T1, LDC, 1 + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + bgt ZGEMM_L2_BEGIN_CONTINUE + b ZGEMM_L2x8_END + +ZGEMM_L2_BEGIN_CONTINUE: + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L2x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 2 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 2 + mr T1, T6 +#else + mr T1, K +#endif +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /* T8 <- T1 % 128 */ + + KERNEL2x8_PRELOAD + KERNEL2x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x8_SUB0 + bl ZGEMM_L2x8_LMAIN_SUB + andi. L, T1, 127 + + bgt ZGEMM_L2x8_BEGIN_CONTINUE + b ZGEMM_L2x8_SAVE + +ZGEMM_L2x8_BEGIN_CONTINUE: + b ZGEMM_L2x8_SUB2 + + +ZGEMM_L2x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP2x8_128K + LOAD_END_2x8 128, 32 + KERNEL2x8_PRELOAD + addi BO, BO, -64 + addi AO,AO, -256 + mtctr T8 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + +CMP2x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L2x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -256 + bl ZGEMM_L2x8_K128 + b ZGEMM_L2x8_SAVE + MY_ALIGN + + +ZGEMM_L2x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L2x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 0 + KERNEL2x8_2 16, 0 + KERNEL2x8_2 17, 0 + KERNEL2x8_2 18, 0 + KERNEL2x8_2 19, 0 + KERNEL2x8_2 20, 0 + KERNEL2x8_2 21, 0 + KERNEL2x8_2 22, 0 + KERNEL2x8_2 23, 0 + KERNEL2x8_2 24, 0 + KERNEL2x8_2 25, 0 + KERNEL2x8_2 26, 0 + KERNEL2x8_2 27, 0 + KERNEL2x8_2 28, 0 + KERNEL2x8_2 29, 0 + KERNEL2x8_2 30, 0 + KERNEL2x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L2x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL2x8_2 8, 0 + KERNEL2x8_2 9, 0 + KERNEL2x8_2 10, 0 + KERNEL2x8_2 11, 0 + dcbt BO, T4 + KERNEL2x8_2 12, 0 + KERNEL2x8_2 13, 0 + KERNEL2x8_2 14, 0 + KERNEL2x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + dcbt AO, T2 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL2x8_2 4, 0 + KERNEL2x8_2 5, 0 + dcbt AO, T4 + KERNEL2x8_2 6, 0 + KERNEL2x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x8_SUB2_4 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 0 + KERNEL2x8_2 2, 0 + KERNEL2x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x8_SUB2_2 + KERNEL2x8_2 0, 0 + KERNEL2x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x8_SUB2_1 + KERNEL2x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x8_SAVE + LOAD_END_2x8 128, 32 + + +ZGEMM_L2x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL2x8_UNPRIME_MMA + SAVE2x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 2 +#endif + + ble ZGEMM_L2x8_SAVE_CONTINUE + b ZGEMM_L2x8_BEGIN + +ZGEMM_L2x8_SAVE_CONTINUE: + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END + b ZGEMM_L2x4_BEGIN + MY_ALIGN + + +ZGEMM_L2x8_END: +/*----------------------------------------*/ + + +ZGEMM_L2x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L2x1_END + andi. T1, M, 4 + ble ZGEMM_L2x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x4_PRELOAD + KERNEL2x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x4_SUB0 + bl ZGEMM_2x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 + + +ZGEMM_L2x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x4_32K + LOAD_END_2x4 64, 32 + KERNEL2x4_PRELOAD + addi BO, BO, -64 + addi AO,AO, -128 + mtctr T8 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + CMP2x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -128 + bl ZGEMM_L2x4_K32 + b ZGEMM_L2x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x4_SUB2_8 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 0 + KERNEL2x4_2 4, 0 + KERNEL2x4_2 5, 0 + KERNEL2x4_2 6, 0 + KERNEL2x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x4_SUB2_4 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 0 + KERNEL2x4_2 2, 0 + KERNEL2x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x4_SUB2_2 + KERNEL2x4_2 0, 0 + KERNEL2x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x4_SUB2_1 + KERNEL2x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x4_SAVE + LOAD_END_2x4 64, 32 + + +ZGEMM_L2x4_SAVE: +/*----------------------------------------*/ + KERNEL2x4_UNPRIME_MMA + SAVE2x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 2 +#endif + + +ZGEMM_L2x4_END: +/*----------------------------------------*/ + + +ZGEMM_L2x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L2x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL2x2_PRELOAD + KERNEL2x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L2x2_SUB0 + bl ZGEMM_2x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 + + +ZGEMM_L2x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x2_32K + LOAD_END_2x2 32, 32 + KERNEL2x2_PRELOAD + addi BO, BO, -64 + addi AO,AO, -64 + mtctr T8 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + CMP2x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -64 + bl ZGEMM_L2x2_K32 + b ZGEMM_L2x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x2_SUB2_8 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 0 + KERNEL2x2_2 4, 0 + KERNEL2x2_2 5, 0 + KERNEL2x2_2 6, 0 + KERNEL2x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x2_SUB2_4 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 0 + KERNEL2x2_2 2, 0 + KERNEL2x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x2_SUB2_2 + KERNEL2x2_2 0, 0 + KERNEL2x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x2_SUB2_1 + KERNEL2x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L2x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x2_SAVE + LOAD_END_2x2 32, 32 + + +ZGEMM_L2x2_SAVE: +/*----------------------------------------*/ + KERNEL2x2_UNPRIME_MMA + SAVE2x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 2 +#endif + + +ZGEMM_L2x2_END: +/*----------------------------------------*/ + + +ZGEMM_L2x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L2x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 2 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 2 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO2x1 + ble ZGEMM_L2x1_SUB0 + bl ZGEMM_2x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 + + +ZGEMM_L2x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP2x1_32K + addi BO, BO, -32 + addi AO,AO, -16 + LOAD2x1O 16, 32 + END2x1_WITHOUT_ADD + LOAD2x1_2O 32, 64 + mtctr T8 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + CMP2x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L2x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -64 + addi AO,AO, -32 + LOAD2x1_2O 32, 64 + bl ZGEMM_L2x1_K32 + b ZGEMM_L2x1_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L2x1_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L2x1_SUB2_8 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_L2 32, 64, 3, 0 + KERNEL2x1_L2 32, 64, 4, 0 + KERNEL2x1_L2 32, 64, 5, 0 + KERNEL2x1_L2 32, 64, 6, 0 + KERNEL2x1_E2 32, 64, 7, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L2x1_SUB2_4 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_L2 32, 64, 1, 0 + KERNEL2x1_L2 32, 64, 2, 0 + KERNEL2x1_E2 32, 64, 3, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L2x1_SUB2_2 + LOAD2x1_2 + KERNEL2x1_L2 32, 64, 0, 0 + KERNEL2x1_E2 32, 64, 1, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L2x1_SUB2_1 + LOAD2x1_2 + KERNEL2x1_E2 32, 64, 0, 1 + MY_ALIGN + + +ZGEMM_L2x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L2x1_SAVE + KERNEL2x1 + + +ZGEMM_L2x1_SAVE: +/*----------------------------------------*/ + SAVE2x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 2 +#endif + + +ZGEMM_L2x1_END: +/*----------------------------------------*/ + slwi T1, K, 5 + addic. J, J, -1 + add B, B, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 2 +#endif + ble ZGEMM_L2_END + b ZGEMM_L2_BEGIN + +ZGEMM_L2_END: + +b ZGEMM_L1 +/* MINI SUBROUTINES */ +/* 1x8 MAIN 128x+2 LOOP */ + + +ZGEMM_L1x8_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN +ZGEMM_L1x8_LOOP: +/*----------------------------------------*/ + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 +ZGEMM_L1x8_K128: +/*----------------------------------------*/ + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 0 + KERNEL1x8_2 32, 0 + KERNEL1x8_2 33, 0 + KERNEL1x8_2 34, 0 + KERNEL1x8_2 35, 0 + KERNEL1x8_2 36, 0 + KERNEL1x8_2 37, 0 + KERNEL1x8_2 38, 0 + KERNEL1x8_2 39, 0 + KERNEL1x8_2 40, 0 + KERNEL1x8_2 41, 0 + KERNEL1x8_2 42, 0 + KERNEL1x8_2 43, 0 + KERNEL1x8_2 44, 0 + KERNEL1x8_2 45, 0 + KERNEL1x8_2 46, 0 + KERNEL1x8_2 47, 0 + KERNEL1x8_2 48, 0 + KERNEL1x8_2 49, 0 + KERNEL1x8_2 50, 0 + KERNEL1x8_2 51, 0 + KERNEL1x8_2 52, 0 + KERNEL1x8_2 53, 0 + KERNEL1x8_2 54, 0 + KERNEL1x8_2 55, 0 + KERNEL1x8_2 56, 0 + KERNEL1x8_2 57, 0 + KERNEL1x8_2 58, 0 + KERNEL1x8_2 59, 0 + KERNEL1x8_2 60, 0 + KERNEL1x8_2 61, 0 + KERNEL1x8_2 62, 0 + KERNEL1x8_2 63, 1 + bdnz ZGEMM_L1x8_LOOP + MY_ALIGN +ZGEMM_L1x8_LOOP_END: +/*----------------------------------------*/ + KERNEL1x8_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x4_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x4_LOOP: +/*----------------------------------------*/ + KERNEL1x4_2 0, 0 + + +ZGEMM_L1x4_K32: +/*----------------------------------------*/ + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 0 + KERNEL1x4_2 8, 0 + KERNEL1x4_2 9, 0 + KERNEL1x4_2 10, 0 + KERNEL1x4_2 11, 0 + KERNEL1x4_2 12, 0 + KERNEL1x4_2 13, 0 + KERNEL1x4_2 14, 0 + KERNEL1x4_2 15, 1 + bdnz ZGEMM_L1x4_LOOP + MY_ALIGN + + +ZGEMM_L1x4_LOOP_END: +/*----------------------------------------*/ + KERNEL1x4_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x2_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + MY_ALIGN + + +ZGEMM_L1x2_LOOP: +/*----------------------------------------*/ + KERNEL1x2_2 0, 0 + + +ZGEMM_L1x2_K32: +/*----------------------------------------*/ + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 0 + KERNEL1x2_2 8, 0 + KERNEL1x2_2 9, 0 + KERNEL1x2_2 10, 0 + KERNEL1x2_2 11, 0 + KERNEL1x2_2 12, 0 + KERNEL1x2_2 13, 0 + KERNEL1x2_2 14, 0 + KERNEL1x2_2 15, 1 + bdnz ZGEMM_L1x2_LOOP + MY_ALIGN + + +ZGEMM_L1x2_LOOP_END: +/*----------------------------------------*/ + KERNEL1x2_2 0, 1 + blr + MY_ALIGN + + +ZGEMM_1x1_LMAIN_SUB: +/*----------------------------------------*/ + mtctr T8 + LOAD1x1_2 + MY_ALIGN + + +ZGEMM_L1x1_LOOP: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 0, 0 + + +ZGEMM_L1x1_K32: +/*----------------------------------------*/ + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_L2 32, 32, 7, 0 + KERNEL1x1_L2 32, 32, 8, 0 + KERNEL1x1_L2 32, 32, 9, 0 + KERNEL1x1_L2 32, 32, 10, 0 + KERNEL1x1_L2 32, 32, 11, 0 + KERNEL1x1_L2 32, 32, 12, 0 + KERNEL1x1_L2 32, 32, 13, 0 + KERNEL1x1_L2 32, 32, 14, 0 + KERNEL1x1_L2 32, 32, 15, 1 + bdnz ZGEMM_L1x1_LOOP + MY_ALIGN + + +ZGEMM_L1x1_LOOP_END: +/*----------------------------------------*/ + END1x1_2 + blr + MY_ALIGN + + +/*----------------------N1 BEGINS---------*/ +ZGEMM_L1: +/*----------------------------------------*/ + andi. T1, N, 1 + ble ZGEMM_L1_END + +ZGEMM_L1_BEGIN: +/*----------------------------------------*/ + mr CO, C + + add T2,C,LDC + mr AO, A + add C, C, T1 +#if defined(TRMMKERNEL) && defined(LEFT) + mr TEMP_REG, OFFSET /*off = offset;*/ +#endif + srawi. I, M, 3 + ble ZGEMM_L1x8_END + dcbt CO,r0 /*just prefetch*/ + dcbt T2,r0 + + +ZGEMM_L1x8_BEGIN: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 8, 1 +#else + mr BO, B + dcbt B, r0 +#endif + dcbt AO, r0 +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 8, 1 + mr T1, T6 +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(T11-2) % 128x */ +#else + mr T1, K +/* TEMPS FOR PREFETCH */ + li T2, 1024 + li T3, 1024+512 + addi T1,T1, -2 +/* TEMPS FOR PREFETCH */ + li T4, 2048 + li T5, 2048+512 + srawi. T8, T1, 7 /**(K-2) % 128x */ +#endif + KERNEL1x8_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x8_SUB0 + bl ZGEMM_L1x8_LMAIN_SUB + andi. L, T1, 127 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 + + +ZGEMM_L1x8_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 255 + cmpwi T6, 129 +#else + andi. L, K, 255 + cmpwi K, 129 +#endif + li T8, 1 + bne CMP1x8_128K + LOAD_END_1x8 -128, -16 + mtctr T8 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + CMP1x8_128K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 128 +#else + cmpwi K, 128 +#endif + bne ZGEMM_L1x8_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -256 + bl ZGEMM_L1x8_K128 + b ZGEMM_L1x8_SAVE + MY_ALIGN + + +ZGEMM_L1x8_SUB2: +/*----------------------------------------*/ + andi. T1,L, 64 + ble ZGEMM_L1x8_SUB2_32 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 0 + KERNEL1x8_2 16, 0 + KERNEL1x8_2 17, 0 + KERNEL1x8_2 18, 0 + KERNEL1x8_2 19, 0 + KERNEL1x8_2 20, 0 + KERNEL1x8_2 21, 0 + KERNEL1x8_2 22, 0 + KERNEL1x8_2 23, 0 + KERNEL1x8_2 24, 0 + KERNEL1x8_2 25, 0 + KERNEL1x8_2 26, 0 + KERNEL1x8_2 27, 0 + KERNEL1x8_2 28, 0 + KERNEL1x8_2 29, 0 + KERNEL1x8_2 30, 0 + KERNEL1x8_2 31, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_32: +/*----------------------------------------*/ + andi. T1,L, 32 + ble ZGEMM_L1x8_SUB2_16 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 0 + dcbt AO, T5 + dcbt BO, T3 + KERNEL1x8_2 8, 0 + KERNEL1x8_2 9, 0 + KERNEL1x8_2 10, 0 + KERNEL1x8_2 11, 0 + dcbt BO, T4 + KERNEL1x8_2 12, 0 + KERNEL1x8_2 13, 0 + KERNEL1x8_2 14, 0 + KERNEL1x8_2 15, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_16: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x8_SUB2_8 + dcbt AO, PRE + dcbt BO, PRE + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + dcbt AO, T2 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 0 + dcbt AO, T3 + dcbt BO, T2 + KERNEL1x8_2 4, 0 + KERNEL1x8_2 5, 0 + dcbt AO, T4 + KERNEL1x8_2 6, 0 + KERNEL1x8_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x8_SUB2_4 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 0 + KERNEL1x8_2 2, 0 + KERNEL1x8_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x8_SUB2_2 + KERNEL1x8_2 0, 0 + KERNEL1x8_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x8_SUB2_1 + KERNEL1x8_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x8_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x8_SAVE + LOAD_END_1x8 128, 16 + + +ZGEMM_L1x8_SAVE: +/*----------------------------------------*/ + addic. I, I, -1 + KERNEL1x8_UNPRIME_MMA + SAVE1x8 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 8, 1 +#endif + bgt ZGEMM_L1x8_BEGIN + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END + b ZGEMM_L1x4_BEGIN + MY_ALIGN + + +ZGEMM_L1x8_END: +/*----------------------------------------*/ + + +ZGEMM_L1x4_BEGIN: +/*----------------------------------------*/ + andi. T2, M, 7 + ble ZGEMM_L1x1_END + andi. T1, M, 4 + ble ZGEMM_L1x4_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 4, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 4, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x4_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x4_SUB0 + bl ZGEMM_1x4_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 + + +ZGEMM_L1x4_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x4_32K + LOAD_END_1x4 -64, -16 + mtctr T8 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + CMP1x4_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x4_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -128 + bl ZGEMM_L1x4_K32 + b ZGEMM_L1x4_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x4_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x4_SUB2_8 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 0 + KERNEL1x4_2 4, 0 + KERNEL1x4_2 5, 0 + KERNEL1x4_2 6, 0 + KERNEL1x4_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x4_SUB2_4 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 0 + KERNEL1x4_2 2, 0 + KERNEL1x4_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x4_SUB2_2 + KERNEL1x4_2 0, 0 + KERNEL1x4_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x4_SUB2_1 + KERNEL1x4_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x4_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x4_SAVE + LOAD_END_1x4 64,16 + + + +ZGEMM_L1x4_SAVE: +/*----------------------------------------*/ + KERNEL1x4_UNPRIME_MMA + SAVE1x4 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 4, 1 +#endif + + +ZGEMM_L1x4_END: +/*----------------------------------------*/ + + +ZGEMM_L1x2_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 2 + ble ZGEMM_L1x2_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 2, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 2, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + KERNEL1x2_ZERO_AND_PRIME_MMA + ble ZGEMM_L1x2_SUB0 + bl ZGEMM_1x2_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 + + +ZGEMM_L1x2_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x2_32K + LOAD_END_1x2 -32, -16 + mtctr T8 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + CMP1x2_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x2_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -64 + bl ZGEMM_L1x2_K32 + b ZGEMM_L1x2_SAVE + MY_ALIGN + MY_ALIGN + + +ZGEMM_L1x2_SUB2: +/*----------------------------------------*/ + andi. T1,L, 16 + ble ZGEMM_L1x2_SUB2_8 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 0 + KERNEL1x2_2 4, 0 + KERNEL1x2_2 5, 0 + KERNEL1x2_2 6, 0 + KERNEL1x2_2 7, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_8: +/*----------------------------------------*/ + andi. T1,L, 8 + ble ZGEMM_L1x2_SUB2_4 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 0 + KERNEL1x2_2 2, 0 + KERNEL1x2_2 3, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x2_SUB2_2 + KERNEL1x2_2 0, 0 + KERNEL1x2_2 1, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x2_SUB2_1 + KERNEL1x2_2 0, 1 + MY_ALIGN + + +ZGEMM_L1x2_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x2_SAVE + LOAD_END_1x2 32,16 + + +ZGEMM_L1x2_SAVE: +/*----------------------------------------*/ + KERNEL1x2_UNPRIME_MMA + SAVE1x2 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 2, 1 +#endif + + +ZGEMM_L1x2_END: +/*----------------------------------------*/ + + +ZGEMM_L1x1_BEGIN: +/*----------------------------------------*/ + andi. T1, M, 1 + ble ZGEMM_L1x1_END +#if defined(TRMMKERNEL) + REFRESH_POINTERS AO, BO,TEMP_REG, B, 1, 1 +#else + mr BO, B +#endif +#if defined(TRMMKERNEL) + REFRESH_TEMP_BK T6,K,TEMP_REG, 1, 1 + mr T1, T6 + addi T1,T1, -2 + srawi. T8, T1, 5 /**(T11-2) % 32x */ +#else + mr T1, K + addi T1,T1, -2 + srawi. T8, T1, 5 /**(K-2) % 32x */ +#endif + ZERO1x1 + ble ZGEMM_L1x1_SUB0 + bl ZGEMM_1x1_LMAIN_SUB + andi. L, T1, 31 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 + + +ZGEMM_L1x1_SUB0: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + andi. L, T6, 63 + cmpwi T6, 33 +#else + andi. L, K, 63 + cmpwi K, 33 +#endif + li T8, 1 + bne CMP1x1_32K + addi BO, BO, -16 + addi AO,AO, -16 + LOAD1x1O 16, 16 + END1x1_WITHOUT_ADD + LOAD1x1_2O 32, 32 + mtctr T8 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + CMP1x1_32K: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) + cmpwi T6, 32 +#else + cmpwi K, 32 +#endif + bne ZGEMM_L1x1_SUB2 + MY_ALIGN + mtctr T8 + addi BO, BO, -32 + addi AO,AO, -32 + LOAD1x1_2O 32, 32 + bl ZGEMM_L1x1_K32 + b ZGEMM_L1x1_SAVE + MY_ALIGN + + +ZGEMM_L1x1_SUB2: +/*----------------------------------------*/ + andi. T1, L, 16 + ble ZGEMM_L1x1_SUB2_8 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_L2 32, 32, 3, 0 + KERNEL1x1_L2 32, 32, 4, 0 + KERNEL1x1_L2 32, 32, 5, 0 + KERNEL1x1_L2 32, 32, 6, 0 + KERNEL1x1_E2 32, 32, 7, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_8: +/*----------------------------------------*/ + andi. T1, L, 8 + ble ZGEMM_L1x1_SUB2_4 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_L2 32, 32, 1, 0 + KERNEL1x1_L2 32, 32, 2, 0 + KERNEL1x1_E2 32, 32, 3, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_4: +/*----------------------------------------*/ + andi. T1,L, 4 + ble ZGEMM_L1x1_SUB2_2 + LOAD1x1_2 + KERNEL1x1_L2 32, 32, 0, 0 + KERNEL1x1_E2 32, 32, 1, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_2: +/*----------------------------------------*/ + andi. T1,L, 2 + ble ZGEMM_L1x1_SUB2_1 + LOAD1x1_2 + KERNEL1x1_E2 32, 32, 0, 1 + MY_ALIGN + + +ZGEMM_L1x1_SUB2_1: +/*----------------------------------------*/ + andi. T1,L, 1 + ble ZGEMM_L1x1_SAVE + KERNEL1x1 + + +ZGEMM_L1x1_SAVE: +/*----------------------------------------*/ + SAVE1x1 +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE T6,K,TEMP_REG, BO,AO, 1, 1 +#endif + + +ZGEMM_L1x1_END: +/*----------------------------------------*/ +#if defined(TRMMKERNEL) && !defined(LEFT) + addi TEMP_REG, TEMP_REG, 1 +#endif + + +ZGEMM_L1_END: +/*----------------------------------------*/ diff --git a/kernel/power/zgemm_macros_power10.S b/kernel/power/zgemm_macros_power10.S new file mode 100644 index 0000000000..42f9c5ad48 --- /dev/null +++ b/kernel/power/zgemm_macros_power10.S @@ -0,0 +1,1138 @@ +/*************************************************************************** +Copyright (c) 2013-2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define unit_size 16 +#define DISP32(ind,disp) (ind*unit_size*32+disp) +#define DISP16(ind,disp) (ind*unit_size*16+disp) +#define DISP8(ind,disp) (ind*unit_size*8+disp) +#define DISP4(ind,disp) (ind*unit_size*4+disp) +#define DISP2(ind,disp) (ind*unit_size*2+disp) +#define DISP1(ind,disp) (ind*unit_size+disp) +#define DISPX(disp) (disp) +/* HELPERS FOR SAVE */ +/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ + + +.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET +#ifndef TRMMKERNEL + lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) + lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) + xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 + xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 +#endif +.endm +/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ + + +.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ +.endm +/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ + + +.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ + xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ +.endm +/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ + + +.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR + xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 +#else // CC || CR || RC || RR + /*we will assume {-alpha_r,-alpha_i} for this case */ + /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ + xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 + /*we will negate alpha image instead instead to fix sign*/ + xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI +#endif +.endm +/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ + + +.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 +#ifndef TRMMKERNEL + xvmsubadp \VSOUT1,\VSINII, alpha_i + xvmaddadp \VSOUT2,\VSINRR, alpha_i +#else + xvmuldp \VSOUT1,\VSINII, alpha_i + xvmuldp \VSOUT2,\VSINRR, alpha_i +#endif +.endm +/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ + + +.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 + xvmsubadp \VSOUT1,\VSINRR, alpha_r + xvmaddadp \VSOUT2,\VSINII, alpha_r +.endm +/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ + + +.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 + xxmrghd \VSOUT1,\VSIN2,\VSIN1 + xxmrgld \VSOUT2,\VSIN2,\VSIN1 +.endm + + +.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 + stxv \VSIN1, DISPX(\LOFFSET)(\REG) + stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) +.endm + + +.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64) + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 + MULT_APLHA_PART1 vs38,vs40,vs48,vs49 + MULT_APLHA_PART2 vs34,vs36,vs46,vs47 + AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + MULT_APLHA_PART1 vs42,vs44, vs56,vs57 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + MULT_APLHA_PART2 vs42,vs44,vs56,vs57 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 + MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59 + UNPACK_FOR_STORE vs56,vs57,vs42,vs44 + UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3 + STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44 + STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 +.endm + + +.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32) + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39 + RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART1 vs38,vs40, vs48,vs49 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs38,vs40,vs48,vs49 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + UNPACK_FOR_STORE vs48,vs49,vs35,vs37 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 + STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37 +.endm + + +.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35 + LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41 +.endm + + +.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET + RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 +#ifndef TRMMKERNEL + lxv vs50, (\LOFFSET)(\BASE_REG) + xxmrgld vs46,vs50,vs50 + xxmrghd vs47,vs50,vs50 +#endif + RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 + AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 + MULT_APLHA_PART1 vs34,vs36, vs46,vs47 + MULT_APLHA_PART2 vs34,vs36, vs46,vs47 + UNPACK_FOR_STORE vs46,vs47,vs39,vs41 + xxmrghd vs39,vs47,vs46 + stxv vs39, (\LOFFSET)(\BASE_REG) +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=8 +**********************************************************************************************/ + +.macro KERNEL2x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 + xxsetaccz 4 + xxsetaccz 5 + xxsetaccz 6 + xxsetaccz 7 +.endm + + +.macro KERNEL2x8_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x8_2 Index, IsLast + lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs44, vs51 + xvf64gerpp 3, vs46, vs51 + xvf64gerpp 4, vs40, vs50 + xvf64gerpp 5, vs42, vs50 + xvf64gerpp 6, vs44, vs50 + xvf64gerpp 7, vs46, vs50 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x8 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 4, vs32, vs48 + xvf64gerpp 5, vs34, vs48 + xvf64gerpp 6, vs36, vs48 + xvf64gerpp 7, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 + xxmfacc 4 + xxmfacc 5 + xxmfacc 6 + xxmfacc 7 +.endm + + +.macro SAVE2x8 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + xxpermdi vs32, vs16, vs17, 0b01 + xxpermdi vs33, vs16, vs17, 0b10 + xxpermdi vs34, vs18, vs19, 0b01 + xxpermdi vs35, vs18, vs19, 0b10 + xxpermdi vs36, vs20, vs21, 0b01 + xxpermdi vs37, vs20, vs21, 0b10 + xxpermdi vs38, vs22, vs23, 0b01 + xxpermdi vs39, vs22, vs23, 0b10 + xxpermdi vs40, vs24, vs25, 0b01 + xxpermdi vs41, vs24, vs25, 0b10 + xxpermdi vs42, vs26, vs27, 0b01 + xxpermdi vs43, vs26, vs27, 0b10 + xxpermdi vs44, vs28, vs29, 0b01 + xxpermdi vs45, vs28, vs29, 0b10 + xxpermdi vs46, vs30, vs31, 0b01 + xxpermdi vs47, vs30, vs31, 0b10 + + xxlor vs18, vs32, vs32 + xxlor vs19, vs33, vs33 + xxlor vs16, vs34, vs34 + xxlor vs17, vs35, vs35 + xxlor vs22, vs36, vs36 + xxlor vs23, vs37, vs37 + xxlor vs20, vs38, vs38 + xxlor vs21, vs39, vs39 + xxlor vs26, vs40, vs40 + xxlor vs27, vs41, vs41 + xxlor vs24, vs42, vs42 + xxlor vs25, vs43, vs43 + xxlor vs30, vs44, vs44 + xxlor vs31, vs45, vs45 + xxlor vs28, vs46, vs46 + xxlor vs29, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=4 +**********************************************************************************************/ + +.macro KERNEL2x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL2x4_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x4_2 Index, IsLast + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs42, vs51 + xvf64gerpp 2, vs40, vs50 + xvf64gerpp 3, vs42, vs50 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x4 OffsetA, OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs32, vs48 + xvf64gerpp 3, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE2x4 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=2 +**********************************************************************************************/ + +.macro KERNEL2x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL2x2_PRELOAD + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs48, 0(BO) // load real imag from B +.endm + + +.macro KERNEL2x2_2 Index, IsLast + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A + lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B + xvf64gerpp 0, vs40, vs51 + xvf64gerpp 1, vs40, vs50 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP4(\Index,64) +.endif +.endm + + +.macro LOAD_END_2x2 OffsetA,OffsetB + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL2x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE2x2 + add T1, CO ,LDC + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + SAVE2 vs4,vs5,vs6,vs7,T1,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=2 and M=1 +**********************************************************************************************/ + +.macro ZERO2x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxlxor vs2, vs2, vs2 + xxlxor vs3, vs3, vs3 + +.endm + + +.macro LOAD2x1 + LOAD2x1O 0,0 +.endm + + +.macro LOAD2x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_WITHOUT_ADD + END2x1 AO,BO,0,0 +.endm + + +.macro END2x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.endm + + +.macro LOAD2x1_2 + LOAD2x1_2O 0,0 +.endm + + +.macro LOAD2x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs50, (\OffsetB+16)(BO) // load real,imag from B + lxv vs52, (\OffsetB+32)(BO) // load real,imag from B + lxv vs54, (\OffsetB+48)(BO) // load real,imag from B + xxswapd vs49, vs48 + xxswapd vs51, vs50 + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END2x1_2 + /*for load2 offset will be 32 and 64*/ + KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 +.endm + + +.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xxswapd vs55, vs54 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs2, vs32, vs50 + xvmaddadp vs1, vs32, vs49 + xvmaddadp vs3, vs32, vs51 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B + lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 + xxswapd vs51, vs50 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs2, vs40, vs54 + xvmaddadp vs1, vs40, vs53 + xvmaddadp vs3, vs40, vs55 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B + lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP4(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP4(\Index,64) +.endif +.endif +.endm + + +.macro KERNEL2x1 + LOAD2x1 + END2x1 AO, BO, 16,32 +.endm + + +.macro SAVE2x1 + add T1, CO ,LDC + SAVE1 vs0,vs1,CO,0 + SAVE1 vs2,vs3,T1,0 + addi CO, CO, 16 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=8 +**********************************************************************************************/ + +.macro KERNEL1x8_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 + xxsetaccz 2 + xxsetaccz 3 +.endm + + +.macro KERNEL1x8_2 Index,IsLast + lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A + lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A + lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A + lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A + lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A + lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A + lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 2, vs36, vs49 + xvf64gerpp 3, vs38, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 + xvf64gerpp 2, vs44, vs48 + xvf64gerpp 3, vs46, vs48 +.if \IsLast==1 + addi AO, AO, DISP16(\Index,256) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x8 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxvp vs36, 64(AO) // load real,imag from A + lxvp vs38, 96(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + xvf64gerpp 2, vs36, vs48 + xvf64gerpp 3, vs38, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x8_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 + xxmfacc 2 + xxmfacc 3 +.endm + + +.macro SAVE1x8 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + xxpermdi vs40, vs8, vs9, 0b01 + xxpermdi vs41, vs8, vs9, 0b10 + xxpermdi vs42, vs10, vs11, 0b01 + xxpermdi vs43, vs10, vs11, 0b10 + xxpermdi vs44, vs12, vs13, 0b01 + xxpermdi vs45, vs12, vs13, 0b10 + xxpermdi vs46, vs14, vs15, 0b01 + xxpermdi vs47, vs14, vs15, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + xxlor vs10, vs40, vs40 + xxlor vs11, vs41, vs41 + xxlor vs8, vs42, vs42 + xxlor vs9, vs43, vs43 + xxlor vs14, vs44, vs44 + xxlor vs15, vs45, vs45 + xxlor vs12, vs46, vs46 + xxlor vs13, vs47, vs47 + + SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 + addi CO, CO, 128 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=4 +**********************************************************************************************/ + +.macro KERNEL1x4_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 + xxsetaccz 1 +.endm + + +.macro KERNEL1x4_2 Index,IsLast + lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A + lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A + lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A + lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 1, vs34, vs49 + xvf64gerpp 0, vs40, vs48 + xvf64gerpp 1, vs42, vs48 +.if \IsLast==1 + addi AO, AO, DISP8(\Index,128) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x4 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxvp vs34, 32(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + xvf64gerpp 1, vs34, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x4_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 + xxmfacc 1 +.endm + + +.macro SAVE1x4 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + xxpermdi vs36, vs4, vs5, 0b01 + xxpermdi vs37, vs4, vs5, 0b10 + xxpermdi vs38, vs6, vs7, 0b01 + xxpermdi vs39, vs6, vs7, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + xxlor vs6, vs36, vs36 + xxlor vs7, vs37, vs37 + xxlor vs4, vs38, vs38 + xxlor vs5, vs39, vs39 + + SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 + addi CO, CO, 64 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=2 +**********************************************************************************************/ + +.macro KERNEL1x2_ZERO_AND_PRIME_MMA + /* zero out and prime the MMA accumulators */ + xxsetaccz 0 +.endm + + +.macro KERNEL1x2_2 Index,IsLast + lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A + lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A + lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B + xvf64gerpp 0, vs32, vs49 + xvf64gerpp 0, vs40, vs48 +.if \IsLast==1 + addi AO, AO, DISP4(\Index,64) + addi BO, BO, DISP2(\Index,32) +.endif +.endm + + +.macro LOAD_END_1x2 OffsetA,OffsetB + lxvp vs32, 0(AO) // load real,imag from A + lxv vs48, 0(BO) // load real imag from B + xvf64gerpp 0, vs32, vs48 + addi BO, BO, \OffsetB + addi AO, AO, \OffsetA +.endm + + +.macro KERNEL1x2_UNPRIME_MMA + /* "unprime" MMA accumulators */ + xxmfacc 0 +.endm + + +.macro SAVE1x2 + xxpermdi vs32, vs0, vs1, 0b01 + xxpermdi vs33, vs0, vs1, 0b10 + xxpermdi vs34, vs2, vs3, 0b01 + xxpermdi vs35, vs2, vs3, 0b10 + + xxlor vs2, vs32, vs32 + xxlor vs3, vs33, vs33 + xxlor vs0, vs34, vs34 + xxlor vs1, vs35, vs35 + + SAVE2 vs0,vs1,vs2,vs3,CO,0 + addi CO, CO, 32 +.endm + +/********************************************************************************************** +* + +.macros for N=1 and M=1 +**********************************************************************************************/ + +.macro ZERO1x1 + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +.endm + + +.macro LOAD1x1 + LOAD1x1O 0,0 +.endm + + +.macro LOAD1x1O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + xxswapd vs49, vs48 + +.endm + + +.macro END1x1_WITHOUT_ADD + END1x1 AO,BO,0,0 +.endm + + +.macro END1x1 AREG, BREG, OffsetA, OffsetB +.if \OffsetB != 0 + addi \BREG, \BREG, \OffsetB +.endif +.if \OffsetA != 0 + addi \AREG, \AREG, \OffsetA +.endif + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.endm + + +.macro LOAD1x1_2 + LOAD1x1_2O 0,0 +.endm + + +.macro LOAD1x1_2O OffsetA,OffsetB + lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B + lxv vs52, (\OffsetB+16)(BO) // load real,imag from B + xxswapd vs49, vs48 + + lxv vs32, (0+\OffsetA)(AO) // load real,imag from A + lxv vs40, (16+\OffsetA)(AO) // load real,imag from A +.endm + + +.macro END1x1_2 + /*for load2 offset will be 32 and 32*/ + KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 +.endm + + + +.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 +.endm + + +.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast + KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 +.endm + + +.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete + xxswapd vs53, vs52 + xvmaddadp vs0, vs32, vs48 + xvmaddadp vs1, vs32, vs49 +.if \Complete==0 + lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A +.endif +.if \Complete==0 + lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B +.endif +.if \Complete==0 + xxswapd vs49, vs48 +.endif + xvmaddadp vs0, vs40, vs52 + xvmaddadp vs1, vs40, vs53 +.if \Complete==0 + lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A +.endif + +.if \Complete==0 + lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B +.endif +.if \IsLast==1 +.if \Complete==1 + addi \AREG, \AREG, DISP2(\Index,\OffsetA) + addi \BREG, \BREG, DISP2(\Index,\OffsetB) +.else + addi \AREG, \AREG, DISP2(\Index,32) + addi \BREG, \BREG, DISP2(\Index,32) +.endif +.endif +.endm + + + +.macro KERNEL1x1 + LOAD1x1 + END1x1 AO, BO, 16,16 +.endm + + + +.macro SAVE1x1 + SAVE1 vs0,vs1,CO,0 + addi CO, CO, 16 +.endm + +/****************************TRMM POINTER REFRESH + +.macroSES*************************/ + + +.macro SHIFT_REG REG1,REG2,SHIFT_VAL + .if \SHIFT_VAL==16 + slwi \REG1, \REG2, 8 + .elseif \SHIFT_VAL==8 + slwi \REG1, \REG2, 7 + .elseif \SHIFT_VAL==4 + slwi \REG1, \REG2, 6 + .elseif \SHIFT_VAL==2 + slwi \REG1, \REG2, 5 + .elseif \SHIFT_VAL==1 + slwi \REG1, \REG2, 4 + .endif +.endm +/* +//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// ptrbb = bb; +// #else +// ptrba += off*16; +// ptrbb = bb + off*2; +// #endif +*/ + + +.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B + #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /* ptrbb = bb;*/ + mr \PTR_B,\B_VAL /* refresh BPOINT */ + #else + /* + // ptrba =ptrba+ off*C_A; + // ptrbb = bb + off*C_B; + */ + SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ + SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ + add \PTR_B, \B_VAL , T4 /* Add values to BO */ + add \PTR_A, \PTR_A, T2 /* Add values to AO */ + #endif +.endm + +/* +// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) +// temp = bk-off; +// #elif defined(LEFT) +// temp = off+16; // number of values in A +// #else +// temp = off+2; // number of values in B +// #endif +*/ + + +.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B + #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + /* temp = bk-off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #elif defined(LEFT) + /* temp = off+INCR_A; // number of values in A */ + addi \TEMP_BK, \OFF_VAL, \INCR_A + #else + /* temp = off+INCR_B // number of values in B*/ + addi \TEMP_BK,\OFF_VAL, \INCR_B + #endif +.endm +/* +// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) +// temp = bk - off; +// #ifdef LEFT +// temp -= 16; // number of values in A +// #else +// temp -= 2; // number of values in B +// #endif +// ptrba += temp*16; +// ptrbb += temp*2; +// #endif +// #ifdef LEFT +// off += 16; // number of values in A +// #endif +*/ + + + +.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B + #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + /*temp = bk - off;*/ + sub \TEMP_BK,\BK_VAL,\OFF_VAL + #ifdef LEFT + /*temp -= 8; // number of values in A*/ + addi \TEMP_BK,\TEMP_BK,-\C_A + #else + /*temp -= 4; // number of values in B*/ + addi \TEMP_BK,\TEMP_BK,-\C_B + #endif + /*ptrba += temp*C_A; + ptrbb += temp*C_B;*/ + SHIFT_REG T4,\TEMP_BK,\C_A + SHIFT_REG T2,\TEMP_BK,\C_B + add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ + add \PTR_B, \PTR_B,T2 + #endif + #ifdef LEFT + /*off += 8; // number of values in A*/ + addi \OFF_VAL,\OFF_VAL,\C_A + #endif +.endm + From 3446e58dafd054ec7bf1736272c32c73f56fc5be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 12:31:35 +0200 Subject: [PATCH 016/349] Fix handling of uname output on AIX --- c_check | 1 + 1 file changed, 1 insertion(+) diff --git a/c_check b/c_check index 8234c20811..dd700b8b48 100644 --- a/c_check +++ b/c_check @@ -6,6 +6,7 @@ # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); +$hostarch = `uname -p` if ($hostos eq "AIX"); $hostarch = "x86_64" if ($hostarch eq "amd64"); $hostarch = "arm" if ($hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); From 72a0ec8e757a8db7323295585fd28f309a36d575 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 12:55:10 +0200 Subject: [PATCH 017/349] Fix reading of CPU name from prtconf output on AIX --- cpuid_power.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index b36aa4945c..ed51df2116 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -57,7 +57,6 @@ #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 #define CPUTYPE_POWER9 9 -#define CPUTYPE_POWER10 10 char *cpuname[] = { "UNKNOWN", @@ -83,8 +82,8 @@ char *lowercpuname[] = { "cell", "ppcg4", "power8", - "power9", - "power10" + "power9", + "power10" }; char *corename[] = { @@ -97,8 +96,8 @@ char *corename[] = { "CELL", "PPCG4", "POWER8", - "POWER9", - "POWER10" + "POWER9", + "POWER10" }; int detect(void){ @@ -154,17 +153,17 @@ int detect(void){ pclose(infile); - if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; - if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; - if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; - if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; - if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; - if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; - if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; - if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; - if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; - if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; - if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; + if (strstr(p, "POWER3")) return CPUTYPE_POWER3; + if (strstr(p, "POWER4")) return CPUTYPE_POWER4; + if (strstr(p, "PPC970")) return CPUTYPE_PPC970; + if (strstr(p, "POWER5")) return CPUTYPE_POWER5; + if (strstr(p, "POWER6")) return CPUTYPE_POWER6; + if (strstr(p, "POWER7")) return CPUTYPE_POWER6; + if (strstr(p, "POWER8")) return CPUTYPE_POWER8; + if (strstr(p, "POWER9")) return CPUTYPE_POWER9; + if (strstr(p, "POWER10")) return CPUTYPE_POWER10; + if (strstr(p, "Cell")) return CPUTYPE_CELL; + if (strstr(p, "7447")) return CPUTYPE_PPCG4; return CPUTYPE_POWER5; #endif From 3f613b130114ffe226b4068b35793eb46e072a48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 12:57:00 +0200 Subject: [PATCH 018/349] Tentative changes for building on AIX --- Makefile.power | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile.power b/Makefile.power index 5c431860f6..b2fa043864 100644 --- a/Makefile.power +++ b/Makefile.power @@ -34,8 +34,11 @@ ifeq ($(USE_OPENMP), 1) COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +ifeq ($(OSNAME), AIX) +FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +endif endif endif @@ -78,6 +81,9 @@ CCOMMON_OPT += -mpowerpc64 -maix64 ifeq ($(COMPILER_F77), g77) FCOMMON_OPT += -mpowerpc64 -maix64 endif +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -mpowerpc64 -maix64 +endif ifeq ($(COMPILER_F77), xlf) FCOMMON_OPT += -q64 endif From c592f0f80a75251e9ddda7c4b00dcc0b263083d4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 12:58:13 +0200 Subject: [PATCH 019/349] Fix utest build on AIX --- utest/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utest/Makefile b/utest/Makefile index 0b98924114..31d4ccf002 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -34,6 +34,9 @@ endif ifeq ($(C_COMPILER), PGI) OBJS = utest_main2.o endif +ifeq ($(OSNAME), AIX) +OBJS = utest_main2.o +endif all : run_test From c0afc11742a388fbc7ad91928b1566cd6bd28388 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 13:12:36 +0200 Subject: [PATCH 020/349] Fix POWERPC builds on AIX (gcc/gfortran 7) 1. macro preprocessing for POWER8 and later kernels only 2. default buffer size used by AIX version of m4 is too small --- kernel/Makefile.L3 | 144 ++++++++++++++++++++++----------------------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 0cb02ef855..c7865480f6 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -482,7 +482,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ @@ -497,7 +497,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s m4 shgemmitcopy.s > shgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ @@ -513,7 +513,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ @@ -529,7 +529,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ @@ -541,7 +541,7 @@ endif endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ @@ -559,7 +559,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ @@ -602,7 +602,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ @@ -625,7 +625,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ @@ -657,7 +657,7 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -669,7 +669,7 @@ endif ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -680,7 +680,7 @@ endif endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -693,9 +693,9 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s - m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s + m4 -B 16384 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s else @@ -703,9 +703,9 @@ else endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s - m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s + m4 -B 16384 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s else @@ -713,9 +713,9 @@ else endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s - m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + m4 -B 16384 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s else @@ -723,9 +723,9 @@ else endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s - m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s + m4 -B 16384 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s else @@ -733,9 +733,9 @@ else endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s - m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s + m4 -B 16384 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s else @@ -743,9 +743,9 @@ else endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s - m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s + m4 -B 16384 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s else @@ -753,9 +753,9 @@ else endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s - m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s + m4 -B 16384 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s else @@ -763,9 +763,9 @@ else endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s - m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s + m4 -B 16384 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s else @@ -787,7 +787,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ @@ -797,7 +797,7 @@ else endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ @@ -807,7 +807,7 @@ else endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ @@ -817,7 +817,7 @@ else endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ @@ -827,7 +827,7 @@ else endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ @@ -837,7 +837,7 @@ else endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ @@ -847,7 +847,7 @@ else endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ @@ -857,7 +857,7 @@ else endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ @@ -879,9 +879,9 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s - m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s + m4 -B 16384 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s else @@ -889,9 +889,9 @@ else endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s - m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s + m4 -B 16384 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s else @@ -899,9 +899,9 @@ else endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s - m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s + m4 -B 16384 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s else @@ -909,9 +909,9 @@ else endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s - m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s + m4 -B 16384 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s else @@ -919,9 +919,9 @@ else endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s - m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s + m4 -B 16384 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s else @@ -929,9 +929,9 @@ else endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s - m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s + m4 -B 16384 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s else @@ -939,9 +939,9 @@ else endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s - m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s + m4 -B 16384 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s else @@ -949,9 +949,9 @@ else endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s - m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s + m4 -B 16384 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s else @@ -959,9 +959,9 @@ else endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s - m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s + m4 -B 16384 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s else @@ -969,9 +969,9 @@ else endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s - m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s + m4 -B 16384 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s else @@ -979,9 +979,9 @@ else endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s - m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s + m4 -B 16384 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s else @@ -989,9 +989,9 @@ else endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s - m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s + m4 -B 16384 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s else @@ -999,9 +999,9 @@ else endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s - m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s + m4 -B 16384 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s else @@ -1009,9 +1009,9 @@ else endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s - m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s + m4 -B 16384 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s else @@ -1019,7 +1019,7 @@ else endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ @@ -1029,9 +1029,9 @@ else endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s - m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s + m4 -B 16384 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s else @@ -1049,9 +1049,9 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s - m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + m4 -B 16384 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s else @@ -1183,9 +1183,9 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s - m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s + m4 -B 16384 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s else @@ -2459,7 +2459,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ @@ -2505,7 +2505,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(OS), AIX) +ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ From c854ef5471e7b1673b408673239ee1b917518496 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 25 Jun 2020 13:29:52 +0200 Subject: [PATCH 021/349] Fix variable names in conditional --- kernel/Makefile.L3 | 94 +++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index c7865480f6..3d63ff861b 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -482,7 +482,7 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ @@ -497,7 +497,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s m4 shgemmitcopy.s > shgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ @@ -513,7 +513,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ @@ -529,7 +529,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ @@ -541,7 +541,7 @@ endif endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ @@ -559,7 +559,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ @@ -602,7 +602,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ @@ -625,7 +625,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ @@ -657,7 +657,7 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -669,7 +669,7 @@ endif ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -680,7 +680,7 @@ endif endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ @@ -693,7 +693,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s m4 -B 16384 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ @@ -703,7 +703,7 @@ else endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s m4 -B 16384 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ @@ -713,7 +713,7 @@ else endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 -B 16384 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ @@ -723,7 +723,7 @@ else endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s m4 -B 16384 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ @@ -733,7 +733,7 @@ else endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s m4 -B 16384 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ @@ -743,7 +743,7 @@ else endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s m4 -B 16384 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ @@ -753,7 +753,7 @@ else endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s m4 -B 16384 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ @@ -763,7 +763,7 @@ else endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s m4 -B 16384 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ @@ -787,7 +787,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ @@ -797,7 +797,7 @@ else endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ @@ -807,7 +807,7 @@ else endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ @@ -817,7 +817,7 @@ else endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ @@ -827,7 +827,7 @@ else endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ @@ -837,7 +837,7 @@ else endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ @@ -847,7 +847,7 @@ else endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ @@ -857,7 +857,7 @@ else endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ @@ -879,7 +879,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s m4 -B 16384 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ @@ -889,7 +889,7 @@ else endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s m4 -B 16384 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ @@ -899,7 +899,7 @@ else endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s m4 -B 16384 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ @@ -909,7 +909,7 @@ else endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s m4 -B 16384 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ @@ -919,7 +919,7 @@ else endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s m4 -B 16384 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ @@ -929,7 +929,7 @@ else endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s m4 -B 16384 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ @@ -939,7 +939,7 @@ else endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s m4 -B 16384 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ @@ -949,7 +949,7 @@ else endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s m4 -B 16384 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ @@ -959,7 +959,7 @@ else endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s m4 -B 16384 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ @@ -969,7 +969,7 @@ else endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s m4 -B 16384 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ @@ -979,7 +979,7 @@ else endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s m4 -B 16384 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ @@ -989,7 +989,7 @@ else endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s m4 -B 16384 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ @@ -999,7 +999,7 @@ else endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s m4 -B 16384 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ @@ -1009,7 +1009,7 @@ else endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s m4 -B 16384 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ @@ -1019,7 +1019,7 @@ else endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ @@ -1029,7 +1029,7 @@ else endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s m4 -B 16384 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ @@ -1049,7 +1049,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 -B 16384 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ @@ -1183,7 +1183,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s m4 -B 16384 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ @@ -2459,7 +2459,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ @@ -2505,7 +2505,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) -ifeq ($(findstring AIXPOW, $(MYOS)$(filter $(MYTARGET), POWER8 POWER9 POWER10)),AIXPOW) +ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW) $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ From 2a91452bdd1d735b11156add482b9f35c3d01c69 Mon Sep 17 00:00:00 2001 From: Matthew Treinish Date: Thu, 25 Jun 2020 11:32:09 -0400 Subject: [PATCH 022/349] Add cpu detection support for comet lake U Comet Lake U CPUs have family: 6, model: 6, extended family: 0, and extended model: 10 were not being correctly detected by GETARCH during openblas builds and would show CORE=UNKNOWN and LIBCORE=unknown. This commit adds the necessary information to cpuid_x86 to detect extended family 10 model 6 and return the proper core information. It's essentially just a skylake cpu, not skylake x, so I just took the used the same return fields as skylake. --- cpuid_x86.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index e29adecae9..1fe5ca152c 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1955,6 +1955,19 @@ int get_coretype(void){ return CORE_NEHALEM; } break; + case 10: + switch (model) { + case 6: + // Comet Lake U + if(support_avx()) + #ifndef NO_AVX2 + return CORE_HASWELL; + #else + return CORE_SANDYBRIDGE; + #endif + else + return CORE_NEHALEM; + } case 5: switch (model) { case 6: From f37e941d5270e396ed27e4ad5fd484fb257b742b Mon Sep 17 00:00:00 2001 From: Matthew Treinish Date: Thu, 25 Jun 2020 11:56:49 -0400 Subject: [PATCH 023/349] Add support to driver/others/dynamic.c too --- driver/others/dynamic.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 38eb766430..7677f265a8 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } + case 10: + if (model == 6) { + if(support_avx2()) + return &gotoblas_HASWELL; + if(support_avx()) { + openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); + return &gotoblas_SANDYBRIDGE; + } else { + openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); + return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. + } + } return NULL; } case 0xf: From 2f9c10810c932fc015cb4e5078cab7117bc120b6 Mon Sep 17 00:00:00 2001 From: Matthew Treinish Date: Thu, 25 Jun 2020 15:53:56 -0400 Subject: [PATCH 024/349] Also set CPUTYPE in get_cpuname() --- cpuid_x86.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 1fe5ca152c..3538690b92 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1406,6 +1406,16 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + } + case 10: //family 6 exmodel 10 + switch (model) { + case 6: // Comet Lake U + if(support_avx2()) + return CPUTYPE_HASWELL; + if(support_avx()) + return CPUTYPE_SANDYBRIDGE; + else + return CPUTYPE_NEHALEM; } break; } From d23419accc2f60a27b95cb29f11f76443a82d111 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 25 Jun 2020 22:19:08 -0500 Subject: [PATCH 025/349] powerpc: Optimized SHGEMM kernel for POWER10 This patch introduces new optimized version of SHGEMM kernel using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. --- kernel/generic/gemm_ncopy_16.c | 32 +- kernel/generic/gemm_ncopy_8.c | 44 +- kernel/generic/gemm_tcopy_16.c | 26 +- kernel/generic/gemm_tcopy_8.c | 46 +- kernel/power/KERNEL.POWER10 | 11 + kernel/power/shgemm_kernel_power10.c | 1044 ++++++++++++++++++++++++++ param.h | 13 + 7 files changed, 1142 insertions(+), 74 deletions(-) create mode 100644 kernel/power/shgemm_kernel_power10.c diff --git a/kernel/generic/gemm_ncopy_16.c b/kernel/generic/gemm_ncopy_16.c index 5f91d0dbe2..d3ab464724 100644 --- a/kernel/generic/gemm_ncopy_16.c +++ b/kernel/generic/gemm_ncopy_16.c @@ -39,24 +39,24 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; - FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; - - FLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; diff --git a/kernel/generic/gemm_ncopy_8.c b/kernel/generic/gemm_ncopy_8.c index a49a778e65..aaf9c8917b 100644 --- a/kernel/generic/gemm_ncopy_8.c +++ b/kernel/generic/gemm_ncopy_8.c @@ -39,30 +39,30 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - - FLOAT *boffset; - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; + IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; + IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; + IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; + IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; + IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; + IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; + IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; diff --git a/kernel/generic/gemm_tcopy_16.c b/kernel/generic/gemm_tcopy_16.c index 56268ebf26..14252599a2 100644 --- a/kernel/generic/gemm_tcopy_16.c +++ b/kernel/generic/gemm_tcopy_16.c @@ -39,22 +39,22 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2; - FLOAT *boffset; - - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2; + IFLOAT *boffset; + + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; aoffset = a; boffset = b; diff --git a/kernel/generic/gemm_tcopy_8.c b/kernel/generic/gemm_tcopy_8.c index b28f3d2190..3e8a839db5 100644 --- a/kernel/generic/gemm_tcopy_8.c +++ b/kernel/generic/gemm_tcopy_8.c @@ -39,32 +39,32 @@ #include #include "common.h" -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG i, j; - FLOAT *aoffset; - FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; - FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; - - FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; - - FLOAT ctemp01, ctemp02, ctemp03, ctemp04; - FLOAT ctemp05, ctemp06, ctemp07, ctemp08; - FLOAT ctemp09, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - FLOAT ctemp17, ctemp18, ctemp19, ctemp20; - FLOAT ctemp21, ctemp22, ctemp23, ctemp24; - FLOAT ctemp25, ctemp26, ctemp27, ctemp28; - FLOAT ctemp29, ctemp30, ctemp31, ctemp32; - FLOAT ctemp33, ctemp34, ctemp35, ctemp36; - FLOAT ctemp37, ctemp38, ctemp39, ctemp40; - FLOAT ctemp41, ctemp42, ctemp43, ctemp44; - FLOAT ctemp45, ctemp46, ctemp47, ctemp48; - FLOAT ctemp49, ctemp50, ctemp51, ctemp52; - FLOAT ctemp53, ctemp54, ctemp55, ctemp56; - FLOAT ctemp57, ctemp58, ctemp59, ctemp60; - FLOAT ctemp61, ctemp62, ctemp63, ctemp64; + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; + IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; + IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; + IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; + IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; + IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; + IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; + IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; aoffset = a; boffset = b; diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 4fc7190b0b..39f5e94145 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -7,6 +7,17 @@ else #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c +SHGEMM_BETA = ../generic/gemm_beta.c +SHGEMMKERNEL = shgemm_kernel_power10.c +SHGEMMINCOPY = ../generic/gemm_ncopy_16.c +SHGEMMITCOPY = ../generic/gemm_tcopy_16.c +SHGEMMONCOPY = ../generic/gemm_ncopy_8.c +SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) +SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) +SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) +SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) + STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c CTRMMKERNEL = cgemm_kernel_power10.S diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c new file mode 100644 index 0000000000..7455f925c7 --- /dev/null +++ b/kernel/power/shgemm_kernel_power10.c @@ -0,0 +1,1044 @@ +/********************************************************************************* +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ +#include "common.h" +#include +#if defined(HALF) && defined(HALFCONVERSION) +static float +bfloat16tof32 (bfloat16 f16) +{ + float result = 0; + unsigned short *q = (unsigned short *) (&result); +#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + q[0] = f16; +#else + q[1] = f16; +#endif + return result; +} + +#define BF16TOF32(x) (bfloat16tof32(x)) +#else +#define BF16TOF32(x) x +#endif + +typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); + +vector char mask = + { 0x0, 0x1, 0x8, 0x9, 0x2, 0x3, 0xa, 0xb, 0x4, 0x5, 0xc, 0xd, 0x6, 0x7, 0xe, + 0xf +}; + +/* + * BFLOAT16 xvbf16ger2pp instruction needs 4×2 matrix of + * bfloat16 floating-point values as input. Hence this + * merging is needed on A and B matrices. + */ +#define MERGE_ROW(x) vec_perm(x, x, mask) +#define MERGE_HIGH(x, y) (vec_t) vec_mergeh ((vector short)x, (vector short)y) +#define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) + +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[3* ldc+J]; \ + rowC[0] += result[0] * alpha; +#define SAVE4x2_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v2sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[6] * alpha; \ + rowC = (v2sf_t *) &CO[5* ldc+J]; \ + rowC[0] += result[4] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[7* ldc+J]; \ + rowC[0] += result[0] * alpha; + +#define MMA __builtin_mma_xvbf16ger2pp + +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc (result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[3] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[2] * alpha; + +#define SET_ACC_ZERO4() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); + +#define SET_ACC_ZERO8() \ + __builtin_mma_xxsetaccz (&acc0); \ + __builtin_mma_xxsetaccz (&acc1); \ + __builtin_mma_xxsetaccz (&acc2); \ + __builtin_mma_xxsetaccz (&acc3); \ + __builtin_mma_xxsetaccz (&acc4); \ + __builtin_mma_xxsetaccz (&acc5); \ + __builtin_mma_xxsetaccz (&acc6); \ + __builtin_mma_xxsetaccz (&acc7); + +#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); +/************************************************************************************* +* SHGEMM Kernel +*************************************************************************************/ +int +CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, + IFLOAT * B, FLOAT * C, BLASLONG ldc) +{ + BLASLONG N = n; + BLASLONG i1; + v4sf_t valpha = { alpha, alpha, alpha, alpha }; + vector short vzero = { 0, 0, 0, 0, 0, 0, 0, 0 }; + N = n >> 3; + /* Loop for n >= 8. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + i = m >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[2]); + vec_t rowA_l = MERGE_LOW (rowA[0], rowA[2]); + vec_t rowA2_h = MERGE_HIGH (rowA[1], rowA[3]); + vec_t rowA2_l = MERGE_LOW (rowA[1], rowA[3]); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + MMA (&acc4, rowB_h, rowA2_h); + MMA (&acc5, rowB_l, rowA2_h); + MMA (&acc6, rowB_h, rowA2_l); + MMA (&acc7, rowB_l, rowA2_l); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 4; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); + vec_t rowA_l = MERGE_LOW (rowA[0], vzero); + vec_t rowA2_h = MERGE_HIGH (rowA[1], vzero); + vec_t rowA2_l = MERGE_LOW (rowA[1], vzero); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + MMA (&acc4, rowB_h, rowA2_h); + MMA (&acc5, rowB_l, rowA2_h); + MMA (&acc6, rowB_h, rowA2_l); + MMA (&acc7, rowB_l, rowA2_l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + SAVE_ACC (&acc4, 8); + SAVE_ACC (&acc6, 12); + SAVE_ACC1 (&acc5, 8); + SAVE_ACC1 (&acc7, 12); + CO += 16; + + AO += (k << 4); + BO += (k << 3); + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 4]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[1]); + vec_t rowA_l = MERGE_LOW (rowA[0], rowA[1]); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 4; + vec_t *rowA = (vec_t *) & (AO[l]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); + vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); + vec_t rowA_h = MERGE_HIGH (rowA[0], vzero); + vec_t rowA_l = MERGE_LOW (rowA[0], vzero); + MMA (&acc0, rowB_h, rowA_h); + MMA (&acc1, rowB_l, rowA_h); + MMA (&acc2, rowB_h, rowA_l); + MMA (&acc3, rowB_l, rowA_l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC1 (&acc3, 4); + CO += 8; + AO += (k << 3); + BO += (k << 3); + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowB = (vec_t *) & (BO[l << 4]); + vec_t rowA_mrg = MERGE_ROW (rowA[0]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), rowA_mrg); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), rowA_mrg); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vector short rowA = + { AO[l + 0], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 1]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 4; + AO += (k << 2); + BO += (k << 3); + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowA = + { AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1], + AO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowB = (vec_t *) & (BO[l << 4]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowA = { AO[l + 0], 0, AO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[(l << 2)]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); + CO += 2; + AO += (k << 1); + BO += (k << 3); + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 } + , t1 = + { + 0, 0, 0, 0}; + for (l = 0; l < k; l++) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), + BF16TOF32 (AO[l]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 3]), BF16TOF32 (BO[(l << 3) + 1]), + BF16TOF32 (BO[(l << 3) + 2]), + BF16TOF32 (BO[(l << 3) + 3]) + }; + v4sf_t rowB1 = + { BF16TOF32 (BO[(l << 3) + 4]), BF16TOF32 (BO[(l << 3) + 5]), + BF16TOF32 (BO[(l << 3) + 6]), + BF16TOF32 (BO[(l << 3) + 7]) + }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + CO[4 * ldc] += t1[0]; + CO[5 * ldc] += t1[1]; + CO[6 * ldc] += t1[2]; + CO[7 * ldc] += t1[3]; + CO += 1; + AO += k; + BO += (k << 3); + } + B += k << 3; + } + N = (n & 7) >> 2; + /* Loop for n >= 4. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 2; + AO = A; + i = m >> 5; + /* Loop for m >= 32. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + IFLOAT *A1 = AO + (16 * k); + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowA1 = (vec_t *) & (A1[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], rowA1[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + vec_t *rowA1 = (vec_t *) & (A1[(l << 2)]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], vzero)); + MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], vzero)); + MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], vzero)); + MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], vzero)); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + SAVE_ACC (&acc4, 0); + SAVE_ACC (&acc5, 4); + CO += 8; + SAVE_ACC (&acc6, 0); + SAVE_ACC (&acc7, 4); + CO += 8; + AO += k << 5; + BO += k << 2; + } + i = (m & 31) >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], vzero)); + MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], vzero)); + } + + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + SAVE_ACC (&acc2, 0); + SAVE_ACC (&acc3, 4); + CO += 8; + AO += k << 4; + BO += k << 2; + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 4]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[1])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + vec_t *rowB = (vec_t *) & (BO[l]); + vec_t rowB_mrg = MERGE_ROW (rowB[0]); + MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], vzero)); + MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], vzero)); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 4); + CO += 8; + AO += k << 3; + BO += k << 2; + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + __vector_quad acc0; + v4sf_t result[4]; + BLASLONG l = 0; + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) + { + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, MERGE_ROW (rowB[0]), MERGE_ROW (rowA[0])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 3; + vector short rowA = + { AO[l], 0, AO[l + 1], 0, AO[l + 2], 0, AO[l + 3], 0 }; + vec_t *rowB = (vec_t *) & (BO[l]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE_ACC (&acc0, 0); + CO += 4; + AO += k << 2; + BO += k << 2; + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; + BLASLONG l = 0; + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) + { + vector short rowA = + { AO[(l << 2) + 0], AO[(l << 2) + 2], AO[(l << 2) + 1], + AO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowA = { AO[l], 0, AO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 1]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + CO += 2; + AO += k << 1; + BO += k << 2; + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), + BF16TOF32 (AO[l]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 2]), BF16TOF32 (BO[(l << 2) + 1]), + BF16TOF32 (BO[(l << 2) + 2]), + BF16TOF32 (BO[(l << 2) + 3]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t[2]; + CO[3 * ldc] += t[3]; + AO += k; + BO += (k << 2); + CO += 1; + } + + B += k << 2; + } + N = (n & 3) >> 1; + /* Loop for n >= 2. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i, j; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + i = m >> 5; + /* Loop for m >= 32. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + IFLOAT *A1 = AO + (16 * k); + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + SET_ACC_ZERO8 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + vec_t *rowA1 = (vec_t *) & (A1[l << 5]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + vec_t *rowA1 = (vec_t *) & (A1[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); + MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); + MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); + MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + SAVE2x4_ACC (&acc4, 0); + SAVE2x4_ACC (&acc5, 4); + SAVE2x4_ACC (&acc6, 8); + SAVE2x4_ACC (&acc7, 12); + CO += 16; + AO += k << 5; + BO += k << 1; + } + i = (m & 31) >> 4; + /* Loop for m >= 16. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + SAVE2x4_ACC (&acc2, 8); + SAVE2x4_ACC (&acc3, 12); + CO += 16; + AO += k << 4; + BO += k << 1; + } + i = (m & 15) >> 3; + /* Loop for m >= 8. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 4]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 4); + CO += 8; + AO += k << 3; + BO += k << 1; + } + i = (m & 7) >> 2; + /* Loop for m >= 4. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); + BLASLONG l = 0; + for (l = 0; l < k / 2; l++) + { + vector short rowB = + { BO[(l << 2) + 0], BO[(l << 2) + 2], BO[(l << 2) + 1], + BO[(l << 2) + 3], + 0, 0, 0, 0 + }; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 2; + vector short rowB = { BO[l + 0], 0, BO[l + 1], 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[l << 1]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + SAVE2x4_ACC (&acc0, 0); + CO += 4; + AO += k << 2; + BO += k << 1; + } + i = (m & 3) >> 1; + /* Loop for m >= 2. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < (k << 1); l += 2) + { + v4sf_t rowA = + { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l + 1]), + BF16TOF32 (AO[l + 1]) + }; + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l + 1]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l + 1]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[0 * ldc + 1] += t[2]; + CO[1 * ldc + 1] += t[3]; + CO += 2; + AO += k << 1; + BO += k << 1; + } + i = (m & 1) >> 0; + /* Loop for m = 1. */ + for (j = 0; j < i; j++) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowA = { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), 0, 0 }; + v4sf_t rowB = + { BF16TOF32 (BO[l << 1]), BF16TOF32 (BO[(l << 1) + 1]), 0, + 0 + }; + t += rowA * rowB; + } + CO[0 * ldc] += t[0] * alpha; + CO[1 * ldc] += t[1] * alpha; + CO += 1; + AO += k; + BO += k << 1; + } + B += k << 1; + } + N = (n & 1) >> 0; + /* Loop for n = 1. */ + for (i1 = 0; i1 < N; i1++) + { + BLASLONG i; + FLOAT *CO; + IFLOAT *AO; + CO = C; + C += ldc; + AO = A; + i = m; + /* Loop for m >= 16. */ + while (i >= 16) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + v4sf_t t2 = { 0, 0, 0, 0 }; + v4sf_t t3 = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 4]), BF16TOF32 (AO[(l << 4) + 1]), + BF16TOF32 (AO[(l << 4) + 2]), + BF16TOF32 (AO[(l << 4) + 3]) + }; + v4sf_t rowA1 = + { BF16TOF32 (AO[(l << 4) + 4]), BF16TOF32 (AO[(l << 4) + 5]), + BF16TOF32 (AO[(l << 4) + 6]), + BF16TOF32 (AO[(l << 4) + 7]) + }; + v4sf_t rowA2 = + { BF16TOF32 (AO[(l << 4) + 8]), BF16TOF32 (AO[(l << 4) + 9]), + BF16TOF32 (AO[(l << 4) + 10]), + BF16TOF32 (AO[(l << 4) + 11]) + }; + v4sf_t rowA3 = { BF16TOF32 (AO[(l << 4) + 12]), + BF16TOF32 (AO[(l << 4) + 13]), BF16TOF32 (AO[(l << 4) + 14]), + BF16TOF32 (AO[(l << 4) + 15]) + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + CO[8] += t2[0]; + CO[9] += t2[1]; + CO[10] += t2[2]; + CO[11] += t2[3]; + CO[12] += t3[0]; + CO[13] += t3[1]; + CO[14] += t3[2]; + CO[15] += t3[3]; + AO += k << 4; + BO += k; + CO += 16; + i -= 16; + } + /* Loop for m >= 8. */ + while (i >= 8) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + v4sf_t t1 = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 3]), BF16TOF32 (AO[(l << 3) + 1]), + BF16TOF32 (AO[(l << 3) + 2]), + BF16TOF32 (AO[(l << 3) + 3]) + }; + v4sf_t rowA1 = + { BF16TOF32 (AO[(l << 3) + 4]), BF16TOF32 (AO[(l << 3) + 5]), + BF16TOF32 (AO[(l << 3) + 6]), + BF16TOF32 (AO[(l << 3) + 7]) + }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + CO[4] += t1[0]; + CO[5] += t1[1]; + CO[6] += t1[2]; + CO[7] += t1[3]; + AO += k << 3; + BO += k; + CO += 8; + i -= 8; + } + /* Loop for m >= 4. */ + while (i >= 4) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = + { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), + BF16TOF32 (BO[l]) + }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 2]), BF16TOF32 (AO[(l << 2) + 1]), + BF16TOF32 (AO[(l << 2) + 2]), + BF16TOF32 (AO[(l << 2) + 3]) + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t[2]; + CO[3] += t[3]; + AO += k << 2; + BO += k; + CO += 4; + i -= 4; + } + /* Loop for m >= 2. */ + while (i >= 2) + { + IFLOAT *BO = B; + BLASLONG l = 0; + v4sf_t t = { 0, 0, 0, 0 }; + for (l = 0; l < k; l++) + { + v4sf_t rowB = { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), 0, 0 }; + v4sf_t rowA = + { BF16TOF32 (AO[l << 1]), BF16TOF32 (AO[(l << 1) + 1]), 0, + 0 + }; + t += rowA * rowB; + } + t = t * valpha; + CO[0] += t[0]; + CO[1] += t[1]; + AO += k << 1; + BO += k; + CO += 2; + i -= 2; + } + /* Loop for m = 1. */ + while (i >= 1) + { + IFLOAT *BO = B; + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < k; l++) + { + t += BF16TOF32 (AO[l]) * BF16TOF32 (BO[l]); + } + AO += k; + BO += k; + CO[0] += t * alpha; + CO += 1; + i -= 1; + } + + B += k; + } + + return 0; +} diff --git a/param.h b/param.h index fd0ea75992..e8cf53f0a0 100644 --- a/param.h +++ b/param.h @@ -2297,6 +2297,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(POWER10) +#undef SHGEMM_DEFAULT_UNROLL_N +#undef SHGEMM_DEFAULT_UNROLL_M +#undef SHGEMM_DEFAULT_P +#undef SHGEMM_DEFAULT_R +#undef SHGEMM_DEFAULT_Q +#define SHGEMM_DEFAULT_UNROLL_M 16 +#define SHGEMM_DEFAULT_UNROLL_N 8 +#define SHGEMM_DEFAULT_P 832 +#define SHGEMM_DEFAULT_Q 1026 +#define SHGEMM_DEFAULT_R 4096 +#endif + #if defined(SPARC) && defined(V7) #define SNUMOPT 4 From e30ad0e521e77d3b72b8d46c18434cc911374f8d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 26 Jun 2020 09:00:43 +0200 Subject: [PATCH 026/349] Strip UTF8 byte order marker from source --- kernel/x86_64/sgemm_kernel_8x4_haswell_2.c | 2 +- kernel/x86_64/strsm_kernel_8x4_haswell_LN.c | 2 +- kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c index 5ab3e6d1f9..a2e78c58db 100644 --- a/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c +++ b/kernel/x86_64/sgemm_kernel_8x4_haswell_2.c @@ -1,4 +1,4 @@ -/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */ /* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */ /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c index 4131debb19..5410bd4ae4 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c @@ -1,4 +1,4 @@ -#include "common.h" +#include "common.h" #include #include "strsm_kernel_8x4_haswell_L_common.h" diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h index cfa56da97e..2862a5b8dc 100644 --- a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h @@ -1,4 +1,4 @@ -/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ From 584ef8d4ae57d9eda3a8e27b84d2d37c60e8e4a5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Jun 2020 14:36:37 +0200 Subject: [PATCH 027/349] Add support for Comet Lake H & S --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 7677f265a8..c03b0b21de 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -619,7 +619,7 @@ static gotoblas_t *get_coretype(void){ } } case 10: - if (model == 6) { + if (model == 5 || model == 6) { if(support_avx2()) return &gotoblas_HASWELL; if(support_avx()) { From 83f47468254c5bca8e86a659e709de3f2cc4ffd4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 27 Jun 2020 14:41:24 +0200 Subject: [PATCH 028/349] Add support for Comet Lake H and S --- cpuid_x86.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 3538690b92..356800b781 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1409,6 +1409,7 @@ int get_cpuname(void){ } case 10: //family 6 exmodel 10 switch (model) { + case 5: // Comet Lake H and S case 6: // Comet Lake U if(support_avx2()) return CPUTYPE_HASWELL; @@ -1967,16 +1968,16 @@ int get_coretype(void){ break; case 10: switch (model) { - case 6: - // Comet Lake U + case 5: // Comet Lake H and S + case 6: // Comet Lake U if(support_avx()) #ifndef NO_AVX2 return CORE_HASWELL; #else - return CORE_SANDYBRIDGE; + return CORE_SANDYBRIDGE; #endif else - return CORE_NEHALEM; + return CORE_NEHALEM; } case 5: switch (model) { From 634e1305f9caf640dfa42e61d4da564d8aedf16b Mon Sep 17 00:00:00 2001 From: EGuesnet <51407514+EGuesnet@users.noreply.github.com> Date: Tue, 30 Jun 2020 15:16:39 +0200 Subject: [PATCH 029/349] Update cgemm_kernel_8x4_power8.S --- kernel/power/cgemm_kernel_8x4_power8.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index 2bc99974f9..6be8c128c5 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -424,7 +424,7 @@ L999: lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) - addi r11, 224 + addi r11, SP, 224 #endif lvx v20, r11, r0 addi r11, r11, 16 @@ -459,4 +459,4 @@ L999: blr EPILOGUE -#endif^ +#endif From 4ab3651591d231c69f0f16dbeae26e2cc7ee819f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 2 Jul 2020 17:00:15 +0200 Subject: [PATCH 030/349] Option -mavx2 requires at least gcc 4.7 --- Makefile.x86_64 | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f2de51ef4c..2676bd258d 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -31,14 +31,24 @@ ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) +# AVX2 support was added in 4.7.0 +GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) +ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) CCOMMON_OPT += -mavx2 endif +endif ifeq ($(F_COMPILER), GFORTRAN) +# AVX2 support was added in 4.7.0 +GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) +GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7) +ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) FCOMMON_OPT += -mavx2 endif endif endif endif +endif From 10a2923f640e9b1aa3f8bca34e71481586aa3acd Mon Sep 17 00:00:00 2001 From: Jussi Enkovaara Date: Tue, 7 Jul 2020 13:35:43 +0300 Subject: [PATCH 031/349] fixes #2238 Always obey omp_get_max_threads() when build with USE_OPENMP --- common_thread.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/common_thread.h b/common_thread.h index 6ec40e096d..ec0c65b220 100644 --- a/common_thread.h +++ b/common_thread.h @@ -132,18 +132,18 @@ extern int blas_server_avail; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP - int openmp_nthreads=0; + int openmp_nthreads=omp_get_max_threads(); #endif +#ifndef USE_OPENMP if (blas_cpu_number == 1 - +#endif #ifdef USE_OPENMP - || omp_in_parallel() + if (openmp_nthreads == 1 || omp_in_parallel() #endif - ) return 1; + ) return 1; #ifdef USE_OPENMP - openmp_nthreads=omp_get_max_threads(); if (blas_cpu_number != openmp_nthreads) { goto_set_num_threads(openmp_nthreads); } From 8751a69271721b0593eafecd1cdd974d2839c864 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 7 Jul 2020 15:46:32 +0200 Subject: [PATCH 032/349] Obtain actual cpu count on AIX and suppress spurious NO_AVX512 on non-x86 --- getarch.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/getarch.c b/getarch.c index 164947f3e1..2cdf772599 100644 --- a/getarch.c +++ b/getarch.c @@ -90,11 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #endif +#if defined(AIX) +#include +#endif +#if defined(__x86_64__) || defined(_M_X64) #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) #else #define NO_AVX512 #endif +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -1297,6 +1302,11 @@ static int get_num_cores(void) { sysctl(m, 2, &count, &len, NULL, 0); return count; + +#elif defined(AIX) + //returns the number of processors which are currently online + return sysconf(_SC_NPROCESSORS_ONLN); + #else return 2; #endif From 45d819ca82f6a562de04cc5cfd3b70fd513fd4b8 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 7 Jul 2020 11:25:20 -0500 Subject: [PATCH 033/349] Changing mcpu option as power10 As compiler enabled mcpu option as power10, changing it from future. --- Makefile.power | 8 ++++---- driver/others/dynamic_power.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile.power b/Makefile.power index 5c431860f6..beb311945f 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,11 +11,11 @@ endif ifeq ($(CORE), POWER10) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math endif endif diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 811a5fae33..f625b9431f 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -45,7 +45,7 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER9; #endif #if (!defined __GNUC__) || ( __GNUC__ >= 11) - if (__builtin_cpu_is("isa_3_1") && __builtin_cpu_supports ("mma")) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif return NULL; From 1d63631afe3da02ade6aa7ca7698b08754c148a8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 11:42:02 +0200 Subject: [PATCH 034/349] Add lapack-test --- CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb5322a1d9..7cdc4181a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 10.dev) - +set(OpenBLAS_PATCH_VERSION 9.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -236,6 +235,10 @@ if (NOT MSVC AND NOT NOFORTRAN) endif() endif() +if (NOT NOFORTRAN) + add_subdirectory(lapack-netlib/TESTING) +endif() + set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} SOVERSION ${OpenBLAS_MAJOR_VERSION} From 60188a8c82398281794956f41c3e7232f0004532 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 11:44:31 +0200 Subject: [PATCH 035/349] Append crude hack for enabling lapack tests in the OpenBLAS build --- lapack-netlib/TESTING/CMakeLists.txt | 393 +++++++++++++++++++++++++++ 1 file changed, 393 insertions(+) diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index d5ca950131..755826bfe8 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -1,3 +1,5 @@ +enable_testing() + if(MSVC_VERSION) # string(REPLACE "/STACK:10000000" "/STACK:900000000000000000" # CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") @@ -168,3 +170,394 @@ if(PYTHONINTERP_FOUND) COMMAND ${PYTHON_EXECUTABLE} "lapack_testing.py" ) endif() + + + +# $1 exec, $2 input, $3 output_result +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh +"rm -f $3\n" +"$1 < $2\n" +"grep -q FATAL $3\n" +"if [ $? -eq 0 ]; then\n" +"echo Error\n" +"exit 1\n" +"else\n" +"exit 0\n" +"fi\n" +) + + +add_test(NAME "REAL_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out" +) +add_test(NAME "COMPLEX_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out" +) +add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out" +) +add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out" +) + +add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out" +) +# ======== COMPLEX-COMPLEX16 LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out" +) + +# ======== SINGLE RFP LIN TESTS ======================== + +add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out" +) + +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out" +) +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out" +) + +# ======== COMPLEX16 RFP LIN TESTS ======================== + +add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out" +) +# +# +# ======== SINGLE EIG TESTS =========================== +# + +add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out" +) + +add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out" +) + +add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out" +) + +add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out" +) + +add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out" +) + +add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out" +) + +add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out" +) + +add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out" +) + +add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out" +) + +add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out" +) + +add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out" +) + +add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out" +) + +add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out" +) + +add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out" +) + +add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out" +) + +add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out" +) + +add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out" +) + +add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out" +) + +add_test(NAME "SCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out" +) + +add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out" +) + +# ======== COMPLEX EIG TESTS =========================== + +add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out" +) + +add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out" +) + +add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out" +) + +add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out" +) + +add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out" +) + +add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out" +) + +add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out" +) + +add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out" +) + +add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out" +) + +add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out" +) + +add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out" +) + +add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out" +) + +add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out" +) + +add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out" +) + +add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out" +) + +add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out" +) + +add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out" +) + +add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out" +) + +add_test(NAME "CCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out" +) + +add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out" +) + +# ======== DOUBLE EIG TESTS =========================== + +add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out" +) + +add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out" +) + +add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out" +) + +add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out" +) + +add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out" +) + +add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out" +) + +add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out" +) + +add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out" +) + +add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out" +) + +add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out" +) + +add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out" +) + +add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out" +) + +add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out" +) + +add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out" +) + +add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out" +) + +add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out" +) + +add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out" +) + +add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out" +) + +add_test(NAME "DCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out" +) + +add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out" +) + +# ======== COMPLEX16 EIG TESTS =========================== + +add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out" +) + +add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out" +) + +add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out" +) + +add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out" +) + +add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out" +) + +add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out" +) + +add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out" +) + +add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out" +) + +add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out" +) + +add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out" +) + +add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out" +) + +add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out" +) + +add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out" +) + +add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out" +) + +add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out" +) + +add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out" +) + +add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out" +) + +add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out" +) + +add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out" +) + +add_test(NAME "Constrained_Linear_Least_Squares_routines" + COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out" +) From 29b5887d5f00bd94478fe84ac4518c4cb0391941 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 13:12:35 +0200 Subject: [PATCH 036/349] Modify for building with OpenBLAS --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 20fd25b4ac..70eea84430 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -98,7 +98,7 @@ set(ZEIGTST zchkee.f macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} openblas) endmacro() if(BUILD_SINGLE) From c502760befbb25e6a9415dbd6b1e811f711e7cf3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 13:13:16 +0200 Subject: [PATCH 037/349] Modify for building with OpenBLAS --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index c941d3577b..954cab193c 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -239,7 +239,8 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} tmglib ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) + target_link_libraries(${name} openblas) +#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() if(BUILD_SINGLE) From f76602474945ce7d5f930080e2b3fd016e945bc9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 13:44:25 +0200 Subject: [PATCH 038/349] enable fortran for cmake --- lapack-netlib/TESTING/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt index 755826bfe8..80e6b32320 100644 --- a/lapack-netlib/TESTING/CMakeLists.txt +++ b/lapack-netlib/TESTING/CMakeLists.txt @@ -1,3 +1,5 @@ +enable_language(Fortran) + enable_testing() if(MSVC_VERSION) From d4a0299e166b33ed9d61a488018f3e1bb5491d30 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 9 Jul 2020 13:57:27 +0200 Subject: [PATCH 039/349] Do not build lapack-test on MSVC for now (same as with BLAS test) --- CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7cdc4181a0..7e51e7e38c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -233,9 +233,6 @@ if (NOT MSVC AND NOT NOFORTRAN) if(NOT NO_CBLAS) add_subdirectory(ctest) endif() -endif() - -if (NOT NOFORTRAN) add_subdirectory(lapack-netlib/TESTING) endif() From af1e140e35cbdbb4d1f98addf3b817b1369460a3 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 9 Jul 2020 21:46:06 -0500 Subject: [PATCH 040/349] Change minimum gcc version for POWER10 As the MMA patches for POWER10 are backported to gcc10.2, changing the minimum gcc version needed to build OpenBLAS for POWER10. --- Makefile.system | 7 ++++++- driver/others/dynamic_power.c | 12 ++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1b473c59d9..61ae264bf5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -286,6 +286,8 @@ GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) # Note that the behavior of -dumpversion is compile-time-configurable for # gcc-7.x and newer. Use -dumpfullversion there ifeq ($(GCCVERSIONGTEQ7),1) @@ -619,9 +621,12 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif -GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) ifeq ($(GCCVERSIONGTEQ11), 1) DYNAMIC_CORE += POWER10 +else ifeq ($(GCCVERSIONEQ10), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) +DYNAMIC_CORE += POWER10 +endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) endif diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index f625b9431f..ca1d42408d 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -6,7 +6,11 @@ extern gotoblas_t gotoblas_POWER8; #if (!defined __GNUC__) || ( __GNUC__ >= 6) extern gotoblas_t gotoblas_POWER9; #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ + || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) +#define HAVE_P10_SUPPORT 1 +#endif +#ifdef HAVE_P10_SUPPORT extern gotoblas_t gotoblas_POWER10; #endif @@ -28,7 +32,7 @@ char *gotoblas_corename(void) { #if (!defined __GNUC__) || ( __GNUC__ >= 6) if (gotoblas == &gotoblas_POWER9) return corename[3]; #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#ifdef HAVE_P10_SUPPORT if (gotoblas == &gotoblas_POWER10) return corename[4]; #endif return corename[0]; @@ -44,7 +48,7 @@ static gotoblas_t *get_coretype(void) { if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#ifdef HAVE_P10_SUPPORT if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif @@ -73,7 +77,7 @@ static gotoblas_t *force_coretype(char * coretype) { #if (!defined __GNUC__) || ( __GNUC__ >= 6) case 3: return (&gotoblas_POWER9); #endif -#if (!defined __GNUC__) || ( __GNUC__ >= 11) +#ifdef HAVE_P10_SUPPORT case 4: return (&gotoblas_POWER10); #endif default: return NULL; From ae3a90f78f7c34a7d53b3650637f5c442b19940c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jul 2020 18:51:58 +0200 Subject: [PATCH 041/349] merge overwritten part of power10 support --- cpuid_power.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpuid_power.c b/cpuid_power.c index ed51df2116..8f578d68f8 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -57,6 +57,7 @@ #define CPUTYPE_PPCG4 7 #define CPUTYPE_POWER8 8 #define CPUTYPE_POWER9 9 +#define CPUTYPE_POWER10 10 char *cpuname[] = { "UNKNOWN", From 5865c7d4d6bc3a5a32a477d181a1568e95b7c167 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 12 Jul 2020 18:59:01 +0200 Subject: [PATCH 042/349] Make 32bit POWER8 use POWER6 kernels for now --- Makefile.system | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.system b/Makefile.system index 1b473c59d9..e3b644cf37 100644 --- a/Makefile.system +++ b/Makefile.system @@ -109,6 +109,9 @@ endif ifeq ($(TARGET), ARMV8) GETARCH_FLAGS := -DFORCE_ARMV7 endif +ifeq ($(TARGET), POWER8) +GETARCH_FLAGS := -DFORCE_POWER6 +endif endif From bd2498c88643834f49f6d0bc764754631a71ee50 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:07:58 +0200 Subject: [PATCH 043/349] Use POWER6 GEMM parameters on 32bit POWER8 --- param.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index e8cf53f0a0..efe0e1096e 100644 --- a/param.h +++ b/param.h @@ -2225,7 +2225,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_A 0 #define GEMM_DEFAULT_OFFSET_B 65536 #define GEMM_DEFAULT_ALIGN 0x0ffffUL - +#if defined(__32BIT__) +#warning using BINARY32==POWER6 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 @@ -2234,7 +2244,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 - +#endif #define SGEMM_DEFAULT_P 1280UL #define DGEMM_DEFAULT_P 640UL #define CGEMM_DEFAULT_P 640UL From b144423f0f4d91e0f642b4c4c66b1cf919fcae0e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:10:12 +0200 Subject: [PATCH 044/349] Do not define USE_TRMM for 32bit POWER8 --- kernel/Makefile.L3 | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index dfdaf5cf4f..1904264bec 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -44,8 +44,10 @@ USE_TRMM = 1 endif ifeq ($(CORE), POWER8) +ifeq ($(BINARY64),1) USE_TRMM = 1 endif +endif ifeq ($(CORE), POWER9) USE_TRMM = 1 @@ -514,7 +516,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -530,7 +532,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s From f8c2697701dfbcc3cba307245aab06134c86f53f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:11:19 +0200 Subject: [PATCH 045/349] Use POWER6 GEMM, TRMM and DTRSM on 32bit POWER8 --- kernel/power/KERNEL.POWER8 | 84 ++++++++++++++++++++++++++++++-------- 1 file changed, 68 insertions(+), 16 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 7fba5b4d6c..dc6646d509 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,3 +1,51 @@ +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +$(info baue power6) +SGEMMKERNEL = gemm_kernel_power6.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = gemm_ncopy_4.S +SGEMMOTCOPY = gemm_tcopy_4.S +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_power6.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4.S +DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_power6.S +CGEMMINCOPY = ../generic/zgemm_ncopy_2.c +CGEMMITCOPY = ../generic/zgemm_tcopy_2.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_power6.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S + +CAXPYKERNEL = zaxpy.S + +else + +$(info baue power8) #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -47,16 +95,21 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +#DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +#DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +#DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +#DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -153,15 +206,15 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -ifneq ($(GCCVERSIONGTEQ9),1) -CAXPYKERNEL = caxpy_power8.S -else -CAXPYKERNEL = caxpy.c -endif -else -CAXPYKERNEL = caxpy.c -endif +#ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +#ifneq ($(GCCVERSIONGTEQ9),1) +#CAXPYKERNEL = caxpy_power8.S +#else +#CAXPYKERNEL = caxpy.c +#endif +#else +#CAXPYKERNEL = caxpy.c +#endif # ZAXPYKERNEL = zaxpy.c # @@ -173,7 +226,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -CDOTKERNEL = cdot.c +CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -183,7 +236,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -CROTKERNEL = crot.c +#CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c @@ -239,4 +292,3 @@ IDAMINKERNEL = ../arm/iamin.c IZAMAXKERNEL = ../arm/izamax.c IZAMINKERNEL = ../arm/izamin.c endif - From da17abec871ed96e1c959eee4ad11a1346d25b2d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 14 Jul 2020 18:20:03 +0200 Subject: [PATCH 046/349] fix trailing whitespace --- kernel/Makefile.L3 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 1904264bec..d5de070a5d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -516,7 +516,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s m4 sgemmotcopy.s > sgemmotcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ rm sgemmotcopy.s sgemmotcopy_nomacros.s @@ -532,7 +532,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s m4 sgemmitcopy.s > sgemmitcopy_nomacros.s $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ rm sgemmitcopy.s sgemmitcopy_nomacros.s From 417c4e8af8ab1a985ddd8d7fe15cf13d47cd82a3 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 14 Jul 2020 11:54:04 -0500 Subject: [PATCH 047/349] Add new linker option for POWER10 While building with DYNAMIC_ARCH on POWER9 with POWER10 aware toolchain, new LDFLAG is needed to avoid POWER10 instructions on PLT calls . --- Makefile.system | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 4f17c25b9d..3312a0be37 100644 --- a/Makefile.system +++ b/Makefile.system @@ -617,6 +617,7 @@ DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 DYNAMIC_CORE += POWER10 +override LDFLAGS += -Wl,-no-power10-stubs endif ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGT5), 1) @@ -626,9 +627,11 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif ifeq ($(GCCVERSIONGTEQ11), 1) DYNAMIC_CORE += POWER10 -else ifeq ($(GCCVERSIONEQ10), 1) +override LDFLAGS += -Wl,-no-power10-stubs +else ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCMINORVERSIONGTEQ2), 1) DYNAMIC_CORE += POWER10 +override LDFLAGS += -Wl,-no-power10-stubs endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) From f308e741b2cad79196b096fde3aad9b562b1410a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 15 Jul 2020 10:00:07 +0200 Subject: [PATCH 048/349] remove debug output and revert changes to cdot and crot --- kernel/power/KERNEL.POWER8 | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index dc6646d509..bb93a6a230 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,5 +1,4 @@ ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) -$(info baue power6) SGEMMKERNEL = gemm_kernel_power6.S SGEMMINCOPY = SGEMMITCOPY = @@ -45,7 +44,6 @@ CAXPYKERNEL = zaxpy.S else -$(info baue power8) #SGEMM_BETA = ../generic/gemm_beta.c #DGEMM_BETA = ../generic/gemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c @@ -226,7 +224,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -CDOTKERNEL = ../arm/zdot.c +CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c @@ -236,7 +234,7 @@ ZNRM2KERNEL = ../arm/znrm2.c # SROTKERNEL = srot.c DROTKERNEL = drot.c -#CROTKERNEL = crot.c +CROTKERNEL = crot.c ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c From 0033f8be0d8fcc5c8ae9ba8f0cae556297015c81 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Jul 2020 23:32:54 +0200 Subject: [PATCH 049/349] Use vec_vsx_ld/st to fix misaligned accesses flagged by asan --- kernel/power/saxpy.c | 96 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c index 393cdfadc4..360d641465 100644 --- a/kernel/power/saxpy.c +++ b/kernel/power/saxpy.c @@ -28,6 +28,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#define offset_0 0 +#define offset_1 16 +#define offset_2 32 +#define offset_3 48 +#define offset_4 64 +#define offset_5 80 +#define offset_6 96 +#define offset_7 112 +#define offset_8 128 +#define offset_9 144 +#define offset_10 160 +#define offset_11 176 +#define offset_12 192 +#define offset_13 208 +#define offset_14 224 +#define offset_15 240 @@ -37,12 +53,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) { BLASLONG i = 0; - __vector float v_a = {alpha,alpha,alpha,alpha}; - __vector float * v_y=(__vector float *)y; - __vector float * v_x=(__vector float *)x; + __vector float v_a __attribute((aligned(16))) = {alpha,alpha,alpha,alpha}; + __vector float * vptr_y =(__vector float *)y; + __vector float * vptr_x =(__vector float *)x; for(; i Date: Thu, 16 Jul 2020 22:17:39 +0000 Subject: [PATCH 050/349] handle missing lack of fortran compiler more gracefully --- cmake/f_check.cmake | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index f877fc3e1c..1fd6c2ad24 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -21,7 +21,14 @@ # NEED2UNDERSCORES if (NOT NO_LAPACK) - enable_language(Fortran) + check_language(Fortran) + if(CMAKE_Fortran_COMPILER) + enable_language(Fortran) + else() + message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") + set (NOFORTRAN 1) + set (NO_LAPACK 1) + endif() else() include(CMakeForceCompiler) CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) From 9d000ecaa2c888d2e777c7223602e5811858f8a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 16 Jul 2020 22:36:35 +0000 Subject: [PATCH 051/349] include CheckLanguage module --- cmake/f_check.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake index 1fd6c2ad24..0f5d0e15dd 100644 --- a/cmake/f_check.cmake +++ b/cmake/f_check.cmake @@ -21,6 +21,7 @@ # NEED2UNDERSCORES if (NOT NO_LAPACK) + include(CheckLanguage) check_language(Fortran) if(CMAKE_Fortran_COMPILER) enable_language(Fortran) From 26b7f24d164150d80f3672018c836e8a4f20260b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 12:51:37 +0000 Subject: [PATCH 052/349] Update cross-compiling example to reflect change in Loongson gcc for #2723 --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 6dc3c7b425..4e5e3e9564 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,10 @@ Examples: ```sh make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A ``` + or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI: + ```sh + make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A + ``` * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: ```sh From 4afd11dae5c254b3c78cd0fa241fe14305e599dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 16:57:41 +0000 Subject: [PATCH 053/349] Add a check for C11 atomics and stdatomic.h --- c_check | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/c_check b/c_check index dd700b8b48..314c2b157c 100644 --- a/c_check +++ b/c_check @@ -249,6 +249,28 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { } } +$c11_atomics = 0; +if ($data =~ /HAVE_C11/) { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11"; + $c11_atomics = 0; + } else { + ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); + print $tmpf "#include \nint main(void){}\n"; + $args = " -c -o $tmpf.o $tmpf"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $c11_atomics = 0; + } else { + $c11_atomics = 1; + } + unlink("$tmpf.o"); + } +} + + $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; $data =~ /globl\s([_\.]*)(.*)/; @@ -352,6 +374,8 @@ print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; +print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; + if ($os eq "LINUX") { From 97d6eb97b15d2ece319da9c741ca13b2976013cc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 16:59:33 +0000 Subject: [PATCH 054/349] Report availability of C11 support --- ctest.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ctest.c b/ctest.c index 5e869b901f..cd84ab1bb9 100644 --- a/ctest.c +++ b/ctest.c @@ -153,3 +153,6 @@ ARCH_ARM ARCH_ARM64 #endif +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) +HAVE_C11 +#endif From 94bab9d1f92325aec79aecc9daacfaef8903d359 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:03:31 +0000 Subject: [PATCH 055/349] Update conditional for atomics to use HAVE_C11 --- driver/others/blas_server.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 04b614a6e7..756e51b5dc 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -141,7 +141,7 @@ typedef struct { } thread_status_t; -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) #else From 791e046744116bbf06649ae43adf0febdcebb6a9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:05:59 +0000 Subject: [PATCH 056/349] Update conditional for atomics to use HAVE_C11 --- driver/others/blas_server_omp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 4255852c81..b4eb27c251 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -55,7 +55,7 @@ int blas_server_avail = 0; static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #else static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; @@ -320,7 +320,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ while(true) { for(i=0; i < MAX_PARALLEL_NUMBER; i++) { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Bool inuse = false; if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { #else @@ -345,7 +345,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ exec_threads(&queue[i], buf_index); } -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 atomic_store(&blas_buffer_inuse[buf_index], false); #else blas_buffer_inuse[buf_index] = false; From 09eb9d2584bd978815571b2860f06dedc9f606d2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:07:38 +0000 Subject: [PATCH 057/349] Update conditional for atomics to HAVE_C11 --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index a5595aed43..9b6c226a1e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1095,7 +1095,7 @@ static BLASULONG base_address = 0UL; static BLASULONG base_address = BASE_ADDRESS; #endif -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 static _Atomic int memory_initialized = 0; #else static volatile int memory_initialized = 0; From 6f38de06d2a0ce372c631b01380be58932ec159a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:09:01 +0000 Subject: [PATCH 058/349] Update conditional for atomics to use HAVE_C11 --- driver/level3/level3_gemm3m_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c index 9216daaed6..39824fc5ab 100644 --- a/driver/level3/level3_gemm3m_thread.c +++ b/driver/level3/level3_gemm3m_thread.c @@ -91,7 +91,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Atomic #else volatile From ce45af8151c96fcb1c75d3985d96c5b64a68f823 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:09:56 +0000 Subject: [PATCH 059/349] Update conditional for atomics to use HAVE_C11 --- driver/level3/level3_syrk_threaded.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index 574f825b0f..a041abac31 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -67,7 +67,7 @@ #endif typedef struct { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Atomic #else volatile From a36eb19ae0dfab6714f82abf90b1394012888ff3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:13:24 +0000 Subject: [PATCH 060/349] Update conditional for C11 atomics to use HAVE_C11 --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 00b34a3f76..d6637abe49 100644 --- a/common.h +++ b/common.h @@ -681,7 +681,7 @@ __declspec(dllimport) int __cdecl omp_in_parallel(void); __declspec(dllimport) int __cdecl omp_get_num_procs(void); #endif -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #if defined(C_GCC) && ( __GNUC__ < 7) // workaround for GCC bug 65467 #ifndef _Atomic From f4f74941bd5fadfe3fd662f4da8355f2c6250949 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:14:50 +0000 Subject: [PATCH 061/349] Update conditional for atomics to use HAVE_C11 --- lapack/getrf/getrf_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index c602822a85..fc410b0e70 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -68,7 +68,7 @@ double sqrt(double); #define GETRF_FACTOR 1.00 -#if (__STDC_VERSION__ >= 201112L) +#ifdef HAVE_C11 #define atomic_load_long(p) __atomic_load_n(p, __ATOMIC_RELAXED) #define atomic_store_long(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) #else From bbe119ee3bc0393dbc1d3422690c5f628576a3b4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 18 Jul 2020 17:19:59 +0000 Subject: [PATCH 062/349] Update conditional for atomics to use HAVE_C11 --- lapack/getrf/potrf_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c index 3125096850..008fcb8cc0 100644 --- a/lapack/getrf/potrf_parallel.c +++ b/lapack/getrf/potrf_parallel.c @@ -101,7 +101,7 @@ static FLOAT dm1 = -1.; #endif typedef struct { -#if __STDC_VERSION__ >= 201112L +#ifdef HAVE_C11 _Atomic #else volatile From 9e21a100e32059adf102b300d2f52085cc25adb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 20 Jul 2020 22:52:09 +0000 Subject: [PATCH 063/349] Add trivial check for stdatomic.h --- cmake/system_check.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94eb0a9c6b..4382ffc4e2 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -116,3 +116,10 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() file(REMOVE "avx512.c" "avx512.o") endif() + +include(CheckIncludeFile) +CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) +if (HAVE_C11 EQUAL 1) +message (STATUS found stdatomic.h) +set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11") +endif() From 9ae154ba899c0c2b98a999432c8b480f8ec2da53 Mon Sep 17 00:00:00 2001 From: Wileam Phan Date: Mon, 20 Jul 2020 23:30:28 -0400 Subject: [PATCH 064/349] Patch for building on Summit --- Makefile.power | 43 ++++++++++++++++++++++++++++++++++++++----- Makefile.system | 22 ++++++++++++++++++++++ exports/Makefile | 4 ++++ f_check | 3 +++ 4 files changed, 67 insertions(+), 5 deletions(-) diff --git a/Makefile.power b/Makefile.power index ea84f59458..bf7037995d 100644 --- a/Makefile.power +++ b/Makefile.power @@ -21,23 +21,56 @@ endif ifeq ($(CORE), POWER9) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp +endif +ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp +endif +else +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align +endif +ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math +else +FCOMMON_OPT += -O2 -Mrecursive +endif endif endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp +endif +ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp +endif +else +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math +else +CCOMMON_OPT += -fast -Mvect=simd -Mcache_align +endif +ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +else +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +endif +else +FCOMMON_OPT += -O2 -Mrecursive endif endif endif diff --git a/Makefile.system b/Makefile.system index 3312a0be37..d62c66ad3e 100644 --- a/Makefile.system +++ b/Makefile.system @@ -796,8 +796,19 @@ endif ifeq ($(C_COMPILER), PGI) ifdef BINARY64 +ifeq ($(ARCH), x86_64) CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm else +ifeq ($(ARCH), power) +ifeq ($(CORE), POWER8) +CCOMMON_OPT += -tp pwr8 +endif +ifeq ($(CORE), POWER9) +CCOMMON_OPT += -tp pwr9 +endif +endif +endif +else CCOMMON_OPT += -tp p7 endif endif @@ -960,8 +971,19 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif +ifeq ($(ARCH), x86_64) FCOMMON_OPT += -tp p7-64 else +ifeq ($(ARCH), power) +ifeq ($(CORE), POWER8) +FCOMMON_OPT += -tp pwr8 +endif +ifeq ($(CORE), POWER9) +FCOMMON_OPT += -tp pwr9 +endif +endif +endif +else FCOMMON_OPT += -tp p7 endif FCOMMON_OPT += -Mrecursive diff --git a/exports/Makefile b/exports/Makefile index 01a313b357..75901586c6 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -55,6 +55,10 @@ endif endif endif +ifeq ($(C_COMPILER), PGI) +EXTRALIB += -pgf90libs +endif + ifneq (,$(filter 1 2,$(NOFORTRAN))) FEXTRALIB = endif diff --git a/f_check b/f_check index 17d863224c..dd4d3475c9 100644 --- a/f_check +++ b/f_check @@ -82,6 +82,9 @@ if ($compiler eq "") { if ($compiler =~ /flang/) { $vendor = FLANG; $openmp = "-fopenmp"; + } elsif ($compiler =~ /pgf/) { + $vendor = PGI; + $openmp = "-mp"; } else { $vendor = G77; $openmp = ""; From 6c33764ca43c7311bdd61e2371b08395cf3e3f01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 22 Jul 2020 17:30:55 +0000 Subject: [PATCH 065/349] Unify BUFFER_SIZE settings for x86_64 again to fix potentially fatal mismatch in DYNAMIC_ARCH builds --- common_x86_64.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/common_x86_64.h b/common_x86_64.h index 15d0c30aa9..bee7e8cdbd 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -228,14 +228,8 @@ static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ #define HUGE_PAGESIZE ( 2 << 20) #ifndef BUFFERSIZE -#if defined(SKYLAKEX) -#define BUFFER_SIZE (32 << 21) -#elif defined(HASWELL) || defined(ZEN) #define BUFFER_SIZE (32 << 22) #else -#define BUFFER_SIZE (32 << 20) -#endif -#else #define BUFFER_SIZE (32 << BUFFERSIZE) #endif From 9796e552eaa8dff68bba3bbb45f2039032a1fb99 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:03:28 +0200 Subject: [PATCH 066/349] Avoid undefining NAME,CNAME etc for pgcc as it makes it ignore the new defininitions --- Makefile.system | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.system b/Makefile.system index d62c66ad3e..cc72c02e89 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1241,7 +1241,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) include $(TOPDIR)/Makefile.$(ARCH) +ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME +endif CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" ifeq ($(CORE), PPC440) From 661c6bfa5a245fdcfd0788d29dff4ce83a508e1e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:08:20 +0200 Subject: [PATCH 067/349] Exclude altivec code paths if the compiler does not support them --- kernel/power/casum.c | 2 ++ kernel/power/ccopy.c | 2 ++ kernel/power/cdot.c | 4 ++++ kernel/power/cgemv_n.c | 7 +++++-- kernel/power/cgemv_t.c | 7 +++++-- kernel/power/crot.c | 2 ++ kernel/power/cswap.c | 2 ++ kernel/power/dasum.c | 2 ++ kernel/power/daxpy.c | 2 ++ kernel/power/dcopy.c | 2 ++ kernel/power/ddot.c | 2 ++ kernel/power/dgemv_n.c | 2 ++ kernel/power/dgemv_t.c | 7 ++++++- kernel/power/drot.c | 2 ++ kernel/power/dscal.c | 2 ++ kernel/power/dswap.c | 2 ++ kernel/power/idamax.c | 9 +++++++++ kernel/power/idamin.c | 7 ++++++- kernel/power/izamax.c | 6 +++++- kernel/power/izamin.c | 8 +++++--- kernel/power/sasum.c | 2 ++ kernel/power/saxpy.c | 4 ++++ kernel/power/scopy.c | 2 ++ kernel/power/sdot.c | 3 +++ kernel/power/sgemv_n.c | 4 ++++ kernel/power/sgemv_t.c | 5 +++++ kernel/power/srot.c | 2 ++ kernel/power/sscal.c | 2 ++ kernel/power/sswap.c | 2 ++ kernel/power/zasum.c | 2 ++ kernel/power/zaxpy.c | 2 ++ kernel/power/zcopy.c | 2 ++ kernel/power/zdot.c | 12 +++++++++++- kernel/power/zgemv_n_4.c | 3 +++ kernel/power/zgemv_t_4.c | 3 +++ kernel/power/zrot.c | 5 ++++- kernel/power/zscal.c | 2 ++ kernel/power/zswap.c | 2 ++ 38 files changed, 126 insertions(+), 12 deletions(-) diff --git a/kernel/power/casum.c b/kernel/power/casum.c index 3478a39ef3..06982bfba5 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "casum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index cbe5b48d2f..5e58034dd5 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "ccopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index d5b18729ad..ef5e4710f2 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zdot.c" +#else #include "common.h" #ifndef HAVE_KERNEL_8 @@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA return (result); } +#endif diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c index eec3fa37c8..8663039c57 100644 --- a/kernel/power/cgemv_n.c +++ b/kernel/power/cgemv_n.c @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_n.c" +#else #include #include @@ -591,4 +594,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return (0); } - +#endif diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c index 691f7a3d3d..1bfc235db5 100644 --- a/kernel/power/cgemv_t.c +++ b/kernel/power/cgemv_t.c @@ -23,7 +23,10 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zgemv_t.c" +#else #include "common.h" @@ -595,4 +598,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, return (0); } - +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 5c1d44620d..fb4860dcdf 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -28,6 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { @@ -169,6 +170,7 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) } #endif +#endif int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 88cb1d6387..5144a2e93a 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "cswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 09e06d9091..999dc677a3 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index 018beafd17..2de4e09110 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "daxpy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index cf203e71e3..24279f8a2b 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dcopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index bd9e1fb97d..c5493015a9 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "ddot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index b4dfda5509..ac365b3b2a 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dgemv_n_microk_power8.c" #endif +#endif #define NBMAX 4096 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 5d43f673f2..09abd5a439 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -25,15 +25,19 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" +#else + #include "common.h" #define NBMAX 1024 //#define PREFETCH 1 + #include #define HAVE_KERNEL4x8_ASM 1 - #if defined(HAVE_KERNEL4x8_ASM) static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { @@ -883,4 +887,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return (0); } +#endif diff --git a/kernel/power/drot.c b/kernel/power/drot.c index b808ab5665..951c2f9c99 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "drot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 7e0fe48c0f..39293252b0 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dscal_microk_power8.c" #endif +#endif #if !defined(HAVE_KERNEL_8) diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 795bb10b48..ff3f95c797 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "dswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 195a8c68e7..5016f67dde 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ #include "common.h" #include +#if defined(__VEC__) || defined(__ALTIVEC__) #include +#endif + #if defined(DOUBLE) #define ABS fabs @@ -37,6 +40,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(__VEC__) || defined(__ALTIVEC__) + /** * Find maximum index * Warning: requirements n>0 and n % 32 == 0 @@ -313,6 +318,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { return index; } +#endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -326,12 +332,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + if (n1 > 0) { max = diamax_kernel_32(n1, x, &maxf); i = n1; } +#endif #endif while (i < n) { if (ABS(x[i]) > maxf) { diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 8a5538821a..e37718c485 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -37,6 +37,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(__VEC__) || defined(__ALTIVEC__) + /** * Find minimum index * Warning: requirements n>0 and n % 32 == 0 @@ -313,7 +315,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { return index; } - +#endif BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { @@ -327,12 +329,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (inc_x == 1) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -32; if (n1 > 0) { min = diamin_kernel_32(n1, x, &minf); i = n1; } +#endif #endif while (i < n) { if (ABS(x[i]) < minf) { diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c index 7149da28b8..fe9d5bf955 100644 --- a/kernel/power/izamax.c +++ b/kernel/power/izamax.c @@ -34,6 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#if defined(__VEC__) || defined(__ALTIVEC__) /** * Find maximum index @@ -299,7 +300,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { } - +#endif @@ -317,6 +318,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -16; if (n1 > 0) { @@ -324,6 +327,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif #endif while(i < n) diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 692315b891..94f2383e01 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -24,7 +24,6 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include @@ -32,6 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabs #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +#if defined(__VEC__) || defined(__ALTIVEC__) /** * Find minimum index @@ -296,6 +296,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { return index; } +#endif @@ -316,6 +317,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) minf = CABS1(x,0); //index will not be incremented #if defined(_CALL_ELF) && (_CALL_ELF == 2) +#if defined(__VEC__) || defined(__ALTIVEC__) + BLASLONG n1 = n & -16; if (n1 > 0) { @@ -323,6 +326,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; ix = n1 << 1; } +#endif #endif while(i < n) @@ -359,5 +363,3 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } } - - diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index b259d7d76f..7331370121 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/saxpy.c b/kernel/power/saxpy.c index 393cdfadc4..d005427b5f 100644 --- a/kernel/power/saxpy.c +++ b/kernel/power/saxpy.c @@ -30,6 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#if defined(__VEC__) || defined(__ALTIVEC__) #ifndef HAVE_KERNEL_8 #include @@ -62,6 +63,7 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) } } #endif +#endif int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { @@ -74,11 +76,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS { BLASLONG n1 = n & -64; +#if defined(__VEC__) || defined(__ALTIVEC__) if ( n1 ) saxpy_kernel_64(n1, x, y, da); i = n1; +#endif while(i < n) { diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c index 5207d386e4..8ff8cb3295 100644 --- a/kernel/power/scopy.c +++ b/kernel/power/scopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "scopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c index 8de434e418..ffeab6638e 100644 --- a/kernel/power/sdot.c +++ b/kernel/power/sdot.c @@ -36,8 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) + #include "sdot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c index 81ac031a34..5dfb18f5b9 100644 --- a/kernel/power/sgemv_n.c +++ b/kernel/power/sgemv_n.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_n.c" +#else #include "common.h" @@ -463,4 +466,5 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return(0); } +#endif diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c index 3d8a442dc5..62c517a9d6 100644 --- a/kernel/power/sgemv_t.c +++ b/kernel/power/sgemv_t.c @@ -24,6 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/gemv_t.c" + +#else #include "common.h" @@ -477,3 +481,4 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO } +#endif diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 9638a59eb8..a53342f612 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -40,8 +40,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "srot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index ddd5b2c5b3..de37e10a55 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sscal_microk_power8.c" #endif +#endif #if !defined(HAVE_KERNEL_16) diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index a564344448..44522f0a0e 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "sswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_32 diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index 8383e39ab9..305e50edef 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -47,8 +47,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zasum_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index 4a7c26c694..3064d54358 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zaxpy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_4 diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index bb80decd27..453f4e5511 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -36,8 +36,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zcopy_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index 9086ef35bc..6907657973 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zdot_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_8 @@ -93,9 +95,11 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; if ( n <= 0 ) - { + { /* __real__ result = 0.0 ; __imag__ result = 0.0 ; + */ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); return(result); } @@ -149,11 +153,17 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in } #if !defined(CONJ) + /* __real__ result = dot[0] - dot[1]; __imag__ result = dot[2] + dot[3]; + */ + result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else + /* __real__ result = dot[0] + dot[1]; __imag__ result = dot[2] - dot[3]; + */ + result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index ba019d6a5a..1f7199c89c 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" +#if defined(__VEC__) || defined(__ALTIVEC__) + #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 @@ -37,6 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif +#endif // #define NBMAX 4096 diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index b34199af64..4ed27d96b9 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -28,10 +28,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #define NBMAX 4096 +#if defined(__VEC__) || defined(__ALTIVEC__) + #define HAVE_KERNEL_4x4_VEC 1 #define HAVE_KERNEL_4x2_VEC 1 #define HAVE_KERNEL_4x1_VEC 1 +#endif #if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) #include #endif diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c index c6d666178c..5e7ca3b233 100644 --- a/kernel/power/zrot.c +++ b/kernel/power/zrot.c @@ -24,6 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +#if !defined(__VEC__) || !defined(__ALTIVEC__) +#include "../arm/zrot.c" +#else #include "common.h" @@ -262,4 +265,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } - \ No newline at end of file +#endif diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 16b584bca0..5526f4d67a 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -39,10 +39,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif #endif +#endif #ifndef HAVE_KERNEL_8 diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index c6508f0321..3a5a8eb833 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -37,8 +37,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) || defined(POWER9) || defined(POWER10) +#if defined(__VEC__) || defined(__ALTIVEC__) #include "zswap_microk_power8.c" #endif +#endif #ifndef HAVE_KERNEL_16 From 7c6e56b5dfa0dee6e39eef9cc17c10ea92c39ac2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:10:59 +0200 Subject: [PATCH 068/349] Rewrite assignment to complex for better portability --- kernel/arm/zdot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index 733c235c64..a9f46dde75 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,10 +48,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; - +/* CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; - +*/ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; @@ -71,8 +72,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } - CREAL(result) = dot[0]; - CIMAG(result) = dot[1]; + /*CREAL(result) = dot[0]; + CIMAG(result) = dot[1];*/ + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); return(result); } From 21072e502ae620186dea2293e91b5685906bdc25 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 17:34:56 +0000 Subject: [PATCH 069/349] Typo fix --- kernel/power/zdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index 6907657973..fe0e9284ee 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -157,13 +157,13 @@ FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG in __real__ result = dot[0] - dot[1]; __imag__ result = dot[2] + dot[3]; */ - result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]); #else /* __real__ result = dot[0] + dot[1]; __imag__ result = dot[2] - dot[3]; */ - result = OPENBLAS_MAKE_COMPLE_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); + result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]); #endif From ca3561cab9d698b7816544a08848306853c17c01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 18:30:42 +0000 Subject: [PATCH 070/349] Add ifdefs around call to altivec microkernel --- kernel/power/crot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/crot.c b/kernel/power/crot.c index fb4860dcdf..84ba5d913a 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -185,7 +185,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { - +#if defined(__VEC__) || defined(__ALTIVEC__) BLASLONG n1 = n & -8; if ( n1 > 0 ) { @@ -193,7 +193,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT i=n1; ix=2*n1; } - +#endif while(i < n) { temp[0] = c*x[ix] + s*y[ix] ; From bf1f0734ff8c90261bc0f3b0f3887b489a10f8b6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 23 Jul 2020 20:40:13 +0000 Subject: [PATCH 071/349] Use OPENBLAS_MAKE_COMPLEX_FLOAT on PPC only --- kernel/arm/zdot.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c index a9f46dde75..ba0e57eb52 100644 --- a/kernel/arm/zdot.c +++ b/kernel/arm/zdot.c @@ -48,11 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA dot[0]=0.0; dot[1]=0.0; -/* +#if !defined(__PPC__) CREAL(result) = 0.0 ; CIMAG(result) = 0.0 ; -*/ +#else result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); +#endif if ( n < 1 ) return(result); inc_x2 = 2 * inc_x ; @@ -72,9 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i++ ; } - /*CREAL(result) = dot[0]; - CIMAG(result) = dot[1];*/ +#if !defined(__POWER__) + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; +#else result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); +#endif return(result); } From 95d37e15754955f5c73195d2ca09208e99600ab9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Jul 2020 10:13:46 +0000 Subject: [PATCH 072/349] Regroup the 32 and 64bit sections and restore 64bit CAXPY --- kernel/power/KERNEL.POWER8 | 49 ++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index bb93a6a230..cbf285913e 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -1,3 +1,4 @@ +# Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) SGEMMKERNEL = gemm_kernel_power6.S SGEMMINCOPY = @@ -35,12 +36,6 @@ ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -DTRSMKERNEL_LN = trsm_kernel_power6_LN.S -DTRSMKERNEL_LT = trsm_kernel_power6_LT.S -DTRSMKERNEL_RN = trsm_kernel_power6_LT.S -DTRSMKERNEL_RT = trsm_kernel_power6_RT.S - -CAXPYKERNEL = zaxpy.S else @@ -93,10 +88,6 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c endif STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c @@ -104,10 +95,17 @@ STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -#DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -#DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -#DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -#DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +DTRSMKERNEL_LN = trsm_kernel_power6_LN.S +DTRSMKERNEL_LT = trsm_kernel_power6_LT.S +DTRSMKERNEL_RN = trsm_kernel_power6_LT.S +DTRSMKERNEL_RT = trsm_kernel_power6_RT.S +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -204,15 +202,20 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -#ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) -#ifneq ($(GCCVERSIONGTEQ9),1) -#CAXPYKERNEL = caxpy_power8.S -#else -#CAXPYKERNEL = caxpy.c -#endif -#else -#CAXPYKERNEL = caxpy.c -#endif +ä +ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) +CAXPYKERNEL = zaxpy.S +else +ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) +ifneq ($(GCCVERSIONGTEQ9),1) +CAXPYKERNEL = caxpy_power8.S +else +CAXPYKERNEL = caxpy.c +endif +else +CAXPYKERNEL = caxpy.c +endif +endif # ZAXPYKERNEL = zaxpy.c # From 251a09ec903fb05a93bbd36bd4138a73b330f09a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 24 Jul 2020 16:04:58 +0000 Subject: [PATCH 073/349] Typo fix --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index cbf285913e..c2f4cd2046 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -202,7 +202,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -ä +# ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) CAXPYKERNEL = zaxpy.S else From 9be2688c78e1646e406e425b4c79e6f82db9f94e Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 24 Jul 2020 23:08:11 -0500 Subject: [PATCH 074/349] Fix to store results in correct order for POWER10 GEMM kernels There is a recent compiler change in __builtin_mma_disassemble_acc() which affects the order of storing result in POWER10. Also removing new LDFLAG -mno-power10-stub as it is handled by linker automatically. --- Makefile.system | 3 - kernel/power/dgemm_kernel_power10.c | 54 ++++++++-------- kernel/power/sgemm_kernel_power10.c | 94 ++++++++++++++-------------- kernel/power/shgemm_kernel_power10.c | 48 +++++++------- 4 files changed, 98 insertions(+), 101 deletions(-) diff --git a/Makefile.system b/Makefile.system index cc72c02e89..db651ef997 100644 --- a/Makefile.system +++ b/Makefile.system @@ -617,7 +617,6 @@ DYNAMIC_CORE += POWER8 ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs endif ifeq ($(C_COMPILER), GCC) ifeq ($(GCCVERSIONGT5), 1) @@ -627,11 +626,9 @@ $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif ifeq ($(GCCVERSIONGTEQ11), 1) DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs else ifeq ($(GCCVERSIONGTEQ10), 1) ifeq ($(GCCMINORVERSIONGTEQ2), 1) DYNAMIC_CORE += POWER10 -override LDFLAGS += -Wl,-no-power10-stubs endif else $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index b3ee301be8..a0bc1a777a 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -27,64 +27,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #ifdef TRMMKERNEL #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[2] * alpha; + rowC[0] = result[1] * alpha; #else #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #endif #define SET_ACC_ZERO4() \ diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 01c122c6d4..81a5ec76b2 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -27,103 +27,103 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); #if defined(TRMMKERNEL) #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[6] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[4] * alpha; \ - rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] = result[4] * alpha; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[6] * alpha; #define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[6] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v2sf_t *) &CO[5* ldc+J]; \ - rowC[0] = result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC[0] = result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] = result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] = result[0] * alpha; + rowC[0] = result[6] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[3] * alpha; \ + rowC[0] = result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[2] * alpha; + rowC[0] = result[1] * alpha; #else #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[5* ldc+J]; \ - rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #endif #define KERNEL(i, j) \ __builtin_mma_xvf32gerpp (&acc0, rowB[i], rowA[j]); \ diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/shgemm_kernel_power10.c index 7455f925c7..1ae9e04bf0 100644 --- a/kernel/power/shgemm_kernel_power10.c +++ b/kernel/power/shgemm_kernel_power10.c @@ -45,7 +45,7 @@ bfloat16tof32 (bfloat16 f16) #define BF16TOF32(x) x #endif -typedef unsigned char vec_t __attribute__ ((vector_size (16))); +typedef __vector unsigned char vec_t; typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); @@ -64,54 +64,54 @@ vector char mask = #define MERGE_LOW(x, y) (vec_t) vec_mergel ((vector short)x, (vector short)y) #define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[3] * alpha; #define SAVE4x2_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[2* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[2* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[3* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define SAVE4x2_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v2sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[6] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v2sf_t *) &CO[5* ldc+J]; \ - rowC[0] += result[4] * alpha; \ - rowC = (v2sf_t *) &CO[6* ldc+J]; \ rowC[0] += result[2] * alpha; \ + rowC = (v2sf_t *) &CO[6* ldc+J]; \ + rowC[0] += result[4] * alpha; \ rowC = (v2sf_t *) &CO[7* ldc+J]; \ - rowC[0] += result[0] * alpha; + rowC[0] += result[6] * alpha; #define MMA __builtin_mma_xvbf16ger2pp #define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc (result, ACC); \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[3] * alpha; \ + rowC[0] += result[0] * alpha; \ rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[2] * alpha; + rowC[0] += result[1] * alpha; #define SET_ACC_ZERO4() \ __builtin_mma_xxsetaccz (&acc0); \ From 4fda217f99f611df04f4dcec8378ee0441fdf6e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Jul 2020 06:42:39 +0000 Subject: [PATCH 075/349] Delete potrf_parallel.c (moving it to ../potrf) --- lapack/getrf/potrf_parallel.c | 667 ---------------------------------- 1 file changed, 667 deletions(-) delete mode 100644 lapack/getrf/potrf_parallel.c diff --git a/lapack/getrf/potrf_parallel.c b/lapack/getrf/potrf_parallel.c deleted file mode 100644 index 008fcb8cc0..0000000000 --- a/lapack/getrf/potrf_parallel.c +++ /dev/null @@ -1,667 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" - -#ifndef USE_SIMPLE_THREADED_LEVEL3 - -//The array of job_t may overflow the stack. -//Instead, use malloc to alloc job_t. -#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD -#define USE_ALLOC_HEAP -#endif - - -static FLOAT dm1 = -1.; - -#ifndef KERNEL_FUNC -#ifndef LOWER -#define KERNEL_FUNC SYRK_KERNEL_U -#else -#define KERNEL_FUNC SYRK_KERNEL_L -#endif -#endif - -#ifndef LOWER -#ifndef COMPLEX -#define TRSM_KERNEL TRSM_KERNEL_LT -#else -#define TRSM_KERNEL TRSM_KERNEL_LC -#endif -#else -#ifndef COMPLEX -#define TRSM_KERNEL TRSM_KERNEL_RN -#else -#define TRSM_KERNEL TRSM_KERNEL_RR -#endif -#endif - -#ifndef CACHE_LINE_SIZE -#define CACHE_LINE_SIZE 8 -#endif - -#ifndef DIVIDE_RATE -#define DIVIDE_RATE 2 -#endif - -#ifndef SWITCH_RATIO -#define SWITCH_RATIO 2 -#endif - -#ifndef LOWER -#define TRANS -#endif - -#ifndef SYRK_LOCAL -#if !defined(LOWER) && !defined(TRANS) -#define SYRK_LOCAL SYRK_UN -#elif !defined(LOWER) && defined(TRANS) -#define SYRK_LOCAL SYRK_UT -#elif defined(LOWER) && !defined(TRANS) -#define SYRK_LOCAL SYRK_LN -#else -#define SYRK_LOCAL SYRK_LT -#endif -#endif - -typedef struct { -#ifdef HAVE_C11 - _Atomic -#else - volatile -#endif - BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; -} job_t; - - -#ifndef KERNEL_OPERATION -#ifndef COMPLEX -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) -#else -#define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ - KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC, (X) - (Y)) -#endif -#endif - -#ifndef ICOPY_OPERATION -#ifndef TRANS -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); -#else -#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); -#endif -#endif - -#ifndef OCOPY_OPERATION -#ifdef TRANS -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); -#else -#define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); -#endif -#endif - -#ifndef S -#define S args -> a -#endif -#ifndef A -#define A args -> b -#endif -#ifndef C -#define C args -> c -#endif -#ifndef LDA -#define LDA args -> lda -#endif -#ifndef N -#define N args -> m -#endif -#ifndef K -#define K args -> k -#endif - -static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ - - FLOAT *buffer[DIVIDE_RATE]; - - BLASLONG k, lda; - BLASLONG m_from, m_to; - - FLOAT *alpha; - FLOAT *a, *c; - job_t *job = (job_t *)args -> common; - BLASLONG xxx, bufferside; - - BLASLONG jjs, min_jj; - BLASLONG is, min_i, div_n; - - BLASLONG i, current; - - k = K; - - a = (FLOAT *)A; - c = (FLOAT *)C; - - lda = LDA; - - alpha = (FLOAT *)args -> alpha; - - m_from = range_n[mypos + 0]; - m_to = range_n[mypos + 1]; - -#if 0 - fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld\n", mypos, m_from, m_to); -#endif - - div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - buffer[0] = (FLOAT *)((((BLASULONG)(sb + k * k * COMPSIZE) + GEMM_ALIGN) & ~GEMM_ALIGN) + GEMM_OFFSET_B); - for (i = 1; i < DIVIDE_RATE; i++) { - buffer[i] = buffer[i - 1] + GEMM_Q * div_n * COMPSIZE; - } - -#ifndef LOWER - TRSM_IUNCOPY(k, k, (FLOAT *)S, lda, 0, sb); -#else - TRSM_OLTCOPY(k, k, (FLOAT *)S, lda, 0, sb); -#endif - - for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) { - - for(jjs = xxx; jjs < MIN(m_to, xxx + div_n); jjs += min_jj){ - - min_jj = MIN(m_to, xxx + div_n) - jjs; - -#ifndef LOWER - if (min_jj > GEMM_UNROLL_MN) min_jj = GEMM_UNROLL_MN; -#else - if (min_jj > GEMM_P) min_jj = GEMM_P; -#endif - -#ifndef LOWER - OCOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - - TRSM_KERNEL (k, min_jj, k, dm1, -#ifdef COMPLEX - ZERO, -#endif - sb, - buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, - a + jjs * lda * COMPSIZE, lda, 0); -#else - ICOPY_OPERATION (k, min_jj, a, lda, 0, jjs, buffer[bufferside] + k * (jjs - xxx) * COMPSIZE); - - TRSM_KERNEL (min_jj, k, k, dm1, -#ifdef COMPLEX - ZERO, -#endif - buffer[bufferside] + k * (jjs - xxx) * COMPSIZE, - sb, - a + jjs * COMPSIZE, lda, 0); -#endif - } - -#ifndef LOWER - for (i = 0; i <= mypos; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -#else - for (i = mypos; i < args -> nthreads; i++) - job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside]; -#endif - - WMB; - } - - min_i = m_to - m_from; - - if (min_i >= GEMM_P * 2) { - min_i = GEMM_P; - } else - if (min_i > GEMM_P) { - min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - } - -#ifndef LOWER - ICOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); -#else - OCOPY_OPERATION(k, min_i, a, lda, 0, m_from, sa); -#endif - - current = mypos; - -#ifndef LOWER - while (current < args -> nthreads) -#else - while (current >= 0) -#endif - { - div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - /* thread has to wait */ - if (current != mypos) while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;}; - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, lda, m_from, xxx); - - if (m_from + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } - } - -#ifndef LOWER - current ++; -#else - current --; -#endif - } - - for(is = m_from + min_i; is < m_to; is += min_i){ - min_i = m_to - is; - - if (min_i >= GEMM_P * 2) { - min_i = GEMM_P; - } else - if (min_i > GEMM_P) { - min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - } - -#ifndef LOWER - ICOPY_OPERATION(k, min_i, a, lda, 0, is, sa); -#else - OCOPY_OPERATION(k, min_i, a, lda, 0, is, sa); -#endif - - current = mypos; - -#ifndef LOWER - while (current < args -> nthreads) -#else - while (current >= 0) -#endif - { - div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN; - - for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) { - - KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - xxx, div_n), k, alpha, - sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], - c, lda, is, xxx); - - if (is + min_i >= m_to) { - job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0; - WMB; - } - } -#ifndef LOWER - current ++; -#else - current --; -#endif - } - } - - for (i = 0; i < args -> nthreads; i++) { - if (i != mypos) { - for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { - while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;}; - } - } - } - - return 0; - } - -static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ - - blas_arg_t newarg; - -#ifndef USE_ALLOC_HEAP - job_t job[MAX_CPU_NUMBER]; -#else - job_t * job = NULL; -#endif - - blas_queue_t queue[MAX_CPU_NUMBER]; - - BLASLONG range[MAX_CPU_NUMBER + 100]; - - BLASLONG num_cpu; - - BLASLONG nthreads = args -> nthreads; - - BLASLONG width, i, j, k; - BLASLONG n, n_from, n_to; - int mode, mask; - double dnum; - -#ifndef COMPLEX -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_REAL; - mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_REAL; - mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; -#elif defined(HALF) - mode = BLAS_HALF | BLAS_REAL; - mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; -#else - mode = BLAS_SINGLE | BLAS_REAL; - mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; -#endif -#else -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_COMPLEX; - mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_COMPLEX; - mask = MAX(ZGEMM_UNROLL_M, ZGEMM_UNROLL_N) - 1; -#else - mode = BLAS_SINGLE | BLAS_COMPLEX; - mask = MAX(CGEMM_UNROLL_M, CGEMM_UNROLL_N) - 1; -#endif -#endif - - newarg.m = args -> m; - newarg.k = args -> k; - newarg.a = args -> a; - newarg.b = args -> b; - newarg.c = args -> c; - newarg.lda = args -> lda; - newarg.alpha = args -> alpha; - -#ifdef USE_ALLOC_HEAP - job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t)); - if(job==NULL){ - fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__); - exit(1); - } -#endif - - newarg.common = (void *)job; - - n_from = 0; - n_to = args -> m; - -#ifndef LOWER - - range[MAX_CPU_NUMBER] = n_to - n_from; - range[0] = 0; - num_cpu = 0; - i = 0; - n = n_to - n_from; - - dnum = (double)n * (double)n /(double)nthreads; - - while (i < n){ - - if (nthreads - num_cpu > 1) { - - double di = (double)i; - - width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); - - if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1)); - - if ((width > n - i) || (width < mask)) width = n - i; - - } else { - width = n - i; - } - - range[MAX_CPU_NUMBER - num_cpu - 1] = range[MAX_CPU_NUMBER - num_cpu] - width; - - queue[num_cpu].mode = mode; - queue[num_cpu].routine = inner_thread; - queue[num_cpu].args = &newarg; - queue[num_cpu].range_m = NULL; - - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; - queue[num_cpu].next = &queue[num_cpu + 1]; - - num_cpu ++; - i += width; - } - - for (i = 0; i < num_cpu; i ++) queue[i].range_n = &range[MAX_CPU_NUMBER - num_cpu]; - -#else - - range[0] = 0; - num_cpu = 0; - i = 0; - n = n_to - n_from; - - dnum = (double)n * (double)n /(double)nthreads; - - while (i < n){ - - if (nthreads - num_cpu > 1) { - - double di = (double)i; - - width = ((((BLASLONG)(sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); - - if ((width > n - i) || (width < mask)) width = n - i; - - } else { - width = n - i; - } - - range[num_cpu + 1] = range[num_cpu] + width; - - queue[num_cpu].mode = mode; - queue[num_cpu].routine = inner_thread; - queue[num_cpu].args = &newarg; - queue[num_cpu].range_m = NULL; - queue[num_cpu].range_n = range; - queue[num_cpu].sa = NULL; - queue[num_cpu].sb = NULL; - queue[num_cpu].next = &queue[num_cpu + 1]; - - num_cpu ++; - i += width; - } - -#endif - - newarg.nthreads = num_cpu; - - if (num_cpu) { - - for (j = 0; j < num_cpu; j++) { - for (i = 0; i < num_cpu; i++) { - for (k = 0; k < DIVIDE_RATE; k++) { - job[j].working[i][CACHE_LINE_SIZE * k] = 0; - } - } - } - - queue[0].sa = sa; - queue[0].sb = sb; - queue[num_cpu - 1].next = NULL; - - exec_blas(num_cpu, queue); - } - -#ifdef USE_ALLOC_HEAP - free(job); -#endif - - return 0; -} - -#endif - -blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG myid) { - - BLASLONG n, bk, i, blocking, lda; - BLASLONG info; - int mode; - blas_arg_t newarg; - FLOAT *a; - FLOAT alpha[2] = { -ONE, ZERO}; - -#ifndef COMPLEX -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_REAL; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_REAL; -#else - mode = BLAS_SINGLE | BLAS_REAL; -#endif -#else -#ifdef XDOUBLE - mode = BLAS_XDOUBLE | BLAS_COMPLEX; -#elif defined(DOUBLE) - mode = BLAS_DOUBLE | BLAS_COMPLEX; -#else - mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif -#endif - - if (args -> nthreads == 1) { -#ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, NULL, sa, sb, 0); -#else - info = POTRF_L_SINGLE(args, NULL, NULL, sa, sb, 0); -#endif - return info; - } - - n = args -> n; - a = (FLOAT *)args -> a; - lda = args -> lda; - - if (range_n) n = range_n[1] - range_n[0]; - - if (n <= GEMM_UNROLL_N * 2) { -#ifndef LOWER - info = POTRF_U_SINGLE(args, NULL, range_n, sa, sb, 0); -#else - info = POTRF_L_SINGLE(args, NULL, range_n, sa, sb, 0); -#endif - return info; - } - - newarg.lda = lda; - newarg.ldb = lda; - newarg.ldc = lda; - newarg.alpha = alpha; - newarg.beta = NULL; - newarg.nthreads = args -> nthreads; - - blocking = ((n / 2 + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N; - if (blocking > GEMM_Q) blocking = GEMM_Q; - - for (i = 0; i < n; i += blocking) { - bk = n - i; - if (bk > blocking) bk = blocking; - - newarg.m = bk; - newarg.n = bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - - info = CNAME(&newarg, NULL, NULL, sa, sb, 0); - if (info) return info + i; - - if (n - i - bk > 0) { -#ifndef USE_SIMPLE_THREADED_LEVEL3 - newarg.m = n - i - bk; - newarg.k = bk; -#ifndef LOWER - newarg.b = a + ( i + (i + bk) * lda) * COMPSIZE; -#else - newarg.b = a + ((i + bk) + i * lda) * COMPSIZE; -#endif - newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - - thread_driver(&newarg, sa, sb); -#else - -#ifndef LOWER - newarg.m = bk; - newarg.n = n - i - bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - newarg.b = a + (i + (i + bk) * lda) * COMPSIZE; - - gemm_thread_n(mode | BLAS_TRANSA_T, - &newarg, NULL, NULL, (void *)TRSM_LCUN, sa, sb, args -> nthreads); - - newarg.n = n - i - bk; - newarg.k = bk; - newarg.a = a + ( i + (i + bk) * lda) * COMPSIZE; - newarg.c = a + ((i + bk) + (i + bk) * lda) * COMPSIZE; - -#if 0 - HERK_THREAD_UC(&newarg, NULL, NULL, sa, sb, 0); -#else - syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T, - &newarg, NULL, NULL, (void *)HERK_UC, sa, sb, args -> nthreads); -#endif -#else - newarg.m = n - i - bk; - newarg.n = bk; - newarg.a = a + (i + i * lda) * COMPSIZE; - newarg.b = a + (i + bk + i * lda) * COMPSIZE; - - gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)TRSM_RCLN, sa, sb, args -> nthreads); - - newarg.n = n - i - bk; - newarg.k = bk; - newarg.a = a + (i + bk + i * lda) * COMPSIZE; - newarg.c = a + (i + bk + (i + bk) * lda) * COMPSIZE; - -#if 0 - HERK_THREAD_LN(&newarg, NULL, NULL, sa, sb, 0); -#else - syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T | BLAS_UPLO, - &newarg, NULL, NULL, (void *)HERK_LN, sa, sb, args -> nthreads); -#endif -#endif - -#endif - } - } - return 0; -} From f194ad59e1399a7fc99e877a3ec26a8d7ff5c585 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 25 Jul 2020 08:52:24 +0200 Subject: [PATCH 076/349] Use _Atomic instead of volatile where available (file moved from ../getrf) must have misplaced this in ../getrf when I made that change in March 2018 (40160ff) the only changes since then were RFC : Add half precision gemm for bfloat16 in OpenBLAS Rajalakshmi Srinivasaraghavan Rajalakshmi Srinivasaraghavan committed on 14 Apr 2020 as 7ebbb50 Change _STDC_VERSION__ to __STDC_VERSION__ Zhiyong Dang committed on 11 May 2018 as 3716267 --- lapack/potrf/potrf_parallel.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index e61e8decbf..008fcb8cc0 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -101,7 +101,12 @@ static FLOAT dm1 = -1.; #endif typedef struct { - volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; +#ifdef HAVE_C11 + _Atomic +#else + volatile +#endif + BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE]; } job_t; @@ -375,6 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; +#elif defined(HALF) + mode = BLAS_HALF | BLAS_REAL; + mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; From 4e1be0e4813df72c26d94f8a452611b62576fcf9 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 11 Jun 2020 04:12:49 -0700 Subject: [PATCH 077/349] ARM64: Add THUNDERX3T110 Target --- Makefile.arm64 | 10 ++ Makefile.system | 1 + TargetList.txt | 1 + cmake/arch.cmake | 2 +- cmake/prebuild.cmake | 27 +++++ cpuid_arm64.c | 27 ++++- driver/others/dynamic_arm64.c | 8 +- getarch.c | 18 +++ interface/swap.c | 2 +- interface/zswap.c | 2 +- kernel/arm64/KERNEL.THUNDERX3T110 | 184 ++++++++++++++++++++++++++++++ param.h | 29 +++++ 12 files changed, 305 insertions(+), 6 deletions(-) create mode 100644 kernel/arm64/KERNEL.THUNDERX3T110 diff --git a/Makefile.arm64 b/Makefile.arm64 index a7cd82e3aa..1091edfe55 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +ifeq ($(CORE), THUNDERX3T110) +ifeq ($(GCCVERSIONGTEQ10), 1) +CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 +else +CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 +endif +endif + ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 diff --git a/Makefile.system b/Makefile.system index db651ef997..d7e71d00af 100644 --- a/Makefile.system +++ b/Makefile.system @@ -578,6 +578,7 @@ DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += TSV110 DYNAMIC_CORE += EMAG8180 +DYNAMIC_CORE += THUNDERX3T110 endif ifeq ($(ARCH), zarch) diff --git a/TargetList.txt b/TargetList.txt index 4e54e30773..8ea2df9b72 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -96,6 +96,7 @@ FALKOR THUNDERX THUNDERX2T99 TSV110 +THUNDERX3T110 9.System Z: ZARCH_GENERIC diff --git a/cmake/arch.cmake b/cmake/arch.cmake index d56ba99cb6..5388156bc2 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,7 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) endif () if (POWER) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 30256870ca..e50483a2f9 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -338,6 +338,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "THUNDERX3T110") + file(APPEND ${TARGET_CONF_TEMP} + "#define THUNDERX3T110\n" + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t8\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t8\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define L3_SIZE\t94371840\n" + "#define L3_LINESIZE\t64\n" + "#define L3_ASSOCIATIVE\t32\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n" + "#define ARMV8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) elseif ("${TCORE}" STREQUAL "TSV110") file(APPEND ${TARGET_CONF_TEMP} "#define ARMV8\n" diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 4103216e6f..6f41be6048 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -40,6 +40,7 @@ // Cavium #define CPU_THUNDERX 7 #define CPU_THUNDERX2T99 8 +#define CPU_THUNDERX3T110 12 //Hisilicon #define CPU_TSV110 9 // Ampere @@ -57,7 +58,8 @@ static char *cpuname[] = { "THUNDERX2T99", "TSV110", "EMAG8180", - "NEOVERSEN1" + "NEOVERSEN1", + "THUNDERX3T110" }; static char *cpuname_lower[] = { @@ -72,7 +74,8 @@ static char *cpuname_lower[] = { "thunderx2t99", "tsv110", "emag8180", - "neoversen1" + "neoversen1", + "thunderx3t110" }; int get_feature(char *search) @@ -158,6 +161,8 @@ int detect(void) return CPU_THUNDERX; else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; + else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8")) + return CPU_THUNDERX3T110; // HiSilicon else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) return CPU_TSV110; @@ -372,7 +377,25 @@ void get_cpuconfig(void) printf("#define L2_LINESIZE 64\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); + break; + case CPU_THUNDERX3T110: + printf("#define THUNDERX3T110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 8 \n"); + printf("#define L1_DATA_SIZE 32768 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 8 \n"); + printf("#define L2_SIZE 524288 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define L3_SIZE 94371840 \n"); + printf("#define L3_LINESIZE 64 \n"); + printf("#define L3_ASSOCIATIVE 32 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; } get_cpucount(); } diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 11ef2725c9..157b03365b 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_TSV110; extern gotoblas_t gotoblas_EMAG8180; extern gotoblas_t gotoblas_NEOVERSEN1; +extern gotoblas_t gotoblas_THUNDERX3T110; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 11 +#define NUM_CORETYPES 12 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -82,6 +83,7 @@ static char *corename[] = { "tsv110", "emag8180", "neoversen1", + "thunderx3t110", "unknown" }; @@ -97,6 +99,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_TSV110) return corename[ 8]; if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; + if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; return corename[NUM_CORETYPES]; } @@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 8: return (&gotoblas_TSV110); case 9: return (&gotoblas_EMAG8180); case 10: return (&gotoblas_NEOVERSEN1); + case 11: return (&gotoblas_THUNDERX3T110); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX; case 0x0af: // ThunderX2 return &gotoblas_THUNDERX2T99; + case 0x0b8: // ThunderX3 + return &gotoblas_THUNDERX3T110; } break; case 0x48: // HiSilicon diff --git a/getarch.c b/getarch.c index 2cdf772599..51c9a84e51 100644 --- a/getarch.c +++ b/getarch.c @@ -1174,6 +1174,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "EMAG8180" #endif +#ifdef FORCE_THUNDERX3T110 +#define ARMV8 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "THUNDERX3T110" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DTHUNDERX3T110 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "thunderx3t110" +#define CORENAME "THUNDERX3T110" +#else +#endif + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" diff --git a/interface/swap.c b/interface/swap.c index 17a9868a9c..ea40b1fc2a 100644 --- a/interface/swap.c +++ b/interface/swap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/interface/zswap.c b/interface/zswap.c index 372b15447a..43971b73e0 100644 --- a/interface/zswap.c +++ b/interface/zswap.c @@ -42,7 +42,7 @@ #include "functable.h" #endif -#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) +#if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) // Multithreaded swap gives performance benefits in ThunderX2T99 #else // Disable multi-threading as it does not show any performance diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110 new file mode 100644 index 0000000000..a20d0d4a6d --- /dev/null +++ b/kernel/arm64/KERNEL.THUNDERX3T110 @@ -0,0 +1,184 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c + +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c + +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S + +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c + +SNRM2KERNEL = scnrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +#DNRM2KERNEL = dznrm2_thunderx2t99_fast.c +#ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c + + +DDOTKERNEL = dot_thunderx2t99.c +SDOTKERNEL = dot_thunderx2t99.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c +DSDOTKERNEL = dot.S + +ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) +DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) +SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S +endif + +ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) +CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S +endif + +ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) +ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S +endif diff --git a/param.h b/param.h index efe0e1096e..476f237a1f 100644 --- a/param.h +++ b/param.h @@ -2779,6 +2779,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(THUNDERX3T110) + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #elif defined(NEOVERSEN1) #define SGEMM_DEFAULT_UNROLL_M 16 From d557584b71578620520e7bcdea7e0f029d0a76e7 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Mon, 27 Jul 2020 14:11:07 -0500 Subject: [PATCH 078/349] Fix compilation issues with clang on POWER As gcc defaults to -malign-power, removing that option. Also adding -fno-integrated-as to use GNU assembler for powerpc assembly optimization files. Fixed other compilation errors reported in dgemv_t.c file. --- Makefile.power | 26 +++++++++++++------------- kernel/Makefile | 5 +++++ kernel/power/dgemv_t.c | 4 ++-- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Makefile.power b/Makefile.power index bf7037995d..c1556fe82a 100644 --- a/Makefile.power +++ b/Makefile.power @@ -11,34 +11,34 @@ endif ifeq ($(CORE), POWER10) ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp else -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -malign-power -fno-fast-math +COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif endif ifeq ($(CORE), POWER9) ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp else FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp endif else ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math +CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math else FCOMMON_OPT += -O2 -Mrecursive endif @@ -48,26 +48,26 @@ endif ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp else FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp endif else ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math +CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) ifeq ($(OSNAME), AIX) -FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math else -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math endif else FCOMMON_OPT += -O2 -Mrecursive diff --git a/kernel/Makefile b/kernel/Makefile index 9b468a6afe..db3282c050 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,6 +10,11 @@ ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) endif +ifeq ($(ARCH), power) +ifeq ($(C_COMPILER), CLANG) + override CFLAGS += -fno-integrated-as +endif +endif AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index 09abd5a439..c07b3c223e 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -359,7 +359,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "stxvd2x 39, %[off], %[y] \n\t" "stxvd2x 40, %[off2], %[y] \n\t" - : [memy] "+m" (*(const double (*)[8])y), + : [memy] "+m" (*(double (*)[8])y), [n] "+&r" (n), [a0] "=b" (a0), [a1] "=&b" (a1), @@ -373,7 +373,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do [off2]"=&b" (off2), [temp] "=&b" (tempR) : [memx] "m" (*(const double (*)[n])x), - [mem_ap] "m" (*(const double (*)[]) ap), + [mem_ap] "m" (*(const double (*)[n*8]) ap), [alpha] "d" (alpha), "[a0]" (ap), [x] "b" (x), From 921ec4e9e2ae5b1d32bcad04a19cdffe06e145c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Jul 2020 19:54:46 +0000 Subject: [PATCH 079/349] Adjust A53 SGEMM parameters to reflect move to 8x8 kernel --- cmake/prebuild.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 30256870ca..ff7715c4b4 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS "#define HAVE_VFP\n" "#define HAVE_NEON\n" "#define ARMV8\n") +if ("${TCORE}" STREQUAL "CORTEXA57") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) +else + set(SGEMM_UNROLL_M 8) + set(SGEMM_UNROLL_N 8) +endif set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) From 64e2e4aaf3d396740c0c0b66b5a10baf8fdef167 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 27 Jul 2020 20:19:22 +0000 Subject: [PATCH 080/349] missing braces --- cmake/prebuild.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index ff7715c4b4..4067138b4a 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -198,10 +198,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS if ("${TCORE}" STREQUAL "CORTEXA57") set(SGEMM_UNROLL_M 16) set(SGEMM_UNROLL_N 4) -else +else () set(SGEMM_UNROLL_M 8) set(SGEMM_UNROLL_N 8) -endif +endif () set(DGEMM_UNROLL_M 8) set(DGEMM_UNROLL_N 4) set(CGEMM_UNROLL_M 8) From 200f5c44cc14f356d7dba6af257044016a0573da Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 13:45:23 +0000 Subject: [PATCH 081/349] Add AMD Renoir models and preliminary support for ZEN3 as ZEN2 also remap erroneous family 16 entry to BOBCAT and reclaim erroneous family 25 "Barcelona" for Zen3 --- cpuid_x86.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 356800b781..ea846a392a 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1454,10 +1454,11 @@ int get_cpuname(void){ return CPUTYPE_OPTERON; case 1: case 3: - case 7: - case 10: +// case 7: +// case 10: return CPUTYPE_BARCELONA; case 5: + case 7: return CPUTYPE_BOBCAT; case 6: switch (model) { @@ -1507,6 +1508,8 @@ int get_cpuname(void){ // AMD Ryzen case 8: // AMD Ryzen2 + default: + // Matisse/Renoir and other recent Ryzen2 if(support_avx()) #ifndef NO_AVX2 return CPUTYPE_ZEN; @@ -1516,6 +1519,16 @@ int get_cpuname(void){ else return CPUTYPE_BARCELONA; } + break; + case 10: // Zen3 + if(support_avx()) +#ifndef NO_AVX2 + return CPUTYPE_ZEN; +#else + return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator +#endif + else + return CPUTYPE_BARCELONA; } break; } @@ -2107,7 +2120,7 @@ int get_coretype(void){ return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. - case 5: // New EXCAVATOR + case 5: // New EXCAVATOR if(support_avx()) return CORE_EXCAVATOR; else @@ -2135,12 +2148,14 @@ int get_coretype(void){ } break; } - } else if (exfamily == 8) { + } else if (exfamily == 8 || exfamily == 10) { switch (model) { case 1: // AMD Ryzen case 8: - // Ryzen 2 + // Ryzen 2 + default: + // Matisse,Renoir Ryzen2 models if(support_avx()) #ifndef NO_AVX2 return CORE_ZEN; From 12918358aa52aa9cdc194057d5e4b556933988aa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 13:53:17 +0000 Subject: [PATCH 082/349] Add AMD Renoir/Matisse and preliminary support for Zen3 as Zen2 also support AMD family 22 Jaguar/Puma as Bobcat --- driver/others/dynamic.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index c03b0b21de..5d71b1b2c2 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -656,7 +656,7 @@ static gotoblas_t *get_coretype(void){ if ((exfamily == 0) || (exfamily == 2)) { if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; else return &gotoblas_OPTERON; - } else if (exfamily == 5) { + } else if (exfamily == 5 || exfamily == 7) { return &gotoblas_BOBCAT; } else if (exfamily == 6) { if(model == 1){ @@ -710,7 +710,7 @@ static gotoblas_t *get_coretype(void){ } } } else if (exfamily == 8) { - if (model == 1 || model == 8) { + /* if (model == 1 || model == 8) */ { if(support_avx()) return &gotoblas_ZEN; else{ @@ -718,16 +718,24 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } - } else if (exfamily == 9) { + } else if (exfamily == 9) { if(support_avx()) return &gotoblas_ZEN; else{ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } + } + } else if (exfamily == 10) { + if(support_avx()) + return &gotoblas_ZEN; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } }else { return &gotoblas_BARCELONA; } + } } From 5fa581c87e0f3979d0fc70b4ea485fc0d898ffb3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 28 Jul 2020 14:22:41 +0000 Subject: [PATCH 083/349] Put hint to use git develop rather than master branch in README --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e5e3e9564..f8226f5cb1 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code -using Git from https://github.com/xianyi/OpenBLAS.git. +using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be +sure to use the develop branch - master is several years out of date due to a change of maintainership.) Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. Most can also be given directly on the make or cmake command line. From 39724e8128cee3ab49aaa1f508e97bf9f56db61e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Jul 2020 01:14:08 +0200 Subject: [PATCH 084/349] Separate OpenMP handling and allow compilation of Power9 code with older gcc --- Makefile.power | 54 ++++++++++++++++++++++---------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/Makefile.power b/Makefile.power index c1556fe82a..37a02d6922 100644 --- a/Makefile.power +++ b/Makefile.power @@ -10,54 +10,36 @@ USE_OPENMP = 1 endif ifeq ($(CORE), POWER10) -ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp -else COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math endif -endif ifeq ($(CORE), POWER9) -ifeq ($(USE_OPENMP), 1) ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp -else -CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp -endif -ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp +CCOMMON_OPT += -Ofast -mvsx -fno-fast-math +ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +CCOMMON_OPT += -mcpu=power8 -mtune=power8 else -FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp +CCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else -ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math -else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math +FCOMMON_OPT += -O2 -frecursive -fno-fast-math +ifneq ($(GCCVERSIONGT4), 1) +$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) +FCOMMON_OPT += -mcpu=power8 -mtune=power8 else -FCOMMON_OPT += -O2 -Mrecursive +FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif +else +FCOMMON_OPT += -O2 -Mrecursive endif endif ifeq ($(CORE), POWER8) -ifeq ($(USE_OPENMP), 1) -ifneq ($(C_COMPILER), PGI) -CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp -else -CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp -endif -ifneq ($(F_COMPILER), PGI) -FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp -else -FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp -endif -else ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math else @@ -73,6 +55,18 @@ else FCOMMON_OPT += -O2 -Mrecursive endif endif + +ifeq ($(USE_OPENMP), 1) +ifneq ($(C_COMPILER), PGI) +CCOMMON_OPT += -DUSE_OPENMP -fopenmp +else +CCOMMON_OPT += -DUSE_OPENMP -mp +endif +ifneq ($(F_COMPILER), PGI) +FCOMMON_OPT += -DUSE_OPENMP -fopenmp +else +FCOMMON_OPT += -DUSE_OPENMP -mp +endif endif # workaround for C->FORTRAN ABI violation in LAPACKE From f77b6a83f4c20ca4e4769a999a69b0f47f7f4bb1 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Wed, 29 Jul 2020 18:59:32 -0500 Subject: [PATCH 085/349] dgemv optimization for POWER10 Making use of new vector pair POWER10 instructions in dgemv_n and dgemv_t. Also adding a new block 4x128 to make use of Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. Tested on simulator and there are no new test failures. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/dgemv_n_microk_power10.c | 268 ++++++++ kernel/power/dgemv_n_power10.c | 565 +++++++++++++++++ kernel/power/dgemv_t_power10.c | 840 ++++++++++++++++++++++++++ 4 files changed, 1675 insertions(+), 2 deletions(-) create mode 100644 kernel/power/dgemv_n_microk_power10.c create mode 100644 kernel/power/dgemv_n_power10.c create mode 100644 kernel/power/dgemv_t_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 39f5e94145..f390fac61d 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -187,12 +187,12 @@ ZSWAPKERNEL = zswap.c # SGEMVNKERNEL = sgemv_n.c -DGEMVNKERNEL = dgemv_n.c +DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # SGEMVTKERNEL = sgemv_t.c -DGEMVTKERNEL = dgemv_t.c +DGEMVTKERNEL = dgemv_t_power10.c CGEMVTKERNEL = cgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c new file mode 100644 index 0000000000..4be8a5f9b8 --- /dev/null +++ b/kernel/power/dgemv_n_microk_power10.c @@ -0,0 +1,268 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/30 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_4x4 1 + +static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) +{ + double *a0; + double *a1; + double *a2; + double *a3; + + __asm__ + ( + "lxvp 40, 0(%10) \n\t" // x0, x1 + XXSPLTD_S(32,%x9,0) // alpha, alpha + + "sldi %6, %13, 3 \n\t" // lda * sizeof (double) + + "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha + "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha + + "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda + "add %6, %6, %6 \n\t" // 2 * lda + + XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha + + "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda + "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda + + "dcbt 0, %3 \n\t" + "dcbt 0, %4 \n\t" + "dcbt 0, %5 \n\t" + "dcbt 0, %6 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "dcbt 0, %2 \n\t" + + "addi %3, %3, 32 \n\t" + "addi %4, %4, 32 \n\t" + "addi %5, %5, 32 \n\t" + "addi %6, %6, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "ble two%= \n\t" + + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] + + "xvmaddadp 36, 42, 33 \n\t" + "addi %3, %3, 32 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] + + "xvmaddadp 36, 44, 34 \n\t" + "addi %4, %4, 32 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] + + "xvmaddadp 36, 46, 35 \n\t" + "addi %5, %5, 32 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] + + "addi %6, %6, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %1, %1, -4 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "lxvp 36, 0(%2) \n\t" // y0, y1 + + "xvmaddadp 36, 40, 32 \n\t" + "xvmaddadp 37, 41, 32 \n\t" + + "xvmaddadp 36, 42, 33 \n\t" + "xvmaddadp 37, 43, 33 \n\t" + + "xvmaddadp 36, 44, 34 \n\t" + "xvmaddadp 37, 45, 34 \n\t" + + "xvmaddadp 36, 46, 35 \n\t" + "xvmaddadp 37, 47, 35 \n\t" + + "stxvp 36, 0(%2) \n\t" // y0, y1 + + "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n" + "#a0=%3 a1=%4 a2=%5 a3=%6" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (y), // 2 + "=b" (a0), // 3 + "=b" (a1), // 4 + "=&b" (a2), // 5 + "=&b" (a3) // 6 + : + "m" (*x), + "m" (*ap), + "d" (alpha), // 9 + "r" (x), // 10 + "b" (16), // 11 + "3" (ap), // 12 + "4" (lda) // 13 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); +} diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c new file mode 100644 index 0000000000..ad5f1ba0d9 --- /dev/null +++ b/kernel/power/dgemv_n_power10.c @@ -0,0 +1,565 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); +typedef __vector_pair __attribute__((aligned(8))) vecp_t; + +#include "dgemv_n_microk_power10.c" + +#define MMA(X, APTR, ACC) \ + rX = (vec_t *) & X; \ + rowA = *((vecp_t*)((void*)&APTR)); \ + __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); + +#define SAVE(ACC, Z) \ + rowC = (v4sf_t *) &y[Z]; \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][1] = result[1][0]; \ + result[2][1] = result[3][0]; \ + rowC[0] += valpha * result[0]; \ + rowC[1] += valpha * result[2]; + +void +dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, + FLOAT * y, FLOAT alpha) +{ + BLASLONG i, j, tmp; + FLOAT *a0 = a_ptr; + FLOAT *x1 = xo; + vector double valpha = { alpha, alpha }; + v4sf_t *rowC; + __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; + v4sf_t result[4]; + vecp_t rowA; + vec_t *rX; + tmp = (n / 32) * 32; + for (i = 0; i < tmp; i += 32) + { + xo = x1; + a0 = a_ptr; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + __builtin_mma_xxsetaccz (&acc2); + __builtin_mma_xxsetaccz (&acc3); + __builtin_mma_xxsetaccz (&acc4); + __builtin_mma_xxsetaccz (&acc5); + __builtin_mma_xxsetaccz (&acc6); + __builtin_mma_xxsetaccz (&acc7); + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + 0 + j * lda], &acc0); + MMA (xo[j], a0[i + 4 + j * lda], &acc1); + MMA (xo[j], a0[i + 8 + j * lda], &acc2); + MMA (xo[j], a0[i + 12 + j * lda], &acc3); + MMA (xo[j], a0[i + 16 + j * lda], &acc4); + MMA (xo[j], a0[i + 20 + j * lda], &acc5); + MMA (xo[j], a0[i + 24 + j * lda], &acc6); + MMA (xo[j], a0[i + 28 + j * lda], &acc7); + } + xo += 32; + a0 += lda << 5; + SAVE (&acc0, i + 0); + SAVE (&acc1, i + 4); + SAVE (&acc2, i + 8); + SAVE (&acc3, i + 12); + SAVE (&acc4, i + 16); + SAVE (&acc5, i + 20); + SAVE (&acc6, i + 24); + SAVE (&acc7, i + 28); + + } + for (i = tmp; i < n; i += 4) + { + xo = x1; + a0 = a_ptr; + __builtin_mma_xxsetaccz (&acc0); + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + for (j = 0; j < 32; j++) + { + __builtin_prefetch (xo+j); + __builtin_prefetch (a0+i+j+lda); + MMA (xo[j], a0[i + j * lda], &acc0); + } + xo += 32; + a0 += lda << 5; + SAVE (&acc0, i); + } +} + + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + FLOAT *a0 = a_ptr; + FLOAT *a1 = a0 + lda; + FLOAT *a2 = a1 + lda; + FLOAT *a3 = a2 + lda; + + + for ( i=0; i<4; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + + for ( i=0; i<2; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; + } +} + + +#endif + +#ifndef HAVE_KERNEL_4x1 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha) +{ + BLASLONG i; + FLOAT x[4] __attribute__ ((aligned (16)));; + + for ( i=0; i<1; i++) + x[i] = xo[i] * alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + + +#endif + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 7; + n1 = (n - (n128 * 128)) >> 2; + n2 = (n - (n128 * 128)) & 3; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + for( i = 0; i < n128 ; i++) + { + dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda128; + x_ptr += 128; + } + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + for( i = 0; i < n128 ; i++) + { + FLOAT xbuffer[128] __attribute__ ((aligned (16))); + BLASLONG j; + for ( j = 0; j < 128 ; j++) + { + xbuffer[j] = x_ptr[0]; + x_ptr += inc_x; + } + dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda128; + } + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha); + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c new file mode 100644 index 0000000000..3db4d57853 --- /dev/null +++ b/kernel/power/dgemv_t_power10.c @@ -0,0 +1,840 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include "common.h" + +#define NBMAX 1024 +//#define PREFETCH 1 +#include + +#define HAVE_KERNEL4x8_ASM 1 + + +#if defined(HAVE_KERNEL4x8_ASM) +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + BLASLONG off2; + BLASLONG tempR; + __asm__( + + "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2 + "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double) + "xxlxor 34,34,34 \n\t" + "xxlxor 35,34,34 \n\t" + "add %[a2], %[a0], %[temp] \n\t" + "add %[a1], %[a0], %[off] \n\t" + "xxlxor 4,34,34 \n\t" + "xxlxor 5,34,34 \n\t" + "xxlxor 6,34,34 \n\t" + "xxlxor 7,34,34 \n\t" + "add %[a3], %[a2], %[off] \n\t" + "add %[a4], %[a2], %[temp] \n\t" + + "xxlxor 8,34,34 \n\t" + "xxlxor 9,34,34 \n\t" + "add %[a5], %[a3], %[temp] \n\t" + "li %[off],0 \n\t" + "li %[off2],16 \n\t" + + "add %[a6], %[a4], %[temp] \n\t" + "add %[a7], %[a5], %[temp] \n\t" + + + + + "lxvp 32, 0(%[x]) \n\t" + "lxvp 36, 0(%[a0]) \n\t" + "lxvp 38, 0(%[a1]) \n\t" + "lxvp 40, 0(%[a2]) \n\t" + "lxvp 42, 0(%[a3]) \n\t" + "lxvp 44, 0(%[a4]) \n\t" + "lxvp 46, 0(%[a5]) \n\t" + "lxvp 48, 0(%[a6]) \n\t" + "lxvp 50, 0(%[a7]) \n\t" +#if defined(PREFETCH) + "li %[temp],896 \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + + "li %[off],32 \n\t" + + + "ble- two%= \n\t" + + //-------------------------------------------------- + ".align 5 \n\t" + "one%=: \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 32(%[a0]) \n\t" + "lxvp 38, 32(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 32(%[a2]) \n\t" + "lxvp 42, 32(%[a3]) \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 32(%[a4]) \n\t" + "lxvp 46, 32(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 32(%[a6]) \n\t" + "lxvp 50, 32(%[a7]) \n\t" + "lxvp 32, 32(%[x]) \n\t" + "ble- two%= \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 64(%[a0]) \n\t" + "lxvp 38, 64(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 64(%[a2]) \n\t" + "lxvp 42, 64(%[a3]) \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 64(%[a4]) \n\t" + "lxvp 46, 64(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 64(%[a6]) \n\t" + "lxvp 50, 64(%[a7]) \n\t" + "lxvp 32, 64(%[x]) \n\t" + "ble- two%= \n\t" + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" +#if defined(PREFETCH) + "addi %[temp],%[temp],128 \n\t" +#endif + "addi %[off2], %[off2],32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a0] \n\t" +#endif + + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 96(%[a0]) \n\t" + "lxvp 38, 96(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a1] \n\t" +#endif + "lxvp 40, 96(%[a2]) \n\t" + "lxvp 42, 96(%[a3]) \n\t" + "addi %[off], %[off],32 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 96(%[a4]) \n\t" + "lxvp 46, 96(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a3] \n\t" +#endif + "lxvp 48, 96(%[a6]) \n\t" + "lxvp 50, 96(%[a7]) \n\t" + "lxvp 32, 96(%[x]) \n\t" + + "addic. %[n],%[n],-4 \n\t" + "ble- two%= \n\t" + + "addi %[off2], %[off2],32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a2] \n\t" +#endif + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a4] \n\t" +#endif + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a5] \n\t" +#endif + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "lxvp 36, 128(%[a0]) \n\t" + "lxvp 38, 128(%[a1]) \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "addi %[off], %[off],32 \n\t" + "lxvp 40, 128(%[a2]) \n\t" + "lxvp 42, 128(%[a3]) \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[a6] \n\t" +#endif + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "lxvp 44, 128(%[a4]) \n\t" + "lxvp 46, 128(%[a5]) \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + +#if defined(PREFETCH) + "dcbt %[temp],%[a7] \n\t" +#endif + "addic. %[n],%[n],-4 \n\t" + "lxvp 48, 128(%[a6]) \n\t" + "lxvp 50, 128(%[a7]) \n\t" + "lxvp 32, 128(%[x]) \n\t" +#if defined(PREFETCH) + "dcbt %[temp],%[x] \n\t" +#endif + "addi %[a0], %[a0], 128 \n\t" + "addi %[a1], %[a1], 128 \n\t" + "addi %[a2], %[a2], 128 \n\t" + "addi %[a3], %[a3], 128 \n\t" + "addi %[a4], %[a4], 128 \n\t" + "addi %[a5], %[a5], 128 \n\t" + "addi %[a6], %[a6], 128 \n\t" + "addi %[a7], %[a7], 128 \n\t" + "addi %[x], %[x], 128 \n\t" + "bgt+ one%= \n\t" + ".align 5 \n\t" + "two%=: \n\t" + //-------------------------------------------- + + "xvmaddadp 34,36,32 \n\t" + "xvmaddadp 35,38,32 \n\t" + "xvmaddadp 4,40,32 \n\t" + "xvmaddadp 5,42,32 \n\t" + "xvmaddadp 6,44,32 \n\t" + "xvmaddadp 7,46,32 \n\t" + "xvmaddadp 8,48,32 \n\t" + "xvmaddadp 9,50,32 \n\t" + XXSPLTD_S(36,%x[alpha],0) + "xvmaddadp 34,37,33 \n\t" + "xvmaddadp 35,39,33 \n\t" + "xvmaddadp 4,41,33 \n\t" + "xvmaddadp 5,43,33 \n\t" + "xvmaddadp 6,45,33 \n\t" + "xvmaddadp 7,47,33 \n\t" + "xvmaddadp 8,49,33 \n\t" + "xvmaddadp 9,51,33 \n\t" + + "lxvp 38, 0(%[y]) \n\t" + "lxvp 40, 32(%[y]) \n\t" + + + + XXMRGLD_S(42,35,34) + XXMRGHD_S(43,35,34) + + XXMRGLD_S(44,5,4) + XXMRGHD_S(45,5,4) + + "xvadddp 42,42,43 \n\t" + + XXMRGLD_S(46,7,6) + XXMRGHD_S(47,7,6) + + "xvadddp 44,44,45 \n\t" + + XXMRGLD_S(48,9,8) + XXMRGHD_S(49,9,8) + + "xvadddp 46,46,47 \n\t" + + "xvmaddadp 39,42,36 \n\t" + "xvmaddadp 38,44,36 \n\t" + + "xvadddp 48,48,49 \n\t" + + "xvmaddadp 41,46,36 \n\t" + + "stxvp 38, 0(%[y]) \n\t" + "xvmaddadp 40,48,36 \n\t" + "stxvp 40, 32(%[y]) \n\t" + + : [memy] "+m" (*(double (*)[8])y), + [n] "+&r" (n), + [a0] "=b" (a0), + [a1] "=&b" (a1), + [a2] "=&b" (a2), + [a3] "=&b" (a3), + [a4] "=&b" (a4), + [a5] "=&b" (a5), + [a6] "=&b" (a6), + [a7] "=&b" (a7), + [off] "+&b" (lda), + [off2]"=&b" (off2), + [temp] "=&b" (tempR) + : [memx] "m" (*(const double (*)[n])x), + [mem_ap] "m" (*(const double (*)[n*8]) ap), + [alpha] "d" (alpha), + "[a0]" (ap), + [x] "b" (x), + [y] "b" (y) + : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39", + "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + ); + return; +} +#else +static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i; +#if defined(PREFETCH) + BLASLONG j, c, k; +#endif + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + va0 = (__vector double*) a0; + va1 = (__vector double*) a1; + va2 = (__vector double*) a2; + va3 = (__vector double*) a3; + va4 = (__vector double*) a4; + va5 = (__vector double*) a5; + va6 = (__vector double*) a6; + va7 = (__vector double*) a7; + v_x = (__vector double*) x; + +#if defined(PREFETCH) + + c = n >> 1; + + for (j = 0; j < c; j += 64) { + k = (c - j) > 64 ? 64 : (c - j); + __builtin_prefetch(v_x + 64); + __builtin_prefetch(va0 + 64); + __builtin_prefetch(va1 + 64); + __builtin_prefetch(va2 + 64); + __builtin_prefetch(va3 + 64); + __builtin_prefetch(va4 + 64); + __builtin_prefetch(va5 + 64); + __builtin_prefetch(va6 + 64); + __builtin_prefetch(va7 + 64); + for (i = 0; i < k; i += 2) { +#else + + for (i = 0; i < n/2; i += 2) { +#endif + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i] * va4[i]; + temp5 += v_x[i] * va5[i]; + temp6 += v_x[i] * va6[i]; + temp7 += v_x[i] * va7[i]; + temp0 += v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i + 1] * va1[i + 1]; + temp2 += v_x[i + 1] * va2[i + 1]; + temp3 += v_x[i + 1] * va3[i + 1]; + + temp4 += v_x[i + 1] * va4[i + 1]; + temp5 += v_x[i + 1] * va5[i + 1]; + temp6 += v_x[i + 1] * va6[i + 1]; + temp7 += v_x[i + 1] * va7[i + 1]; + } +#if defined(PREFETCH) + va0 += 64; + va1 += 64; + va2 += 64; + va3 += 64; + va4 += 64; + va5 += 64; + va6 += 64; + va7 += 64; + v_x += 64; + + } +#endif + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + + y[4] += alpha * (temp4[0] + temp4[1]); + y[5] += alpha * (temp5[0] + temp5[1]); + y[6] += alpha * (temp6[0] + temp6[1]); + y[7] += alpha * (temp7[0] + temp7[1]); + +} + +#endif + + +static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + BLASLONG i = 0; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* va2 = (__vector double*) a2; + __vector double* va3 = (__vector double*) a3; + __vector double* v_x = (__vector double*) x; + register __vector double temp0 = {0, 0}; + register __vector double temp1 = {0, 0}; + register __vector double temp2 = {0, 0}; + register __vector double temp3 = {0, 0}; + register __vector double temp4 = {0, 0}; + register __vector double temp5 = {0, 0}; + register __vector double temp6 = {0, 0}; + register __vector double temp7 = {0, 0}; + + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i]; + temp1 += v_x[i] * va1[i]; + temp2 += v_x[i] * va2[i]; + temp3 += v_x[i] * va3[i]; + temp4 += v_x[i + 1] * va0[i + 1]; + temp5 += v_x[i + 1] * va1[i + 1]; + temp6 += v_x[i + 1] * va2[i + 1]; + temp7 += v_x[i + 1] * va3[i + 1]; + } + + temp0 += temp4; + temp1 += temp5; + temp2 += temp6; + temp3 += temp7; + y[0] += alpha * (temp0[0] + temp0[1]); + y[1] += alpha * (temp1[0] + temp1[1]); + y[2] += alpha * (temp2[0] + temp2[1]); + y[3] += alpha * (temp3[0] + temp3[1]); + +} + + +static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { + + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + __vector double* va0 = (__vector double*) a0; + __vector double* va1 = (__vector double*) a1; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + __vector double temp1 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1]; + } + + + + y[0] += alpha * (temp0[0] + temp0[1]); + y[inc_y] += alpha * (temp1[0] + temp1[1]); +} + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { + + BLASLONG i; + FLOAT *a0; + a0 = ap; + __vector double* va0 = (__vector double*) a0; + __vector double* v_x = (__vector double*) x; + __vector double temp0 = {0, 0}; + for (i = 0; i < n / 2; i += 2) { + temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; + } + + *y += alpha * (temp0[0] + temp0[1]); + +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest++ = *src; + src += inc_src; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[8] __attribute__((aligned(16))); + FLOAT *xbuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + xbuffer = buffer; + + n1 = n >> 3; + n2 = n & 7; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if (inc_x != 1) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + BLASLONG lda8 = lda << 3; + + + if (inc_y == 1) { + + for (i = 0; i < n1; i++) { + + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); + + y_ptr += 8; + a_ptr += lda8; +#if defined(PREFETCH) + __builtin_prefetch(y_ptr+64); +#endif + } + + } else { + + for (i = 0; i < n1; i++) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + ybuffer[4] = 0; + ybuffer[5] = 0; + ybuffer[6] = 0; + ybuffer[7] = 0; + dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + + *y_ptr += ybuffer[4]; + y_ptr += inc_y; + *y_ptr += ybuffer[5]; + y_ptr += inc_y; + *y_ptr += ybuffer[6]; + y_ptr += inc_y; + *y_ptr += ybuffer[7]; + y_ptr += inc_y; + + a_ptr += lda8; + } + + } + + + if (n2 & 4) { + ybuffer[0] = 0; + ybuffer[1] = 0; + ybuffer[2] = 0; + ybuffer[3] = 0; + dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); + + a_ptr += lda<<2; + + *y_ptr += ybuffer[0]; + y_ptr += inc_y; + *y_ptr += ybuffer[1]; + y_ptr += inc_y; + *y_ptr += ybuffer[2]; + y_ptr += inc_y; + *y_ptr += ybuffer[3]; + y_ptr += inc_y; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); + a_ptr += lda << 1; + y_ptr += 2 * inc_y; + + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + a_ptr += lda; + y_ptr += inc_y; + + } + + a += NB; + x += NB * inc_x; + + + } + + if (m3 == 0) return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 3 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + aj += 3; + } + + } else { + + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + aj += lda; + } + + } else { + + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; + y_ptr += inc_y; + aj += lda; + } + + } + + } + return (0); + } + + if (m3 == 2) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if (lda == 2 && inc_y == 1) { + + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; + y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; + y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; + aj += 8; + + } + + for (; j < n; j++) { + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; + aj += 2; + } + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + + for (j = 0; j < (n & -4); j += 4) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; + y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; + y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; + aj += lda4; + } + + for (; j < n; j++) { + + y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; + y_ptr += inc_y; + aj += lda; + } + } + + } + return (0); + + } + + FLOAT xtemp = *x_ptr * alpha; + FLOAT *aj = a_ptr; + y_ptr = y; + if (lda == 1 && inc_y == 1) { + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += aj[j] * xtemp; + y_ptr[j + 1] += aj[j + 1] * xtemp; + y_ptr[j + 2] += aj[j + 2] * xtemp; + y_ptr[j + 3] += aj[j + 3] * xtemp; + } + for (; j < n; j++) { + y_ptr[j] += aj[j] * xtemp; + } + + + } else { + if (inc_y == 1) { + + BLASLONG register lda2 = lda << 1; + BLASLONG register lda4 = lda << 2; + BLASLONG register lda3 = lda2 + lda; + for (j = 0; j < (n & -4); j += 4) { + y_ptr[j] += *aj * xtemp; + y_ptr[j + 1] += *(aj + lda) * xtemp; + y_ptr[j + 2] += *(aj + lda2) * xtemp; + y_ptr[j + 3] += *(aj + lda3) * xtemp; + aj += lda4; + } + + for (; j < n; j++) { + y_ptr[j] += *aj * xtemp; + aj += lda; + } + + } else { + for (j = 0; j < n; j++) { + *y_ptr += *aj * xtemp; + y_ptr += inc_y; + aj += lda; + } + + } + } + + return (0); + +} + From 104aa678b0f4bc4dd9f65959d0b6f1aeb7b6f6d3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 30 Jul 2020 11:40:52 +0200 Subject: [PATCH 086/349] Fix inadvertent version number reversal to 0.3.9.dev caused by #2710 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e51e7e38c..4bef6570c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From 589c74aed38bb7923d6653fa9370b81e4fe95b4a Mon Sep 17 00:00:00 2001 From: Kevin Adler Date: Thu, 30 Jul 2020 20:52:16 -0500 Subject: [PATCH 087/349] Use systemcfg APIs for CPU detection on AIX AIX libc already provides ready access to an integer that contains a bit identifying the CPU it's running on, so there's no need to call a program and grep its output. Additionally, prtconf is not available in the PASE runtime, which provides an AIX emulation layer on the IBM i operating system. The AIX systemcfg.h also provides macro definitions like POWER_8, POWER_9, etc for all the bits defining the CPUs as well as macros like __power_8(), __power_9_andup() that return booleans, but I did not use them. Since these macros depend on the level of the OS in which it is built, they may not be defined and instead the associated hex literals are used directly. --- cpuid_power.c | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/cpuid_power.c b/cpuid_power.c index 8f578d68f8..df3dc86686 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -38,6 +38,7 @@ #include #ifdef _AIX +#include #include #endif #ifdef __APPLE__ @@ -137,35 +138,19 @@ int detect(void){ #endif #ifdef _AIX - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = popen("prtconf|grep 'Processor Type'", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Pro", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - pclose(infile); + // Cast from int to unsigned to ensure comparisons work for all bits in + // the bit mask, even the top bit + unsigned implementation = (unsigned) _system_configuration.implementation; - if (strstr(p, "POWER3")) return CPUTYPE_POWER3; - if (strstr(p, "POWER4")) return CPUTYPE_POWER4; - if (strstr(p, "PPC970")) return CPUTYPE_PPC970; - if (strstr(p, "POWER5")) return CPUTYPE_POWER5; - if (strstr(p, "POWER6")) return CPUTYPE_POWER6; - if (strstr(p, "POWER7")) return CPUTYPE_POWER6; - if (strstr(p, "POWER8")) return CPUTYPE_POWER8; - if (strstr(p, "POWER9")) return CPUTYPE_POWER9; - if (strstr(p, "POWER10")) return CPUTYPE_POWER10; - if (strstr(p, "Cell")) return CPUTYPE_CELL; - if (strstr(p, "7447")) return CPUTYPE_PPCG4; - return CPUTYPE_POWER5; + if (implementation >= 0x40000u) return CPUTYPE_POWER10; + else if (implementation & 0x20000) return CPUTYPE_POWER9; + else if (implementation & 0x10000) return CPUTYPE_POWER8; + else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7 + else if (implementation & 0x04000) return CPUTYPE_POWER6; + else if (implementation & 0x02000) return CPUTYPE_POWER5; + else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 + else if (implementation & 0x00800) return CPUTYPE_POWER4; + else return CPUTYPE_POWER3; #endif #ifdef __APPLE__ From da9e2a7adafc2e0d321e6f2f90beaffed2853372 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 31 Jul 2020 16:03:33 +0200 Subject: [PATCH 088/349] Add SYMBOLPREFIX and/or SYMBOLSUFFIX to cblas prototypes --- Makefile | 3 ++- Makefile.install | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index e113026dd5..c1d943facd 100644 --- a/Makefile +++ b/Makefile @@ -365,11 +365,12 @@ clean :: @$(MAKE) -C kernel clean #endif @$(MAKE) -C reference clean - @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h + @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 ifeq ($(OSNAME), Darwin) @rm -rf getarch.dSYM getarch_2nd.dSYM endif @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @rm -f cblas.tmp cblas.tmp2 @touch $(NETLIB_LAPACK_DIR)/make.inc @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h diff --git a/Makefile.install b/Makefile.install index dad869f4cc..12713a6db3 100644 --- a/Makefile.install +++ b/Makefile.install @@ -45,7 +45,16 @@ install : lib.grd ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) - @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" + @cp cblas.h cblas.tmp +ifdef SYMBOLPREFIX + @sed 's/cblas/$(SYMBOLPREFIX)cblas/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas/$(SYMBOLPREFIX)openblas/g' cblas.tmp2 > cblas.tmp +endif +ifdef SYMBOLSUFFIX + @sed 's/(OPENBLAS/$(SYMBOLSUFFIX)(OPENBLAS/g' cblas.tmp > cblas.tmp2 + @sed 's/(void)/$(SYMBOLSUFFIX)(void)/g' cblas.tmp2 > cblas.tmp +endif + @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif ifneq ($(OSNAME), AIX) @@ -168,4 +177,3 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! - From 60cd5e55fc2b8d50b52ebc54c701cb7315ad74ca Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Aug 2020 12:31:39 +0200 Subject: [PATCH 089/349] Protect against inadvertent activation of USE_CUDA --- driver/others/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/driver/others/Makefile b/driver/others/Makefile index 5653f3c25d..7558ec0581 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -47,8 +47,10 @@ endif endif ifdef USE_CUDA +ifeq ($(USE_CUDA), 1) COMMONOBJS += cuda_init.$(SUFFIX) endif +endif ifdef FUNCTION_PROFILE COMMONOBJS += profile.$(SUFFIX) From ecf4b9e0fca35ed15e3b0354002584fbd29a6166 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 1 Aug 2020 17:06:03 +0200 Subject: [PATCH 090/349] Improve substitution rules for SYMBOLPREFIX and -SUFFIX addition --- Makefile.install | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Makefile.install b/Makefile.install index 12713a6db3..01c0b1226e 100644 --- a/Makefile.install +++ b/Makefile.install @@ -47,12 +47,18 @@ ifndef NO_CBLAS @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @cp cblas.h cblas.tmp ifdef SYMBOLPREFIX - @sed 's/cblas/$(SYMBOLPREFIX)cblas/g' cblas.tmp > cblas.tmp2 - @sed 's/openblas/$(SYMBOLPREFIX)openblas/g' cblas.tmp2 > cblas.tmp + @sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp + #change back any openblas_complex_float and double that got hit + @sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2 + @sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp endif ifdef SYMBOLSUFFIX - @sed 's/(OPENBLAS/$(SYMBOLSUFFIX)(OPENBLAS/g' cblas.tmp > cblas.tmp2 - @sed 's/(void)/$(SYMBOLSUFFIX)(void)/g' cblas.tmp2 > cblas.tmp + @sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2 + @sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp + #change back any openblas_complex_float and double that got hit + @sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2 + @sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp endif @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" endif From 6794ac34153d9def9a1056738090160868417702 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 11:20:08 +0200 Subject: [PATCH 091/349] Add SYMBOLPREFIX and/or -SUFFIX to cblas.h if needed --- CMakeLists.txt | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e51e7e38c..c324e22419 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 9.dev) +set(OpenBLAS_PATCH_VERSION 10.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -249,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) endif() endif() -if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") +if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") else() @@ -358,10 +358,21 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + if (NOT ${SYMBOLPREFIX} STREQUAL "") + string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() + if (NOT ${SYMBOLSUFFIX} STREQUAL "") + string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") + string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") + endif() file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() From 53add6a80df77fecac8b2b2e0c81a913a50eda42 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 17:57:12 +0200 Subject: [PATCH 092/349] Apply library name suffix to openblas if any --- lapack-netlib/TESTING/EIG/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt index 70eea84430..e877b14223 100644 --- a/lapack-netlib/TESTING/EIG/CMakeLists.txt +++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt @@ -98,7 +98,7 @@ set(ZEIGTST zchkee.f macro(add_eig_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas) + target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) endmacro() if(BUILD_SINGLE) From aaf1a17168f50ce689b69a87b6643abcd0c1de51 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 17:58:33 +0200 Subject: [PATCH 093/349] Apply current library name suffix --- lapack-netlib/TESTING/LIN/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 954cab193c..0d0bb54185 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -239,7 +239,7 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr macro(add_lin_executable name) add_executable(${name} ${ARGN}) - target_link_libraries(${name} openblas) + target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE}) #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) endmacro() From aa3a1e7d8ce7049605807375fb52331d000cd0cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 18:22:31 +0200 Subject: [PATCH 094/349] Multiply by two rather than left shift by one place fixes GCC ubsan report of "left shift of negative value -2" in the BLAS tests --- kernel/x86_64/cdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index 93fca0a0d9..f71d7b6b4a 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -141,8 +141,8 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA i=0; ix=0; iy=0; - inc_x <<= 1; - inc_y <<= 1; + inc_x *= 2; + inc_y *= 2; while(i < n) { From aa53a8a5cb8cfadb4b1230c4b4596dec7fcd75ac Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 18:25:09 +0200 Subject: [PATCH 095/349] Multiply by two instead of left-shifting one place fixes GCC ubsan report of "left shift of negative value -2" in the BLAS tests --- kernel/x86_64/zdot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 01169e8e6d..423a6f23e5 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -140,8 +140,8 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO i=0; ix=0; iy=0; - inc_x <<= 1; - inc_y <<= 1; + inc_x *= 2; + inc_y *= 2; while(i < n) { From 0ef4b3f1f2b8c4ea20afbd50c35d29971ea1c3e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 18:27:40 +0200 Subject: [PATCH 096/349] Multiply instead of doing a left shift of a potentially negative number fixes GCC ubsan report in the BLAS tests --- kernel/x86_64/cgemv_t_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index 6bdea67871..f44fe72477 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -233,9 +233,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( m < 1 ) return(0); if ( n < 1 ) return(0); - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; + inc_x *= 2; + inc_y *= 2; + lda *= 2; lda4 = lda << 2; xbuffer = buffer; From 81dcfdcf397dd93b03376ea1e17bd7d0d0c7a335 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 2 Aug 2020 18:29:56 +0200 Subject: [PATCH 097/349] Multiply by 2 instead of left-shifting a potentially negative number fixes GCC ubsan warning in the BLAS tests --- kernel/x86_64/zgemv_t_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 2ab7a671bb..6221471f73 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -235,9 +235,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, if ( m < 1 ) return(0); if ( n < 1 ) return(0); - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; + inc_x *= 2; + inc_y *= 2; + lda <<= 1; lda4 = lda << 2; xbuffer = buffer; From 475b5c95b9ffb6a249bb8d8f99a8b9a6d5ec7441 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 7 Aug 2020 15:27:44 -0500 Subject: [PATCH 098/349] Remove extra symbol in Makefile While trying out different unroll values, noted that make failed due to this extra symbol. --- kernel/Makefile.L3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index d5de070a5d..8df306d5f9 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -2351,7 +2351,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) endif -$(D Date: Sat, 8 Aug 2020 18:05:20 +0200 Subject: [PATCH 099/349] Create Jenkinsfile for OSUOSL PowerCI --- Jenkinsfile | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 Jenkinsfile diff --git a/Jenkinsfile b/Jenkinsfile new file mode 100644 index 0000000000..2b61bed9fb --- /dev/null +++ b/Jenkinsfile @@ -0,0 +1,9 @@ +node { + stage('Checkout') { + checkout + } + + stage('Build') { + sh("make") + } +} From 6f5ca44c1afd3fe39cb3e18e34af7ad733b513e0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Aug 2020 12:59:20 +0200 Subject: [PATCH 100/349] Expand TAU array as SGEMQR/DGEMQR read elements 2 and 3 --- lapack-netlib/TESTING/LIN/derrtsqr.f | 4 +++- lapack-netlib/TESTING/LIN/serrtsqr.f | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/derrtsqr.f b/lapack-netlib/TESTING/LIN/derrtsqr.f index c8ad302571..d1d0ff02d3 100644 --- a/lapack-netlib/TESTING/LIN/derrtsqr.f +++ b/lapack-netlib/TESTING/LIN/derrtsqr.f @@ -77,7 +77,7 @@ SUBROUTINE DERRTSQR( PATH, NUNIT ) * .. * .. Local Arrays .. DOUBLE PRECISION A( NMAX, NMAX ), T( NMAX, NMAX ), W( NMAX ), - $ C( NMAX, NMAX ), TAU(NMAX) + $ C( NMAX, NMAX ), TAU(NMAX*2) * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, DGEQR, @@ -137,6 +137,8 @@ SUBROUTINE DERRTSQR( PATH, NUNIT ) * TAU(1)=1 TAU(2)=1 + TAU(3)=1 + TAU(4)=1 SRNAMT = 'DGEMQR' NB=1 INFOT = 1 diff --git a/lapack-netlib/TESTING/LIN/serrtsqr.f b/lapack-netlib/TESTING/LIN/serrtsqr.f index f00f3e14b3..7f91a3c394 100644 --- a/lapack-netlib/TESTING/LIN/serrtsqr.f +++ b/lapack-netlib/TESTING/LIN/serrtsqr.f @@ -77,7 +77,7 @@ SUBROUTINE SERRTSQR( PATH, NUNIT ) * .. * .. Local Arrays .. REAL A( NMAX, NMAX ), T( NMAX, NMAX ), W( NMAX ), - $ C( NMAX, NMAX ), TAU(NMAX) + $ C( NMAX, NMAX ), TAU(NMAX*2) * .. * .. External Subroutines .. EXTERNAL ALAESM, CHKXER, SGEQR, @@ -137,6 +137,8 @@ SUBROUTINE SERRTSQR( PATH, NUNIT ) * TAU(1)=1 TAU(2)=1 + TAU(3)=1 + TAU(4)=1 SRNAMT = 'SGEMQR' NB=1 INFOT = 1 From 64259d521a29514a77eea9ca8681884e7c59eb8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Aug 2020 13:02:27 +0200 Subject: [PATCH 101/349] Fix use of unallocated array in workspace query and wrong type of argument to xSCAL --- lapack-netlib/TESTING/LIN/cdrvls.f | 6 +++--- lapack-netlib/TESTING/LIN/zdrvls.f | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f index d24e3885bd..f43c10b721 100644 --- a/lapack-netlib/TESTING/LIN/cdrvls.f +++ b/lapack-netlib/TESTING/LIN/cdrvls.f @@ -372,13 +372,13 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, END IF * Compute workspace needed for CGELSY CALL CGELSY( M, N, NRHS, A, LDA, B, LDB, - $ IWQ, RCOND, CRANK, WQ, -1, RWORK, + $ IWQ, RCOND, CRANK, WQ, -1, RWQ, $ INFO ) LWORK_CGELSY = INT( WQ( 1 ) ) LRWORK_CGELSY = 2*N * Compute workspace needed for CGELSS CALL CGELSS( M, N, NRHS, A, LDA, B, LDB, S, - $ RCOND, CRANK, WQ, -1, RWORK, INFO ) + $ RCOND, CRANK, WQ, -1, RWQ, INFO ) LWORK_CGELSS = INT( WQ( 1 ) ) LRWORK_CGELSS = 5*MNMIN * Compute workspace needed for CGELSD @@ -564,7 +564,7 @@ SUBROUTINE CDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL CLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) CALL CSCAL( NCOLS*NRHS, - $ ONE / REAL( NCOLS ), WORK, + $ CONE / REAL( NCOLS ), WORK, $ 1 ) END IF CALL CGEMM( TRANS, 'No transpose', NROWS, diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f index 4587c56866..1313c853b1 100644 --- a/lapack-netlib/TESTING/LIN/zdrvls.f +++ b/lapack-netlib/TESTING/LIN/zdrvls.f @@ -372,12 +372,12 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, END IF * Compute workspace needed for ZGELSY CALL ZGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ, - $ RCOND, CRANK, WQ, -1, RWORK, INFO ) + $ RCOND, CRANK, WQ, -1, RWQ, INFO ) LWORK_ZGELSY = INT( WQ( 1 ) ) LRWORK_ZGELSY = 2*N * Compute workspace needed for ZGELSS CALL ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, - $ RCOND, CRANK, WQ, -1 , RWORK, + $ RCOND, CRANK, WQ, -1 , RWQ, $ INFO ) LWORK_ZGELSS = INT( WQ( 1 ) ) LRWORK_ZGELSS = 5*MNMIN @@ -564,7 +564,7 @@ SUBROUTINE ZDRVLS( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL, NNB, CALL ZLARNV( 2, ISEED, NCOLS*NRHS, $ WORK ) CALL ZSCAL( NCOLS*NRHS, - $ ONE / DBLE( NCOLS ), WORK, + $ CONE / DBLE( NCOLS ), WORK, $ 1 ) END IF CALL ZGEMM( TRANS, 'No transpose', NROWS, From c9d32674eaa2602184c2719dde15ac3fbebf41b7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 9 Aug 2020 19:17:04 +0200 Subject: [PATCH 102/349] Add memory barrier to the blas_lock implementation for Linux as recommended by cparrott73 in #2760 --- common_power.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_power.h b/common_power.h index aa19794b50..e0685f760e 100644 --- a/common_power.h +++ b/common_power.h @@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ " bne- 1f\n" " stwcx. %2,0, %1\n" " bne- 0b\n" + " isync\n" "1: " : "=&r"(ret) : "r"(address), "r" (val) From e2828e30aa5fc5670d0f4d4d42fc26649a4c3c64 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 11 Aug 2020 12:55:42 +0200 Subject: [PATCH 103/349] s390x: Optimize SGEMM/DGEMM blocks for z14 with explicit loop unrolling/interleaving Improve performance of SGEMM and DGEMM on z14 and z15 by unrolling and interleaving the inner loop of the SGEMM 16x4 and DGEMM 8x4 blocks. Specifically, we explicitly interleave vector register loads and computation of two iterations. Note that this change only adds one C function, since SGEMM 16x4 and DGEMM 8x4 actually map to the same C code: they both hold intermediate results in a 4x4 grid of vector registers, and the C implementation is built around that. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 213 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 212 insertions(+), 1 deletion(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index eb6d7700b0..eae2e4d69f 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -249,7 +249,6 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { #if UNROLL_M == 16 -VECTOR_BLOCK(16, 4) VECTOR_BLOCK(16, 2) VECTOR_BLOCK(16, 1) #endif @@ -257,7 +256,9 @@ VECTOR_BLOCK(16, 1) VECTOR_BLOCK(8, 8) VECTOR_BLOCK(4, 8) #endif +#ifndef DOUBLE VECTOR_BLOCK(8, 4) +#endif VECTOR_BLOCK(8, 2) VECTOR_BLOCK(8, 1) VECTOR_BLOCK(4, 4) @@ -267,8 +268,218 @@ VECTOR_BLOCK(4, 1) #ifdef DOUBLE VECTOR_BLOCK(2, 4) VECTOR_BLOCK(2, 2) +VECTOR_BLOCK(2, 1) +#endif + + +/** + * Calculate a row-block that fits 4x4 vector registers using a loop + * unrolled-by-2 with explicit interleaving to better overlap loads and + * computation. + * This function fits 16x4 blocks for SGEMM and 8x4 blocks for DGEMM. + */ +#ifdef DOUBLE +static inline void GEBP_block_8_4( +#else // float +static inline void GEBP_block_16_4( +#endif + FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { +#define VEC_ROWS 4 +#define VEC_COLS 4 +#define ROWS VEC_ROWS * VLEN_FLOATS +#define COLS (VEC_COLS) + + /* + * Hold intermediate results in vector registers. + * Since we need to force the compiler's hand in places, we need to use + * individual variables in contrast to the generic implementation's + * arrays. + */ +#define INIT_ROW_OF_C(ROW) \ + vector_float A##ROW = vec_load_hinted(A + ROW * VLEN_FLOATS); \ + vector_float C_##ROW##_0 = A##ROW * B[0]; \ + vector_float C_##ROW##_1 = A##ROW * B[1]; \ + vector_float C_##ROW##_2 = A##ROW * B[2]; \ + vector_float C_##ROW##_3 = A##ROW * B[3]; + + INIT_ROW_OF_C(0) + INIT_ROW_OF_C(1) + INIT_ROW_OF_C(2) + INIT_ROW_OF_C(3) +#undef INIT_ROW_OF_C + + if (bk > 1) { + BLASLONG k = 1; + vector_float Ak[VEC_ROWS], Aknext[VEC_ROWS]; + vector_float Bk[VEC_COLS], Bknext[VEC_COLS]; + + /* + * Note that in several places, we enforce an instruction + * sequence that we identified empirically by utilizing dummy + * asm statements. + */ + + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bk[j] = vec_splats(B[j + k * COLS]); + asm(""); + + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + k * ROWS); + + for (; k < (bk - 2); k += 2) { + /* + * Load inputs for (k+1) into registers. + * Loading from B first is advantageous. + */ + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bknext[j] = vec_splats(B[j + (k + 1) * COLS]); + asm(""); + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Aknext[i] = vec_load_hinted(A + i * VLEN_FLOATS + + (k + 1) * ROWS); + + /* + * To achieve better instruction-level parallelism, + * make sure to first load input data for (k+1) before + * initiating compute for k. We enforce that ordering + * with a pseudo asm statement. + * Note that we need to massage this particular "barrier" + * depending on the gcc version. + */ +#if __GNUC__ > 7 +#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \ + do { \ + asm("" \ + : "+v"(C_0_0), "+v"(C_0_1), "+v"(C_0_2), "+v"(C_0_3), "+v"(C_1_0), \ + "+v"(C_1_1), "+v"(C_1_2), "+v"(C_1_3) \ + : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \ + "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \ + "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \ + asm("" \ + : "+v"(C_2_0), "+v"(C_2_1), "+v"(C_2_2), "+v"(C_2_3), "+v"(C_3_0), \ + "+v"(C_3_1), "+v"(C_3_2), "+v"(C_3_3) \ + : "v"(B##SUFFIX[0]), "v"(B##SUFFIX[1]), "v"(B##SUFFIX[2]), \ + "v"(B##SUFFIX[3]), "v"(A##SUFFIX[0]), "v"(A##SUFFIX[1]), \ + "v"(A##SUFFIX[2]), "v"(A##SUFFIX[3])); \ + } while (0) +#else // __GNUC__ <= 7 +#define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \ + do { \ + asm(""); \ + } while (0) #endif + BARRIER_READ_BEFORE_COMPUTE(knext); + + /* Compute for (k) */ + C_0_0 += Ak[0] * Bk[0]; + C_1_0 += Ak[1] * Bk[0]; + C_2_0 += Ak[2] * Bk[0]; + C_3_0 += Ak[3] * Bk[0]; + + C_0_1 += Ak[0] * Bk[1]; + C_1_1 += Ak[1] * Bk[1]; + C_2_1 += Ak[2] * Bk[1]; + C_3_1 += Ak[3] * Bk[1]; + + C_0_2 += Ak[0] * Bk[2]; + C_1_2 += Ak[1] * Bk[2]; + C_2_2 += Ak[2] * Bk[2]; + C_3_2 += Ak[3] * Bk[2]; + + C_0_3 += Ak[0] * Bk[3]; + C_1_3 += Ak[1] * Bk[3]; + C_2_3 += Ak[2] * Bk[3]; + C_3_3 += Ak[3] * Bk[3]; + + asm(""); + + /* + * Load inputs for (k+2) into registers. + * First load from B. + */ + for (BLASLONG j = 0; j < VEC_COLS; j++) + Bk[j] = vec_splats(B[j + (k + 2) * COLS]); + asm(""); + for (BLASLONG i = 0; i < VEC_ROWS; i++) + Ak[i] = vec_load_hinted(A + i * VLEN_FLOATS + (k + 2) * ROWS); + + /* + * As above, make sure to first schedule the loads for (k+2) + * before compute for (k+1). + */ + BARRIER_READ_BEFORE_COMPUTE(k); + + /* Compute on (k+1) */ + C_0_0 += Aknext[0] * Bknext[0]; + C_1_0 += Aknext[1] * Bknext[0]; + C_2_0 += Aknext[2] * Bknext[0]; + C_3_0 += Aknext[3] * Bknext[0]; + + C_0_1 += Aknext[0] * Bknext[1]; + C_1_1 += Aknext[1] * Bknext[1]; + C_2_1 += Aknext[2] * Bknext[1]; + C_3_1 += Aknext[3] * Bknext[1]; + + C_0_2 += Aknext[0] * Bknext[2]; + C_1_2 += Aknext[1] * Bknext[2]; + C_2_2 += Aknext[2] * Bknext[2]; + C_3_2 += Aknext[3] * Bknext[2]; + + C_0_3 += Aknext[0] * Bknext[3]; + C_1_3 += Aknext[1] * Bknext[3]; + C_2_3 += Aknext[2] * Bknext[3]; + C_3_3 += Aknext[3] * Bknext[3]; + } + + /* Wrapup remaining k's */ + for (; k < bk; k++) { + vector_float Ak; + +#define COMPUTE_WRAPUP_ROW(ROW) \ + Ak = vec_load_hinted(A + ROW * VLEN_FLOATS + k * ROWS); \ + C_##ROW##_0 += Ak * B[0 + k * COLS]; \ + C_##ROW##_1 += Ak * B[1 + k * COLS]; \ + C_##ROW##_2 += Ak * B[2 + k * COLS]; \ + C_##ROW##_3 += Ak * B[3 + k * COLS]; + + COMPUTE_WRAPUP_ROW(0) + COMPUTE_WRAPUP_ROW(1) + COMPUTE_WRAPUP_ROW(2) + COMPUTE_WRAPUP_ROW(3) +#undef COMPUTE_WRAPUP_ROW + } + } + + /* + * Unpack row-block of C_aux into outer C_i, multiply by + * alpha and add up (or assign for TRMM). + */ +#define WRITE_BACK_C(ROW, COL) \ + do { \ + vector_float *Cij = \ + (vector_float *)(C + ROW * VLEN_FLOATS + COL * ldc); \ + if (trmm) { \ + *Cij = alpha * C_##ROW##_##COL; \ + } else { \ + *Cij += alpha * C_##ROW##_##COL; \ + } \ + } while (0) + + WRITE_BACK_C(0, 0); WRITE_BACK_C(0, 1); WRITE_BACK_C(0, 2); WRITE_BACK_C(0, 3); + WRITE_BACK_C(1, 0); WRITE_BACK_C(1, 1); WRITE_BACK_C(1, 2); WRITE_BACK_C(1, 3); + WRITE_BACK_C(2, 0); WRITE_BACK_C(2, 1); WRITE_BACK_C(2, 2); WRITE_BACK_C(2, 3); + WRITE_BACK_C(3, 0); WRITE_BACK_C(3, 1); WRITE_BACK_C(3, 2); WRITE_BACK_C(3, 3); +#undef WRITE_BACK_C + +#undef ROWS +#undef VEC_ROWS +#undef COLS +#undef VEC_COLS +#undef BARRIER_READ_BEFORE_COMPUTE +} + /** * Handle calculation for row blocks in C_i of any size by dispatching into * macro-defined (inline) functions or by deferring to a simple generic From 07c334e7be2f30a07263f0f827cb92fd257704dc Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 11 Aug 2020 12:55:53 +0200 Subject: [PATCH 104/349] s390x: Factor out small block sizes for SGEMM/DGEMM on z14 For small register blockings that are too small to fill up vector registers with column vectors, we currently use a generic code block. Replace that with instantiations of the generic code as individual functions, so that the compiler can optimize each one separately. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 78 +++++++++++++++++++++++++++-------------- 1 file changed, 51 insertions(+), 27 deletions(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index eae2e4d69f..741c094314 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -265,12 +265,58 @@ VECTOR_BLOCK(4, 4) VECTOR_BLOCK(4, 2) VECTOR_BLOCK(4, 1) +/** + * Calculate for a row-block in C_i of size ROWSxCOLS using scalar operations. + * Simple implementation for smaller block sizes + * + * @param[in] A Pointer current block of input matrix A. + * @param[in] k Number of columns in A. + * @param[in] B Pointer current block of input matrix B. + * @param[inout] C Pointer current block of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +#define SCALAR_BLOCK(ROWS, COLS) \ + static inline void GEBP_block_##ROWS##_##COLS( \ + FLOAT const *restrict A, BLASLONG k, FLOAT const *restrict B, \ + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ + FLOAT Caux[ROWS][COLS] __attribute__((aligned(16))); \ + \ + /* \ + * Peel off first iteration (i.e., column of A) for \ + * initializing Caux \ + */ \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) Caux[i][j] = A[i] * B[j]; \ + \ + for (BLASLONG kk = 1; kk < k; kk++) \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] += A[i + kk * ROWS] * B[j + kk * COLS]; \ + \ + for (BLASLONG i = 0; i < ROWS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + if (trmm) { \ + C[i + j * ldc] = alpha * Caux[i][j]; \ + } else { \ + C[i + j * ldc] += alpha * Caux[i][j]; \ + } \ + } + #ifdef DOUBLE VECTOR_BLOCK(2, 4) VECTOR_BLOCK(2, 2) VECTOR_BLOCK(2, 1) +#else +SCALAR_BLOCK(2, 4) +SCALAR_BLOCK(2, 2) +SCALAR_BLOCK(2, 1) #endif +SCALAR_BLOCK(1, 4) +SCALAR_BLOCK(1, 2) +SCALAR_BLOCK(1, 1) + /** * Calculate a row-block that fits 4x4 vector registers using a loop @@ -526,6 +572,8 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, } } + /* Dispatch into the implementation for each block size: */ + #define BLOCK(bm, bn) \ if (m == bm && n == bn) { \ GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ @@ -541,35 +589,11 @@ static inline void GEBP_block(BLASLONG m, BLASLONG n, BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); - #ifdef DOUBLE - BLOCK(2, 4); - BLOCK(2, 2); - #endif - -#undef BLOCK + BLOCK(2, 4); BLOCK(2, 2); BLOCK(2, 1); - /* simple implementation for smaller block sizes: */ - FLOAT Caux[m][n] __attribute__ ((aligned (16))); + BLOCK(1, 4); BLOCK(1, 2); BLOCK(1, 1); - /* - * Peel off first iteration (i.e., column of A) for initializing Caux - */ - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - Caux[i][j] = A[i] * B[j]; - - for (BLASLONG kk = 1; kk < k; kk++) - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - Caux[i][j] += A[i + kk * m] * B[j + kk * n]; - - for (BLASLONG i = 0; i < m; i++) - for (BLASLONG j = 0; j < n; j++) - if (trmm) { - C[i + j * ldc] = alpha * Caux[i][j]; - } else { - C[i + j * ldc] += alpha * Caux[i][j]; - } +#undef BLOCK } /** From e115c97e05889fc2e8edf041cdfd92d00d63a884 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 11 Aug 2020 12:55:59 +0200 Subject: [PATCH 105/349] s390x/SGEMM: adjust default P and Q to multiples of M We recently changed the register blocking for SGEMM on s390x to 16x4. However, we did not adjust Q to a multiple of 16 and thus fell back to the 8x4 kernel at each block's margin, without need. Adjust P and Q to multiples of 16 to employ the faster 16x4 kernel for complete full-sized blocks. Signed-off-by: Marius Hillenbrand --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 476f237a1f..3e539a2b8a 100644 --- a/param.h +++ b/param.h @@ -3092,12 +3092,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 456 +#define SGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 320 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 224 -#define SGEMM_DEFAULT_Q 488 +#define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 384 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 352 From fee361ae64f2d02552713ade0ee972e6efdb1ed4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Aug 2020 13:27:19 +0200 Subject: [PATCH 106/349] fix another source of NO_CBLAS=0 surprise --- interface/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface/Makefile b/interface/Makefile index 44a9fdcf02..2dbd600731 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -367,7 +367,7 @@ CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX) endif -ifndef NO_CBLAS +ifneq ($(NO_CBLAS), 1) override CFLAGS += -I. From 619343278d6d6e8ec3989fb883da333ee087d351 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 11 Aug 2020 13:40:40 +0200 Subject: [PATCH 107/349] Fix mishandling of NO_CBLAS=0 and NO_LAPACKE=0 --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index c1d943facd..7a03b08f01 100644 --- a/Makefile +++ b/Makefile @@ -141,7 +141,7 @@ ifndef NO_FBLAS $(MAKE) -C test all endif $(MAKE) -C utest all -ifndef NO_CBLAS +ifneq ($(NO_CBLAS), 1) $(MAKE) -C ctest all ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all @@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib endif -ifndef NO_LAPACKE +ifneq ($(NO_LAPACKE), 1) @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib endif endif From efdd237a91646f0ce58815ef6507c04e393813a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 12 Aug 2020 23:08:38 +0200 Subject: [PATCH 108/349] Add a dedicated POWER9 build to the Travis CI (#2774) * Add dedicated POWER9 build (using new syntax to ensure it runs as a P9-only containerized job rather than a VM that might end up on P8 hardware half of the time) * Bump gcc version for POWER9 build --- .travis.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.travis.yml b/.travis.yml index 101147353f..307010e40e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -75,6 +75,23 @@ matrix: - TARGET_BOX=LINUX32 - BTYPE="BINARY=32" + - os: linux + arch: ppc64le + dist: bionic + compiler: gcc + before_script: + - sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y + - sudo apt-get update + - sudo apt-get install gcc-9 gfortran-9 -y + script: + - make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX_P9 + - os: linux compiler: gcc addons: From e740c4873d5b66851580ca53d9dce427325b8b9b Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Thu, 13 Aug 2020 06:17:34 +0800 Subject: [PATCH 109/349] Enable COOPERLAKE build target Enable new build target platform -- COOPERLAKE. This target platform supports all the SKYLAKEX supported ISAs + avx512bf16. So all the SKYLAKEX specific kernels/drivers and related code are now extended to be also active on COOPERLAKE. Besides, new BF16 related kernels are active under this target. --- Makefile.system | 8 +- Makefile.x86_64 | 19 ++++ TargetList.txt | 1 + cmake/arch.cmake | 4 +- cmake/cc.cmake | 8 ++ cmake/system.cmake | 5 +- cpuid.h | 15 +-- cpuid_x86.c | 37 +++++++- driver/level3/level3.c | 2 +- driver/level3/level3_thread.c | 2 +- driver/level3/trmm_L.c | 8 +- driver/level3/trmm_R.c | 12 +-- driver/others/parameter.c | 11 ++- getarch.c | 30 ++++++ kernel/CMakeLists.txt | 2 +- kernel/Makefile | 12 ++- kernel/Makefile.L3 | 4 + kernel/setparam-ref.c | 2 +- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.COOPERLAKE | 1 + kernel/x86_64/caxpy.c | 2 +- kernel/x86_64/cdot.c | 2 +- kernel/x86_64/cgemv_n_4.c | 2 +- kernel/x86_64/cgemv_t_4.c | 2 +- kernel/x86_64/cscal.c | 2 +- kernel/x86_64/daxpy.c | 2 +- kernel/x86_64/ddot.c | 2 +- kernel/x86_64/dgemv_n_4.c | 2 +- kernel/x86_64/dgemv_t_4.c | 2 +- kernel/x86_64/dscal.c | 2 +- kernel/x86_64/dsymv_L.c | 2 +- kernel/x86_64/dsymv_U.c | 2 +- kernel/x86_64/saxpy.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/ssymv_L.c | 2 +- kernel/x86_64/ssymv_U.c | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zaxpy.c | 2 +- kernel/x86_64/zdot.c | 2 +- kernel/x86_64/zgemv_n_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- kernel/x86_64/zscal.c | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- param.h | 118 ++++++++++++++++++++++++ 62 files changed, 309 insertions(+), 76 deletions(-) create mode 100644 kernel/x86_64/KERNEL.COOPERLAKE diff --git a/Makefile.system b/Makefile.system index d7e71d00af..2286d14f22 100644 --- a/Makefile.system +++ b/Makefile.system @@ -88,6 +88,9 @@ endif ifeq ($(TARGET), SKYLAKEX) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET), COOPERLAKE) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -130,6 +133,9 @@ endif ifeq ($(TARGET_CORE), SKYLAKEX) GETARCH_FLAGS := -DFORCE_NEHALEM endif +ifeq ($(TARGET_CORE), COOPERLAKE) +GETARCH_FLAGS := -DFORCE_NEHALEM +endif ifeq ($(TARGET_CORE), SANDYBRIDGE) GETARCH_FLAGS := -DFORCE_NEHALEM endif @@ -553,7 +559,7 @@ DYNAMIC_CORE += HASWELL ZEN endif ifneq ($(NO_AVX512), 1) ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += SKYLAKEX +DYNAMIC_CORE += SKYLAKEX COOPERLAKE endif endif endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 2676bd258d..96e9dbe446 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -27,6 +27,25 @@ endif endif endif +ifeq ($(CORE), COOPERLAKE) +ifndef DYNAMIC_ARCH +ifndef NO_AVX512 +CCOMMON_OPT += -march=cooperlake +FCOMMON_OPT += -march=cooperlake +ifeq ($(OSNAME), CYGWIN_NT) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +ifeq ($(OSNAME), WINNT) +ifeq ($(C_COMPILER), GCC) +CCOMMON_OPT += -fno-asynchronous-unwind-tables +FCOMMON_OPT += -fno-asynchronous-unwind-tables +endif +endif +endif +endif +endif + ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 diff --git a/TargetList.txt b/TargetList.txt index 8ea2df9b72..5934f30128 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -22,6 +22,7 @@ SANDYBRIDGE HASWELL SKYLAKEX ATOM +COOPERLAKE b)AMD CPU: ATHLON diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5388156bc2..c00f8fe71d 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -76,9 +76,9 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () if (NOT NO_AVX512) - set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") - endif () + endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index d5551147c8..88cf9f5736 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -103,3 +103,11 @@ if (${CORE} STREQUAL "SKYLAKEX") endif () endif () endif () + +if (${CORE} STREQUAL "COOPERLAKE") + if (NOT DYNAMIC_ARCH) + if (NOT NO_AVX512) + set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + endif () + endif () +endif () diff --git a/cmake/system.cmake b/cmake/system.cmake index d8dcc3cf30..2838e279fb 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -33,7 +33,7 @@ endif () if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) message(STATUS "Compiling a ${BINARY}-bit binary.") set(NO_AVX 1) - if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") + if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") set(TARGET "NEHALEM") endif () if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") @@ -45,6 +45,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) endif () if (DEFINED TARGET) + if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() diff --git a/cpuid.h b/cpuid.h index 697f43133e..824e0bc70f 100644 --- a/cpuid.h +++ b/cpuid.h @@ -118,6 +118,7 @@ #define CORE_ZEN 27 #define CORE_SKYLAKEX 28 #define CORE_DHYANA 29 +#define CORE_COOPERLAKE 30 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -137,11 +138,12 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) -#define HAVE_AVX (1 << 18) -#define HAVE_FMA4 (1 << 19) -#define HAVE_FMA3 (1 << 20) -#define HAVE_AVX512VL (1 << 21) -#define HAVE_AVX2 (1 << 22) +#define HAVE_AVX (1 << 18) +#define HAVE_FMA4 (1 << 19) +#define HAVE_FMA3 (1 << 20) +#define HAVE_AVX512VL (1 << 21) +#define HAVE_AVX2 (1 << 22) +#define HAVE_AVX512BF16 (1 << 23) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -218,7 +220,8 @@ typedef struct { #define CPUTYPE_ZEN 51 #define CPUTYPE_SKYLAKEX 52 #define CPUTYPE_DHYANA 53 +#define CPUTYPE_COOPERLAKE 54 -#define CPUTYPE_HYGON_UNKNOWN 54 +#define CPUTYPE_HYGON_UNKNOWN 99 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index ea846a392a..728d459d1b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -249,6 +249,22 @@ int support_avx512(){ #endif } +int support_avx512_bf16(){ +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + cpuid_count(7, 1, &eax, &ebx, &ecx, &edx); + if((eax & 32) == 32){ + ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not + } + return ret; +#else + return 0; +#endif +} int get_vendor(void){ int eax, ebx, ecx, edx; @@ -335,6 +351,7 @@ int get_cputype(int gettype){ if (support_avx()) feature |= HAVE_AVX; if (support_avx2()) feature |= HAVE_AVX2; if (support_avx512()) feature |= HAVE_AVX512VL; + if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; #endif @@ -1337,6 +1354,8 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 5: // Skylake X + if(support_avx512_bf16()) + return CPUTYPE_COOPERLAKE; if(support_avx512()) return CPUTYPE_SKYLAKEX; if(support_avx2()) @@ -1677,7 +1696,8 @@ static char *cpuname[] = { "EXCAVATOR", "ZEN", "SKYLAKEX", - "DHYANA" + "DHYANA", + "COOPERLAKE" }; static char *lowercpuname[] = { @@ -1733,7 +1753,8 @@ static char *lowercpuname[] = { "excavator", "zen", "skylakex", - "dhyana" + "dhyana", + "cooperlake" }; static char *corename[] = { @@ -1766,7 +1787,8 @@ static char *corename[] = { "EXCAVATOR", "ZEN", "SKYLAKEX", - "DHYANA" + "DHYANA", + "COOPERLAKE" }; static char *corename_lower[] = { @@ -1799,7 +1821,8 @@ static char *corename_lower[] = { "excavator", "zen", "skylakex", - "dhyana" + "dhyana", + "cooperlake" }; @@ -2007,7 +2030,9 @@ int get_coretype(void){ case 5: // Skylake X #ifndef NO_AVX512 - return CORE_SKYLAKEX; + if(support_avx512_bf16()) + return CORE_COOPERLAKE; + return CORE_SKYLAKEX; #else if(support_avx()) #ifndef NO_AVX2 @@ -2276,6 +2301,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); + if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); @@ -2346,6 +2372,7 @@ void get_sse(void){ if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); + if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); diff --git a/driver/level3/level3.c b/driver/level3/level3.c index c6bbb9ca94..a38506585f 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 5a8d497d24..6e1fd9e99a 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, /* Split local region of B into parts */ for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ min_jj = MIN(n_to, js + div_n) - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 9117090b5a..1027c0c737 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -135,7 +135,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -205,7 +205,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -300,7 +300,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -370,7 +370,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index 62c6a24427..e8df7fb210 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ min_jj = js - ls - min_l - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; -#ifdef SKYLAKEX +#if defined(SKYLAKEX) || defined(COOPERLAKE) /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else diff --git a/driver/others/parameter.c b/driver/others/parameter.c index b1f3befae3..5d312fa87d 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -180,9 +180,10 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ - defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ - defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) + defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ + defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ + defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -266,7 +267,9 @@ int get_L2_size(void){ void blas_set_parameter(void){ int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ + defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ + defined(SKYLAKEX) || defined(COOPERLAKE) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 51c9a84e51..83043bdf22 100644 --- a/getarch.c +++ b/getarch.c @@ -365,6 +365,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif +#ifdef FORCE_COOPERLAKE +#ifdef NO_AVX512 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#else +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "COOPERLAKE" +#define ARCHCONFIG "-DCOOPERLAKE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" +#define LIBNAME "cooperlake" +#define CORENAME "COOPERLAKE" +#endif +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d1349c5f83..2f448e8f89 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -127,7 +127,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.L3 set(USE_TRMM false) - if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) + if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) set(USE_TRMM true) endif () if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) diff --git a/kernel/Makefile b/kernel/Makefile index db3282c050..0c883cd964 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -37,7 +37,17 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE -ifeq ($(TARGET_CORE), SKYLAKEX) +ifeq ($(TARGET_CORE), COOPERLAKE) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=cooperlake + ifeq ($(OSNAME), CYGWIN_NT) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + ifeq ($(OSNAME), WINNT) + ifeq ($(C_COMPILER), GCC) + override CFLAGS += -fno-asynchronous-unwind-tables + endif + endif +else ifeq ($(TARGET_CORE), SKYLAKEX) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 8df306d5f9..bee8b216a0 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -39,6 +39,10 @@ ifeq ($(CORE), SKYLAKEX) USE_TRMM = 1 endif +ifeq ($(CORE), COOPERLAKE) +USE_TRMM = 1 +endif + ifeq ($(CORE), ZEN) USE_TRMM = 1 endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d3aa030c15..a0bdc7e142 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1166,7 +1166,7 @@ static void init_parameter(void) { #endif #endif -#ifdef SKYLAKEX +#if defined (SKYLAKEX) || defined (COOPERLAKE) #ifdef DEBUG fprintf(stderr, "SkylakeX\n"); diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 34653d400a..fde9eba8e0 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 492f343447..fddf7560f9 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 6840c54adf..33afd2a613 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index e2f731fca8..b05bd6ee52 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 11825429ef..f960559a67 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 4c054f3992..cf842c9b54 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index e674967365..63c44c27a2 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 498057697b..4cb01e50a0 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index f3072983d0..09d5d8e43d 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 879ae9c383..7d129e54c1 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index 6c308197b7..d335993174 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE new file mode 100644 index 0000000000..0b2f3c0ed4 --- /dev/null +++ b/kernel/x86_64/KERNEL.COOPERLAKE @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.SKYLAKEX diff --git a/kernel/x86_64/caxpy.c b/kernel/x86_64/caxpy.c index 586d05ac2d..c19b98f02b 100644 --- a/kernel/x86_64/caxpy.c +++ b/kernel/x86_64/caxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "caxpy_microk_steamroller-2.c" #elif defined(BULLDOZER) #include "caxpy_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) #include "caxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "caxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/cdot.c b/kernel/x86_64/cdot.c index f71d7b6b4a..f2bf19dcd1 100644 --- a/kernel/x86_64/cdot.c +++ b/kernel/x86_64/cdot.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "cdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "cdot_microk_sandy-2.c" diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c index d81766cd40..0ed02b8d8f 100644 --- a/kernel/x86_64/cgemv_n_4.c +++ b/kernel/x86_64/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cgemv_n_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_n_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c index f44fe72477..c2903b11fb 100644 --- a/kernel/x86_64/cgemv_t_4.c +++ b/kernel/x86_64/cgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cgemv_t_microk_haswell-4.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "cgemv_t_microk_bulldozer-4.c" diff --git a/kernel/x86_64/cscal.c b/kernel/x86_64/cscal.c index 72af998092..6d75358a69 100644 --- a/kernel/x86_64/cscal.c +++ b/kernel/x86_64/cscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "cscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "cscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index cde5bdaa68..d84c0c2218 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_piledriver-2.c" #elif defined(HASWELL) || defined(ZEN) #include "daxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "daxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "daxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index 9693576142..e4b6622e69 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ddot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "ddot_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "ddot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "ddot_microk_sandy-2.c" diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 6d33641e91..da68db0cd1 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dgemv_n_microk_nehalem-4.c" #elif defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dgemv_n_microk_haswell-4.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dgemv_n_microk_skylakex-4.c" #endif diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index ed672a7579..a3bf28dc8d 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "dgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index e2436f7890..d1270d20bc 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_sandy-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dscal_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dscal_microk_skylakex-2.c" #endif diff --git a/kernel/x86_64/dsymv_L.c b/kernel/x86_64/dsymv_L.c index a722cc9df0..573377ee08 100644 --- a/kernel/x86_64/dsymv_L.c +++ b/kernel/x86_64/dsymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dsymv_L_microk_bulldozer-2.c" #elif defined(HASWELL) || defined(ZEN) #include "dsymv_L_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "dsymv_L_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/dsymv_U.c b/kernel/x86_64/dsymv_U.c index 431e4bb3fc..530ac8b1d5 100644 --- a/kernel/x86_64/dsymv_U.c +++ b/kernel/x86_64/dsymv_U.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "dsymv_U_microk_bulldozer-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "dsymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "dsymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/saxpy.c b/kernel/x86_64/saxpy.c index e1349da58c..7b2845636f 100644 --- a/kernel/x86_64/saxpy.c +++ b/kernel/x86_64/saxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "saxpy_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "saxpy_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "saxpy_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "saxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 3536afc9ed..e816c67e9f 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sdot_microk_nehalem-2.c" #elif defined(HASWELL) || defined(ZEN) #include "sdot_microk_haswell-2.c" -#elif defined (SKYLAKEX) +#elif defined (SKYLAKEX) || defined (COOPERLAKE) #include "sdot_microk_skylakex-2.c" #elif defined(SANDYBRIDGE) #include "sdot_microk_sandy-2.c" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 63697970fe..3eec21774d 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_n_microk_nehalem-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_n_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_n_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 86ecaf516e..fe886f57f7 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -34,7 +34,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "sgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 73ae001ea8..c9d698eb76 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "ssymv_L_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_L_microk_sandy-2.c" diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index f37c251a18..4d8aac1ab1 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "ssymv_U_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "ssymv_U_microk_sandy-2.c" diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 8a5c44c9ba..fea4fc7462 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index 0c40a3435e..b853ef3659 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 7a2eeace59..bad367e91e 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 0408b577c7..147201751d 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index 53866cf954..25e9f6d422 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zaxpy_microk_bulldozer-2.c" #elif defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zaxpy_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zaxpy_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zaxpy_microk_sandy-2.c" diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 423a6f23e5..90fd86daf5 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zdot_microk_bulldozer-2.c" #elif defined(STEAMROLLER) || defined(PILEDRIVER) || defined(EXCAVATOR) #include "zdot_microk_steamroller-2.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zdot_microk_haswell-2.c" #elif defined(SANDYBRIDGE) #include "zdot_microk_sandy-2.c" diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c index 0fedc496b9..1f9d41859d 100644 --- a/kernel/x86_64/zgemv_n_4.c +++ b/kernel/x86_64/zgemv_n_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zgemv_n_microk_haswell-4.c" #elif defined(SANDYBRIDGE) #include "zgemv_n_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 6221471f73..34f28b224a 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -31,7 +31,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) || defined(EXCAVATOR) #include "zgemv_t_microk_bulldozer-4.c" -#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#elif defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zgemv_t_microk_haswell-4.c" #endif diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c index 2a6d0e4c79..09a702a815 100644 --- a/kernel/x86_64/zscal.c +++ b/kernel/x86_64/zscal.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #include "zscal_microk_haswell-2.c" #elif defined(BULLDOZER) || defined(PILEDRIVER) #include "zscal_microk_bulldozer-2.c" diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index e44bd75506..83ed41ba14 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index e9f330c365..7ed2faf0fd 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 9f0dead180..5945f3f811 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index b6106a37d7..484d74f149 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) +#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 3e539a2b8a..1ab982dc5e 100644 --- a/param.h +++ b/param.h @@ -1748,6 +1748,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef COOPERLAKE + +#define SNUMOPT 16 +#define DNUMOPT 8 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#define GEMM_PREFERED_SIZE 8 +#else +#define SWITCH_RATIO 16 +#define GEMM_PREFERED_SIZE 16 +#endif +#define USE_SGEMM_KERNEL_DIRECT 1 + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#else + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#define SGEMM_DEFAULT_UNROLL_MN 32 +#define DGEMM_DEFAULT_UNROLL_MN 32 +#endif + +#ifdef ARCH_X86 + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define CGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_R 1024 +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#else + +#define SGEMM_DEFAULT_P 640 +#define DGEMM_DEFAULT_P 192 +#define CGEMM_DEFAULT_P 384 +#define ZGEMM_DEFAULT_P 256 + +#define SGEMM_DEFAULT_Q 320 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 192 +#define ZGEMM_DEFAULT_Q 128 + +#define SGEMM_DEFAULT_R sgemm_r +#define DGEMM_DEFAULT_R 8640 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r + +#define QGEMM_DEFAULT_Q 128 +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r +#define XGEMM_DEFAULT_Q 128 + +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 + +#define CGEMM3M_DEFAULT_P 320 +#define ZGEMM3M_DEFAULT_P 256 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 320 +#define ZGEMM3M_DEFAULT_Q 256 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#endif +#endif #ifdef ATOM From c62aad62e551cc238cee2e4f78169c62df88bc63 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Aug 2020 00:35:45 +0200 Subject: [PATCH 110/349] Fix incorrect calls to DLASET Reference-LAPACK issue 429 --- lapack-netlib/TESTING/EIG/cchkhb2stg.f | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkhb2stg.f b/lapack-netlib/TESTING/EIG/cchkhb2stg.f index 61537f44bc..cd884febfe 100644 --- a/lapack-netlib/TESTING/EIG/cchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkhb2stg.f @@ -680,8 +680,8 @@ SUBROUTINE CCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -753,8 +753,8 @@ SUBROUTINE CCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH From d64f1ef26bc7c7f3ee6b54aaa2d394cf7842456d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Aug 2020 00:40:24 +0200 Subject: [PATCH 111/349] Fix incorrect argument to SLASET Reference-LAPACK issue 425 (and 318) --- lapack-netlib/TESTING/EIG/schksb2stg.f | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/schksb2stg.f b/lapack-netlib/TESTING/EIG/schksb2stg.f index 07b6fa95cb..7308bb690a 100644 --- a/lapack-netlib/TESTING/EIG/schksb2stg.f +++ b/lapack-netlib/TESTING/EIG/schksb2stg.f @@ -670,8 +670,8 @@ SUBROUTINE SCHKSB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the SSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL SLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH From 597010a9688c9f5688dc459ba92ef8a28ea20769 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 14 Aug 2020 00:41:56 +0200 Subject: [PATCH 112/349] Fix incorrect argument to SLASET Reference-LAPACK issue 425 (and 318) --- lapack-netlib/TESTING/EIG/schkst2stg.f | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f index f386ab43c1..83edb9dcee 100644 --- a/lapack-netlib/TESTING/EIG/schkst2stg.f +++ b/lapack-netlib/TESTING/EIG/schkst2stg.f @@ -999,8 +999,8 @@ SUBROUTINE SCHKST2STG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL SLACPY( "U", N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH From f5fcc5baec1c5aea7dbd7a2a8fdd41ae8b422a6e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Aug 2020 13:30:29 +0200 Subject: [PATCH 113/349] Add trivial gemm test for multithread consistency --- cpp_thread_test/gemm64.cpp | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 cpp_thread_test/gemm64.cpp diff --git a/cpp_thread_test/gemm64.cpp b/cpp_thread_test/gemm64.cpp new file mode 100644 index 0000000000..2c3442a2e9 --- /dev/null +++ b/cpp_thread_test/gemm64.cpp @@ -0,0 +1,20 @@ +#include +#include +int main ( int argc, char* argv[] ) { + const long n = ((long)1 << 31) - 1; + std::cout << n < Date: Sat, 15 Aug 2020 13:31:28 +0200 Subject: [PATCH 114/349] Update gemm64.cpp --- cpp_thread_test/gemm64.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp_thread_test/gemm64.cpp b/cpp_thread_test/gemm64.cpp index 2c3442a2e9..df38416fab 100644 --- a/cpp_thread_test/gemm64.cpp +++ b/cpp_thread_test/gemm64.cpp @@ -1,5 +1,6 @@ #include -#include +#include "common.h" +#include "cblas.h" int main ( int argc, char* argv[] ) { const long n = ((long)1 << 31) - 1; std::cout << n < Date: Sat, 15 Aug 2020 13:33:52 +0200 Subject: [PATCH 115/349] Add simple sgemm preicsion test --- cpp_thread_test/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 81e3470ef7..0dc7229d7e 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -10,5 +10,9 @@ dgemm_tester : dgemv_tester $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester ./dgemm_tester +gemm64 : gemm64 + $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 gemm64.cpp ../libopenblas.a -lpthread -o gemm64 + ./gemm64 + clean :: - rm -f dgemv_tester dgemm_tester + rm -f dgemv_tester dgemm_tester gemm64 From 37ac23e8a36049d875d01887b292ec11751fccc8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Aug 2020 13:38:05 +0200 Subject: [PATCH 116/349] Add simple MT sgemm precision test and INTERFACE64 build --- .drone.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.drone.yml b/.drone.yml index b1c211d147..fb009d46e0 100644 --- a/.drone.yml +++ b/.drone.yml @@ -190,3 +190,29 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester + - make -C cpp_thread_test gemm64 +--- +kind: pipeline +name: epyc_native_test_int64 + +platform: + os: linux + arch: amd64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1 INTERFACE64=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester + - make -C cpp_thread_test gemm64 From d57d503c150bb40e1478b88735818c1b76d64ed2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Aug 2020 14:46:26 +0200 Subject: [PATCH 117/349] Update Makefile --- cpp_thread_test/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 0dc7229d7e..0d78990ebf 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -11,7 +11,7 @@ dgemm_tester : dgemv_tester ./dgemm_tester gemm64 : gemm64 - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 gemm64.cpp ../libopenblas.a -lpthread -o gemm64 + $(CXX) $(COMMON_OPT) -I.. -Wall -Wextra -Wshadow -fopenmp -std=c++11 gemm64.cpp ../libopenblas.a -lpthread -o gemm64 ./gemm64 clean :: From 82f8a0aebabab6e81386b75b6f172abb692dd31c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 15 Aug 2020 15:46:18 +0200 Subject: [PATCH 118/349] Update .drone.yml --- .drone.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.drone.yml b/.drone.yml index fb009d46e0..e8353eb5c8 100644 --- a/.drone.yml +++ b/.drone.yml @@ -166,6 +166,32 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester + - make -C cpp_thread_test gemm64 +--- +kind: pipeline +name: arm64_native_test_int64 + +platform: + os: linux + arch: arm64 + +steps: +- name: Build and Test + image: ubuntu:18.04 + environment: + CC: gcc + COMMON_FLAGS: 'USE_OPENMP=1 INTERFACE64=1' + commands: + - echo "MAKE_FLAGS:= $COMMON_FLAGS" + - apt-get update -y + - apt-get install -y make $CC gfortran perl python g++ + - $CC --version + - make QUIET_MAKE=1 $COMMON_FLAGS + - make -C test $COMMON_FLAGS + - make -C ctest $COMMON_FLAGS + - make -C utest $COMMON_FLAGS + - make -C cpp_thread_test dgemm_tester + - make -C cpp_thread_test gemm64 --- kind: pipeline name: epyc_native_test From 5ec8f716cf181b70352fa15a7beb45fc886312de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Aug 2020 15:19:40 +0200 Subject: [PATCH 119/349] revert --- .drone.yml | 52 ---------------------------------------------------- 1 file changed, 52 deletions(-) diff --git a/.drone.yml b/.drone.yml index e8353eb5c8..b1c211d147 100644 --- a/.drone.yml +++ b/.drone.yml @@ -166,32 +166,6 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester - - make -C cpp_thread_test gemm64 ---- -kind: pipeline -name: arm64_native_test_int64 - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - COMMON_FLAGS: 'USE_OPENMP=1 INTERFACE64=1' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl python g++ - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - - make -C cpp_thread_test dgemm_tester - - make -C cpp_thread_test gemm64 --- kind: pipeline name: epyc_native_test @@ -216,29 +190,3 @@ steps: - make -C ctest $COMMON_FLAGS - make -C utest $COMMON_FLAGS - make -C cpp_thread_test dgemm_tester - - make -C cpp_thread_test gemm64 ---- -kind: pipeline -name: epyc_native_test_int64 - -platform: - os: linux - arch: amd64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - COMMON_FLAGS: 'USE_OPENMP=1 INTERFACE64=1' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl python g++ - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - - make -C cpp_thread_test dgemm_tester - - make -C cpp_thread_test gemm64 From a8c6fb9e1ce4d6cb3d4e8a782f9c4c69469aae91 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Aug 2020 15:20:16 +0200 Subject: [PATCH 120/349] revert --- cpp_thread_test/Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile index 0d78990ebf..81e3470ef7 100644 --- a/cpp_thread_test/Makefile +++ b/cpp_thread_test/Makefile @@ -10,9 +10,5 @@ dgemm_tester : dgemv_tester $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester ./dgemm_tester -gemm64 : gemm64 - $(CXX) $(COMMON_OPT) -I.. -Wall -Wextra -Wshadow -fopenmp -std=c++11 gemm64.cpp ../libopenblas.a -lpthread -o gemm64 - ./gemm64 - clean :: - rm -f dgemv_tester dgemm_tester gemm64 + rm -f dgemv_tester dgemm_tester From 6bfc66663c4b3bbd2c5f7ac05a150d2c4bd94af4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 17 Aug 2020 15:20:41 +0200 Subject: [PATCH 121/349] revert --- cpp_thread_test/gemm64.cpp | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 cpp_thread_test/gemm64.cpp diff --git a/cpp_thread_test/gemm64.cpp b/cpp_thread_test/gemm64.cpp deleted file mode 100644 index df38416fab..0000000000 --- a/cpp_thread_test/gemm64.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include -#include "common.h" -#include "cblas.h" -int main ( int argc, char* argv[] ) { - const long n = ((long)1 << 31) - 1; - std::cout << n < Date: Mon, 17 Aug 2020 15:32:14 +0200 Subject: [PATCH 122/349] Add typedef for bfloat16 if needed --- openblas_config_template.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/openblas_config_template.h b/openblas_config_template.h index 49aea1cab5..9955e5c73d 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -34,6 +34,10 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif +#ifndef BFLOAT16 +typedef unsigned short bfloat16; +#endif + #ifdef OPENBLAS_USE64BITINT typedef BLASLONG blasint; #else From 6b731d917f9049ba426a82dccf9b7bdbcfd1bab3 Mon Sep 17 00:00:00 2001 From: Albert Ziegenhagel Date: Tue, 18 Aug 2020 08:48:48 +0200 Subject: [PATCH 123/349] Do not require pkg-config to generate the *.pc file Generating the pkg-config file does not actually depend on pkg-config being available. --- CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c324e22419..4b82d76704 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -389,11 +389,9 @@ if(NOT NO_LAPACKE) install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) endif() -include(FindPkgConfig QUIET) -if(PKG_CONFIG_FOUND) - configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) - install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) -endif() +# Install pkg-config files +configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) +install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". From 75eeb265d7c5715f05b63e8706593ef6d8485627 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 14:51:09 +0200 Subject: [PATCH 124/349] [WIP] Refactor the driver code for direct SGEMM (#2782) Move "direct SGEMM" functionality out of the SkylakeX SGEMM kernel and make it available (on x86_64 targets only for now) in DYNAMIC_ARCH builds * Add sgemm_direct targets in the kernel Makefile.L3 and CMakeLists.txt * Add direct_sgemm functions to the gotoblas struct in common_param.h * Move sgemm_direct_performant helper to separate file * Update gemm.c to macros for sgemm_direct to support dynamic_arch naming via common_s,h * (Conditionally) add sgemm_direct functions in setparam-ref.c --- common_level3.h | 4 +-- common_param.h | 5 ++++ common_s.h | 12 ++++++++ interface/gemm.c | 4 +-- kernel/CMakeLists.txt | 14 +++++++++ kernel/Makefile.L3 | 24 ++++++++++++++++ kernel/setparam-ref.c | 5 ++++ kernel/x86_64/sgemm_direct_performant.c | 30 ++++++++++++++++++++ kernel/x86_64/sgemm_direct_skylakex.c | 17 +++++++---- kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c | 2 +- 10 files changed, 107 insertions(+), 10 deletions(-) create mode 100644 kernel/x86_64/sgemm_direct_performant.c diff --git a/common_level3.h b/common_level3.h index 4e44a5e735..671a7a0866 100644 --- a/common_level3.h +++ b/common_level3.h @@ -47,12 +47,12 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); extern "C" { #endif -extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, +void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, float * A, BLASLONG strideA, float * B, BLASLONG strideB, float * R, BLASLONG strideR); -extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); +int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, diff --git a/common_param.h b/common_param.h index c92609a761..0437482dc6 100644 --- a/common_param.h +++ b/common_param.h @@ -175,6 +175,11 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#ifdef ARCH_X86_64 + void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); + int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); +#endif + int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); diff --git a/common_s.h b/common_s.h index 23c432f7c5..34903ec491 100644 --- a/common_s.h +++ b/common_s.h @@ -45,6 +45,10 @@ #define SSYMV_THREAD_U ssymv_thread_U #define SSYMV_THREAD_L ssymv_thread_L + +#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant +#define SGEMM_DIRECT sgemm_direct + #define SGEMM_ONCOPY sgemm_oncopy #define SGEMM_OTCOPY sgemm_otcopy @@ -204,6 +208,14 @@ #define SSYMV_THREAD_U ssymv_thread_U #define SSYMV_THREAD_L ssymv_thread_L +#ifdef ARCH_X86_64 +#define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant +#define SGEMM_DIRECT gotoblas -> sgemm_direct +#else +#define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant +#define SGEMM_DIRECT sgemm_direct +#endif + #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy #define SGEMM_OTCOPY gotoblas -> sgemm_otcopy #define SGEMM_INCOPY gotoblas -> sgemm_incopy diff --git a/interface/gemm.c b/interface/gemm.c index 99388e7d9b..860e588fe2 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -275,8 +275,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #ifdef DYNAMIC_ARCH if (support_avx512() ) #endif - if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { - sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); + if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { + SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); return; } diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d1349c5f83..d9fba6aca6 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -134,6 +134,20 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(USE_TRMM true) endif () + set(USE_DIRECT_SGEMM false) + if (X86_64) + set(USE_DIRECT_SGEMM true) + endif() + + if (USE_DIRECT_SGEMM) + # if (NOT DEFINED SGEMMDIRECTKERNEL) + set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) + set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) + # endif() + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) + endif() + foreach (float_type SINGLE DOUBLE HALF) string(SUBSTRING ${float_type} 0 1 float_char) if (${float_type} STREQUAL "HALF") diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 8df306d5f9..a176b47fe0 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -9,6 +9,10 @@ ifeq ($(ARCH), x86_64) USE_GEMM3M = 1 endif +ifeq ($(ARCH), x86_64) +USE_DIRECT_SGEMM = 1 +endif + ifeq ($(ARCH), ia64) USE_GEMM3M = 1 endif @@ -65,6 +69,13 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif +ifdef USE_DIRECT_SGEMM +ifndef SGEMMDIRECTKERNEL +SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c +SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c +endif +endif + ifeq ($(BUILD_HALF), 1) ifndef SHGEMMKERNEL SHGEMM_BETA = ../generic/gemm_beta.c @@ -90,6 +101,12 @@ SKERNELOBJS += \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) +ifdef USE_DIRECT_SGEMM +SKERNELOBJS += \ + sgemm_direct$(TSUFFIX).$(SUFFIX) \ + sgemm_direct_performant$(TSUFFIX).$(SUFFIX) +endif + DKERNELOBJS += \ dgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ @@ -668,6 +685,13 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +ifdef USE_DIRECT_SGEMM +$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d3aa030c15..d3845003a4 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -135,6 +135,11 @@ gotoblas_t TABLE_NAME = { sgemv_nTS, sgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, +#ifdef ARCH_X86_64 + sgemm_directTS, + sgemm_direct_performantTS, +#endif + sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, diff --git a/kernel/x86_64/sgemm_direct_performant.c b/kernel/x86_64/sgemm_direct_performant.c new file mode 100644 index 0000000000..5a20ce3955 --- /dev/null +++ b/kernel/x86_64/sgemm_direct_performant.c @@ -0,0 +1,30 @@ +#include "common.h" +/* helper for the direct sgemm code written by Arjan van der Ven */ + + + + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG K) +{ + unsigned long long mnk = M * N * K; + /* large matrixes -> not performant */ + if (mnk >= 28 * 512 * 512) + return 0; + + /* + * if the B matrix is not a nice multiple if 4 we get many unaligned accesses, + * and the regular sgemm copy/realignment of data pays off much quicker + */ + if ((N & 3) != 0 && (mnk >= 8 * 512 * 512)) + return 0; + +#ifdef SMP + /* if we can run multithreaded, the threading changes the based threshold */ + if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1) + return 0; +#endif + + return 1; +} + + diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index 0e8f1318f7..a7cddbb3d8 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,7 +1,7 @@ - +#if defined(SKYLAKEX) || defined (COOPERLAKE) /* the direct sgemm code written by Arjan van der Ven */ -//#include - +#include +#include "common.h" /* * "Direct sgemm" code. This code operates directly on the inputs and outputs * of the sgemm call, avoiding the copies, memory realignments and threading, @@ -38,6 +38,7 @@ #define MATMUL_SCALAR(N,M) result##N##M += Aval##M * Bval##N; #define STORE_SCALAR(N,M) R[(i+M) * strideR + j + N] = result##N##M; +#if 0 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { unsigned long long mnk = M * N * K; @@ -61,9 +62,10 @@ int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) return 1; } +#endif - -void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +//void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) { int i, j, k; @@ -465,3 +467,8 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict } } } +#else +#include "common.h" +void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) +{} +#endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index 3b1af33c16..f3d6142429 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -512,4 +512,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f return 0; } #include -#include "sgemm_direct_skylakex.c" +//#include "sgemm_direct_skylakex.c" From bb9cf766f5cfd5112adebfeb30f916350854b05d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 15:06:30 +0200 Subject: [PATCH 125/349] make march=cooperlake option conditional on gcc >= 10.1 --- Makefile.x86_64 | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 96e9dbe446..00975b25af 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -30,8 +30,15 @@ endif ifeq ($(CORE), COOPERLAKE) ifndef DYNAMIC_ARCH ifndef NO_AVX512 +ifeq ($(C_COMPILER), GCC) +# cooperlake support was added in 10.1 +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) +GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1) +ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) CCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=cooperlake +endif +endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables FCOMMON_OPT += -fno-asynchronous-unwind-tables From 81fbe8d08858ae0f1dd4de1bc5dfad864d8358f5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 16:10:15 +0200 Subject: [PATCH 126/349] -march=cooperlake only available in gcc >= 10 --- kernel/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 0c883cd964..d5078c5ba7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,6 +8,7 @@ include $(TOPDIR)/Makefile.system ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) endif ifeq ($(ARCH), power) @@ -38,7 +39,12 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), COOPERLAKE) - override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=cooperlake + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + ifeq ($(GCCVERSIONGTEQ10, 1) + override CFLAGS += -march=cooperlake + else + override CFLAGS += -march=skylake-avx512 + endif ifeq ($(OSNAME), CYGWIN_NT) override CFLAGS += -fno-asynchronous-unwind-tables endif From 6f4dc7445d220ffd38e0ceaa17f983e359713760 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 16:36:55 +0200 Subject: [PATCH 127/349] Fix typo --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index d5078c5ba7..16211218f4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -40,7 +40,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ10, 1) + ifeq ($(GCCVERSIONGTEQ10), 1) override CFLAGS += -march=cooperlake else override CFLAGS += -march=skylake-avx512 From 430f741b302d98e0dab3eab2675cb7b4f7d096ed Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 17:17:53 +0200 Subject: [PATCH 128/349] -march=cooperlake requires gcc10 --- cmake/cc.cmake | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 88cf9f5736..d7608220c6 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -107,7 +107,10 @@ endif () if (${CORE} STREQUAL "COOPERLAKE") if (NOT DYNAMIC_ARCH) if (NOT NO_AVX512) - set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) + set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + endif() endif () endif () endif () From 6a3c07478682770ad05a3046ac0523bdde7050b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 17:22:12 +0200 Subject: [PATCH 129/349] -march=cooperlake requires gcc10 --- cmake/system.cmake | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 2838e279fb..b4ffc18037 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -46,7 +46,14 @@ endif () if (DEFINED TARGET) if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL10.1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") + else() + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") + endif() + endif() endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") From 71d33c952da2ad57dbec3e4e48556db0f4f17610 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 17:44:23 +0200 Subject: [PATCH 130/349] Typo fix --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index b4ffc18037..1b43685896 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -48,7 +48,7 @@ if (DEFINED TARGET) if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL10.1) + if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") From 7c1986640b3be7ffd97f908cb6171f9b2b515c36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 20:48:39 +0200 Subject: [PATCH 131/349] fallback from cooperlake to skylake if gcc<10 --- cmake/cc.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index d7608220c6..c490dd9abf 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -110,6 +110,8 @@ if (${CORE} STREQUAL "COOPERLAKE") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif() endif () endif () From b8ebfc933562cf2c55e6147a791d29aff0d4ef6d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 22:30:19 +0200 Subject: [PATCH 132/349] Update system.cmake --- cmake/system.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 1b43685896..827ff5adb5 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -46,13 +46,15 @@ endif () if (DEFINED TARGET) if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) - if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") +# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") else() set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") endif() +# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") +# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) From bd3207b4b437bf6927043b3bcdd135ac29f2a6a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 19 Aug 2020 22:51:10 +0200 Subject: [PATCH 133/349] Update system.cmake --- cmake/system.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 827ff5adb5..e3617c4e28 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -55,7 +55,7 @@ if (DEFINED TARGET) endif() # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") - endif() +# endif() endif() if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") From 7c0977c267b19179a847b8fbe74b5ecfdadbaa48 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Aug 2020 13:53:44 +0200 Subject: [PATCH 134/349] Add OpenMP dependency to pkgconfig file if needed --- cmake/openblas.pc.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in index df4b2ab069..0bd49f9968 100644 --- a/cmake/openblas.pc.in +++ b/cmake/openblas.pc.in @@ -7,5 +7,5 @@ Name: OpenBLAS Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version Version: @OPENBLAS_VERSION@ URL: https://github.com/xianyi/OpenBLAS -Libs: -L${libdir} -lopenblas${libsuffix} +Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} Cflags: -I${includedir} From 1840bc5b523ff5dc17eebdbff3c0784a4ae1f03f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 22 Aug 2020 13:55:18 +0200 Subject: [PATCH 135/349] Add OpenMP dependency to pkgconfig file if needed --- Makefile.install | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile.install b/Makefile.install index 01c0b1226e..7c1a3ca43a 100644 --- a/Makefile.install +++ b/Makefile.install @@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig +PKG_EXTRALIB := $(EXTRALIB) +ifeq ($(USE_OPENMP), 1) + ifeq ($(C_COMPILER), PGI) + PKG_EXTRALIB += -lomp + else + PKG_EXTRALIB += -lgomp + endif +endif .PHONY : install .NOTPARALLEL : install @@ -147,7 +155,7 @@ endif @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" From b2053239fc36f9ca8c29286d8fc553d0200907b0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 23 Aug 2020 15:08:16 +0200 Subject: [PATCH 136/349] Fix mssing dummy parameter (imag part of alpha) of zdot_thread_function --- kernel/x86_64/zdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c index 90fd86daf5..1bc785ac11 100644 --- a/kernel/x86_64/zdot.c +++ b/kernel/x86_64/zdot.c @@ -168,7 +168,7 @@ static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLO #if defined(SMP) static int zdot_thread_function(BLASLONG n, BLASLONG dummy0, -BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, +BLASLONG dummy1, FLOAT dummy2r, FLOAT dummy2i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) { zdot_compute(n, x, inc_x, y, inc_y, (void *)result); From 0c1c903f1eb79719aa159b497cc2089d9fe61556 Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Wed, 12 Aug 2020 03:28:25 +0800 Subject: [PATCH 137/349] Fix OMP num specify issue In current code, no matter what number of threads specified, all available CPU count is used when invoking OMP, which leads to very bad performance if the workload is small while all available CPUs are big. Lots of time are wasted on inter-thread sync. Fix this issue by really using the number specified by the variable 'num' from calling API. Signed-off-by: Chen, Guobing --- driver/others/blas_server_omp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index b4eb27c251..d9969b5998 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ break; } -#pragma omp parallel for schedule(OMP_SCHED) +#pragma omp parallel for num_threads(num) schedule(OMP_SCHED) for (i = 0; i < num; i ++) { #ifndef USE_SIMPLE_THREADED_LEVEL3 From 48a1364e105fccc7162adeab0de22487d52d88d3 Mon Sep 17 00:00:00 2001 From: pkubaj Date: Sun, 23 Aug 2020 18:50:19 +0000 Subject: [PATCH 138/349] Add aliases for armv6, armv7 FreeBSD uses those names for 32-bit ARM variants. --- Makefile.system | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.system b/Makefile.system index 2286d14f22..e7d3dc4ce8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -25,6 +25,10 @@ else ifeq ($(ARCH), powerpc) override ARCH=power else ifeq ($(ARCH), i386) override ARCH=x86 +else ifeq ($(ARCH), armv6) +override ARCH=arm +else ifeq ($(ARCH), armv7) +override ARCH=arm else ifeq ($(ARCH), aarch64) override ARCH=arm64 else ifeq ($(ARCH), zarch) From 936966a42c1f2f0c63b49dc0a47e7e3039e520eb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 10:59:08 +0200 Subject: [PATCH 139/349] Make ILAENV and xGETRF2 functions available --- relapack/src/lapack.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h index 776b0589fa..9e9cdff7ee 100644 --- a/relapack/src/lapack.h +++ b/relapack/src/lapack.h @@ -4,6 +4,13 @@ extern blasint LAPACK(lsame)(const char *, const char *); extern blasint LAPACK(xerbla)(const char *, const blasint *, int); +extern const blasint LAPACK(ilaenv)(const blasint *, const char*, const char*, const blasint* , int , int, int ); + +extern void LAPACK(sgetrf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(dgetrf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(cgetrf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); +extern void LAPACK(zgetrf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); + extern void LAPACK(slaswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); extern void LAPACK(dlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); extern void LAPACK(claswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); From 6797a3a1e0b3ad9f5df62e2b751c8d5ac50cbaf5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 11:15:12 +0200 Subject: [PATCH 140/349] Add early returns --- relapack/src/cgetrf.c | 9 +++++++-- relapack/src/chegst.c | 2 ++ relapack/src/chetrf_rook.c | 4 ++-- relapack/src/clauum.c | 2 ++ relapack/src/cpotrf.c | 3 +++ relapack/src/csytrf.c | 3 ++- relapack/src/csytrf_rook.c | 4 ++-- relapack/src/ctgsyl.c | 7 +++++++ relapack/src/ctrsyl.c | 5 +++++ relapack/src/ctrtri.c | 2 ++ 10 files changed, 34 insertions(+), 7 deletions(-) diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c index 878c9ec15b..bf9ca53f48 100644 --- a/relapack/src/cgetrf.c +++ b/relapack/src/cgetrf.c @@ -30,6 +30,8 @@ void RELAPACK_cgetrf( return; } + if (*m == 0 || *n == 0) return; + const blasint sn = MIN(*m, *n); RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -62,9 +64,11 @@ static void RELAPACK_cgetrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_CGETRF, 1)) { + if (*m == 0 || *n == 0) return; + + if ( *n <= MAX(CROSSOVER_CGETRF, 1)) { // Unblocked - LAPACK(cgetf2)(m, n, A, ldA, ipiv, info); + LAPACK(cgetrf2)(m, n, A, ldA, ipiv, info); return; } @@ -96,6 +100,7 @@ static void RELAPACK_cgetrf_rec( // recursion(A_L, ipiv_T) RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + if (*info) return; // apply pivots to A_R LAPACK(claswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c index fe77b03eae..8557c29523 100644 --- a/relapack/src/chegst.c +++ b/relapack/src/chegst.c @@ -40,6 +40,8 @@ void RELAPACK_chegst( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c index 3d2fa32160..9ed1261cff 100644 --- a/relapack/src/chetrf_rook.c +++ b/relapack/src/chetrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_chetrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_chetrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF")); + LAPACK(xerbla)("CHETRF_ROOK", &minfo, strlen("CHETRF_ROOK")); return; } diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c index 2bc93f182b..58a14e7da0 100644 --- a/relapack/src/clauum.c +++ b/relapack/src/clauum.c @@ -32,6 +32,8 @@ void RELAPACK_clauum( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c index 0f8e7ebb06..db06c6fefc 100644 --- a/relapack/src/cpotrf.c +++ b/relapack/src/cpotrf.c @@ -32,6 +32,8 @@ void RELAPACK_cpotrf( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -46,6 +48,7 @@ static void RELAPACK_cpotrf_rec( float *A, const blasint *ldA, blasint *info ){ + if (*n == 0) return; if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { // Unblocked diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c index 2ebc310014..807c91eced 100644 --- a/relapack/src/csytrf.c +++ b/relapack/src/csytrf.c @@ -36,7 +36,7 @@ void RELAPACK_csytrf( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -67,6 +67,7 @@ void RELAPACK_csytrf( blasint nout; // Recursive kernel +if (*n != 0) RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c index e8a9865cca..105c6b8b69 100644 --- a/relapack/src/csytrf_rook.c +++ b/relapack/src/csytrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_csytrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_csytrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF")); + LAPACK(xerbla)("CSYTRF_ROOK", &minfo, strlen("CSYTRF_ROOK")); return; } diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c index 704f3ef232..632bbc14ee 100644 --- a/relapack/src/ctgsyl.c +++ b/relapack/src/ctgsyl.c @@ -68,6 +68,13 @@ void RELAPACK_ctgsyl( return; } + if ( *m == 0 || *n == 0) { + *scale = 1.; + if (notran && (*ijob != 0)) + *dif = 0.; + return; + } + // Clean char * arguments const char cleantrans = notran ? 'N' : 'C'; diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c index fed6e847e5..f7b841cb00 100644 --- a/relapack/src/ctrsyl.c +++ b/relapack/src/ctrsyl.c @@ -47,6 +47,11 @@ void RELAPACK_ctrsyl( return; } + if (*m == 0 || *n == 0) { + *scale = 1.; + return; + } + // Clean char * arguments const char cleantranA = notransA ? 'N' : 'C'; const char cleantranB = notransB ? 'N' : 'C'; diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c index 5201a24c73..8d736007b6 100644 --- a/relapack/src/ctrtri.c +++ b/relapack/src/ctrtri.c @@ -36,6 +36,8 @@ void RELAPACK_ctrtri( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; const char cleandiag = nounit ? 'N' : 'U'; From c9b67141f0827ccdfefd0197b6f0daba50f35dc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 11:20:31 +0200 Subject: [PATCH 141/349] Add early returns --- relapack/src/dgetrf.c | 12 ++++++------ relapack/src/dsytrf.c | 3 ++- relapack/src/dsytrf_rook.c | 4 ++-- relapack/src/dtrsyl.c | 5 +++++ relapack/src/zgetrf.c | 9 +++++++-- relapack/src/zhetrf_rook.c | 4 ++-- relapack/src/zsytrf.c | 3 ++- relapack/src/zsytrf_rook.c | 5 +++-- relapack/src/ztrsyl.c | 5 +++++ relapack/src/ztrtri.c | 4 ++-- 10 files changed, 36 insertions(+), 18 deletions(-) diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c index be960fde9e..3ebfb18d2d 100644 --- a/relapack/src/dgetrf.c +++ b/relapack/src/dgetrf.c @@ -29,15 +29,16 @@ void RELAPACK_dgetrf( return; } - const blasint sn = MIN(*m, *n); + if (*m == 0 || *n == 0) return; + const blasint sn = MIN(*m, *n); RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); // Right remainder if (*m < *n) { // Constants const double ONE[] = { 1. }; - const blasint iONE[] = { 1. }; + const blasint iONE[] = { 1 }; // Splitting const blasint rn = *n - *m; @@ -60,13 +61,11 @@ static void RELAPACK_dgetrf_rec( double *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - - if (*n <= MAX(CROSSOVER_DGETRF, 1)) { + if ( *n <= MAX(CROSSOVER_DGETRF, 1)) { // Unblocked - LAPACK(dgetf2)(m, n, A, ldA, ipiv, info); + LAPACK(dgetrf2)(m, n, A, ldA, ipiv, info); return; } - // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; @@ -95,6 +94,7 @@ static void RELAPACK_dgetrf_rec( // recursion(A_L, ipiv_T) RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + if (*info) return; // apply pivots to A_R LAPACK(dlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c index 43d28f94eb..ba869ad118 100644 --- a/relapack/src/dsytrf.c +++ b/relapack/src/dsytrf.c @@ -36,7 +36,7 @@ void RELAPACK_dsytrf( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -67,6 +67,7 @@ void RELAPACK_dsytrf( blasint nout; // Recursive kernel +if (*n != 0) RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c index 78fa652abe..fcdc2809ff 100644 --- a/relapack/src/dsytrf_rook.c +++ b/relapack/src/dsytrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_dsytrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork <1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_dsytrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF")); + LAPACK(xerbla)("DSYTRF_ROOK", &minfo, strlen("DSYTRF_ROOK")); return; } diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c index 7663773007..4948c49776 100644 --- a/relapack/src/dtrsyl.c +++ b/relapack/src/dtrsyl.c @@ -49,6 +49,11 @@ void RELAPACK_dtrsyl( return; } + if (*m == 0 || *n == 0) { + *scale = 1.; + return; + } + // Clean char * arguments const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c index b0d14ffb1e..8c3e8a8e87 100644 --- a/relapack/src/zgetrf.c +++ b/relapack/src/zgetrf.c @@ -30,6 +30,7 @@ void RELAPACK_zgetrf( return; } + if (*m == 0 || *n == 0) return; const blasint sn = MIN(*m, *n); RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -62,9 +63,11 @@ static void RELAPACK_zgetrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { + if (*m == 0 || *n == 0) return; + + if ( *n <= MAX(CROSSOVER_ZGETRF, 1)) { // Unblocked - LAPACK(zgetf2)(m, n, A, ldA, ipiv, info); + LAPACK(zgetrf2)(m, n, A, ldA, ipiv, info); return; } @@ -96,6 +99,8 @@ static void RELAPACK_zgetrf_rec( // recursion(A_L, ipiv_T) RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); +if (*info) return; + // apply pivots to A_R LAPACK(zlaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c index 285aea96e8..605e3a77f7 100644 --- a/relapack/src/zhetrf_rook.c +++ b/relapack/src/zhetrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_zhetrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_zhetrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF")); + LAPACK(xerbla)("ZHETRF_ROOK", &minfo, strlen("ZHETRF_ROOK")); return; } diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c index f3412ad8f3..59daba02f7 100644 --- a/relapack/src/zsytrf.c +++ b/relapack/src/zsytrf.c @@ -36,7 +36,7 @@ void RELAPACK_zsytrf( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -67,6 +67,7 @@ void RELAPACK_zsytrf( blasint nout; // Recursive kernel + if (*n != 0) RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c index fc6d736455..0fd8e70335 100644 --- a/relapack/src/zsytrf_rook.c +++ b/relapack/src/zsytrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_zsytrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_zsytrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF")); + LAPACK(xerbla)("ZSYTRF_ROOK", &minfo, strlen("ZSYTRF_ROOK")); return; } @@ -67,6 +67,7 @@ void RELAPACK_zsytrf_rook( blasint nout; // Recursive kernel + if (*n != 0) RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c index 567ef115a8..9d0107526b 100644 --- a/relapack/src/ztrsyl.c +++ b/relapack/src/ztrsyl.c @@ -47,6 +47,11 @@ void RELAPACK_ztrsyl( return; } + if (*m == 0 || *n == 0) { + *scale = 1.; + return; + } + // Clean char * arguments const char cleantranA = notransA ? 'N' : 'C'; const char cleantranB = notransB ? 'N' : 'C'; diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c index 3f6606d84b..54854f5253 100644 --- a/relapack/src/ztrtri.c +++ b/relapack/src/ztrtri.c @@ -69,8 +69,8 @@ static void RELAPACK_ztrtri_rec( } // Constants - const double ONE[] = { 1. }; - const double MONE[] = { -1. }; + const double ONE[] = { 1., 0. }; + const double MONE[] = { -1. , 0. }; // Splitting const blasint n1 = ZREC_SPLIT(*n); From d64cc2be8143225330bbc5b7877b155a1df3a90f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 11:22:50 +0200 Subject: [PATCH 142/349] Add early returns --- relapack/src/sgetrf.c | 15 +++++++++++---- relapack/src/ssytrf.c | 3 ++- relapack/src/ssytrf_rook.c | 5 +++-- relapack/src/strsyl.c | 5 +++++ 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c index 0231cc166f..a0c7015fd0 100644 --- a/relapack/src/sgetrf.c +++ b/relapack/src/sgetrf.c @@ -14,7 +14,6 @@ void RELAPACK_sgetrf( float *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - // Check arguments *info = 0; if (*m < 0) @@ -28,6 +27,9 @@ void RELAPACK_sgetrf( LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); return; } + + if (*m == 0 || *n == 0) return; + const blasint sn = MIN(*m, *n); RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -35,7 +37,7 @@ void RELAPACK_sgetrf( if (*m < *n) { // Constants const float ONE[] = { 1. }; - const blasint iONE[] = { 1. }; + const blasint iONE[] = { 1 }; // Splitting const blasint rn = *n - *m; @@ -58,9 +60,12 @@ static void RELAPACK_sgetrf_rec( float *A, const blasint *ldA, blasint *ipiv, blasint *info ) { - if (*n <= MAX(CROSSOVER_SGETRF, 1)) { + + if (*m == 0 || *n == 0) return; + + if ( *n <= MAX(CROSSOVER_SGETRF, 1)) { // Unblocked - LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); + LAPACK(sgetrf2)(m, n, A, ldA, ipiv, info); return; } @@ -91,6 +96,8 @@ static void RELAPACK_sgetrf_rec( // recursion(A_L, ipiv_T) RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); + if (*info) + return; // apply pivots to A_R LAPACK(slaswp)(&n2, A_R, ldA, iONE, &n1, ipiv_T, iONE); diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c index 9fe7ce4a6e..5f8e033913 100644 --- a/relapack/src/ssytrf.c +++ b/relapack/src/ssytrf.c @@ -35,7 +35,7 @@ void RELAPACK_ssytrf( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork <1 || *lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -66,6 +66,7 @@ void RELAPACK_ssytrf( blasint nout; // Recursive kernel +if (*n != 0) RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c index abcf29d1cb..b40f12271a 100644 --- a/relapack/src/ssytrf_rook.c +++ b/relapack/src/ssytrf_rook.c @@ -36,7 +36,7 @@ void RELAPACK_ssytrf_rook( *info = -2; else if (*ldA < MAX(1, *n)) *info = -4; - else if (*lWork < minlWork && *lWork != -1) + else if ((*lWork < 1 ||*lWork < minlWork) && *lWork != -1) *info = -7; else if (*lWork == -1) { // Work size query @@ -56,7 +56,7 @@ void RELAPACK_ssytrf_rook( if (*info) { const blasint minfo = -*info; - LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF")); + LAPACK(xerbla)("SSYTRF_ROOK", &minfo, strlen("SSYTRF_ROOK")); return; } @@ -67,6 +67,7 @@ void RELAPACK_ssytrf_rook( blasint nout; // Recursive kernel +if (*n != 0) RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); #if XSYTRF_ALLOW_MALLOC diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c index 012fb35486..d85963fcc1 100644 --- a/relapack/src/strsyl.c +++ b/relapack/src/strsyl.c @@ -49,6 +49,11 @@ void RELAPACK_strsyl( return; } + if (*m == 0 || *n == 0) { + *scale = 1.; + return; + } + // Clean char * arguments const char cleantranA = notransA ? 'N' : (transA ? 'T' : 'C'); const char cleantranB = notransB ? 'N' : (transB ? 'T' : 'C'); From de636757173680ba0a936588ca7b42cdf7ff6c9a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 27 Aug 2020 11:25:18 +0200 Subject: [PATCH 143/349] Add early returns and fix sign errors in workspace calculations --- relapack/src/cgbtrf.c | 11 ++++++----- relapack/src/cpbtrf.c | 10 ++++++---- relapack/src/dgbtrf.c | 5 ++++- relapack/src/dpbtrf.c | 10 ++++++---- relapack/src/sgbtrf.c | 16 ++++++++++++---- relapack/src/spbtrf.c | 13 +++++++++---- relapack/src/zgbtrf.c | 16 +++++++++++++++- relapack/src/zpbtrf.c | 11 +++++++---- 8 files changed, 65 insertions(+), 27 deletions(-) diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c index 61332c6a6c..e52f2e6c15 100644 --- a/relapack/src/cgbtrf.c +++ b/relapack/src/cgbtrf.c @@ -36,6 +36,7 @@ void RELAPACK_cgbtrf( return; } + if (*m == 0 || *n == 0) return; // Constant const float ZERO[] = { 0., 0. }; @@ -56,10 +57,10 @@ void RELAPACK_cgbtrf( // Allocate work space const blasint n1 = CREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv); + const blasint nWorkl = abs ( (kv > n1) ? n1 : kv); + const blasint mWorku = abs ((*kl > n1) ? n1 : *kl); + const blasint nWorku = abs ((*kl > n1) ? MAX(0, *n - *kl) : *kl); float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -82,7 +83,7 @@ static void RELAPACK_cgbtrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { + if (*n <= MAX(CROSSOVER_CGBTRF, 1)|| *n > *kl || *ldAb == 1) { // Unblocked LAPACK(cgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); return; diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c index 971e547c64..a0fa138509 100644 --- a/relapack/src/cpbtrf.c +++ b/relapack/src/cpbtrf.c @@ -35,6 +35,8 @@ void RELAPACK_cpbtrf( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -43,8 +45,8 @@ void RELAPACK_cpbtrf( // Allocate work space const blasint n1 = CREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint mWork = abs((*kd > n1) ? (lower ? *n - *kd : n1) : *kd); + const blasint nWork = abs((*kd > n1) ? (lower ? n1 : *n - *kd) : *kd); float *Work = malloc(mWork * nWork * 2 * sizeof(float)); LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -64,7 +66,7 @@ static void RELAPACK_cpbtrf_rec( blasint *info ){ - if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { + if (*n <= MAX(CROSSOVER_CPBTRF, 1) || *ldAb==1) { // Unblocked LAPACK(cpbtf2)(uplo, n, kd, Ab, ldAb, info); return; @@ -148,7 +150,7 @@ static void RELAPACK_cpbtrf_rec( } // recursion(A_BR) - if (*kd > n1) + if (*kd > n1 && ldA != 0) RELAPACK_cpotrf(uplo, &n2, A_BR, ldA, info); else RELAPACK_cpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c index cdf06ad5be..aac10f251e 100644 --- a/relapack/src/dgbtrf.c +++ b/relapack/src/dgbtrf.c @@ -36,6 +36,8 @@ void RELAPACK_dgbtrf( return; } + if (*m == 0 || *n == 0) return; + // Constant const double ZERO[] = { 0. }; @@ -83,7 +85,7 @@ static void RELAPACK_dgbtrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { + if (*n <= MAX(CROSSOVER_DGBTRF, 1) || *n > *kl || *ldAb == 1) { // Unblocked LAPACK(dgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); return; @@ -195,6 +197,7 @@ static void RELAPACK_dgbtrf_rec( // Worku = A_TRr LAPACK(dlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); // Worku = A_TL \ Worku + if (ldWorku <= 0) return; BLAS(dtrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); // A_TRr = Worku LAPACK(dlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c index 9380b28ad6..94e9b80e28 100644 --- a/relapack/src/dpbtrf.c +++ b/relapack/src/dpbtrf.c @@ -35,6 +35,8 @@ void RELAPACK_dpbtrf( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -43,8 +45,8 @@ void RELAPACK_dpbtrf( // Allocate work space const blasint n1 = DREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint mWork = abs((*kd > n1) ? (lower ? *n - *kd : n1) : *kd); + const blasint nWork = abs((*kd > n1) ? (lower ? n1 : *n - *kd) : *kd); double *Work = malloc(mWork * nWork * sizeof(double)); LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -64,7 +66,7 @@ static void RELAPACK_dpbtrf_rec( blasint *info ){ - if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { + if (*n <= MAX(CROSSOVER_DPBTRF, 1) || *ldAb == 1) { // Unblocked LAPACK(dpbtf2)(uplo, n, kd, Ab, ldAb, info); return; @@ -148,7 +150,7 @@ static void RELAPACK_dpbtrf_rec( } // recursion(A_BR) - if (*kd > n1) + if (*kd > n1 && ldA != 0) RELAPACK_dpotrf(uplo, &n2, A_BR, ldA, info); else RELAPACK_dpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c index 3e3fdf4555..76e84e6715 100644 --- a/relapack/src/sgbtrf.c +++ b/relapack/src/sgbtrf.c @@ -35,6 +35,13 @@ void RELAPACK_sgbtrf( return; } + if (*m == 0 || *n == 0) return; + + if (*ldAb == 1) { + LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); + return; + } + // Constant const float ZERO[] = { 0. }; @@ -82,8 +89,9 @@ static void RELAPACK_sgbtrf_rec( blasint *info ) { + if (*m == 0 || *n == 0) return; - if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { + if ( *n <= MAX(CROSSOVER_SGBTRF, 1) || *n > *kl || *ldAb == 1) { // Unblocked LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); return; @@ -160,7 +168,7 @@ static void RELAPACK_sgbtrf_rec( // recursion(Ab_L, ipiv_T) RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); - + if (*info) return; // Workl = A_BLb LAPACK(slacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); @@ -222,8 +230,8 @@ static void RELAPACK_sgbtrf_rec( // recursion(Ab_BR, ipiv_B) //cause of infinite recursion here ? -// RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); - LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); +// LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c index 26804dcc2f..330276312a 100644 --- a/relapack/src/spbtrf.c +++ b/relapack/src/spbtrf.c @@ -35,6 +35,9 @@ void RELAPACK_spbtrf( return; } + + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -43,8 +46,8 @@ void RELAPACK_spbtrf( // Allocate work space const blasint n1 = SREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint mWork = abs( (*kd > n1) ? (lower ? *n - *kd : n1) : *kd); + const blasint nWork = abs((*kd > n1) ? (lower ? n1 : *n - *kd) : *kd); float *Work = malloc(mWork * nWork * sizeof(float)); LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -64,7 +67,9 @@ static void RELAPACK_spbtrf_rec( blasint *info ){ - if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { + if (*n == 0 ) return; + + if ( *n <= MAX(CROSSOVER_SPBTRF, 1) || *ldAb == 1) { // Unblocked LAPACK(spbtf2)(uplo, n, kd, Ab, ldAb, info); return; @@ -148,7 +153,7 @@ static void RELAPACK_spbtrf_rec( } // recursion(A_BR) - if (*kd > n1) + if (*kd > n1 && ldA != 0) RELAPACK_spotrf(uplo, &n2, A_BR, ldA, info); else RELAPACK_spbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c index d4ba417531..5d7dfd3c70 100644 --- a/relapack/src/zgbtrf.c +++ b/relapack/src/zgbtrf.c @@ -36,6 +36,8 @@ void RELAPACK_zgbtrf( return; } + if (*m == 0 || *n == 0) return; + // Constant const double ZERO[] = { 0., 0. }; @@ -82,7 +84,7 @@ static void RELAPACK_zgbtrf_rec( blasint *info ) { - if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { + if (*n <= MAX(CROSSOVER_ZGBTRF, 1) || *n > *kl || *ldAb == 1) { // Unblocked LAPACK(zgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); return; @@ -92,6 +94,7 @@ static void RELAPACK_zgbtrf_rec( const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; const blasint iONE[] = { 1 }; + const blasint min11 = -11; // Loop iterators blasint i, j; @@ -158,6 +161,7 @@ static void RELAPACK_zgbtrf_rec( // recursion(Ab_L, ipiv_T) RELAPACK_zgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); +if (*info) return; // Workl = A_BLb LAPACK(zlacpy)("U", &m22, &n1, A_BLb, ldA, Workl, ldWorkl); @@ -193,11 +197,21 @@ static void RELAPACK_zgbtrf_rec( } // A_TRl = A_TL \ A_TRl + if (*ldA < MAX(1,m1)) { + LAPACK(xerbla)("ZGBTRF", &min11, strlen("ZGBTRF")); + return; + } else { BLAS(ztrsm)("L", "L", "N", "U", &m1, &n21, ONE, A_TL, ldA, A_TRl, ldA); + } // Worku = A_TRr LAPACK(zlacpy)("L", &m1, &n22, A_TRr, ldA, Worku, ldWorku); // Worku = A_TL \ Worku + if (*ldWorku < MAX(1,m1)) { + LAPACK(xerbla)("ZGBTRF", &min11, strlen("ZGBTRF")); + return; + } else { BLAS(ztrsm)("L", "L", "N", "U", &m1, &n22, ONE, A_TL, ldA, Worku, ldWorku); + } // A_TRr = Worku LAPACK(zlacpy)("L", &m1, &n22, Worku, ldWorku, A_TRr, ldA); // A_BRtl = A_BRtl - A_BLt * A_TRl diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c index fb0e1e97b5..8b094380c0 100644 --- a/relapack/src/zpbtrf.c +++ b/relapack/src/zpbtrf.c @@ -35,6 +35,8 @@ void RELAPACK_zpbtrf( return; } + if (*n == 0) return; + // Clean char * arguments const char cleanuplo = lower ? 'L' : 'U'; @@ -43,9 +45,10 @@ void RELAPACK_zpbtrf( // Allocate work space const blasint n1 = ZREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const blasint mWork = abs((*kd > n1) ? (lower ? *n - *kd : n1) : *kd); + const blasint nWork = abs((*kd > n1) ? (lower ? n1 : *n - *kd) : *kd); double *Work = malloc(mWork * nWork * 2 * sizeof(double)); + LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); // Recursive kernel @@ -64,7 +67,7 @@ static void RELAPACK_zpbtrf_rec( blasint *info ){ - if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { + if (*n <= MAX(CROSSOVER_ZPBTRF, 1) || *ldAb == 1) { // Unblocked LAPACK(zpbtf2)(uplo, n, kd, Ab, ldAb, info); return; @@ -148,7 +151,7 @@ static void RELAPACK_zpbtrf_rec( } // recursion(A_BR) - if (*kd > n1) + if (*kd > n1 && ldA != 0) RELAPACK_zpotrf(uplo, &n2, A_BR, ldA, info); else RELAPACK_zpbtrf_rec(uplo, &n2, kd, Ab_BR, ldAb, Work, ldWork, info); From 085aae8bdb137ed2156f2bb4f005a17cd3106384 Mon Sep 17 00:00:00 2001 From: Kevin Adler Date: Thu, 27 Aug 2020 23:08:33 -0500 Subject: [PATCH 144/349] Fix compile error on AIX cpuid detection In 589c74a the cpuid detection was changed to use systemcfg, but a copy and paste error was introduced during some refactoring that caused POWER7 detection to reference CPUTYPE_POWER7 (which doesn't exist) instead of CPUTYPE_POWER6. --- cpuid_power.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_power.c b/cpuid_power.c index df3dc86686..b17493bc8f 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -145,7 +145,7 @@ int detect(void){ if (implementation >= 0x40000u) return CPUTYPE_POWER10; else if (implementation & 0x20000) return CPUTYPE_POWER9; else if (implementation & 0x10000) return CPUTYPE_POWER8; - else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7 + else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7 else if (implementation & 0x04000) return CPUTYPE_POWER6; else if (implementation & 0x02000) return CPUTYPE_POWER5; else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 From 317ff27cda58fbd06f195bea27cab2448b55a0ac Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 28 Aug 2020 10:42:54 -0500 Subject: [PATCH 145/349] POWER10: Avoid setting accumulators to zero in gemm kernels For the first iteration, it is better to use xvf*ger instead of xvf*gerpp builtins which helps to avoid setting accumulators to zero. This helps to reduce few instructions. --- kernel/power/dgemm_kernel_power10.c | 156 ++++++++++++--------- kernel/power/sgemm_kernel_power10.c | 204 +++++++++++++++++----------- 2 files changed, 222 insertions(+), 138 deletions(-) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index a0bc1a777a..b2a29140e7 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -87,22 +87,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); rowC[0] += result[1] * alpha; #endif -#define SET_ACC_ZERO4() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); - -#define SET_ACC_ZERO8() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); - #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) @@ -210,12 +194,22 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, PREFETCH1 (CO + ldc + ldc, 128); PREFETCH1 (CO + ldc + ldc + ldc, 128); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -254,13 +248,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 3]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -291,14 +291,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); @@ -325,13 +328,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 1]; + rb = (vec_t *) & BO[l << 2]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } @@ -414,16 +420,27 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]); + __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]); + __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 4]; + rowA = (vec_t *) & AO[l << 4]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -461,16 +478,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 3]; + rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -500,17 +524,21 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -536,16 +564,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0, 0, 0, 0 }; + t[0] = BO[0], t[1] = BO[1]; + __vector_pair rowB; + vec_t *rb = (vec_t *) & t[0]; + __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0, 0, 0, 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - __vector_pair rowB; - vec_t *rb = (vec_t *) & t[0]; + rb = (vec_t *) & t[0]; __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); - vec_t *rowA = (vec_t *) & AO[l << 1]; + rowA = (vec_t *) & AO[l << 1]; __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE2x4_ACC (&acc0, 0); diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c index 81a5ec76b2..9fbf84695a 100644 --- a/kernel/power/sgemm_kernel_power10.c +++ b/kernel/power/sgemm_kernel_power10.c @@ -134,21 +134,6 @@ typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); __builtin_mma_xvf32gerpp (&acc5, rowB[i+1], rowA[j+2]); \ __builtin_mma_xvf32gerpp (&acc6, rowB[i], rowA[j+3]); \ __builtin_mma_xvf32gerpp (&acc7, rowB[i+1], rowA[j+3]); -#define SET_ACC_ZERO4() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); - -#define SET_ACC_ZERO8() \ - __builtin_mma_xxsetaccz (&acc0); \ - __builtin_mma_xxsetaccz (&acc1); \ - __builtin_mma_xxsetaccz (&acc2); \ - __builtin_mma_xxsetaccz (&acc3); \ - __builtin_mma_xxsetaccz (&acc4); \ - __builtin_mma_xxsetaccz (&acc5); \ - __builtin_mma_xxsetaccz (&acc6); \ - __builtin_mma_xxsetaccz (&acc7); #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); @@ -249,8 +234,20 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; + vec_t *rowA1 = (vec_t *) & AO[0]; + vec_t *rowB1 = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB1[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc1, rowB1[1], rowA1[0]); + __builtin_mma_xvf32ger (&acc2, rowB1[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc3, rowB1[1], rowA1[1]); + __builtin_mma_xvf32ger (&acc4, rowB1[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc5, rowB1[1], rowA1[2]); + __builtin_mma_xvf32ger (&acc6, rowB1[0], rowA1[3]); + __builtin_mma_xvf32ger (&acc7, rowB1[1], rowA1[3]); + AO += 16; + BO += 8; + temp--; BLASLONG K = temp / 64; for (l = 0; l < K; l++) { @@ -454,12 +451,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc3, rowB[1], rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[1]); @@ -489,13 +491,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); } @@ -522,15 +526,18 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v2sf_t *rowC; v2sf_t result[8]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = AO[0], t[1] = AO[1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[1], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; - vec_t *rowA = (vec_t *) & t[0]; - vec_t *rowB = (vec_t *) & BO[l << 3]; + rowA = (vec_t *) & t[0]; + rowB = (vec_t *) & BO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[1], rowA[0]); } @@ -625,13 +632,23 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, FLOAT *A1; A1 = AO + (16 * k); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < k; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowA1 = (vec_t *) & A1[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]); + for (l = 1; l < k; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowA1 = (vec_t *) & A1[l << 4]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rowA1 = (vec_t *) & A1[l << 4]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -673,12 +690,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 4]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -710,13 +732,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 3]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); } @@ -742,12 +766,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; __vector_quad acc0; v4sf_t result[4]; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - vec_t *rowA = (vec_t *) & AO[l << 2]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE_ACC (&acc0, 0); @@ -771,14 +797,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v2sf_t *rowC; v2sf_t result[8]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = AO[0], t[1] = AO[1]; + vec_t *rowA = (vec_t *) & t[0]; + vec_t *rowB = (vec_t *) & BO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = AO[l << 1], t[1] = AO[(l << 1) + 1]; - vec_t *rowA = (vec_t *) & t[0]; - vec_t *rowB = (vec_t *) & BO[l << 2]; + rowA = (vec_t *) & t[0]; + rowB = (vec_t *) & BO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE4x2_ACC (&acc0, 0); @@ -856,15 +885,26 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, FLOAT *A1; A1 = AO + (16 * k); __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; - SET_ACC_ZERO8 (); BLASLONG l = 0; - for (l = 0; l < k; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + vec_t *rowA1 = (vec_t *) & A1[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + __builtin_mma_xvf32ger (&acc4, rowB[0], rowA1[0]); + __builtin_mma_xvf32ger (&acc5, rowB[0], rowA1[1]); + __builtin_mma_xvf32ger (&acc6, rowB[0], rowA1[2]); + __builtin_mma_xvf32ger (&acc7, rowB[0], rowA1[3]); + for (l = 1; l < k; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 4]; - vec_t *rowA1 = (vec_t *) & A1[l << 4]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 4]; + rowA1 = (vec_t *) & A1[l << 4]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -897,7 +937,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1, acc2, acc3; - SET_ACC_ZERO4 (); BLASLONG l = 0; #if defined(TRMMKERNEL) REFRESH_POINTERS (16, 2) @@ -905,12 +944,19 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BO = B; temp = k; #endif - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + __builtin_mma_xvf32ger (&acc2, rowB[0], rowA[2]); + __builtin_mma_xvf32ger (&acc3, rowB[0], rowA[3]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 4]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 4]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); __builtin_mma_xvf32gerpp (&acc2, rowB[0], rowA[2]); @@ -934,8 +980,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0, acc1; - __builtin_mma_xxsetaccz (&acc0); - __builtin_mma_xxsetaccz (&acc1); #if defined(TRMMKERNEL) REFRESH_POINTERS (8, 2) #else @@ -943,12 +987,17 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, temp = k; #endif BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + __builtin_mma_xvf32ger (&acc1, rowB[0], rowA[1]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 3]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 3]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); __builtin_mma_xvf32gerpp (&acc1, rowB[0], rowA[1]); } @@ -968,7 +1017,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, v4sf_t *rowC; v4sf_t result[4]; __vector_quad acc0; - __builtin_mma_xxsetaccz (&acc0); #if defined(TRMMKERNEL) REFRESH_POINTERS (4, 2) #else @@ -976,12 +1024,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, temp = k; #endif BLASLONG l = 0; - for (l = 0; l < temp; l++) + FLOAT t[4] = { 0 }; + t[0] = BO[0], t[1] = BO[1]; + vec_t *rowB = (vec_t *) & t[0]; + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf32ger (&acc0, rowB[0], rowA[0]); + for (l = 1; l < temp; l++) { - FLOAT t[4] = { 0 }; t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; - vec_t *rowB = (vec_t *) & t[0]; - vec_t *rowA = (vec_t *) & AO[l << 2]; + rowB = (vec_t *) & t[0]; + rowA = (vec_t *) & AO[l << 2]; __builtin_mma_xvf32gerpp (&acc0, rowB[0], rowA[0]); } SAVE2x4_ACC (&acc0, 0); From cb3c190a3a46057782fb518e81b51fc7909e01d8 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Fri, 21 Aug 2020 14:44:36 +0800 Subject: [PATCH 146/349] Implementaion of dasum, sasum with AVX2 & AVX512 intrinsic --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/dasum.c | 96 ++++++++++++++++++++++ kernel/x86_64/dasum_microk_haswell-2.c | 35 ++++++++ kernel/x86_64/dasum_microk_skylakex-2.c | 27 ++++++ kernel/x86_64/sasum.c | 104 ++++++++++++++++++++++++ kernel/x86_64/sasum_microk_haswell-2.c | 36 ++++++++ kernel/x86_64/sasum_microk_skylakex-2.c | 27 ++++++ 7 files changed, 327 insertions(+) create mode 100644 kernel/x86_64/dasum.c create mode 100644 kernel/x86_64/dasum_microk_haswell-2.c create mode 100644 kernel/x86_64/dasum_microk_skylakex-2.c create mode 100644 kernel/x86_64/sasum.c create mode 100644 kernel/x86_64/sasum_microk_haswell-2.c create mode 100644 kernel/x86_64/sasum_microk_skylakex-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index ef8b36a573..b979fc0aed 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -100,3 +100,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c new file mode 100644 index 0000000000..31313416b6 --- /dev/null +++ b/kernel/x86_64/dasum.c @@ -0,0 +1,96 @@ +#include "common.h" +#include + +#define ABS fabs + +#if defined(SKYLAKEX) +#include "dasum_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "dasum_microk_haswell-2.c" +#endif + +#ifndef HAVE_KERNEL_16 +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + return sum0+sum1+sum2+sum3; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -16; + if ( n1 > 0 ) + { + + sumf = dasum_kernel_16(n1, x); + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} + diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c new file mode 100644 index 0000000000..bf9d85e73b --- /dev/null +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -0,0 +1,35 @@ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_16 1 + +#include + +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); + for (; i < n; i += 16) { + accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + return half_accum0[0]; + +} +#endif diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c new file mode 100644 index 0000000000..2c959b1ad6 --- /dev/null +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -0,0 +1,27 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#if defined(__AVX512CD__) +#define HAVE_KERNEL_16 1 + +#include + +static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + + __m512d accum_0, accum_1; + + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + + for (; i < n; i += 16) { + accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0])); + accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8])); + } + + accum_0 += accum_1; + return _mm512_reduce_add_pd(accum_0); +} +#endif +#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c new file mode 100644 index 0000000000..6012555465 --- /dev/null +++ b/kernel/x86_64/sasum.c @@ -0,0 +1,104 @@ +#include "common.h" +#include + +#if defined(DOUBLE) + +#error supports float only + +#else + +#define ABS fabsf + +#endif + +#if defined(SKYLAKEX) +#include "sasum_microk_skylakex-2.c" +#elif defined(HASWELL) +#include "sasum_microk_haswell-2.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + return sum0+sum1+sum2+sum3; +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sumf = sasum_kernel_32(n1, x); + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c new file mode 100644 index 0000000000..f46e76ebfe --- /dev/null +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -0,0 +1,36 @@ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#define HAVE_KERNEL_32 1 + +#include + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); + + __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); + for (; i < n; i += 32) { + accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128 half_accum0; + half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + return half_accum0[0]; + +} +#endif diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c new file mode 100644 index 0000000000..b1c49fd097 --- /dev/null +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -0,0 +1,27 @@ +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) + +#if defined(__AVX512CD__) +#define HAVE_KERNEL_32 1 + +#include + +static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +{ + BLASLONG i = 0; + + __m512 accum_0, accum_1; + + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + + for (; i < n; i += 32) { + accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0])); + accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16])); + } + + accum_0 += accum_1; + return _mm512_reduce_add_ps(accum_0); +} +#endif +#endif From 448152cdd809c6ab16f1767660e2f4b5b3aa4ef6 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Mon, 31 Aug 2020 14:39:08 +0800 Subject: [PATCH 147/349] define __AVX2__ to ensure the haswell code compiled with avx2 --- kernel/x86_64/dasum_microk_haswell-2.c | 2 +- kernel/x86_64/sasum_microk_haswell-2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index bf9d85e73b..7639dfd041 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,4 +1,4 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index f46e76ebfe..b628729f50 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,4 +1,4 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) #define HAVE_KERNEL_32 1 From 5feb087c05beff18208c31b369d74dc3badeada3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Aug 2020 20:02:08 +0200 Subject: [PATCH 148/349] Handle Apple labeling armv8 as arm64 rather than aarch64 --- cmake/system_check.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 4382ffc4e2..511a7c7d1e 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -54,14 +54,14 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") - set(ARM 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(ARM64 1) else() set(ARM 1) endif() +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") + set(ARM 1) elseif (${CMAKE_CROSSCOMPILING}) if (${TARGET} STREQUAL "CORE2") if (NOT BINARY) From 3210a427345126112d3a1501d2ea8024aea861cc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 31 Aug 2020 20:03:21 +0200 Subject: [PATCH 149/349] Report cpu as ARMV8 instead of just giving up on non-Linux hosts --- cpuid_arm64.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 6f41be6048..1fd43148a3 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -197,6 +197,8 @@ int detect(void) } +#else + return CPU_ARMV8; #endif return CPU_UNKNOWN; From f42e84d46c52f4ee1e05af8f365cd85de8a77b95 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Sep 2020 10:44:48 +0200 Subject: [PATCH 150/349] Fix misnaming of LAPACK_?ggsvp function prototypes as LAPACKE_ (#2808) * Fix misnaming of LAPACK_?ggsvp and ?ggsvd function prototypes as LAPACKE_ * Drop the LAPACKE matrix_layout parameter from the argument lists, change ints to pointers and add missing work arguments. --- lapack-netlib/LAPACKE/include/lapack.h | 116 +++++++++++++------------ 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 36e53ec245..4f48b7c879 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3650,45 +3650,45 @@ void LAPACK_zggrqf( lapack_int* info ); #define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) -lapack_int LAPACKE_sggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, float* a, - lapack_int lda, float* b, lapack_int ldb, - float* alpha, float* beta, float* u, lapack_int ldu, - float* v, lapack_int ldv, float* q, lapack_int ldq, - lapack_int* iwork ); + lapack_int* lda, float* b, lapack_int* ldb, + float* alpha, float* beta, float* u, lapack_int* ldu, + float* v, lapack_int* ldv, float* q, lapack_int* ldq, + float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) -lapack_int LAPACKE_dggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, double* a, - lapack_int lda, double* b, lapack_int ldb, + lapack_int* lda, double* b, lapack_int* ldb, double* alpha, double* beta, double* u, - lapack_int ldu, double* v, lapack_int ldv, double* q, - lapack_int ldq, lapack_int* iwork ); + lapack_int* ldu, double* v, lapack_int* ldv, double* q, + lapack_int* ldq, float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) -lapack_int LAPACKE_cggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int lda, - lapack_complex_float* b, lapack_int ldb, + lapack_complex_float* a, lapack_int* lda, + lapack_complex_float* b, lapack_int* ldb, float* alpha, float* beta, lapack_complex_float* u, - lapack_int ldu, lapack_complex_float* v, - lapack_int ldv, lapack_complex_float* q, - lapack_int ldq, lapack_int* iwork ); + lapack_int* ldu, lapack_complex_float* v, + lapack_int* ldv, lapack_complex_float* q, + lapack_int* ldq, float* work, lapack_int* rwork, lapack_int* iwork, lapack_int *info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) -lapack_int LAPACKE_zggsvd( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int n, lapack_int p, +lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int lda, - lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* a, lapack_int* lda, + lapack_complex_double* b, lapack_int* ldb, double* alpha, double* beta, - lapack_complex_double* u, lapack_int ldu, - lapack_complex_double* v, lapack_int ldv, - lapack_complex_double* q, lapack_int ldq, - lapack_int* iwork ); + lapack_complex_double* u, lapack_int* ldu, + lapack_complex_double* v, lapack_int* ldv, + lapack_complex_double* q, lapack_int* ldq, + float* work, lapack_int* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) void LAPACK_cggsvd3( @@ -3753,41 +3753,49 @@ void LAPACK_zggsvd3( lapack_int* info ); #define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) -lapack_int LAPACKE_sggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, float* a, - lapack_int lda, float* b, lapack_int ldb, float tola, - float tolb, lapack_int* k, lapack_int* l, float* u, - lapack_int ldu, float* v, lapack_int ldv, float* q, - lapack_int ldq ); +lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, float* a, + lapack_int* lda, float* b, lapack_int* ldb, float* tola, + float* tolb, lapack_int* k, lapack_int* l, float* u, + lapack_int* ldu, float* v, lapack_int* ldv, float* q, + lapack_int* ldq, lapack_int* iwork, float* tau, + float* work, lapack_int* info); #define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) -lapack_int LAPACKE_dggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, double* a, - lapack_int lda, double* b, lapack_int ldb, - double tola, double tolb, lapack_int* k, - lapack_int* l, double* u, lapack_int ldu, double* v, - lapack_int ldv, double* q, lapack_int ldq ); +lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, double* a, + lapack_int* lda, double* b, lapack_int* ldb, + double* tola, double* tolb, lapack_int* k, + lapack_int* l, double* u, lapack_int* ldu, double* v, + lapack_int* ldv, double* q, lapack_int* ldq, + lapack_int* iwork, double* tau, double* work, + lapack_int* info); #define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) -lapack_int LAPACKE_cggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, - lapack_complex_float* a, lapack_int lda, - lapack_complex_float* b, lapack_int ldb, float tola, - float tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int ldu, - lapack_complex_float* v, lapack_int ldv, - lapack_complex_float* q, lapack_int ldq ); +lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, + lapack_complex_float* a, lapack_int* lda, + lapack_complex_float* b, lapack_int* ldb, float* tola, + float* tolb, lapack_int* k, lapack_int* l, + lapack_complex_float* u, lapack_int* ldu, + lapack_complex_float* v, lapack_int* ldv, + lapack_complex_float* q, lapack_int* ldq, + lapack_int* iwork, lapack_int* rwork, + lapack_complex_float* tau, lapack_complex_float* work, + lapack_int* info); #define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) -lapack_int LAPACKE_zggsvp( int matrix_layout, char jobu, char jobv, char jobq, - lapack_int m, lapack_int p, lapack_int n, - lapack_complex_double* a, lapack_int lda, - lapack_complex_double* b, lapack_int ldb, - double tola, double tolb, lapack_int* k, +lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, + lapack_int* m, lapack_int* p, lapack_int* n, + lapack_complex_double* a, lapack_int* lda, + lapack_complex_double* b, lapack_int* ldb, + double* tola, double* tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u, - lapack_int ldu, lapack_complex_double* v, - lapack_int ldv, lapack_complex_double* q, - lapack_int ldq ); + lapack_int* ldu, lapack_complex_double* v, + lapack_int* ldv, lapack_complex_double* q, + lapack_int* ldq, lapack_int* iwork, lapack_int* rwork, + lapack_complex_double* tau, lapack_complex_double* work, + lapack_int* info); #define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3) void LAPACK_cggsvp3( From 68b1713c300ac152d1efcb3c02f0c59fafcd39e1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Sep 2020 17:19:14 +0200 Subject: [PATCH 151/349] Merge pull request #2811 from martin-frbg/issue2806 Make NO_AVX512 option override the AVX512 compile test in CMAKE builds as well --- cmake/system.cmake | 5 +++++ cmake/system_check.cmake | 2 ++ 2 files changed, 7 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index e3617c4e28..c0f3c6ed2e 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -110,6 +110,11 @@ if (NO_AVX2) set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX2") endif () +if (NO_AVX512) + message(STATUS "Disabling Advanced Vector Extensions 512 (AVX512).") + set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX512") +endif () + if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(GETARCH_FLAGS "${GETARCH_FLAGS} ${CMAKE_C_FLAGS_DEBUG}") endif () diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 511a7c7d1e..d06f4779fd 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -109,6 +109,7 @@ else() endif() if (X86_64 OR X86) +if (NOT NO_AVX512) file(WRITE ${PROJECT_BINARY_DIR}/avx512.c "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o ${PROJECT_BINARY_DIR}/avx512.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) @@ -116,6 +117,7 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() file(REMOVE "avx512.c" "avx512.o") endif() +endif() include(CheckIncludeFile) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) From e4900caa1180e9b13766a97e708992f9df61b1a1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 1 Sep 2020 19:54:08 +0200 Subject: [PATCH 152/349] Fix c_check misinterpreting arm64 in uname output to mean armv7 additionla fix for upcoming OSX on ARM64 related to #2804, as suggested by fxcoudert in #2805 --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 314c2b157c..5ea93b75ce 100644 --- a/c_check +++ b/c_check @@ -8,7 +8,7 @@ $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); $hostarch = `uname -p` if ($hostos eq "AIX"); $hostarch = "x86_64" if ($hostarch eq "amd64"); -$hostarch = "arm" if ($hostarch =~ /^arm.*/); +$hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/); $hostarch = "arm64" if ($hostarch eq "aarch64"); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $hostarch = "zarch" if ($hostarch eq "s390x"); From 60ef193258f580115640794e0c867ef45cb16974 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 13:59:06 +0200 Subject: [PATCH 153/349] s390x: use "lghi" for immediate values to fix build with clang Some of the kernels written in assembly utilize a "load address" instruction for loading an immediate value into a register. That is both unnecessarily complex and LLVM's assembler does not understand that specific syntax. Thus, replace with the appropriate "load immediate" instruction, which is also clearer to read. Signed-off-by: Marius Hillenbrand --- kernel/zarch/ctrmm4x4V.S | 18 +++++++++--------- kernel/zarch/gemm8x4V.S | 24 ++++++++++++------------ kernel/zarch/strmm8x4V.S | 24 ++++++++++++------------ kernel/zarch/ztrmm4x4V.S | 18 +++++++++--------- 4 files changed, 42 insertions(+), 42 deletions(-) diff --git a/kernel/zarch/ctrmm4x4V.S b/kernel/zarch/ctrmm4x4V.S index c0e4df17d1..123f2ead0c 100644 --- a/kernel/zarch/ctrmm4x4V.S +++ b/kernel/zarch/ctrmm4x4V.S @@ -198,7 +198,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else - la LOCAL_VAR1,3(0,0) + lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store @@ -254,7 +254,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store @@ -305,7 +305,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store @@ -385,7 +385,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store @@ -442,7 +442,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store @@ -492,7 +492,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store @@ -568,7 +568,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store @@ -620,7 +620,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store @@ -670,7 +670,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store diff --git a/kernel/zarch/gemm8x4V.S b/kernel/zarch/gemm8x4V.S index 27fd5f57b5..633e60ea6d 100644 --- a/kernel/zarch/gemm8x4V.S +++ b/kernel/zarch/gemm8x4V.S @@ -147,7 +147,7 @@ brctg LOCAL_VAR1,.L8x4_4_BK ALIGN_4 .L8x4_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x4_BK_Store @@ -183,7 +183,7 @@ brctg LOCAL_VAR1,.L4x4_4_BK ALIGN_4 .L4x4_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x4_BK_Store @@ -217,7 +217,7 @@ brctg LOCAL_VAR1,.L2x4_4_BK ALIGN_4 .L2x4_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x4_BK_Store @@ -252,7 +252,7 @@ brctg LOCAL_VAR1,.L1x4_4_BK ALIGN_4 .L1x4_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x4_BK_Store @@ -309,7 +309,7 @@ brctg LOCAL_VAR1,.L8x2_4_BK ALIGN_4 .L8x2_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x2_BK_Store @@ -346,7 +346,7 @@ brctg LOCAL_VAR1,.L4x2_4_BK ALIGN_4 .L4x2_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x2_BK_Store @@ -380,7 +380,7 @@ brctg LOCAL_VAR1,.L2x2_4_BK ALIGN_4 .L2x2_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x2_BK_Store @@ -415,7 +415,7 @@ brctg LOCAL_VAR1,.L1x2_4_BK ALIGN_4 .L1x2_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x2_BK_Store @@ -471,7 +471,7 @@ brctg LOCAL_VAR1,.L8x1_4_BK ALIGN_4 .L8x1_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L8x1_BK_Store @@ -508,7 +508,7 @@ brctg LOCAL_VAR1,.L4x1_4_BK ALIGN_4 .L4x1_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L4x1_BK_Store @@ -542,7 +542,7 @@ brctg LOCAL_VAR1,.L2x1_4_BK ALIGN_4 .L2x1_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L2x1_BK_Store @@ -577,7 +577,7 @@ brctg LOCAL_VAR1,.L1x1_4_BK ALIGN_4 .L1x1_mod: -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ jz .L1x1_BK_Store diff --git a/kernel/zarch/strmm8x4V.S b/kernel/zarch/strmm8x4V.S index f8e7481670..e34a7a05ae 100644 --- a/kernel/zarch/strmm8x4V.S +++ b/kernel/zarch/strmm8x4V.S @@ -186,7 +186,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,8,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x4_BK_Store @@ -239,7 +239,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else - la LOCAL_VAR1,3(0,0) + lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store @@ -290,7 +290,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store @@ -341,7 +341,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store @@ -423,7 +423,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,8,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x2_BK_Store @@ -475,7 +475,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store @@ -525,7 +525,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store @@ -575,7 +575,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store @@ -655,7 +655,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,8,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L8x1_BK_Store @@ -708,7 +708,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store @@ -757,7 +757,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store @@ -807,7 +807,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store diff --git a/kernel/zarch/ztrmm4x4V.S b/kernel/zarch/ztrmm4x4V.S index 52ee15f06c..6fd7f25099 100644 --- a/kernel/zarch/ztrmm4x4V.S +++ b/kernel/zarch/ztrmm4x4V.S @@ -196,7 +196,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,4 nill LOCAL_VAR1,3 #else - la LOCAL_VAR1,3(0,0) + lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x4_BK_Store @@ -256,7 +256,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x4_BK_Store @@ -307,7 +307,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,4 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x4_BK_Store @@ -390,7 +390,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x2_BK_Store @@ -447,7 +447,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x2_BK_Store @@ -497,7 +497,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,2 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x2_BK_Store @@ -573,7 +573,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,4,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L4x1_BK_Store @@ -625,7 +625,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,2,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L2x1_BK_Store @@ -675,7 +675,7 @@ ALIGN_4 RefreshTempBk LOCAL_VAR1,BK,OFF,1,1 nill LOCAL_VAR1,3 #else -la LOCAL_VAR1,3(0,0) +lghi LOCAL_VAR1,3 NGR LOCAL_VAR1,BK /*refresh BK*/ #endif jz .L1x1_BK_Store From a1616a0b8653fb06d607c5f8efafa01b0106dded Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 11:58:48 +0200 Subject: [PATCH 154/349] s390x: replace nop with "nop 0" in inline assembly ... as a bandaid for building with clang until LLVM's internal assembler supports nops without operand. Signed-off-by: Marius Hillenbrand --- kernel/zarch/dgemv_n_4.c | 6 +++--- kernel/zarch/dgemv_t_4.c | 2 +- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- kernel/zarch/idamax.c | 2 +- kernel/zarch/idamin.c | 2 +- kernel/zarch/idmax.c | 2 +- kernel/zarch/idmin.c | 2 +- kernel/zarch/isamax.c | 2 +- kernel/zarch/isamin.c | 2 +- kernel/zarch/ismax.c | 2 +- kernel/zarch/ismin.c | 2 +- kernel/zarch/izamax.c | 2 +- kernel/zarch/izamin.c | 2 +- kernel/zarch/sgemv_n_4.c | 6 +++--- kernel/zarch/sgemv_t_4.c | 2 +- 16 files changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index 502ba837ea..b2a3d1e8d3 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -169,7 +169,7 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), @@ -274,7 +274,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), @@ -351,7 +351,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index de72a1798a..30cec14f7d 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -438,7 +438,7 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) dest) : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), [src] "a"(src),[n] "r"(n) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index a2546b8124..2d5c48407e 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -213,7 +213,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "ste %%f0,%[amax]\n\t" "vlgvg %[iamax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 09654b7426..1d51bb2c2b 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -213,7 +213,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "ste %%f0,%[amin]\n\t" "vlgvg %[iamin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b292c1d151..f9bfe34945 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -160,7 +160,7 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "std %%f0,%[amax]\n\t" "vlgvg %[iamax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index f9a8119e15..b7ce700275 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -160,7 +160,7 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "std %%f0,%[amin]\n\t" "vlgvg %[iamin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 8f283bc170..55471ce506 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -140,7 +140,7 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "std %%f0,%[max]\n\t" "vlgvg %[imax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index e4b7bb4fe3..ec1c69822e 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -140,7 +140,7 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "std %%f0,%[min]\n\t" "vlgvg %[imin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index ac86435d77..6ea46c7162 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -204,7 +204,7 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { "ste %%f0,%[amax]\n\t" "vlgvg %[iamax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 3f2d039eb9..18cfa2a6e1 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -204,7 +204,7 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { "ste %%f0,%[amin]\n\t" "vlgvg %[iamin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 41172c1bd3..be990b9d55 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -184,7 +184,7 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { "ste %%f0,%[max]\n\t" "vlgvg %[imax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index e2684df416..a27c8a743e 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -184,7 +184,7 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { "ste %%f0,%[min]\n\t" "vlgvg %[imin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index daca1d6f71..cb299cb246 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -157,7 +157,7 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { "std %%f0,%[amax]\n\t" "vlgvg %[iamax],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 9ababb91fd..4dfa1a9db1 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -157,7 +157,7 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { "std %%f0,%[amin]\n\t" "vlgvg %[iamin],%%v1,0\n\t" "2:\n\t" - "nop" + "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index a1efef373f..a0d522b831 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -159,7 +159,7 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "agfi %%r1,16\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), @@ -258,7 +258,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "agfi %%r1,16\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), @@ -331,7 +331,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "agfi %%r1,16\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) y) : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 81d7c9fe74..81e600695f 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -431,7 +431,7 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "agfi %%r1,16\n\t" "brctg %%r0,2b\n\t" "3:\n\t" - "nop" + "nop 0" : "+m"(*(struct { FLOAT x[n]; } *) dest) : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), [src] "a"(src),[n] "r"(n) From b9b3265ec8a78762263f54944e35c849013e0cab Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 12:04:28 +0200 Subject: [PATCH 155/349] s390x: avoid inline assembly for vector loads for clang ... since clang does not support the instruction format for inline assembly and also it is not required for current versions of clang. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index 741c094314..b7d7cc04b7 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -172,7 +172,7 @@ static inline vector_float vec_load_hinted(FLOAT const *restrict a) { vector_float const *restrict addr = (vector_float const *restrict)a; vector_float y; -#if __GNUC__ < 9 +#if __GNUC__ < 9 && !defined(__clang__) // hex-encode vl %[out],%[addr],3 asm(".insn vrx,0xe70000003006,%[out],%[addr],3" : [ out ] "=v"(y) From 87e5bbd88795d09f4bec0691d33f91e8109eb424 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 12:08:05 +0200 Subject: [PATCH 156/349] s390x: avoid variable-length arrays in struct for asm operands ... since it is not required and clang does not support that gcc extension. Instead, use a variable-length array directly for these operands. Note that, while the actual inline assembly code does not directly use these memory operands, they serve to inform the compiler that it cannot reorder reads or writes to/from the input and output data across the inline asm statements. Signed-off-by: Marius Hillenbrand --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/casum.c | 2 +- kernel/zarch/caxpy.c | 6 +++--- kernel/zarch/ccopy.c | 4 ++-- kernel/zarch/cdot.c | 6 +++--- kernel/zarch/cgemv_n_4.c | 30 +++++++++++++++--------------- kernel/zarch/cgemv_t_4.c | 32 ++++++++++++++++---------------- kernel/zarch/crot.c | 4 ++-- kernel/zarch/cscal.c | 14 +++++++------- kernel/zarch/csum.c | 2 +- kernel/zarch/cswap.c | 4 ++-- kernel/zarch/damax.c | 2 +- kernel/zarch/damax_z13.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/damin_z13.c | 2 +- kernel/zarch/dasum.c | 2 +- kernel/zarch/daxpy.c | 4 ++-- kernel/zarch/dcopy.c | 4 ++-- kernel/zarch/ddot.c | 4 ++-- kernel/zarch/dgemv_n_4.c | 24 ++++++++++++------------ kernel/zarch/dgemv_t_4.c | 28 ++++++++++++++-------------- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmax_z13.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/dmin_z13.c | 2 +- kernel/zarch/drot.c | 2 +- kernel/zarch/dscal.c | 4 ++-- kernel/zarch/dsdot.c | 4 ++-- kernel/zarch/dsum.c | 2 +- kernel/zarch/dswap.c | 2 +- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- kernel/zarch/idamax.c | 2 +- kernel/zarch/idamin.c | 2 +- kernel/zarch/idmax.c | 2 +- kernel/zarch/idmin.c | 2 +- kernel/zarch/isamax.c | 2 +- kernel/zarch/isamin.c | 2 +- kernel/zarch/ismax.c | 2 +- kernel/zarch/ismin.c | 2 +- kernel/zarch/izamax.c | 2 +- kernel/zarch/izamin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/sasum.c | 2 +- kernel/zarch/saxpy.c | 4 ++-- kernel/zarch/scopy.c | 4 ++-- kernel/zarch/sdot.c | 4 ++-- kernel/zarch/sgemv_n_4.c | 24 ++++++++++++------------ kernel/zarch/sgemv_t_4.c | 28 ++++++++++++++-------------- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- kernel/zarch/srot.c | 2 +- kernel/zarch/sscal.c | 4 ++-- kernel/zarch/ssum.c | 2 +- kernel/zarch/sswap.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamax_z13.c | 2 +- kernel/zarch/zamin.c | 2 +- kernel/zarch/zamin_z13.c | 2 +- kernel/zarch/zasum.c | 2 +- kernel/zarch/zaxpy.c | 6 +++--- kernel/zarch/zcopy.c | 4 ++-- kernel/zarch/zdot.c | 6 +++--- kernel/zarch/zgemv_n_4.c | 30 +++++++++++++++--------------- kernel/zarch/zgemv_t_4.c | 32 ++++++++++++++++---------------- kernel/zarch/zrot.c | 4 ++-- kernel/zarch/zscal.c | 14 +++++++------- kernel/zarch/zsum.c | 2 +- kernel/zarch/zswap.c | 4 ++-- 71 files changed, 212 insertions(+), 212 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index b10ca4752d..018a9a9c07 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -136,7 +136,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { "wfmaxsb %%v0,%%v0,%%v16,0\n\t" "ler %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 40945fae81..7b3b366309 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -136,7 +136,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { "wfminsb %%v0,%%v0,%%v16,0\n\t" "ler %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index e28f2018c7..f3b9ed6285 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -108,7 +108,7 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vstef %%v24,%[asum],0" : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index 14a124ae25..c0a7a71f42 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -99,9 +99,9 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vst %%v19,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index 0a5e03992a..9e08edc3bf 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -36,9 +36,9 @@ static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y), [n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index d90f9c8712..0d6dfbeb1c 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -97,9 +97,9 @@ static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vstef %%v24,4(%[d]),1\n\t" "vstef %%v25,8(%[d]),1\n\t" "vstef %%v25,12(%[d]),0" - : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 5c36bc3383..5fdf7717e3 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -146,12 +146,12 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vst %%v0,0(%%r1,%[y])\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -238,10 +238,10 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vst %%v0,0(%%r1,%[y])\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } @@ -307,9 +307,9 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vst %%v0,0(%%r1,%[y])\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } @@ -350,8 +350,8 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vst %%v23,16(%%r1,%[dest])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src), [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index e10edfab02..2bdac9ea13 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -159,13 +159,13 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v23,%%v19,%%v21,%%v23\n\t" "vst %%v22,0(%[y])\n\t" "vst %%v23,16(%[y])" - : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -271,11 +271,11 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v20,%%v16,%%v18,%%v20\n\t" "vfmasb %%v20,%%v17,%%v19,%%v20\n\t" "vst %%v20,0(%[y])" - : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } @@ -361,10 +361,10 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmasb %%v0,%%v16,%%v18,%%v0\n\t" "vfmasb %%v0,%%v17,%%v19,%%v0\n\t" "vsteg %%v0,0(%[y]),0" - : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index aab155f8b5..5a0990f3dc 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -169,8 +169,8 @@ static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n * 2]) x), + "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index 9fc54cf295..f9e89a452e 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -80,8 +80,8 @@ static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", @@ -132,8 +132,8 @@ static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); @@ -171,8 +171,8 @@ static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); @@ -194,7 +194,7 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) : [x] "a"(x) : "cc", "r1", "v0"); } diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c index e9413da8e3..b076501aa5 100644 --- a/kernel/zarch/csum.c +++ b/kernel/zarch/csum.c @@ -90,7 +90,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vstef %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 198994e185..f3ab77ab5e 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -99,8 +99,8 @@ static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n * 2]) x), + "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) : [x] "a"(x),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index caacb50dc1..d19181cbe6 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -76,7 +76,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "wfmaxdb %%v0,%%v0,%%v16,8\n\t" "lpdr %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index f3db4c108f..5bc0d17214 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -110,7 +110,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 0163a144b3..4e0558af41 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -76,7 +76,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "wfmindb %%v0,%%v0,%%v16,8\n\t" "lpdr %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 4196b2e15f..a7efd4b262 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -110,7 +110,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index aa1382b103..9703cd3bea 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -106,7 +106,7 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v25\n\t" "vsteg %%v24,%[asum],0" : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 5b0208c20e..4e59ef7c69 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -100,8 +100,8 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vst %%v27,240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), [alpha] "Q"(*alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index 691b90c64c..3c546568f7 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -36,8 +36,8 @@ static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index 9cad68f4b6..c0ed8b72e1 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -80,8 +80,8 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "adbr %%f0,%%f1\n\t" "ldr %[dot],%%f0" : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index b2a3d1e8d3..e1c5c44728 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -170,12 +170,12 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", @@ -275,10 +275,10 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", @@ -352,8 +352,8 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 30cec14f7d..513cffe5a5 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -173,12 +173,12 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vrepg %%v4,%%v3,1\n\t" "adbr %%f3,%%f4\n\t" "std %%f3,24(%[y])" - : "=m"(*(struct { FLOAT x[4]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -280,10 +280,10 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vrepg %%v2,%%v1,1\n\t" "adbr %%f1,%%f2\n\t" "std %%f1,8(%[y])" - : "=m"(*(struct { FLOAT x[2]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -360,8 +360,8 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "adbr %%f0,%%f1\n\t" "std %%f0,0(%[y])" : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -439,8 +439,8 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) dest) - : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const FLOAT (*)[n]) src), [src] "a"(src),[n] "r"(n) : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index cdc8d5d08f..4b76e0dd61 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -73,7 +73,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "wfmaxdb %%v0,%%v0,%%v16,0\n\t" "ldr %[max],%%f0" : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index c4e8d91f87..93acee2dbf 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -90,7 +90,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[max],%%f0" : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index f9b129cbd9..21d55f3238 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -73,7 +73,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "wfmindb %%v0,%%v0,%%v16,0\n\t" "ldr %[min],%%f0" : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 77f021c1d9..7d2dae3fb3 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -90,7 +90,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[min],%%f0" : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 11fbe15b6d..9d6d1a80db 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -169,7 +169,7 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y), [n] "+&r"(n) : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index 2961eff202..a5a5e34685 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -59,7 +59,7 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "vst %%v31,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) : [x] "a"(x),[da] "Q"(da) : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -81,7 +81,7 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) : [x] "a"(x) : "cc", "r1", "v0"); } diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 5fa88c3b92..2952bcf42b 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -112,8 +112,8 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "adbr %%f0,%%f1\n\t" "ldr %[dot],%%f0" : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c index 8d44873c0b..69b9f9b41e 100644 --- a/kernel/zarch/dsum.c +++ b/kernel/zarch/dsum.c @@ -88,7 +88,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v25\n\t" "vsteg %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index f0c9ded511..46cbbba232 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -99,7 +99,7 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y), [n] "+&r"(n) : [x] "a"(x),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 2d5c48407e..459196d00f 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -215,7 +215,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "2:\n\t" "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 1d51bb2c2b..9bcf3646b1 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -215,7 +215,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "2:\n\t" "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index f9bfe34945..0f53488d32 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -162,7 +162,7 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "2:\n\t" "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index b7ce700275..f48bde8940 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -162,7 +162,7 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "2:\n\t" "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 55471ce506..1fdf1fa029 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -142,7 +142,7 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "2:\n\t" "nop 0" : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ec1c69822e..282f26bbd5 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -142,7 +142,7 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "2:\n\t" "nop 0" : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 6ea46c7162..a30a96412d 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -206,7 +206,7 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { "2:\n\t" "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT(*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 18cfa2a6e1..b29027ff49 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -206,7 +206,7 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { "2:\n\t" "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index be990b9d55..3d751ff6b6 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -186,7 +186,7 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { "2:\n\t" "nop 0" : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index a27c8a743e..e57c0bfa65 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -186,7 +186,7 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { "2:\n\t" "nop 0" : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index cb299cb246..fda76f4714 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -159,7 +159,7 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { "2:\n\t" "nop 0" : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 4dfa1a9db1..412ab15ca7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -159,7 +159,7 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { "2:\n\t" "nop 0" : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index fdda6dd321..20da4406ad 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -78,7 +78,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { "wfmaxsb %%v0,%%v0,%%v16,8\n\t" "lper %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index f05e851f96..e7e4fd9b76 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -78,7 +78,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { "wfminsb %%v0,%%v0,%%v16,8\n\t" "lper %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index d56f2697b1..4cf74f3516 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -108,7 +108,7 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vstef %%v24,%[asum],0" : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index ca34a47ff3..8bcb1a61b9 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -100,8 +100,8 @@ static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vst %%v27,240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), [alpha] "Q"(*alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index 5c453cfbb9..631c9f929e 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -36,8 +36,8 @@ static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index d870b30f07..d27c17162d 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -84,8 +84,8 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "aebr %%f0,%%f3\n\t" "ler %[dot],%%f0" : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index a0d522b831..b4cfb61de8 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -160,12 +160,12 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", @@ -259,10 +259,10 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", @@ -332,8 +332,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), [n] "r"(n) : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 81e600695f..3c708200cc 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -172,12 +172,12 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vrepg %%v4,%%v3,1\n\t" "aebr %%f3,%%f4\n\t" "ste %%f3,12(%[y])" - : "=m"(*(struct { FLOAT x[4]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -278,10 +278,10 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vrepg %%v2,%%v1,1\n\t" "aebr %%f1,%%f2\n\t" "ste %%f1,4(%[y])" - : "=m"(*(struct { FLOAT x[2]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -357,8 +357,8 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "aebr %%f0,%%f1\n\t" "ste %%f0,0(%[y])" : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -432,8 +432,8 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "brctg %%r0,2b\n\t" "3:\n\t" "nop 0" - : "+m"(*(struct { FLOAT x[n]; } *) dest) - : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const FLOAT (*)[n]) src), [src] "a"(src),[n] "r"(n) : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 7015aaa1da..0c7433cbc7 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -75,7 +75,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) { "wfmaxsb %%v0,%%v0,%%v16,0\n\t" "ler %[max],%%f0" : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT(*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index b6875c5c69..5e0f3860d9 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -75,7 +75,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { "wfminsb %%v0,%%v0,%%v16,0\n\t" "ler %[min],%%f0" : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 4f471d8668..c235adcbed 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -169,7 +169,7 @@ static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y), [n] "+&r"(n) : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index 9b9930dc87..da2f49eaf5 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -59,7 +59,7 @@ static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { "vst %%v31,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) : [x] "a"(x),[da] "Q"(da) : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -81,7 +81,7 @@ static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) : [x] "a"(x) : "cc", "r1", "v0"); } diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c index 3f3f46a850..02aabdff65 100644 --- a/kernel/zarch/ssum.c +++ b/kernel/zarch/ssum.c @@ -91,7 +91,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { "vfasb %%v24,%%v24,%%v25\n\t" "vstef %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index 0c62f189d7..ec860765a9 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -99,7 +99,7 @@ static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y), [n] "+&r"(n) : [x] "a"(x),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index aa04ab91fe..98e40d0730 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -114,7 +114,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { "wfmaxdb %%v0,%%v0,%%v16,0\n\t" "ldr %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index 37278d6dbb..f727ad67a2 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -123,7 +123,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[amax],%%f0" : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 0b54028532..2e43fefd9f 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -114,7 +114,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { "wfmindb %%v0,%%v0,%%v16,0\n\t" "ldr %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index e37bb2236f..e528025956 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -123,7 +123,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { "vsel %%v0,%%v0,%%v16,%%v17\n\t" "ldr %[amin],%%f0" : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index aeef8d77e6..0003f38a5d 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -106,7 +106,7 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v25\n\t" "vsteg %%v24,%[asum],0" : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 9363ec32df..f2c115597c 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -95,9 +95,9 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vst %%v19,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 5a46aec1c9..d91d9f367f 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -36,9 +36,9 @@ static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y), [n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index ac6e69c23f..6b71441015 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -93,9 +93,9 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vsteg %%v24,8(%[d]),1\n\t" "vsteg %%v25,16(%[d]),1\n\t" "vsteg %%v25,24(%[d]),0" - : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 13045a3591..2ef9b4de85 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -112,12 +112,12 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -172,10 +172,10 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); } @@ -210,9 +210,9 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } @@ -261,8 +261,8 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vst %%v31,48(%%r1,%[dest])\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src), [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 031c31e29b..c10769266d 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -141,13 +141,13 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vst %%v27,16(%[y])\n\t" "vst %%v28,32(%[y])\n\t" "vst %%v29,48(%[y])" - : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) ap2),[ap2] "a"(ap2), + "m"(*(const FLOAT (*)[n * 2]) ap3),[ap3] "a"(ap3), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); @@ -229,11 +229,11 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" "vst %%v22,0(%[y])\n\t" "vst %%v23,16(%[y])\n\t" - : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap0),[ap0] "a"(ap0), + "m"(*(const FLOAT (*)[n * 2]) ap1),[ap1] "a"(ap1), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } @@ -294,10 +294,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" "vst %%v0,0(%[y])\n\t" - : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); } diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 6284d5a474..3b87e356af 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -169,8 +169,8 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n * 2]) x), + "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index e497a6d7b9..a5a8f694d3 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -78,8 +78,8 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", @@ -128,8 +128,8 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); @@ -167,8 +167,8 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), [alpha] "a"(alpha) : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); @@ -190,7 +190,7 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) : [x] "a"(x) : "cc", "r1", "v0"); } diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c index e0f978d87c..b35832af89 100644 --- a/kernel/zarch/zsum.c +++ b/kernel/zarch/zsum.c @@ -89,7 +89,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v24,%%v24,%%v25\n\t" "vsteg %%v24,%[sum],0" : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index bc466866cb..7a2d1f8824 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -99,8 +99,8 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : "+m"(*(FLOAT (*)[n * 2]) x), + "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) : [x] "a"(x),[y] "a"(y) : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", From 095f4e6964ba150b1293747d842a60294836be45 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 15:09:32 +0200 Subject: [PATCH 157/349] s390x: allow clang to emit fused multiply-adds (replicates gcc's default behavior) gcc's default setting for floating-point expression contraction is "fast", which allows the compiler to emit fused multiply adds instead of separate multiplies and adds (amongst others). Fused multiply-adds, which assembly kernels typically apply, also bring a significant performance advantage to the C implementation for matrix-matrix multiplication on s390x. To enable that performance advantage for builds with clang, add -ffp-contract=fast to the compiler options. Signed-off-by: Marius Hillenbrand --- Makefile.zarch | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Makefile.zarch b/Makefile.zarch index be1e34f6dd..b841d9b4d5 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -8,3 +8,9 @@ ifeq ($(CORE), Z14) CCOMMON_OPT += -march=z14 -mzvector -O3 FCOMMON_OPT += -march=z14 -mzvector endif + +# Enable floating-point expression contraction for clang, since it is the +# default for gcc +ifeq ($(C_COMPILER), CLANG) +CCOMMON_OPT += -ffp-contract=fast +endif From 2ee5b899ce9777c63710de1ede75c362db5bcd47 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 1 Sep 2020 16:16:53 +0200 Subject: [PATCH 158/349] s390x: enable S/DGEMM block with explicit loop unrolling + interleaving with clang The code for SGEMM 16x4 and DGEMM 8x4 blocks on z14 and z15 uses explicit unrolling and interleaving to improve performance. The code employs an empty inline asm statement with operands that constrain the compiler's instruction scheduling and thereby enforce proper overlapping of load and compute phases. Fix an ifdef to apply that for clang builds, as well. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index b7d7cc04b7..ef0b1d1e31 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -393,7 +393,7 @@ static inline void GEBP_block_16_4( * Note that we need to massage this particular "barrier" * depending on the gcc version. */ -#if __GNUC__ > 7 +#if __GNUC__ > 7 || defined(__clang__) #define BARRIER_READ_BEFORE_COMPUTE(SUFFIX) \ do { \ asm("" \ From 029fd01cfbcc0b18475faee8353585313c88a95b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 22:47:38 +0200 Subject: [PATCH 159/349] Detect AppleSilicon cpu on OSX --- cpuid_arm64.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 1fd43148a3..df1be85ba5 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -26,6 +26,11 @@ *****************************************************************************/ #include +#ifdef OS_DARWIN +#include +int32_t value; +size_t length=sizeof(value); +#endif #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 @@ -45,6 +50,8 @@ #define CPU_TSV110 9 // Ampere #define CPU_EMAG8180 10 +// Apple +#define CPU_SILICON 11 static char *cpuname[] = { "UNKNOWN", @@ -59,7 +66,8 @@ static char *cpuname[] = { "TSV110", "EMAG8180", "NEOVERSEN1", - "THUNDERX3T110" + "THUNDERX3T110", + "SILICON" }; static char *cpuname_lower[] = { @@ -75,7 +83,8 @@ static char *cpuname_lower[] = { "tsv110", "emag8180", "neoversen1", - "thunderx3t110" + "thunderx3t110", + "silicon" }; int get_feature(char *search) @@ -198,6 +207,10 @@ int detect(void) } #else +#ifdef DARWIN + sysctlbyname("hw.cpufamily",&value,&length,NULL,0); + if (value ==131287967) return CPU_SILICON; +#endif return CPU_ARMV8; #endif @@ -247,7 +260,10 @@ int n=0; printf("#define NUM_CORES %d\n",n); #endif - +#ifdef DARWIN + sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); + printf("#define NUM_CORES %d\n",value); +#endif } @@ -398,6 +414,19 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; +#ifdef DARWIN + case CPU_SILICON: + printf("#define SILICON \n"); + sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); + printf("#define L1_CODE_SIZE %d \n",value); + sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); + printf("#define L1_CODE_LINESIZE %d \n",value); + sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); + printf("#define L1_DATA_SIZE %d \n",value); + sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); + printf("#define L2_DATA_SIZE %d \n",value); + break; +#endif } get_cpucount(); } From b37d17382a092905bb7c2a263ad0ca269e53f541 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 22:48:49 +0200 Subject: [PATCH 160/349] Add Apple Silicon --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 5934f30128..de907bdb34 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -98,6 +98,7 @@ THUNDERX THUNDERX2T99 TSV110 THUNDERX3T110 +SILICON 9.System Z: ZARCH_GENERIC From 4a4d1ca6e025de7c88b8d18794775c4114168359 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 22:52:12 +0200 Subject: [PATCH 161/349] Add AppleSIlicon cpu --- Makefile.arm64 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index 1091edfe55..78ba79aa0b 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -66,6 +66,11 @@ FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif endif +ifeq ($(CORE), SILICON) +CCOMMON_OPT += -march=armv8.3-a +FCOMMON_OPT += -march=armv8.3-a +endif + ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 From 80794fe8fd2a877cd0387ffc64b21a786ae449f6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 22:56:58 +0200 Subject: [PATCH 162/349] Create KERNEL.SILICON --- kernel/arm64/KERNEL.SILICON | 1 + 1 file changed, 1 insertion(+) create mode 100644 kernel/arm64/KERNEL.SILICON diff --git a/kernel/arm64/KERNEL.SILICON b/kernel/arm64/KERNEL.SILICON new file mode 100644 index 0000000000..e3efef1f5f --- /dev/null +++ b/kernel/arm64/KERNEL.SILICON @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV8 From 0ce2aa3163fd2225c746cb5b8b1d82dc1a6fbceb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 23:41:51 +0200 Subject: [PATCH 163/349] Fix data type of rwork array --- lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c index 91458136c9..c5eca535e9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c @@ -47,8 +47,8 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, lapack_complex_float* cwork = NULL; lapack_complex_float cwork_query; lapack_int lrwork = -1; - double* rwork = NULL; - double rwork_query; + float* rwork = NULL; + float rwork_query; lapack_int i; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 ); @@ -84,7 +84,7 @@ lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } - rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork ); + rwork = (float*)LAPACKE_malloc( sizeof(float) * lrwork ); if( rwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; From c31b72965ecf2b745c3a515f8abf889f9dd24473 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 2 Sep 2020 23:44:44 +0200 Subject: [PATCH 164/349] Fix data type of work array in zgesvdq prototype --- lapack-netlib/LAPACKE/include/lapack.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 4f48b7c879..c045892df6 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -2513,7 +2513,7 @@ void LAPACK_zgesvdq( lapack_complex_double* U, lapack_int const* ldu, lapack_complex_double* V, lapack_int const* ldv, lapack_int* numrank, lapack_int* iwork, lapack_int const* liwork, - lapack_complex_float* cwork, lapack_int* lcwork, + lapack_complex_double* cwork, lapack_int* lcwork, double* rwork, lapack_int const* lrwork, lapack_int* info ); From 1b0f17eeed840d8e9642afd7d801259279d587cf Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Tue, 1 Sep 2020 15:41:48 +0800 Subject: [PATCH 165/349] align to 64, using SSE when input size is small --- kernel/x86_64/dasum.c | 140 ++++++++++------------- kernel/x86_64/dasum_microk_haswell-2.c | 91 +++++++++++---- kernel/x86_64/dasum_microk_skylakex-2.c | 79 ++++++++++--- kernel/x86_64/sasum.c | 146 +++++++++++------------- kernel/x86_64/sasum_microk_haswell-2.c | 88 ++++++++++---- kernel/x86_64/sasum_microk_skylakex-2.c | 72 +++++++++--- 6 files changed, 392 insertions(+), 224 deletions(-) diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 31313416b6..8a40ea4b9b 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -1,7 +1,8 @@ #include "common.h" -#include -#define ABS fabs +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #if defined(SKYLAKEX) #include "dasum_microk_skylakex-2.c" @@ -9,88 +10,73 @@ #include "dasum_microk_haswell-2.c" #endif -#ifndef HAVE_KERNEL_16 -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#ifndef HAVE_DASUM_KERNEL +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { - BLASLONG i=0; - FLOAT *x = x1; - FLOAT temp0, temp1, temp2, temp3; - FLOAT temp4, temp5, temp6, temp7; - FLOAT sum0 = 0.0; - FLOAT sum1 = 0.0; - FLOAT sum2 = 0.0; - FLOAT sum3 = 0.0; - - while ( i< n ) - { - - temp0 = ABS(x[0]); - temp1 = ABS(x[1]); - temp2 = ABS(x[2]); - temp3 = ABS(x[3]); - temp4 = ABS(x[4]); - temp5 = ABS(x[5]); - temp6 = ABS(x[6]); - temp7 = ABS(x[7]); - - sum0 += temp0; - sum1 += temp1; - sum2 += temp2; - sum3 += temp3; - - sum0 += temp4; - sum1 += temp5; - sum2 += temp6; - sum3 += temp7; - - x+=8; - i+=8; - - } - - return sum0+sum1+sum2+sum3; + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + } + + while (i < n) { + sum4 += ABS_K(x1[i]); + i++; + } + + return sum0+sum1+sum2+sum3+sum4; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG n1; - - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) - { - - n1 = n & -16; - if ( n1 > 0 ) - { - - sumf = dasum_kernel_16(n1, x); - i=n1; - } - - while(i < n) - { - sumf += ABS(x[i]); - i++; - } - - } - else - { - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - - } - return(sumf); + BLASLONG i=0; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) { + sumf = dasum_kernel(n, x); + } + else { + n *= inc_x; + + while(i < n) { + sumf += ABS_K(x[i]); + i += inc_x; + } + } + return(sumf); } diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index 7639dfd041..4fc73ddd41 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,35 +1,86 @@ #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) -#define HAVE_KERNEL_16 1 +#define HAVE_DASUM_KERNEL #include +#include -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; - __m256d accum_0, accum_1, accum_2, accum_3; - - accum_0 = _mm256_setzero_pd(); - accum_1 = _mm256_setzero_pd(); - accum_2 = _mm256_setzero_pd(); - accum_3 = _mm256_setzero_pd(); - - __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); - for (; i < n; i += 16) { - accum_0 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 4]), abs_mask); - accum_2 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); - accum_3 += (__m256d)_mm256_and_si256(_mm256_loadu_si256(&x1[i+12]), abs_mask); + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 3) & 0x3; + + for (i = 0; i < align_256; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_256; + x1 += align_256; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX2 = n&(~255); + + if (n >= 256) { + __m256d accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_pd(); + accum_1 = _mm256_setzero_pd(); + accum_2 = _mm256_setzero_pd(); + accum_3 = _mm256_setzero_pd(); + + __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); + for (i = 0; i < tail_index_AVX2; i += 16) { + accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); + accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); + accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + + __m128d half_accum0; + half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + + half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + + sumf += half_accum0[0]; } + + if (n >= 8) { + __m128d accum_20, accum_21, accum_22, accum_23; + accum_20 = _mm_setzero_pd(); + accum_21 = _mm_setzero_pd(); + accum_22 = _mm_setzero_pd(); + accum_23 = _mm_setzero_pd(); - accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); + for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + } - __m128d half_accum0; - half_accum0 = _mm_add_pd(_mm256_extractf128_pd(accum_0, 0), _mm256_extractf128_pd(accum_0, 1)); + accum_20 = accum_20 + accum_21 + accum_22 + accum_23; + __m128d half_accum20; + half_accum20 = _mm_hadd_pd(accum_20, accum_20); - half_accum0 = _mm_hadd_pd(half_accum0, half_accum0); + sumf += half_accum20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } - return half_accum0[0]; + return sumf; } #endif diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index 2c959b1ad6..aea8c02d9f 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -1,27 +1,80 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) -#if defined(__AVX512CD__) -#define HAVE_KERNEL_16 1 +#define HAVE_DASUM_KERNEL 1 #include -static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) +#include + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7; - __m512d accum_0, accum_1; + for (i = 0; i < align_512; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_512; + x1 += align_512; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX512 = n&(~255); - accum_0 = _mm512_setzero_pd(); - accum_1 = _mm512_setzero_pd(); + // + if ( n >= 256 ) { - for (; i < n; i += 16) { - accum_0 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 0])); - accum_1 += _mm512_abs_pd(_mm512_loadu_pd(&x1[i+ 8])); + __m512d accum_0, accum_1, accum_2, accum_3; + accum_0 = _mm512_setzero_pd(); + accum_1 = _mm512_setzero_pd(); + accum_2 = _mm512_setzero_pd(); + accum_3 = _mm512_setzero_pd(); + for (i = 0; i < tail_index_AVX512; i += 32) { + accum_0 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 0])); + accum_1 += _mm512_abs_pd(_mm512_load_pd(&x1[i + 8])); + accum_2 += _mm512_abs_pd(_mm512_load_pd(&x1[i +16])); + accum_3 += _mm512_abs_pd(_mm512_load_pd(&x1[i +24])); + } + + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + sumf += _mm512_reduce_add_pd(accum_0); } - accum_0 += accum_1; - return _mm512_reduce_add_pd(accum_0); + if (n >= 8) { + __m128d accum_20, accum_21, accum_22, accum_23; + accum_20 = _mm_setzero_pd(); + accum_21 = _mm_setzero_pd(); + accum_22 = _mm_setzero_pd(); + accum_23 = _mm_setzero_pd(); + + __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); + for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { + accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); + accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); + } + + accum_20 = accum_20 + accum_21 + accum_22 + accum_23; + __m128d half_accum20; + half_accum20 = _mm_hadd_pd(accum_20, accum_20); + + sumf += half_accum20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + + return sumf; } #endif -#endif diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index 6012555465..36ec4a7379 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -1,13 +1,11 @@ #include "common.h" -#include #if defined(DOUBLE) - #error supports float only - #else - -#define ABS fabsf +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #endif @@ -17,88 +15,76 @@ #include "sasum_microk_haswell-2.c" #endif -#ifndef HAVE_KERNEL_32 +#ifndef HAVE_SASUM_KERNEL -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { - BLASLONG i=0; - FLOAT *x = x1; - FLOAT temp0, temp1, temp2, temp3; - FLOAT temp4, temp5, temp6, temp7; - FLOAT sum0 = 0.0; - FLOAT sum1 = 0.0; - FLOAT sum2 = 0.0; - FLOAT sum3 = 0.0; - - while ( i< n ) - { - - temp0 = ABS(x[0]); - temp1 = ABS(x[1]); - temp2 = ABS(x[2]); - temp3 = ABS(x[3]); - temp4 = ABS(x[4]); - temp5 = ABS(x[5]); - temp6 = ABS(x[6]); - temp7 = ABS(x[7]); - - sum0 += temp0; - sum1 += temp1; - sum2 += temp2; - sum3 += temp3; - - sum0 += temp4; - sum1 += temp5; - sum2 += temp6; - sum3 += temp7; - - x+=8; - i+=8; - - } - - return sum0+sum1+sum2+sum3; + BLASLONG i=0; + BLASLONG n_8 = n & -8; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + FLOAT sum4 = 0.0; + + while (i < n_8) { + + temp0 = ABS_K(x[0]); + temp1 = ABS_K(x[1]); + temp2 = ABS_K(x[2]); + temp3 = ABS_K(x[3]); + temp4 = ABS_K(x[4]); + temp5 = ABS_K(x[5]); + temp6 = ABS_K(x[6]); + temp7 = ABS_K(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + while (i < n) { + sum4 += ABS_K(x1[i]); + i++; + } + + return sum0+sum1+sum2+sum3+sum4; } #endif FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG n1; - - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) - { - - n1 = n & -32; - if ( n1 > 0 ) - { - - sumf = sasum_kernel_32(n1, x); - i=n1; - } - - while(i < n) - { - sumf += ABS(x[i]); - i++; - } - - } - else - { - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - - } - return(sumf); + BLASLONG i=0; + FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) { + sumf = sasum_kernel(n, x); + } + else { + + n *= inc_x; + while(i < n) { + sumf += ABS_K(x[i]); + i += inc_x; + } + + } + return(sumf); } diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index b628729f50..8e6cb9a47e 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,36 +1,82 @@ #if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) -#define HAVE_KERNEL_32 1 +#define HAVE_SASUM_KERNEL 1 #include +#include -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif + +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; - __m256 accum_0, accum_1, accum_2, accum_3; - - accum_0 = _mm256_setzero_ps(); - accum_1 = _mm256_setzero_ps(); - accum_2 = _mm256_setzero_ps(); - accum_3 = _mm256_setzero_ps(); - - __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); - for (; i < n; i += 32) { - accum_0 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 0]), abs_mask); - accum_1 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+ 8]), abs_mask); - accum_2 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+16]), abs_mask); - accum_3 += (__m256)_mm256_and_si256(_mm256_loadu_si256(&x1[i+24]), abs_mask); + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_256 = ((32 - ((uintptr_t)x1 & (uintptr_t)0x1f)) >> 2) & 0x7; + + for (i = 0; i < align_256; i++) { + sumf += ABS_K(x1[i]); + } + + n -= align_256; + x1 += align_256; } - accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX2 = n&(~255); + + if (n >= 256) { + __m256 accum_0, accum_1, accum_2, accum_3; + + accum_0 = _mm256_setzero_ps(); + accum_1 = _mm256_setzero_ps(); + accum_2 = _mm256_setzero_ps(); + accum_3 = _mm256_setzero_ps(); - __m128 half_accum0; - half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); + __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); + for (i = 0; i < tail_index_AVX2; i += 32) { + accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); + accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); + accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); + accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); + } - half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); - half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + __m128 half_accum0; + half_accum0 = _mm_add_ps(_mm256_extractf128_ps(accum_0, 0), _mm256_extractf128_ps(accum_0, 1)); - return half_accum0[0]; + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + half_accum0 = _mm_hadd_ps(half_accum0, half_accum0); + + sumf += half_accum0[0]; + + } + + if (n >= 8) { + __m128 accum_20, accum_21; + accum_20 = _mm_setzero_ps(); + accum_21 = _mm_setzero_ps(); + + __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); + for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + } + + accum_20 += accum_21; + accum_20 = _mm_hadd_ps(accum_20, accum_20); + accum_20 = _mm_hadd_ps(accum_20, accum_20); + + sumf += accum_20[0]; + } + + for (i = tail_index_SSE; i < n; ++i) { + sumf += ABS_K(x1[i]); + } + return sumf; } #endif diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index b1c49fd097..c8c69d1e0e 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -1,27 +1,73 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) -#if defined(__AVX512CD__) -#define HAVE_KERNEL_32 1 +#define HAVE_SASUM_KERNEL 1 + +#ifndef ABS_K +#define ABS_K(a) ((a) > 0 ? (a) : (-(a))) +#endif #include +#include -static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1) +static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) { BLASLONG i = 0; + FLOAT sumf = 0.0; + + if (n >= 256) { + BLASLONG align_512 = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf; + + for (i = 0; i < align_512; i++) { + sumf += ABS_K(x1[i]); + } + n -= align_512; + x1 += align_512; + } + + BLASLONG tail_index_SSE = n&(~7); + BLASLONG tail_index_AVX512 = n&(~255); - __m512 accum_0, accum_1; + if (n >= 256) { + __m512 accum_0, accum_1, accum_2, accum_3; + accum_0 = _mm512_setzero_ps(); + accum_1 = _mm512_setzero_ps(); + accum_2 = _mm512_setzero_ps(); + accum_3 = _mm512_setzero_ps(); - accum_0 = _mm512_setzero_ps(); - accum_1 = _mm512_setzero_ps(); + for (i = 0; i < tail_index_AVX512; i += 64) { + accum_0 += _mm512_abs_ps(_mm512_load_ps(&x1[i + 0])); + accum_1 += _mm512_abs_ps(_mm512_load_ps(&x1[i +16])); + accum_2 += _mm512_abs_ps(_mm512_load_ps(&x1[i +32])); + accum_3 += _mm512_abs_ps(_mm512_load_ps(&x1[i +48])); + } - for (; i < n; i += 32) { - accum_0 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 0])); - accum_1 += _mm512_abs_ps(_mm512_loadu_ps(&x1[i+ 16])); + accum_0 = accum_0 + accum_1 + accum_2 + accum_3; + sumf += _mm512_reduce_add_ps(accum_0); } - accum_0 += accum_1; - return _mm512_reduce_add_ps(accum_0); + if (n >= 8) { + __m128 accum_20, accum_21; + accum_20 = _mm_setzero_ps(); + accum_21 = _mm_setzero_ps(); + + __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); + for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { + accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); + accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); + } + + accum_20 += accum_21; + accum_20 = _mm_hadd_ps(accum_20, accum_20); + accum_20 = _mm_hadd_ps(accum_20, accum_20); + + sumf += accum_20[0]; + } + + for (i = tail_index_SSE; i < n; i++) { + sumf += ABS_K(x1[i]); + } + + return sumf; } #endif -#endif From 17dca035de526d69d9639f16622e9aeda7cd7ffd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Sep 2020 08:38:08 +0200 Subject: [PATCH 166/349] rename SILICON to VORTEX --- Makefile.arm64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.arm64 b/Makefile.arm64 index 78ba79aa0b..62a877fff7 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -66,7 +66,7 @@ FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif endif -ifeq ($(CORE), SILICON) +ifeq ($(CORE), VORTEX) CCOMMON_OPT += -march=armv8.3-a FCOMMON_OPT += -march=armv8.3-a endif From ea3a58c8442c9327b43e5cd2109865782759e6e8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Sep 2020 08:38:53 +0200 Subject: [PATCH 167/349] Rename SILICON to VORTEX --- TargetList.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TargetList.txt b/TargetList.txt index de907bdb34..66eca45069 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -98,7 +98,7 @@ THUNDERX THUNDERX2T99 TSV110 THUNDERX3T110 -SILICON +VORTEX 9.System Z: ZARCH_GENERIC From af5bc955035ba1a590c3b8a72a403c414bee45ee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Sep 2020 08:43:26 +0200 Subject: [PATCH 168/349] Rename SILICON to VORTEX and fix duplicate numbering --- cpuid_arm64.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index df1be85ba5..a0d3e15b99 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -51,7 +51,7 @@ size_t length=sizeof(value); // Ampere #define CPU_EMAG8180 10 // Apple -#define CPU_SILICON 11 +#define CPU_VORTEX 13 static char *cpuname[] = { "UNKNOWN", @@ -67,7 +67,7 @@ static char *cpuname[] = { "EMAG8180", "NEOVERSEN1", "THUNDERX3T110", - "SILICON" + "VORTEX" }; static char *cpuname_lower[] = { @@ -84,7 +84,7 @@ static char *cpuname_lower[] = { "emag8180", "neoversen1", "thunderx3t110", - "silicon" + "vortex" }; int get_feature(char *search) @@ -209,7 +209,7 @@ int detect(void) #else #ifdef DARWIN sysctlbyname("hw.cpufamily",&value,&length,NULL,0); - if (value ==131287967) return CPU_SILICON; + if (value ==131287967) return CPU_VORTEX; #endif return CPU_ARMV8; #endif @@ -415,8 +415,8 @@ void get_cpuconfig(void) printf("#define DTB_SIZE 4096 \n"); break; #ifdef DARWIN - case CPU_SILICON: - printf("#define SILICON \n"); + case CPU_VORTEX: + printf("#define VORTEX \n"); sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); printf("#define L1_CODE_SIZE %d \n",value); sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); From 775a87242d374e140fa784931a04bf01d4738e1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 3 Sep 2020 08:44:20 +0200 Subject: [PATCH 169/349] Rename KERNEL.SILICON to KERNEL.VORTEX --- kernel/arm64/{KERNEL.SILICON => KERNEL.VORTEX} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/arm64/{KERNEL.SILICON => KERNEL.VORTEX} (100%) diff --git a/kernel/arm64/KERNEL.SILICON b/kernel/arm64/KERNEL.VORTEX similarity index 100% rename from kernel/arm64/KERNEL.SILICON rename to kernel/arm64/KERNEL.VORTEX From deaeb6c5b89f64bbe9d5ba0126690ae5d57ae0ce Mon Sep 17 00:00:00 2001 From: "Chen, Guobing" Date: Thu, 27 Aug 2020 06:42:28 +0800 Subject: [PATCH 170/349] Add bfloat16 based dot and conversion with single/double 1. Added bfloat16 based dot as new API: shdot 2. Implemented generic kernel and cooperlake-specific (AVX512-BF16) kernel for shdot 3. Added 4 conversion APIs for bfloat16 data type <=> single/double: shstobf16 shdtobf16 sbf16tos dbf16tod shstobf16 -- convert single float array to bfloat16 array shdtobf16 -- convert double float array to bfloat16 array sbf16tos -- convert bfloat16 array to single float array dbf16tod -- convert bfloat16 array to double float array 4. Implemented generic kernels for all 4 conversion APIs, and cooperlake-specific kernel for shstobf16 and shdtobf16 5. Update level1 thread facilitate functions and macros to support multi-threading for these new APIs 6. Fix Cooperlake platform detection/specify issue when under dynamic-arch building 7. Change the typedef of bfloat16 from unsigned short to more strict uint16_t Signed-off-by: Chen, Guobing --- Makefile.tail | 7 +- cblas.h | 11 ++ cmake/kernel.cmake | 4 +- common.h | 3 +- common_interface.h | 5 + common_level1.h | 6 + common_macro.h | 6 + common_param.h | 7 +- common_sh.h | 12 ++ common_thread.h | 19 ++- common_x86_64.h | 23 +++ driver/others/blas_l1_thread.c | 74 ++++++++-- driver/others/blas_server.c | 93 ++++++++---- driver/others/blas_server_omp.c | 71 +++++++-- driver/others/blas_server_win32.c | 69 +++++++-- driver/others/dynamic.c | 44 +++++- exports/gensymbol | 4 +- interface/Makefile | 38 ++++- interface/bf16dot.c | 52 +++++++ interface/bf16to.c | 62 ++++++++ interface/tobf16.c | 61 ++++++++ kernel/Makefile.L1 | 36 +++++ kernel/setparam-ref.c | 4 +- kernel/x86_64/KERNEL | 12 ++ kernel/x86_64/bf16to.c | 114 +++++++++++++++ kernel/x86_64/dtobf16_microk_cooperlake.c | 104 +++++++++++++ kernel/x86_64/shdot.c | 115 +++++++++++++++ kernel/x86_64/shdot_microk_cooperlake.c | 159 ++++++++++++++++++++ kernel/x86_64/stobf16_microk_cooperlake.c | 86 +++++++++++ kernel/x86_64/tobf16.c | 170 ++++++++++++++++++++++ openblas_config_template.h | 3 +- 31 files changed, 1392 insertions(+), 82 deletions(-) create mode 100644 interface/bf16dot.c create mode 100644 interface/bf16to.c create mode 100644 interface/tobf16.c create mode 100644 kernel/x86_64/bf16to.c create mode 100644 kernel/x86_64/dtobf16_microk_cooperlake.c create mode 100644 kernel/x86_64/shdot.c create mode 100644 kernel/x86_64/shdot_microk_cooperlake.c create mode 100644 kernel/x86_64/stobf16_microk_cooperlake.c create mode 100644 kernel/x86_64/tobf16.c diff --git a/Makefile.tail b/Makefile.tail index 39902982b6..cfc4a36fca 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -5,13 +5,14 @@ QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +SHEXTOBJS_P = $(SHEXTOBJS:.$(SUFFIX)=.$(PSUFFIX)) COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -30,6 +31,7 @@ $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX +$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) @@ -38,6 +40,7 @@ $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(SHEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) libs :: $(BLASOBJS) $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ diff --git a/cblas.h b/cblas.h index 4bc5588d87..21f3958f24 100644 --- a/cblas.h +++ b/cblas.h @@ -382,6 +382,17 @@ void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, double *c, OPENBLAS_CONST blasint cldc); +/*** BFLOAT16 and INT8 extensions ***/ +/* convert float array to BFLOAT16 array by rounding */ +void cblas_shstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); +/* convert double array to BFLOAT16 array by rounding */ +void cblas_shdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); +/* convert BFLOAT16 array to float array */ +void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout); +/* convert BFLOAT16 array to double array */ +void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); +/* dot production of BFLOAT16 input arrays, and output as float */ +float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); #ifdef __cplusplus } diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 4b505a1028..79eeaae6fe 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -126,12 +126,14 @@ if (BUILD_HALF) set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPBYKERNEL ../arm/axpby.c) set(SHCOPYKERNEL ../arm/copy.c) - set(SHDOTKERNEL ../arm/dot.c) + set(SHDOTKERNEL ../x86_64/shdot.c) set(SHROTKERNEL ../arm/rot.c) set(SHSCALKERNEL ../arm/scal.c) set(SHNRM2KERNEL ../arm/nrm2.c) set(SHSUMKERNEL ../arm/sum.c) set(SHSWAPKERNEL ../arm/swap.c) + set(TOBF16KERNEL ../x86_64/tobf16.c) + set(BF16TOKERNEL ../x86_64/bf16to.c) endif () endmacro () diff --git a/common.h b/common.h index d6637abe49..adc162536b 100644 --- a/common.h +++ b/common.h @@ -258,7 +258,8 @@ typedef unsigned long BLASULONG; #endif #ifndef BFLOAT16 -typedef unsigned short bfloat16; +#include +typedef uint16_t bfloat16; #define HALFCONVERSION 1 #endif diff --git a/common_interface.h b/common_interface.h index 78f5be6b0d..35a957aa17 100644 --- a/common_interface.h +++ b/common_interface.h @@ -54,6 +54,11 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); +float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); +void BLASFUNC(dbf16tod) (blasint *, bfloat16 *, blasint *, double *, blasint *); #ifdef RETURN_BY_STRUCT typedef struct { diff --git a/common_level1.h b/common_level1.h index 74cafb6dbb..88aa275a58 100644 --- a/common_level1.h +++ b/common_level1.h @@ -46,6 +46,12 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); +float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); + +void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); +void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); +void sbf16tos_k (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); +void dbf16tod_k (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); openblas_complex_float cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 8fe1f156fa..3d6bcd9e84 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,6 +646,11 @@ #elif defined(HALF) +#define D_TO_BF16_K SHDTOBF16_K +#define D_BF16_TO_K DBF16TOD_K +#define S_TO_BF16_K SHSTOBF16_K +#define S_BF16_TO_K SBF16TOS_K + #define AMAX_K SAMAX_K #define AMIN_K SAMIN_K #define MAX_K SMAX_K @@ -657,6 +662,7 @@ #define ASUM_K SASUM_K #define DOTU_K SDOTU_K #define DOTC_K SDOTC_K +#define BF16_DOT_K SHDOT_K #define AXPYU_K SAXPYU_K #define AXPYC_K SAXPYC_K #define AXPBY_K SAXPBY_K diff --git a/common_param.h b/common_param.h index 0437482dc6..a52de98ab8 100644 --- a/common_param.h +++ b/common_param.h @@ -51,6 +51,11 @@ typedef struct { int shgemm_p, shgemm_q, shgemm_r; int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; + void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); + void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); + void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); + float (*shamax_k) (BLASLONG, float *, BLASLONG); float (*shamin_k) (BLASLONG, float *, BLASLONG); float (*shmax_k) (BLASLONG, float *, BLASLONG); @@ -64,7 +69,7 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); float (*shasum_k) (BLASLONG, float *, BLASLONG); float (*shsum_k) (BLASLONG, float *, BLASLONG); int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); diff --git a/common_sh.h b/common_sh.h index 7a0045762a..5dc99b3bde 100644 --- a/common_sh.h +++ b/common_sh.h @@ -3,6 +3,12 @@ #ifndef DYNAMIC_ARCH +#define SHDOT_K shdot_k +#define SHSTOBF16_K shstobf16_k +#define SHDTOBF16_K shdtobf16_k +#define SBF16TOS_K sbf16tos_k +#define DBF16TOD_K dbf16tod_k + #define SHGEMM_ONCOPY shgemm_oncopy #define SHGEMM_OTCOPY shgemm_otcopy @@ -18,6 +24,12 @@ #else +#define SHDOT_K gotoblas -> shdot_k +#define SHSTOBF16_K gotoblas -> shstobf16_k +#define SHDTOBF16_K gotoblas -> shdtobf16_k +#define SBF16TOS_K gotoblas -> sbf16tos_k +#define DBF16TOD_K gotoblas -> dbf16tod_k + #define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy #define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy #define SHGEMM_INCOPY gotoblas -> shgemm_incopy diff --git a/common_thread.h b/common_thread.h index ec0c65b220..a18df0d78b 100644 --- a/common_thread.h +++ b/common_thread.h @@ -59,12 +59,19 @@ extern int blas_omp_linked; #define BLAS_PTHREAD 0x4000U #define BLAS_NODE 0x2000U -#define BLAS_PREC 0x0003U -#define BLAS_SINGLE 0x0000U -#define BLAS_DOUBLE 0x0001U -#define BLAS_XDOUBLE 0x0002U -#define BLAS_REAL 0x0000U -#define BLAS_COMPLEX 0x0004U +#define BLAS_PREC 0x000FU +#define BLAS_INT8 0x0000U +#define BLAS_BFLOAT16 0x0001U +#define BLAS_SINGLE 0x0002U +#define BLAS_DOUBLE 0x0003U +#define BLAS_XDOUBLE 0x0004U +#define BLAS_STOBF16 0x0008U +#define BLAS_DTOBF16 0x0009U +#define BLAS_BF16TOS 0x000AU +#define BLAS_BF16TOD 0x000BU + +#define BLAS_REAL 0x0000U +#define BLAS_COMPLEX 0x1000U #define BLAS_TRANSA 0x0030U /* 2bit */ #define BLAS_TRANSA_N 0x0000U diff --git a/common_x86_64.h b/common_x86_64.h index bee7e8cdbd..b813336c6b 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -142,6 +142,29 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ #endif } +static __inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, int *edx) +{ +#ifdef C_MSVC + int cpuInfo[4] = {-1}; + __cpuidex(cpuInfo, op, count); + *eax = cpuInfo[0]; + *ebx = cpuInfo[1]; + *ecx = cpuInfo[2]; + *edx = cpuInfo[3]; +#else +#if defined(__i386__) && defined(__PIC__) + __asm__ __volatile__ + ("mov %%ebx, %%edi;" + "cpuid;" + "xchgl %%ebx, %%edi;" + : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "2" (count) : "cc"); +#else + __asm__ __volatile__ + ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "2" (count) : "cc"); +#endif +#endif +} + /* #define WHEREAMI */ diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c index e405c74650..04acbcc5f4 100644 --- a/driver/others/blas_l1_thread.c +++ b/driver/others/blas_l1_thread.c @@ -49,9 +49,36 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha blas_arg_t args [MAX_CPU_NUMBER]; BLASLONG i, width, astride, bstride; - int num_cpu, calc_type; - - calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; + int num_cpu, calc_type_a, calc_type_b; + + switch (mode & BLAS_PREC) { + case BLAS_INT8 : + case BLAS_BFLOAT16: + case BLAS_SINGLE : + case BLAS_DOUBLE : + case BLAS_XDOUBLE : + calc_type_a = calc_type_b = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_STOBF16 : + calc_type_a = 2 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_DTOBF16 : + calc_type_a = 3 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_BF16TOS : + calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 2 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_BF16TOD : + calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 3 + ((mode & BLAS_COMPLEX) != 0); + break; + default: + calc_type_a = calc_type_b = 0; + break; + } mode |= BLAS_LEGACY; @@ -77,8 +104,8 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha bstride = width; } - astride <<= calc_type; - bstride <<= calc_type; + astride <<= calc_type_a; + bstride <<= calc_type_b; args[num_cpu].m = width; args[num_cpu].n = n; @@ -120,9 +147,36 @@ int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASL blas_arg_t args [MAX_CPU_NUMBER]; BLASLONG i, width, astride, bstride; - int num_cpu, calc_type; - - calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; + int num_cpu, calc_type_a, calc_type_b; + + switch (mode & BLAS_PREC) { + case BLAS_INT8 : + case BLAS_BFLOAT16: + case BLAS_SINGLE : + case BLAS_DOUBLE : + case BLAS_XDOUBLE : + calc_type_a = calc_type_b = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_STOBF16 : + calc_type_a = 2 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_DTOBF16 : + calc_type_a = 3 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_BF16TOS : + calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 2 + ((mode & BLAS_COMPLEX) != 0); + break; + case BLAS_BF16TOD : + calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); + calc_type_b = 3 + ((mode & BLAS_COMPLEX) != 0); + break; + default: + calc_type_a = calc_type_b = 0; + break; + } mode |= BLAS_LEGACY; @@ -148,8 +202,8 @@ int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASL bstride = width; } - astride <<= calc_type; - bstride <<= calc_type; + astride <<= calc_type_a; + bstride <<= calc_type_b; args[num_cpu].m = width; args[num_cpu].n = n; diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 756e51b5dc..8d3dda3bf6 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -192,7 +192,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -205,7 +205,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -216,21 +216,58 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { - /* REAL / Single */ - void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, - float *, BLASLONG, float *, BLASLONG, - float *, BLASLONG, void *) = func; - - afunc(args -> m, args -> n, args -> k, - ((float *)args -> alpha)[0], - args -> a, args -> lda, - args -> b, args -> ldb, - args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ + /* REAL / Single */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, float *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#ifdef BUILD_HALF + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + /* REAL / BFLOAT16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((bfloat16 *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + /* REAL / BLAS_STOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + /* REAL / BLAS_DTOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#endif + } else { + /* REAL / Other types in future */ } } else { #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -244,7 +281,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE) { /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -256,7 +293,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, @@ -268,7 +305,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } + } else { + /* COMPLEX / Other types in future */ + } } } @@ -414,33 +453,37 @@ blas_queue_t *tscq; if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } + } else { + /* Other types in future */ + } } else { #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } + } else { + /* Other types in future */ + } } queue->sb=sb; } diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index d9969b5998..d126955e4d 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -142,7 +142,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -155,7 +155,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -166,7 +166,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, @@ -177,10 +177,47 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); +#ifdef BUILD_HALF + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + /* REAL / BFLOAT16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((bfloat16 *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + /* REAL / BLAS_STOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + /* REAL / BLAS_DTOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#endif + } else { + /* REAL / Other types in future */ } } else { #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -194,7 +231,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -206,7 +243,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, @@ -218,8 +255,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } - } + } else { + /* COMPLEX / Other types in future */ + } + } } static void exec_threads(blas_queue_t *queue, int buf_index){ @@ -255,32 +294,36 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE){ sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + /* Other types in future */ } } else { #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + /* Other types in future */ } } queue->sb=sb; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 5ecc4428b7..d2cc917570 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -77,7 +77,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* REAL / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -90,7 +90,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* REAL / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, @@ -101,7 +101,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE){ /* REAL / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, @@ -112,10 +112,47 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); +#ifdef BUILD_HALF + } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ + /* REAL / BFLOAT16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, + bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, + bfloat16 *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((bfloat16 *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_STOBF16){ + /* REAL / BLAS_STOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, + float *, BLASLONG, bfloat16 *, BLASLONG, + float *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((float *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); + } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ + /* REAL / BLAS_DTOBF16 */ + void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, + double *, BLASLONG, bfloat16 *, BLASLONG, + double *, BLASLONG, void *) = func; + + afunc(args -> m, args -> n, args -> k, + ((double *)args -> alpha)[0], + args -> a, args -> lda, + args -> b, args -> ldb, + args -> c, args -> ldc, sb); +#endif + } else { + /* REAL / Other types in future */ } } else { #ifdef EXPRECISION - if (mode & BLAS_XDOUBLE){ + if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ /* COMPLEX / Extended Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, @@ -129,7 +166,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> c, args -> ldc, sb); } else #endif - if (mode & BLAS_DOUBLE){ + if ((mode & BLAS_PREC) == BLAS_DOUBLE){ /* COMPLEX / Double */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, @@ -141,7 +178,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } else { + } else if ((mode & BLAS_PREC) == BLAS_SINGLE) { /* COMPLEX / Single */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, @@ -153,7 +190,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); - } + } else { + /* COMPLEX / Other types in future */ + } } } @@ -233,32 +272,36 @@ static DWORD WINAPI blas_thread_server(void *arg){ if (sb == NULL) { if (!(queue -> mode & BLAS_COMPLEX)){ #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + /* Other types in future */ } } else { #ifdef EXPRECISION - if (queue -> mode & BLAS_XDOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } else #endif - if (queue -> mode & BLAS_DOUBLE){ + if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { + } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + } else { + /* Other types in future */ } } queue->sb=sb; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5d71b1b2c2..21d2c79489 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -207,6 +207,19 @@ extern gotoblas_t gotoblas_SKYLAKEX; #else #define gotoblas_SKYLAKEX gotoblas_PRESCOTT #endif +#ifdef DYN_COOPERLAKE +extern gotoblas_t gotoblas_COOPERLAKE; +#elif defined(DYN_SKYLAKEX) +#define gotoblas_COOPERLAKE gotoblas_SKYLAKEX +#elif defined(DYN_HASWELL) +#define gotoblas_COOPERLAKE gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_COOPERLAKE gotoblas_NEHALEM +#else +#define gotoblas_COOPERLAKE gotoblas_PRESCOTT +#endif #else // not DYNAMIC_LIST @@ -247,14 +260,17 @@ extern gotoblas_t gotoblas_EXCAVATOR; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE +#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE #else extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; +extern gotoblas_t gotoblas_COOPERLAKE; #else #define gotoblas_SKYLAKEX gotoblas_HASWELL +#define gotoblas_COOPERLAKE gotoblas_HASWELL #endif #endif #else @@ -262,6 +278,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_SKYLAKEX gotoblas_NEHALEM +#define gotoblas_COOPERLAKE gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA @@ -343,6 +360,23 @@ int support_avx512(){ #endif } +int support_avx512_bf16(){ +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + cpuid_count(7, 1, &eax, &ebx, &ecx, &edx); + if((eax & 32) == 32){ + ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" @@ -524,7 +558,10 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } - if (model == 5) { + if (model == 5) { + // Intel Cooperlake + if(support_avx512_bf16()) + return &gotoblas_COOPERLAKE; // Intel Skylake X if (support_avx512()) return &gotoblas_SKYLAKEX; @@ -774,7 +811,8 @@ static char *corename[] = { "Steamroller", "Excavator", "Zen", - "SkylakeX" + "SkylakeX", + "Cooperlake" }; char *gotoblas_corename(void) { @@ -838,6 +876,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_ZEN) return corename[23]; if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; + if (gotoblas == &gotoblas_COOPERLAKE) return corename[25]; return corename[0]; } @@ -868,6 +907,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 25: return (&gotoblas_COOPERLAKE); case 24: return (&gotoblas_SKYLAKEX); case 23: return (&gotoblas_ZEN); case 22: return (&gotoblas_EXCAVATOR); diff --git a/exports/gensymbol b/exports/gensymbol index 73b4be248b..ce4d9bb64a 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -46,7 +46,7 @@ ssum, dsum, scsum, dzsum ); -@halfblasobjs = (shgemm); +@halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); @cblasobjs = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -84,7 +84,7 @@ cblas_xerbla ); -@halfcblasobjs = (cblas_shgemm); +@halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, diff --git a/interface/Makefile b/interface/Makefile index 2dbd600731..fde6227bc4 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -47,7 +47,9 @@ SBLAS3OBJS = \ sgeadd.$(SUFFIX) ifeq ($(BUILD_HALF),1) +SHBLAS1OBJS = shdot.$(SUFFIX) SHBLAS3OBJS = shgemm.$(SUFFIX) +SHEXTOBJS = shstobf16.$(SUFFIX) shdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif DBLAS1OBJS = \ @@ -281,7 +283,9 @@ CSBLAS3OBJS = \ cblas_sgeadd.$(SUFFIX) ifeq ($(BUILD_HALF),1) +CSHBLAS1OBJS = cblas_shdot.$(SUFFIX) CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) +CSHEXTOBJS = cblas_shstobf16.$(SUFFIX) cblas_shdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif CDBLAS1OBJS = \ @@ -374,6 +378,7 @@ override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) +SHBLAS1OBJS += $(CSHBLAS1OBJS) SHBLAS3OBJS += $(CSHBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) @@ -385,10 +390,11 @@ ZBLAS1OBJS += $(CZBLAS1OBJS) ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS) +SHEXTOBJS += $(CSHEXTOBJS) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SHBLASOBJS = $(SHBLAS3OBJS) +SHBLASOBJS = $(SHBLAS1OBJS) $(SHBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -463,7 +469,7 @@ ZBLASOBJS += $(ZLAPACKOBJS) endif -FUNCOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +FUNCOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -491,7 +497,7 @@ endif clean :: @rm -f functable.h -level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) +level1 : $(BEXTOBJS) $(SHBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) @@ -725,6 +731,19 @@ sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -c $< -o $(@F) +ifeq ($(BUILD_HALF),1) +shdot.$(SUFFIX) shdot.$(PSUFFIX) : bf16dot.c + $(CC) $(CFLAGS) -c $< -o $(@F) +shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +shdtobf16.$(SUFFIX) shdtobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +sbf16tos.$(SUFFIX) sbf16tos.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +dbf16tod.$(SUFFIX) dbf16tod.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +endif + sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -c $< -o $(@F) @@ -1463,6 +1482,19 @@ cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +ifeq ($(BUILD_HALF),1) +cblas_shdot.$(SUFFIX) cblas_shdot.$(PSUFFIX) : bf16dot.c + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) +cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +cblas_shdtobf16.$(SUFFIX) cblas_shdtobf16.$(PSUFFIX) : tobf16.c + $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +cblas_sbf16tos.$(SUFFIX) cblas_sbf16tos.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) +cblas_dbf16tod.$(SUFFIX) cblas_dbf16tod.$(PSUFFIX) : bf16to.c + $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) +endif + cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) diff --git a/interface/bf16dot.c b/interface/bf16dot.c new file mode 100644 index 0000000000..33717e3748 --- /dev/null +++ b/interface/bf16dot.c @@ -0,0 +1,52 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#ifndef CBLAS +float NAME(blasint *N, bfloat16 *x, blasint *INCX, bfloat16 *y, blasint *INCY){ + BLASLONG n = *N; + BLASLONG incx = *INCX; + BLASLONG incy = *INCY; + float ret; + PRINT_DEBUG_NAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + ret = BF16_DOT_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; + + return ret; + } + +#else + +float CNAME(blasint n, bfloat16 *x, blasint incx, bfloat16 *y, blasint incy){ + + float ret; + PRINT_DEBUG_CNAME; + + if (n <= 0) return 0.; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (incx < 0) x -= (n - 1) * incx; + if (incy < 0) y -= (n - 1) * incy; + ret = BF16_DOT_K(n, x, incx, y, incy); + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; + + return ret; +} + +#endif diff --git a/interface/bf16to.c b/interface/bf16to.c new file mode 100644 index 0000000000..036c0b142f --- /dev/null +++ b/interface/bf16to.c @@ -0,0 +1,62 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#if defined(DOUBLE_PREC) +#define FLOAT_TYPE double +#elif defined(SINGLE_PREC) +#define FLOAT_TYPE float +#else +#endif + +#ifndef CBLAS +void NAME(blasint *N, bfloat16 *in, blasint *INC_IN, FLOAT_TYPE *out, blasint *INC_OUT){ + BLASLONG n = *N; + BLASLONG inc_in = *INC_IN; + BLASLONG inc_out = *INC_OUT; + + PRINT_DEBUG_NAME; + + if (n <= 0) return; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (inc_in < 0) in -= (n - 1) * inc_in; + if (inc_out < 0) out -= (n - 1) * inc_out; + +#if defined(DOUBLE_PREC) + D_BF16_TO_K(n, in, inc_in, out, inc_out); +#elif defined(SINGLE_PREC) + S_BF16_TO_K(n, in, inc_in, out, inc_out); +#else +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; +} +#else +void CNAME(blasint n, bfloat16 * in, blasint inc_in, FLOAT_TYPE * out, blasint inc_out){ + PRINT_DEBUG_CNAME; + + if (n <= 0) return; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (inc_in < 0) in -= (n - 1) * inc_in; + if (inc_out < 0) out -= (n - 1) * inc_out; + +#if defined(DOUBLE_PREC) + D_BF16_TO_K(n, in, inc_in, out, inc_out); +#elif defined(SINGLE_PREC) + S_BF16_TO_K(n, in, inc_in, out, inc_out); +#else +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; +} +#endif diff --git a/interface/tobf16.c b/interface/tobf16.c new file mode 100644 index 0000000000..787d9d689e --- /dev/null +++ b/interface/tobf16.c @@ -0,0 +1,61 @@ +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#if defined(DOUBLE_PREC) +#define FLOAT_TYPE double +#elif defined(SINGLE_PREC) +#define FLOAT_TYPE float +#else +#endif + +#ifndef CBLAS +void NAME(blasint *N, FLOAT_TYPE *in, blasint *INC_IN, bfloat16 *out, blasint *INC_OUT){ + BLASLONG n = *N; + BLASLONG inc_in = *INC_IN; + BLASLONG inc_out = *INC_OUT; + + PRINT_DEBUG_NAME; + + if (n <= 0) return; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (inc_in < 0) in -= (n - 1) * inc_in; + if (inc_out < 0) out -= (n - 1) * inc_out; + +#if defined(DOUBLE_PREC) + D_TO_BF16_K(n, in, inc_in, out, inc_out); +#elif defined(SINGLE_PREC) + S_TO_BF16_K(n, in, inc_in, out, inc_out); +#else +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; +} +#else +void CNAME(blasint n, FLOAT_TYPE *in, blasint inc_in, bfloat16 *out, blasint inc_out){ + PRINT_DEBUG_CNAME; + + if (n <= 0) return; + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (inc_in < 0) in -= (n - 1) * inc_in; + if (inc_out < 0) out -= (n - 1) * inc_out; + +#if defined(DOUBLE_PREC) + D_TO_BF16_K(n, in, inc_in, out, inc_out); +#elif defined(SINGLE_PREC) + S_TO_BF16_K(n, in, inc_in, out, inc_out); +#endif + + FUNCTION_PROFILE_END(1, 2 * n, 2 * n); + IDEBUG_END; +} +#endif diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 970703230a..c6576ee07b 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -262,6 +262,20 @@ ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif +ifeq ($(BUILD_HALF),1) +ifndef SHDOTKERNEL +SHDOTKERNEL = ../x86_64/shdot.c +endif + +ifndef TOBF16KERNEL +TOBF16KERNEL = ../x86_64/tobf16.c +endif + +ifndef BF16TOKERNEL +BF16TOKERNEL = ../x86_64/bf16to.c +endif +endif + ### NRM2 ### ifndef SNRM2KERNEL @@ -516,6 +530,15 @@ XBLASOBJS += \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_HALF),1) +SHBLASOBJS += \ + shdot_k$(TSUFFIX).$(SUFFIX) +SHEXTOBJS += \ + shstobf16_k$(TSUFFIX).$(SUFFIX) shdtobf16_k$(TSUFFIX).$(SUFFIX) +SHEXTOBJS += \ + sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX) +endif + ### AMAX ### @@ -734,6 +757,19 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ +ifeq ($(BUILD_HALF),1) +$(KDIR)shdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)shdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHDOTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +$(KDIR)shstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) + $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ +$(KDIR)shdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) + $(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@ +$(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL) + $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ +$(KDIR)dbf16tod_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL) + $(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@ +endif + $(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 582a1dc016..c43520310b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -62,9 +62,11 @@ gotoblas_t TABLE_NAME = { MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), #endif + shstobf16_kTS, shdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, + samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, shdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sgemv_nTS, sgemv_tTS, sger_kTS, diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 4874711bbe..4a2e13bedb 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -146,6 +146,18 @@ ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif +ifndef SHDOTKERNEL +SHDOTKERNEL = shdot.c +endif + +ifndef TOBF16KERNEL +TOBF16KERNEL = tobf16.c +endif + +ifndef BF16TOKERNEL +BF16TOKERNEL = bf16to.c +endif + ifndef ISAMAXKERNEL ISAMAXKERNEL = iamax_sse.S endif diff --git a/kernel/x86_64/bf16to.c b/kernel/x86_64/bf16to.c new file mode 100644 index 0000000000..fc6b5a5293 --- /dev/null +++ b/kernel/x86_64/bf16to.c @@ -0,0 +1,114 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if defined(DOUBLE) +#define FLOAT_TYPE double +#elif defined(SINGLE) +#define FLOAT_TYPE float +#else +#endif + +/* Notes for algorithm: + * - Input denormal treated as zero + * - Force to be QNAN + */ +static void bf16to_kernel_1(BLASLONG n, const bfloat16 * in, BLASLONG inc_in, FLOAT_TYPE * out, BLASLONG inc_out) +{ + BLASLONG register index_in = 0; + BLASLONG register index_out = 0; + BLASLONG register index = 0; + uint16_t * tmp = NULL; +#if defined(DOUBLE) + float float_out = 0.0; +#endif + + while(index= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_TOBF16_ACCL_KERNEL 1 +#include "common.h" +#include + +static void tobf16_accl_kernel(BLASLONG n, const double * in, bfloat16 * out) +{ + /* Get the 64-bytes unaligned header number targeting for avx512 + * processing (Assume input float array is natural aligned) */ + int align_header = ((64 - ((uintptr_t)in & (uintptr_t)0x3f)) >> 3) & 0x7; + + if (n < align_header) {align_header = n;} + + if (align_header != 0) { + unsigned char align_mask8 = (((unsigned char)0xff) >> (8-align_header)); + __m512d a = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &in[0]); + _mm_mask_storeu_epi16(&out[0], *((__mmask8*) &align_mask8), (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(a))); + } + + if (n == align_header) { + return; + } else { + n -= align_header; + in += align_header; + out += align_header; + } + + int tail_index_8 = n&(~7); + int tail_index_32 = n&(~31); + int tail_index_128 = n&(~127); + unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n&7))); + + /* Processing the main chunk with 128-elements per round */ + for (int i = 0; i < tail_index_128; i += 128) { + // Fold 1 + __m512 data1_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+ 0]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+ 8])), 1); + __m512 data1_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+16]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+24])), 1); + _mm512_storeu_si512(&out[i+ 0], (__m512i) _mm512_cvtne2ps_pbh(data1_512_high, data1_512_low)); + + // Fold 2 + __m512 data2_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+32]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+40])), 1); + __m512 data2_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+48]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+56])), 1); + _mm512_storeu_si512(&out[i+32], (__m512i) _mm512_cvtne2ps_pbh(data2_512_high, data2_512_low)); + + // Fold 3 + __m512 data3_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+64]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+72])), 1); + __m512 data3_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+80]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+88])), 1); + _mm512_storeu_si512(&out[i+64], (__m512i) _mm512_cvtne2ps_pbh(data3_512_high, data3_512_low)); + + // Fold 4 + __m512 data4_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+96]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+104])), 1); + __m512 data4_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+112]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+120])), 1); + _mm512_storeu_si512(&out[i+96], (__m512i) _mm512_cvtne2ps_pbh(data4_512_high, data4_512_low)); + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (int j = tail_index_128; j < tail_index_32; j += 32) { + __m512 data1_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[j+ 0]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[j+ 8])), 1); + __m512 data1_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[j+16]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[j+24])), 1); + _mm512_storeu_si512(&out[j], (__m512i) _mm512_cvtne2ps_pbh(data1_512_high, data1_512_low)); + } + + /* Processing the remaining <32 chunk with 8-elements per round */ + for (int j = tail_index_32; j < tail_index_8; j += 8) { + _mm_storeu_si128((__m128i *)&out[j], (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(_mm512_load_pd(&in[j])))); + } + + /* Processing the remaining <8 chunk with masked processing */ + if ((n&7) > 0) { + __m512d data_512 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &in[tail_index_8]); + _mm_mask_storeu_epi16(&out[tail_index_8], *((__mmask8*) &tail_mask8), (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(data_512))); + } +} + +#endif diff --git a/kernel/x86_64/shdot.c b/kernel/x86_64/shdot.c new file mode 100644 index 0000000000..5073fda2a6 --- /dev/null +++ b/kernel/x86_64/shdot.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(COOPERLAKE) +#include "shdot_microk_cooperlake.c" +#endif + +static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) +{ + float d = 0.0; + +#ifdef HAVE_SHDOT_ACCL_KERNEL + if ((inc_x == 1) && (inc_y == 1)) { + return shdot_accl_kernel(n, x, y); + } +#endif + + float * x_fp32 = malloc(sizeof(float)*n); + float * y_fp32 = malloc(sizeof(float)*n); + + SBF16TOS_K(n, x, inc_x, x_fp32, 1); + SBF16TOS_K(n, y, inc_y, y_fp32, 1); + + d = SDOTU_K(n, x_fp32, 1, y_fp32, 1); + + free(x_fp32); + free(y_fp32); + + return d; +} + +#if defined(SMP) +static int shdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, + bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y, + float *result, BLASLONG dummy3) +{ + *(float *)result = shdot_compute(n, x, inc_x, y, inc_y); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, + void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, + int (*function)(), int nthreads); +#endif + +float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) +{ + float dot_result = 0.0; + + if (n <= 0) return 0.0; + +#if defined(SMP) + int nthreads; + int thread_thres = 40960; + bfloat16 dummy_alpha; +#endif + +#if defined(SMP) + if (inc_x == 0 || inc_y == 0 || n <= thread_thres) + nthreads = 1; + else + nthreads = num_cpu_avail(1); + + int best_threads = (int) (n/(float)thread_thres + 0.5); + + if (best_threads < nthreads) { + nthreads = best_threads; + } + + if (nthreads <= 1) { + dot_result = shdot_compute(n, x, inc_x, y, inc_y); + } else { + char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2]; + int mode = BLAS_BFLOAT16 | BLAS_REAL; + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, + x, inc_x, y, inc_y, thread_result, 0, + (void *)shdot_thread_func, nthreads); + float * ptr = (float *)thread_result; + for (int i = 0; i < nthreads; i++) { + dot_result += (*ptr); + ptr = (float *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + dot_result = shdot_compute(n, x, inc_x, y, inc_y); +#endif + + return dot_result; +} diff --git a/kernel/x86_64/shdot_microk_cooperlake.c b/kernel/x86_64/shdot_microk_cooperlake.c new file mode 100644 index 0000000000..e645296f18 --- /dev/null +++ b/kernel/x86_64/shdot_microk_cooperlake.c @@ -0,0 +1,159 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SHDOT_ACCL_KERNEL 1 +#include "common.h" +#include + +static float shdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) +{ + __m128 accum128 = _mm_setzero_ps(); + if (n> 127) { /* n range from 128 to inf. */ + long tail_index_32 = n&(~31); + long tail_index_128 = n&(~127); + unsigned int tail_mask_uint = (((unsigned int)0xffffffff) >> (32-(n&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_uint); + + __m512 accum512_0 = _mm512_setzero_ps(); + __m512 accum512_1 = _mm512_setzero_ps(); + __m512 accum512_2 = _mm512_setzero_ps(); + __m512 accum512_3 = _mm512_setzero_ps(); + + /* Processing the main chunk with 128-elements per round */ + for (long i = 0; i < tail_index_128; i += 128) { + accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) _mm512_loadu_si512(&x[i+ 0]), (__m512bh) _mm512_loadu_si512(&y[i+ 0])); + accum512_1 = _mm512_dpbf16_ps(accum512_1, (__m512bh) _mm512_loadu_si512(&x[i+32]), (__m512bh) _mm512_loadu_si512(&y[i+32])); + accum512_2 = _mm512_dpbf16_ps(accum512_2, (__m512bh) _mm512_loadu_si512(&x[i+64]), (__m512bh) _mm512_loadu_si512(&y[i+64])); + accum512_3 = _mm512_dpbf16_ps(accum512_3, (__m512bh) _mm512_loadu_si512(&x[i+96]), (__m512bh) _mm512_loadu_si512(&y[i+96])); + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (long j = tail_index_128; j < tail_index_32; j += 32) { + accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) _mm512_loadu_si512(&x[j]), (__m512bh) _mm512_loadu_si512(&y[j])); + } + + /* Processing the remaining <32 chunk with masked 32-elements processing */ + if ((n&31) != 0) { + accum512_2 = _mm512_dpbf16_ps(accum512_2, + (__m512bh) _mm512_maskz_loadu_epi16(tail_mask, &x[tail_index_32]), + (__m512bh) _mm512_maskz_loadu_epi16(tail_mask, &y[tail_index_32])); + } + + /* Accumulate the 4 registers into 1 register */ + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_2 = _mm512_add_ps(accum512_2, accum512_3); + accum512_0 = _mm512_add_ps(accum512_0, accum512_2); + + __m256 accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); + } else if (n > 31) { /* n range from 32 to 127 */ + /* Processing <128 chunk with 32-elements per round */ + __m256 accum256 = _mm256_setzero_ps(); + __m256 accum256_1 = _mm256_setzero_ps(); + int tail_index_32 = n&(~31); + for (int j = 0; j < tail_index_32; j += 32) { + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0])); + accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16])); + } + accum256 = _mm256_add_ps(accum256, accum256_1); + + /* Processing the remaining <32 chunk with 16-elements processing */ + if ((n&16) != 0) { + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32])); + } + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); + + /* Processing the remaining <16 chunk with 8-elements processing */ + if ((n&8) != 0) { + int tail_index_16 = n&(~15); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + } + + /* Processing the remaining <8 chunk with masked 8-elements processing */ + if ((n&7) != 0) { + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + int tail_index_8 = n&(~7); + accum128 = _mm_dpbf16_ps(accum128, + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]), + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8])); + } + } else if (n > 15) { /* n range from 16 to 31 */ + /* Processing <32 chunk with 16-elements processing */ + __m256 accum256 = _mm256_setzero_ps(); + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0])); + accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); + + /* Processing the remaining <16 chunk with 8-elements processing */ + if ((n&8) != 0) { + int tail_index_16 = n&(~15); + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); + } + + /* Processing the remaining <8 chunk with masked 8-elements processing */ + if ((n&7) != 0) { + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + int tail_index_8 = n&(~7); + accum128 = _mm_dpbf16_ps(accum128, + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]), + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8])); + } + } else if (n > 7) { /* n range from 8 to 15 */ + /* Processing <16 chunk with 8-elements processing */ + accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0])); + + /* Processing the remaining <8 chunk with masked 8-elements processing */ + if ((n&7) != 0) { + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + int tail_index_8 = n&(~7); + accum128 = _mm_dpbf16_ps(accum128, + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]), + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8])); + } + } else { /* n range from 1 to 7 */ + unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_uint); + accum128 = _mm_dpbf16_ps(accum128, + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[0]), + (__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[0])); + } + + /* Add up the 4 elements into lowest entry */ + __m128 accum128_1 = _mm_shuffle_ps(accum128, accum128, 14); + accum128 = _mm_add_ps(accum128, accum128_1); + accum128_1 = _mm_shuffle_ps(accum128, accum128, 1); + accum128 = _mm_add_ps(accum128, accum128_1); + + return accum128[0]; +} + +#endif diff --git a/kernel/x86_64/stobf16_microk_cooperlake.c b/kernel/x86_64/stobf16_microk_cooperlake.c new file mode 100644 index 0000000000..2756a69345 --- /dev/null +++ b/kernel/x86_64/stobf16_microk_cooperlake.c @@ -0,0 +1,86 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/* need a new enough GCC for avx512 support */ +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_TOBF16_ACCL_KERNEL 1 +#include "common.h" +#include + +static void tobf16_accl_kernel(BLASLONG n, const float * in, bfloat16 * out) +{ + /* Get the 64-bytes unaligned header number targeting for avx512 + * processing (Assume input float array is natural aligned) */ + int align_header = ((64 - ((uintptr_t)in & (uintptr_t)0x3f)) >> 2) & 0xf; + + if (n < align_header) {align_header = n;} + + if (align_header != 0) { + uint16_t align_mask16 = (((uint16_t)0xffff) >> (16-align_header)); + __m512 a = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &in[0]); + _mm256_mask_storeu_epi16(&out[0], *((__mmask16*) &align_mask16), (__m256i) _mm512_cvtneps_pbh(a)); + } + + if (n == align_header) { + return; + } else { + n -= align_header; + in += align_header; + out += align_header; + } + + int tail_index_32 = n&(~31); + int tail_index_128 = n&(~127); + uint32_t tail_mask32 = (((uint32_t) 0xffffffff) >> (32-(n&31))); + uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15))); + + /* Processing the main chunk with 128-elements per round */ + for (int i = 0; i < tail_index_128; i += 128) { + _mm512_storeu_si512(&out[i+ 0], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 16]), _mm512_load_ps(&in[i+ 0]))); + _mm512_storeu_si512(&out[i+32], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 48]), _mm512_load_ps(&in[i+32]))); + _mm512_storeu_si512(&out[i+64], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 80]), _mm512_load_ps(&in[i+64]))); + _mm512_storeu_si512(&out[i+96], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+112]), _mm512_load_ps(&in[i+96]))); + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (int j = tail_index_128; j < tail_index_32; j += 32) { + _mm512_storeu_si512(&out[j], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[j+ 16]), _mm512_load_ps(&in[j]))); + } + + /* Processing the remaining <32 chunk with masked processing */ + if ((n&31) > 15) { + __m512 b = _mm512_load_ps(&in[tail_index_32]); + __m512 a = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &in[tail_index_32+16]); + _mm512_mask_storeu_epi16(&out[tail_index_32], *((__mmask32*) &tail_mask32), (__m512i) _mm512_cvtne2ps_pbh(a, b)); + } else if ((n&31) > 0) { + __m512 a = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &in[tail_index_32]); + _mm256_mask_storeu_epi16(&out[tail_index_32], *((__mmask16*) &tail_mask16), (__m256i) _mm512_cvtneps_pbh(a)); + } +} + +#endif diff --git a/kernel/x86_64/tobf16.c b/kernel/x86_64/tobf16.c new file mode 100644 index 0000000000..3d17966214 --- /dev/null +++ b/kernel/x86_64/tobf16.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if defined(DOUBLE) +#define FLOAT_TYPE double +#elif defined(SINGLE) +#define FLOAT_TYPE float +#else +#endif + +#if defined(COOPERLAKE) +#if defined(DOUBLE) +#include "dtobf16_microk_cooperlake.c" +#elif defined(SINGLE) +#include "stobf16_microk_cooperlake.c" +#endif +#endif + +/* Notes for algorithm: + * - Round to Nearest Even used generally + * - QNAN for NAN case + * - Input denormals are treated as zero + */ +static void tobf16_generic_kernel(BLASLONG n, const FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out) +{ + BLASLONG register index_in = 0; + BLASLONG register index_out = 0; + BLASLONG register index = 0; + float float_in = 0.0; + uint32_t * uint32_in = (uint32_t *)(&float_in); + uint16_t * uint16_in = (uint16_t *)(&float_in); + + while(index> 16) & 0x1u) + 0x7fffu); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + *(out+index_out) = uint16_in[1]; +#else + *(out+index_out) = uint16_in[0]; +#endif + break; + } + + index_in += inc_in; + index_out += inc_out; + index++; + } +} + +#ifndef HAVE_TOBF16_ACCL_KERNEL +static void tobf16_accl_kernel(BLASLONG n, const FLOAT_TYPE * in, bfloat16 * out) +{ + tobf16_generic_kernel(n, in, 1, out, 1); +} +#endif + +static void tobf16_compute(BLASLONG n, FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out) +{ + if ((inc_in == 1) && (inc_out == 1)) { + tobf16_accl_kernel(n, in, out); + } else { + tobf16_generic_kernel(n, in, inc_in, out, inc_out); + } +} + +#if defined(SMP) +static int tobf16_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT_TYPE dummy2, + FLOAT_TYPE *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y, + FLOAT_TYPE *dummy3, BLASLONG dummy4) +{ + tobf16_compute(n, x, inc_x, y, inc_y); + return 0; +} + +extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, + void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, + int (*function)(), int nthreads); +#endif + +void CNAME(BLASLONG n, FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out) +{ + if (n <= 0) return; + +#if defined(SMP) + int nthreads; + FLOAT_TYPE dummy_alpha; + FLOAT_TYPE dummy_c; +#endif + +#if defined(SMP) + if (inc_in == 0 || inc_out == 0 || n <= 100000) { + nthreads = 1; + } else { + if (n/100000 < 100) { + nthreads = 4; + } else { + nthreads = 16; + } + } + + if (nthreads == 1) { + tobf16_compute(n, in, inc_in, out, inc_out); + } else { +#if defined(DOUBLE) + int mode = BLAS_REAL | BLAS_DTOBF16; +#elif defined(SINGLE) + int mode = BLAS_REAL | BLAS_STOBF16; +#endif + blas_level1_thread(mode, n, 0, 0, &dummy_alpha, + in, inc_in, out, inc_out, &dummy_c, 0, + (void *)tobf16_thread_func, nthreads); + } +#else + tobf16_compute(n, in, inc_in, out, inc_out); +#endif + +} diff --git a/openblas_config_template.h b/openblas_config_template.h index 9955e5c73d..858b8c5cb0 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -35,7 +35,8 @@ typedef unsigned long BLASULONG; #endif #ifndef BFLOAT16 -typedef unsigned short bfloat16; +#include +typedef uint16_t bfloat16; #endif #ifdef OPENBLAS_USE64BITINT From 860247b5da58debb2082353a730f64049018bf35 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 2 Sep 2020 22:38:56 +0200 Subject: [PATCH 171/349] Follow-up to lapack#434 & lapack#409: fix signature mismatches --- lapack-netlib/LAPACKE/include/lapack.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index c045892df6..9a8e1a218a 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3665,7 +3665,7 @@ lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, lapack_int* lda, double* b, lapack_int* ldb, double* alpha, double* beta, double* u, lapack_int* ldu, double* v, lapack_int* ldv, double* q, - lapack_int* ldq, float* work, lapack_int* iwork, lapack_int* info ); + lapack_int* ldq, double* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, @@ -3676,7 +3676,7 @@ lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, float* alpha, float* beta, lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* q, - lapack_int* ldq, float* work, lapack_int* rwork, lapack_int* iwork, lapack_int *info ); + lapack_int* ldq, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, @@ -3688,7 +3688,7 @@ lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* q, lapack_int* ldq, - float* work, lapack_int* rwork, lapack_int* iwork, lapack_int* info ); + lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) void LAPACK_cggsvd3( @@ -3780,7 +3780,7 @@ lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* q, lapack_int* ldq, - lapack_int* iwork, lapack_int* rwork, + lapack_int* iwork, float* rwork, lapack_complex_float* tau, lapack_complex_float* work, lapack_int* info); @@ -3793,7 +3793,7 @@ lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, lapack_int* l, lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* q, - lapack_int* ldq, lapack_int* iwork, lapack_int* rwork, + lapack_int* ldq, lapack_int* iwork, double* rwork, lapack_complex_double* tau, lapack_complex_double* work, lapack_int* info); From 1c6c71fa853226073779aba4cc5c08a2ba22300c Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 2 Sep 2020 22:41:50 +0200 Subject: [PATCH 172/349] Follow-up to lapack#434 & lapack#409: add missing 'const' in signatures Based on how the surrounding functions in lapack.h are handling the parameters, particularly the ?ggsv?3-variants of the affected functions --- lapack-netlib/LAPACKE/include/lapack.h | 80 +++++++++++++------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 9a8e1a218a..f0af3795d0 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3651,43 +3651,43 @@ void LAPACK_zggrqf( #define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, float* a, - lapack_int* lda, float* b, lapack_int* ldb, - float* alpha, float* beta, float* u, lapack_int* ldu, - float* v, lapack_int* ldv, float* q, lapack_int* ldq, + lapack_int const* lda, float* b, lapack_int const* ldb, + float* alpha, float* beta, float* u, lapack_int const* ldu, + float* v, lapack_int const* ldv, float* q, lapack_int const* ldq, float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, double* a, - lapack_int* lda, double* b, lapack_int* ldb, + lapack_int const* lda, double* b, lapack_int const* ldb, double* alpha, double* beta, double* u, - lapack_int* ldu, double* v, lapack_int* ldv, double* q, - lapack_int* ldq, double* work, lapack_int* iwork, lapack_int* info ); + lapack_int const* ldu, double* v, lapack_int const* ldv, double* q, + lapack_int const* ldq, double* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, float* alpha, float* beta, lapack_complex_float* u, - lapack_int* ldu, lapack_complex_float* v, - lapack_int* ldv, lapack_complex_float* q, - lapack_int* ldq, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* info ); + lapack_int const* ldu, lapack_complex_float* v, + lapack_int const* ldv, lapack_complex_float* q, + lapack_int const* ldq, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* n, lapack_int* p, + lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, double* alpha, double* beta, - lapack_complex_double* u, lapack_int* ldu, - lapack_complex_double* v, lapack_int* ldv, - lapack_complex_double* q, lapack_int* ldq, + lapack_complex_double* u, lapack_int const* ldu, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* q, lapack_int const* ldq, lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) @@ -3754,46 +3754,46 @@ void LAPACK_zggsvd3( #define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, float* a, - lapack_int* lda, float* b, lapack_int* ldb, float* tola, + lapack_int const* m, lapack_int const* p, lapack_int const* n, float* a, + lapack_int const* lda, float* b, lapack_int const* ldb, float* tola, float* tolb, lapack_int* k, lapack_int* l, float* u, - lapack_int* ldu, float* v, lapack_int* ldv, float* q, - lapack_int* ldq, lapack_int* iwork, float* tau, + lapack_int const* ldu, float* v, lapack_int const* ldv, float* q, + lapack_int const* ldq, lapack_int* iwork, float* tau, float* work, lapack_int* info); #define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, double* a, - lapack_int* lda, double* b, lapack_int* ldb, + lapack_int const* m, lapack_int const* p, lapack_int const* n, double* a, + lapack_int const* lda, double* b, lapack_int const* ldb, double* tola, double* tolb, lapack_int* k, - lapack_int* l, double* u, lapack_int* ldu, double* v, - lapack_int* ldv, double* q, lapack_int* ldq, + lapack_int* l, double* u, lapack_int const* ldu, double* v, + lapack_int const* ldv, double* q, lapack_int const* ldq, lapack_int* iwork, double* tau, double* work, lapack_int* info); #define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, - lapack_complex_float* a, lapack_int* lda, - lapack_complex_float* b, lapack_int* ldb, float* tola, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, float* tola, float* tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int* ldu, - lapack_complex_float* v, lapack_int* ldv, - lapack_complex_float* q, lapack_int* ldq, + lapack_complex_float* u, lapack_int const* ldu, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* q, lapack_int const* ldq, lapack_int* iwork, float* rwork, lapack_complex_float* tau, lapack_complex_float* work, lapack_int* info); #define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int* m, lapack_int* p, lapack_int* n, - lapack_complex_double* a, lapack_int* lda, - lapack_complex_double* b, lapack_int* ldb, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, double* tola, double* tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u, - lapack_int* ldu, lapack_complex_double* v, - lapack_int* ldv, lapack_complex_double* q, - lapack_int* ldq, lapack_int* iwork, double* rwork, + lapack_int const* ldu, lapack_complex_double* v, + lapack_int const* ldv, lapack_complex_double* q, + lapack_int const* ldq, lapack_int* iwork, double* rwork, lapack_complex_double* tau, lapack_complex_double* work, lapack_int* info); From 3426519ae2e4210dc6088b484ce7b8f1abd1d38d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Wed, 2 Sep 2020 22:46:47 +0200 Subject: [PATCH 173/349] adapt ?ggsv?-functions to ambient code style in LAPACKE/include/lapack.h --- lapack-netlib/LAPACKE/include/lapack.h | 162 ++++++++++++++----------- 1 file changed, 92 insertions(+), 70 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index f0af3795d0..aedaa308dd 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3650,45 +3650,58 @@ void LAPACK_zggrqf( lapack_int* info ); #define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD) -lapack_int LAPACK_sggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* n, lapack_int const* p, - lapack_int* k, lapack_int* l, float* a, - lapack_int const* lda, float* b, lapack_int const* ldb, - float* alpha, float* beta, float* u, lapack_int const* ldu, - float* v, lapack_int const* ldv, float* q, lapack_int const* ldq, - float* work, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_sggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + float* a, lapack_int const* lda, + float* b, lapack_int const* ldb, + float* alpha, float* beta, + float* u, lapack_int const* ldu, + float* v, lapack_int const* ldv, + float* q, lapack_int const* ldq, + float* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD) -lapack_int LAPACK_dggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* n, lapack_int const* p, - lapack_int* k, lapack_int* l, double* a, - lapack_int const* lda, double* b, lapack_int const* ldb, - double* alpha, double* beta, double* u, - lapack_int const* ldu, double* v, lapack_int const* ldv, double* q, - lapack_int const* ldq, double* work, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_dggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + double* a, lapack_int const* lda, + double* b, lapack_int const* ldb, + double* alpha, double* beta, + double* u, lapack_int const* ldu, + double* v, lapack_int const* ldv, + double* q, lapack_int const* ldq, + double* work, lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD) -lapack_int LAPACK_cggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* n, lapack_int const* p, - lapack_int* k, lapack_int* l, - lapack_complex_float* a, lapack_int const* lda, - lapack_complex_float* b, lapack_int const* ldb, - float* alpha, float* beta, lapack_complex_float* u, - lapack_int const* ldu, lapack_complex_float* v, - lapack_int const* ldv, lapack_complex_float* q, - lapack_int const* ldq, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_cggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, + float* alpha, float* beta, + lapack_complex_float* u, lapack_int const* ldu, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* q, lapack_int const* ldq, + lapack_complex_float* work, float* rwork, + lapack_int* iwork, lapack_int* info ); #define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD) -lapack_int LAPACK_zggsvd( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* n, lapack_int const* p, - lapack_int* k, lapack_int* l, - lapack_complex_double* a, lapack_int const* lda, - lapack_complex_double* b, lapack_int const* ldb, - double* alpha, double* beta, - lapack_complex_double* u, lapack_int const* ldu, - lapack_complex_double* v, lapack_int const* ldv, - lapack_complex_double* q, lapack_int const* ldq, - lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* info ); +lapack_int LAPACK_zggsvd( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* n, lapack_int const* p, + lapack_int* k, lapack_int* l, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, + double* alpha, double* beta, + lapack_complex_double* u, lapack_int const* ldu, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* q, lapack_int const* ldq, + lapack_complex_double* work, double* rwork, + lapack_int* iwork, lapack_int* info ); #define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3) void LAPACK_cggsvd3( @@ -3753,49 +3766,58 @@ void LAPACK_zggsvd3( lapack_int* info ); #define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP) -lapack_int LAPACK_sggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* p, lapack_int const* n, float* a, - lapack_int const* lda, float* b, lapack_int const* ldb, float* tola, - float* tolb, lapack_int* k, lapack_int* l, float* u, - lapack_int const* ldu, float* v, lapack_int const* ldv, float* q, - lapack_int const* ldq, lapack_int* iwork, float* tau, - float* work, lapack_int* info); +lapack_int LAPACK_sggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + float* a, lapack_int const* lda, + float* b, lapack_int const* ldb, + float* tola, float* tolb, + lapack_int* k, lapack_int* l, + float* u, lapack_int const* ldu, + float* v, lapack_int const* ldv, + float* q, lapack_int const* ldq, + lapack_int* iwork, float* tau, + float* work, lapack_int* info ); #define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP) -lapack_int LAPACK_dggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* p, lapack_int const* n, double* a, - lapack_int const* lda, double* b, lapack_int const* ldb, - double* tola, double* tolb, lapack_int* k, - lapack_int* l, double* u, lapack_int const* ldu, double* v, - lapack_int const* ldv, double* q, lapack_int const* ldq, - lapack_int* iwork, double* tau, double* work, - lapack_int* info); +lapack_int LAPACK_dggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + double* a, lapack_int const* lda, + double* b, lapack_int const* ldb, + double* tola, double* tolb, + lapack_int* k, lapack_int* l, + double* u, lapack_int const* ldu, + double* v, lapack_int const* ldv, + double* q, lapack_int const* ldq, + lapack_int* iwork, double* tau, + double* work, lapack_int* info ); #define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP) -lapack_int LAPACK_cggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* p, lapack_int const* n, - lapack_complex_float* a, lapack_int const* lda, - lapack_complex_float* b, lapack_int const* ldb, float* tola, - float* tolb, lapack_int* k, lapack_int* l, - lapack_complex_float* u, lapack_int const* ldu, - lapack_complex_float* v, lapack_int const* ldv, - lapack_complex_float* q, lapack_int const* ldq, - lapack_int* iwork, float* rwork, - lapack_complex_float* tau, lapack_complex_float* work, - lapack_int* info); +lapack_int LAPACK_cggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_float* a, lapack_int const* lda, + lapack_complex_float* b, lapack_int const* ldb, + float* tola, float* tolb, lapack_int* k, lapack_int* l, + lapack_complex_float* u, lapack_int const* ldu, + lapack_complex_float* v, lapack_int const* ldv, + lapack_complex_float* q, lapack_int const* ldq, + lapack_int* iwork, float* rwork, lapack_complex_float* tau, + lapack_complex_float* work, lapack_int* info ); #define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP) -lapack_int LAPACK_zggsvp( char const* jobu, char const* jobv, char const* jobq, - lapack_int const* m, lapack_int const* p, lapack_int const* n, - lapack_complex_double* a, lapack_int const* lda, - lapack_complex_double* b, lapack_int const* ldb, - double* tola, double* tolb, lapack_int* k, - lapack_int* l, lapack_complex_double* u, - lapack_int const* ldu, lapack_complex_double* v, - lapack_int const* ldv, lapack_complex_double* q, - lapack_int const* ldq, lapack_int* iwork, double* rwork, - lapack_complex_double* tau, lapack_complex_double* work, - lapack_int* info); +lapack_int LAPACK_zggsvp( + char const* jobu, char const* jobv, char const* jobq, + lapack_int const* m, lapack_int const* p, lapack_int const* n, + lapack_complex_double* a, lapack_int const* lda, + lapack_complex_double* b, lapack_int const* ldb, + double* tola, double* tolb, lapack_int* k, lapack_int* l, + lapack_complex_double* u, lapack_int const* ldu, + lapack_complex_double* v, lapack_int const* ldv, + lapack_complex_double* q, lapack_int const* ldq, + lapack_int* iwork, double* rwork, lapack_complex_double* tau, + lapack_complex_double* work, lapack_int* info ); #define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3) void LAPACK_cggsvp3( From 718f67421aaf83fb33722e4267a2be40185f63de Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Fri, 4 Sep 2020 10:36:19 -0500 Subject: [PATCH 174/349] POWER9: Fix mcpu option with clang Adding check for compiler type before checking GCC version in Makefile. This allows clang to use power9 instead of power8 when CORE is POWER9. --- Makefile.power | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile.power b/Makefile.power index 37a02d6922..e766f84994 100644 --- a/Makefile.power +++ b/Makefile.power @@ -17,6 +17,7 @@ endif ifeq ($(CORE), POWER9) ifneq ($(C_COMPILER), PGI) CCOMMON_OPT += -Ofast -mvsx -fno-fast-math +ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) CCOMMON_OPT += -mcpu=power8 -mtune=power8 @@ -24,10 +25,14 @@ else CCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else +CCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif +else CCOMMON_OPT += -fast -Mvect=simd -Mcache_align endif ifneq ($(F_COMPILER), PGI) FCOMMON_OPT += -O2 -frecursive -fno-fast-math +ifeq ($(C_COMPILER), GCC) ifneq ($(GCCVERSIONGT4), 1) $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) FCOMMON_OPT += -mcpu=power8 -mtune=power8 @@ -35,6 +40,9 @@ else FCOMMON_OPT += -mcpu=power9 -mtune=power9 endif else +FCOMMON_OPT += -mcpu=power9 -mtune=power9 +endif +else FCOMMON_OPT += -O2 -Mrecursive endif endif From 330044d82147a9a08fd10d503fec7f406cde2861 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Sep 2020 09:44:33 +0200 Subject: [PATCH 175/349] Fix potentiol domain error in sqrt --- driver/level3/level3_syrk_threaded.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c index a041abac31..d7dcd68a3b 100644 --- a/driver/level3/level3_syrk_threaded.c +++ b/driver/level3/level3_syrk_threaded.c @@ -526,7 +526,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO BLASLONG width, i, j, k; BLASLONG n, n_from, n_to; int mode, mask; - double dnum; + double dnum, di, dinum; if ((nthreads == 1) || (args -> n < nthreads * SWITCH_RATIO)) { SYRK_LOCAL(args, range_m, range_n, sa, sb, 0); @@ -601,9 +601,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (nthreads - num_cpu > 1) { - double di = (double)i; + di = (double)i; - width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) ); + dinum = di * di + dnum; + + if (dinum > 0) + width = (((BLASLONG)((sqrt(dinum) - di) + mask)/(mask+1)) * (mask+1) ); + else + width = (((BLASLONG)(- di + mask)/(mask+1)) * (mask+1) ); if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) ); @@ -643,10 +648,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (nthreads - num_cpu > 1) { - double di = (double)i; + di = (double)i; - width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + dinum = di * di +dnum; + if (dinum > 0) + width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1)); + else + width = (((BLASLONG)(- di + mask)/(mask+1)) * (mask+1)); + if ((width > n - i) || (width < mask)) width = n - i; } else { From 8a2a137a9e4e4ec657c5befe361061607489aaa2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Sep 2020 13:06:31 +0200 Subject: [PATCH 176/349] Correct argument to SLASET (Improves fix from PR2778) as explained by serguei-patchkovskii in Reference-LAPACK/lapack#438 (comment) , passing in an index of 1 instead of N leads to a standards violation accessing matrix A in SLASET, i.e. undefined behavior --- lapack-netlib/TESTING/EIG/cchkhb2stg.f | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkhb2stg.f b/lapack-netlib/TESTING/EIG/cchkhb2stg.f index cd884febfe..100f133abd 100644 --- a/lapack-netlib/TESTING/EIG/cchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkhb2stg.f @@ -680,8 +680,8 @@ SUBROUTINE CCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -753,8 +753,8 @@ SUBROUTINE CCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL SLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH From 6f8fad87c5d272f3e01853906be0269d9b96b30a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 5 Sep 2020 19:44:01 +0200 Subject: [PATCH 177/349] Use POSIX2001 clock.gettime for higher resolution --- benchmark/asum.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/benchmark/asum.c b/benchmark/asum.c index 78ccdf47b9..e3d16acfd2 100644 --- a/benchmark/asum.c +++ b/benchmark/asum.c @@ -128,8 +128,13 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) struct timeval start, stop; double time1,timeg; +#else + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; +#endif argc--;argv++; @@ -160,26 +165,30 @@ int main(int argc, char *argv[]){ fprintf(stderr, " %6d : ", (int)m); - for (l=0; l1) timeg /= loops; #ifdef COMPLEX From 7d9c77f421fd662f8e103f6fae8adefc49e42078 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 7 Sep 2020 22:03:46 +0200 Subject: [PATCH 178/349] Correct dimension argument to xLASET from Reference-LAPACK PR 438 --- lapack-netlib/TESTING/EIG/cchkst2stg.f | 8 ++++---- lapack-netlib/TESTING/EIG/dchksb2stg.f | 8 ++++---- lapack-netlib/TESTING/EIG/dchkst2stg.f | 8 ++++---- lapack-netlib/TESTING/EIG/zchkhb2stg.f | 8 ++++---- lapack-netlib/TESTING/EIG/zchkst2stg.f | 8 ++++---- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f index 5c478577f0..8c7f962b74 100644 --- a/lapack-netlib/TESTING/EIG/cchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f @@ -1014,8 +1014,8 @@ SUBROUTINE CCHKST2STG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( 'U', N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -1048,8 +1048,8 @@ SUBROUTINE CCHKST2STG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL CLACPY( 'L', N, N, A, LDA, V, LDU ) CALL CHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, $ WORK, LH, WORK( LH+1 ), LW, IINFO ) diff --git a/lapack-netlib/TESTING/EIG/dchksb2stg.f b/lapack-netlib/TESTING/EIG/dchksb2stg.f index ee66f7ebb3..88f6e18d36 100644 --- a/lapack-netlib/TESTING/EIG/dchksb2stg.f +++ b/lapack-netlib/TESTING/EIG/dchksb2stg.f @@ -670,8 +670,8 @@ SUBROUTINE DCHKSB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -743,8 +743,8 @@ SUBROUTINE DCHKSB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL DLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f index ca31c9d1f0..7115175c29 100644 --- a/lapack-netlib/TESTING/EIG/dchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f @@ -999,8 +999,8 @@ SUBROUTINE DCHKST2STG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL DLACPY( "U", N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -1032,8 +1032,8 @@ SUBROUTINE DCHKST2STG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL DLACPY( "L", N, N, A, LDA, V, LDU ) CALL DSYTRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, $ WORK, LH, WORK( LH+1 ), LW, IINFO ) diff --git a/lapack-netlib/TESTING/EIG/zchkhb2stg.f b/lapack-netlib/TESTING/EIG/zchkhb2stg.f index dbbb843480..05434e4e33 100644 --- a/lapack-netlib/TESTING/EIG/zchkhb2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkhb2stg.f @@ -680,8 +680,8 @@ SUBROUTINE ZCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -753,8 +753,8 @@ SUBROUTINE ZCHKHB2STG( NSIZES, NN, NWDTHS, KK, NTYPES, DOTYPE, * the one from above. Compare it with D1 computed * using the DSBTRD. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL ZLACPY( ' ', K+1, N, A, LDA, U, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f index 167e5f3591..4eadca4f30 100644 --- a/lapack-netlib/TESTING/EIG/zchkst2stg.f +++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f @@ -1014,8 +1014,8 @@ SUBROUTINE ZCHKST2STG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL ZLACPY( 'U', N, N, A, LDA, V, LDU ) LH = MAX(1, 4*N) LW = LWORK - LH @@ -1048,8 +1048,8 @@ SUBROUTINE ZCHKST2STG( NSIZES, NN, NTYPES, DOTYPE, ISEED, THRESH, * the one from above. Compare it with D1 computed * using the 1-stage. * - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, 1 ) - CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, 1 ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SD, N ) + CALL DLASET( 'Full', N, 1, ZERO, ZERO, SE, N ) CALL ZLACPY( 'L', N, N, A, LDA, V, LDU ) CALL ZHETRD_2STAGE( 'N', "L", N, V, LDU, SD, SE, TAU, $ WORK, LH, WORK( LH+1 ), LW, IINFO ) From 0629d8ebdb98995b995ac4593c98f7721703c8fc Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Fri, 4 Sep 2020 16:32:45 +0200 Subject: [PATCH 179/349] s390x/DYNAMIC_ARCH: generalize detecting supported archs for clang Simplify detection of which kernels we can compile on s390x. Instead of decoding the gcc version in a complicated manner, just check if CC supports a given -march=archXY flag. Together with the next patch, we thereby gain support for builds with LLVM/clang with DYNAMIC_ARCH=1. Signed-off-by: Marius Hillenbrand --- Makefile.system | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/Makefile.system b/Makefile.system index e7d3dc4ce8..f4a42f7297 100644 --- a/Makefile.system +++ b/Makefile.system @@ -295,7 +295,6 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -594,34 +593,34 @@ endif ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC -# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer -ifeq ($(GCCVERSIONGT5), 1) - ZARCH_SUPPORT_Z13 := 1 -else ifeq ($(GCCVERSIONEQ5), 1) -ifeq ($(GCCMINORVERSIONGTEQ2), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) -ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(ZARCH_SUPPORT_Z13), 1) +# if the compiler accepts -march=arch11 or -march=z13 and can compile a file +# with z13-specific inline assembly, then we can include support for Z13. +# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases +# only support one or the other. +# note: LLVM version 6.x supported -march=z13 yet could not handle vector +# registers in inline assembly, so the check for supporting the -march flag is +# not enough. +ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null +ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) + +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) DYNAMIC_CORE += Z13 else -$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) endif -ifeq ($(GCCVERSIONGTEQ7), 1) +# as above for z13, check for -march=arch12 and z14 support in the compiler. +ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) DYNAMIC_CORE += Z14 else -$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) -endif +$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) endif +endif # ARCH zarch + ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 From 4f34bcfb5e2da40ffe02c9f0765b9f4e18e8f6f5 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 7 Sep 2020 17:04:03 +0200 Subject: [PATCH 180/349] s390x/DYNAMIC_ARCH: pass supported arch levels from Makefile to run-time code ... instead of duplicating the (old) mechanism from the Makefile that aimed to derive supported architecture generations from the gcc version. To enable builds with DYNAMIC_ARCH with older compiler releases, the Makefile and drivers/other/dynamic_arch.c need a common view of the architecture support built into the library. We follow the notation from x86 when used with DYNAMIC_LIST, where defines DYN_ denote support for a given generation to be built in. Since there are far fewer architecture generations in OpenBLAS for s390x, that does not bloat command lines too much. Signed-off-by: Marius Hillenbrand --- Makefile.system | 2 ++ driver/others/dynamic_zarch.c | 48 ++++++++++++++++------------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/Makefile.system b/Makefile.system index f4a42f7297..1b832ba418 100644 --- a/Makefile.system +++ b/Makefile.system @@ -606,6 +606,7 @@ ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) DYNAMIC_CORE += Z13 +CCOMMON_OPT += -DDYN_Z13 else $(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) endif @@ -615,6 +616,7 @@ ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && ec ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) DYNAMIC_CORE += Z14 +CCOMMON_OPT += -DDYN_Z14 else $(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) endif diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 403b341110..dac8909fbb 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,18 +1,6 @@ #include "common.h" #include -// Gate kernels for z13 and z14 on gcc version -#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ - /* RHEL 7 since 7.3: */ \ - (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ - __GNUC_RH_RELEASE__ >= 11) -#define HAVE_Z13_SUPPORT -#endif - -#if __GNUC__ >= 7 -#define HAVE_Z14_SUPPORT -#endif - // Guard the use of getauxval() on glibc version >= 2.16 #ifdef __GLIBC__ #include @@ -47,10 +35,10 @@ static unsigned long get_hwcap(void) { #endif // __GLIBC extern gotoblas_t gotoblas_ZARCH_GENERIC; -#ifdef HAVE_Z13_SUPPORT +#ifdef DYN_Z13 extern gotoblas_t gotoblas_Z13; #endif -#ifdef HAVE_Z14_SUPPORT +#ifdef DYN_Z14 extern gotoblas_t gotoblas_Z14; #endif @@ -66,10 +54,10 @@ static char* corename[] = { }; char* gotoblas_corename(void) { -#ifdef HAVE_Z13_SUPPORT +#ifdef DYN_Z13 if (gotoblas == &gotoblas_Z13) return corename[1]; #endif -#ifdef HAVE_Z14_SUPPORT +#ifdef DYN_Z14 if (gotoblas == &gotoblas_Z14) return corename[2]; #endif if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; @@ -89,15 +77,15 @@ static gotoblas_t* get_coretype(void) { unsigned long hwcap __attribute__((unused)) = get_hwcap(); +#ifdef DYN_Z14 // z14 and z15 systems: exploit Vector Facility (SIMD) and // Vector-Enhancements Facility 1 (float SIMD instructions), if present. -#ifdef HAVE_Z14_SUPPORT if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) return &gotoblas_Z14; #endif +#ifdef DYN_Z13 // z13: Vector Facility (SIMD for double) -#ifdef HAVE_Z13_SUPPORT if (hwcap & HWCAP_S390_VX) return &gotoblas_Z13; #endif @@ -123,19 +111,27 @@ static gotoblas_t* force_coretype(char* coretype) { } } - switch (found) - { -#ifdef HAVE_Z13_SUPPORT - case 1: return (&gotoblas_Z13); + if (found == 1) { +#ifdef DYN_Z13 + return &gotoblas_Z13; +#else + openblas_warning(1, "Z13 support not compiled in"); + return NULL; #endif -#ifdef HAVE_Z14_SUPPORT - case 2: return (&gotoblas_Z14); + } else if (found == 2) { +#ifdef DYN_Z14 + return &gotoblas_Z14; +#else + openblas_warning(1, "Z14 support not compiled in"); + return NULL; #endif - case 3: return (&gotoblas_ZARCH_GENERIC); - default: return NULL; + } else if (found == 3) { + return &gotoblas_ZARCH_GENERIC; } + snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); + return NULL; } void gotoblas_dynamic_init(void) { From a55fe06f251ff6269f4a126dec27f59bf3ea67f0 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 7 Sep 2020 17:13:03 +0200 Subject: [PATCH 181/349] s390x/DYNAMIC_ARCH: define a HW_CAP flag to support slightly older glibc versions Enable building DYNAMIC_ARCH support with older versions of glibc that do not know about the hwcap flag HWCAP_S390_VXE yet. Signed-off-by: Marius Hillenbrand --- driver/others/dynamic_zarch.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index dac8909fbb..bf5eab9b20 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -65,6 +65,10 @@ char* gotoblas_corename(void) { return corename[0]; } +#ifndef HWCAP_S390_VXE +#define HWCAP_S390_VXE 8192 +#endif + /** * Detect the fitting set of kernels by retrieving the CPU features supported by * OS from the auxiliary value AT_HWCAP and choosing the set of kernels From f7731a358af7871a72dad3ada5d35963bb454ed7 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 8 Sep 2020 15:15:15 +0200 Subject: [PATCH 182/349] Update CONTRIBUTERS.md - clang build fixes for IBM z Signed-off-by: Marius Hillenbrand --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index aba39e56f9..7b994885a1 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -187,6 +187,7 @@ In chronological order: * Marius Hillenbrand * [2020-05-12] Revise dynamic architecture detection for IBM z * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 + * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support * Danfeng Zhang * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 \ No newline at end of file From 047b8d7aff79d31c25c8c6a46fd917fafe4ca8c8 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 8 Sep 2020 19:30:37 +0200 Subject: [PATCH 183/349] Add an s390 build with clang to the Travis configuration Since clang builds have been fixed on s390x, including support for DYNAMIC_ARCH, cover that build type in Travis. Explicitly request Ubuntu 20.04 (codename focal) to get a recent LLVM/clang version 10.x and thereby cover all s390x architecture generations supported in OpenBLAS. Ubuntu 18.10's LLVM/clang 6.x cannot build the inline assembly in some of the Z13 and Z14 kernels. LLVM/clang currently does not support OpenMP on s390x, so disable that in the build. Signed-off-by: Marius Hillenbrand --- .travis.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.travis.yml b/.travis.yml index 307010e40e..3f8f766fe9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,6 +43,18 @@ matrix: - TARGET_BOX=IBMZ_LINUX - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu + os: linux + dist: focal + arch: s390x + compiler: clang + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=IBMZ_LINUX + - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" + - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 From 746ad3bd190493a7219bc02547a050772d4a4e01 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 18:40:59 +0200 Subject: [PATCH 184/349] Fix vendor match for GCC gfortran --- f_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/f_check b/f_check index dd4d3475c9..f894aa9ac5 100644 --- a/f_check +++ b/f_check @@ -69,7 +69,7 @@ if ($compiler eq "") { $bu = "_"; } - if ($data =~ /GNU/) { + if ($data =~ /GNU/ || $data =~ /GCC/ ) { $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; From 26792d2096ce0736a53bef6b8bf4ff0206ac3efa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 21:47:55 +0200 Subject: [PATCH 185/349] Copy BUILD_* directives to the compiler options to allow ifdef in tests --- cmake/system.cmake | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index c0f3c6ed2e..aa342c3d20 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -393,6 +393,18 @@ set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CCOMMON_OPT}") +if (BUILD_SINGLE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") +endif() +if (BUILD_DOUBLE) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE") +endif() +if (BUILD_COMPLEX) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX") +endif() +if (BUILD_COMPLEX16) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") +endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") endif() From 74e358bcd514cff2e9b32c13571c09176b56a3d8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 21:49:01 +0200 Subject: [PATCH 186/349] Remove spurious complex16 tests --- ctest/c_dblas1.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ctest/c_dblas1.c b/ctest/c_dblas1.c index e49ae60075..8e13afcaaf 100644 --- a/ctest/c_dblas1.c +++ b/ctest/c_dblas1.c @@ -74,16 +74,6 @@ void F77_dswap( const int *N, double *X, const int *incX, return; } -double F77_dzasum(const int *N, void *X, const int *incX) -{ - return cblas_dzasum(*N, X, *incX); -} - -double F77_dznrm2(const int *N, OPENBLAS_CONST void *X, const int *incX) -{ - return cblas_dznrm2(*N, X, *incX); -} - int F77_idamax(const int *N, OPENBLAS_CONST double *X, const int *incX) { if (*N < 1 || *incX < 1) return(0); From 593ce9e23786796a483f44436e4aca57d042f05d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 21:50:12 +0200 Subject: [PATCH 187/349] Make building individual tests depend on BUILD_SINGLE etc defines --- test/CMakeLists.txt | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index adeee34525..f1f773cbaf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -3,11 +3,18 @@ include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) -set(OpenBLAS_Tests - sblat1 sblat2 sblat3 - dblat1 dblat2 dblat3 - cblat1 cblat2 cblat3 - zblat1 zblat2 zblat3) +if (BUILD_SINGLE) + list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) +endif() +if (BUILD_DOUBLE) + list (APPEND OpenBLAS_Tests dblat1 dblat2 dblat3) +endif() +if (BUILD_COMPLEX) + list (APPEND OpenBLAS_Tests cblat1 cblat2 cblat3) +endif() +if (BUILD_COMPLEX16) + list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) +endif() foreach(test_bin ${OpenBLAS_Tests}) add_executable(${test_bin} ${test_bin}.f) From ce8939863626d3a194890e87edc9b7280f73b660 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 21:52:18 +0200 Subject: [PATCH 188/349] Make tests for individual variable types conditional on the respective BUILD_ option --- utest/test_amax.c | 6 +++++- utest/test_axpy.c | 9 +++++++++ utest/test_dotu.c | 3 +++ utest/test_ismin.c | 2 ++ utest/test_min.c | 13 +++++++++++-- utest/test_potrs.c | 39 ++++++++++++++++++++++++++++++--------- utest/test_rot.c | 9 +++++++++ utest/test_swap.c | 9 +++++++++ 8 files changed, 78 insertions(+), 12 deletions(-) diff --git a/utest/test_amax.c b/utest/test_amax.c index 8318040271..a9e5a1c858 100644 --- a/utest/test_amax.c +++ b/utest/test_amax.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_SINGLE CTEST(amax, samax){ blasint N=3, inc=1; float te_max=0.0, tr_max=0.0; @@ -43,7 +44,8 @@ CTEST(amax, samax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } - +#endif +#ifdef BUILD_DOUBLE CTEST(amax, damax){ blasint N=3, inc=1; double te_max=0.0, tr_max=0.0; @@ -54,3 +56,5 @@ CTEST(amax, damax){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } +#endif + diff --git a/utest/test_axpy.c b/utest/test_axpy.c index 6030430735..5fd7c1b04d 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_DOUBLE CTEST(axpy,daxpy_inc_0) { blasint i; @@ -52,7 +53,9 @@ CTEST(axpy,daxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX16 CTEST(axpy,zaxpy_inc_0) { blasint i; @@ -71,7 +74,9 @@ CTEST(axpy,zaxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_SINGLE CTEST(axpy,saxpy_inc_0) { blasint i; @@ -90,7 +95,9 @@ CTEST(axpy,saxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX CTEST(axpy,caxpy_inc_0) { blasint i; @@ -109,3 +116,5 @@ CTEST(axpy,caxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif + diff --git a/utest/test_dotu.c b/utest/test_dotu.c index 918541848d..5422864038 100644 --- a/utest/test_dotu.c +++ b/utest/test_dotu.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_COMPLEX16 CTEST( zdotu,zdotu_n_1) { blasint N=1,incX=1,incY=1; @@ -80,3 +81,5 @@ CTEST(zdotu, zdotu_offset_1) #endif } +#endif + diff --git a/utest/test_ismin.c b/utest/test_ismin.c index f23d6b5457..af597807f7 100644 --- a/utest/test_ismin.c +++ b/utest/test_ismin.c @@ -36,6 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ELEMENTS 50 #define INCREMENT 2 +#ifdef BUILD_SINGLE CTEST(ismin, positive_step_2){ blasint i; blasint N = ELEMENTS, inc = INCREMENT; @@ -87,3 +88,4 @@ CTEST(ismax, negative_step_2){ blasint index = BLASFUNC(ismax)(&N, x, &inc); ASSERT_EQUAL(9, index); } +#endif diff --git a/utest/test_min.c b/utest/test_min.c index fd31b59828..a627674aec 100644 --- a/utest/test_min.c +++ b/utest/test_min.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" - +#ifdef BUILD_SINGLE CTEST(min, smin_negative){ blasint N=3, inc=1; float te_min=0.0, tr_min=0.0; @@ -43,7 +43,9 @@ CTEST(min, smin_negative){ ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); } +#endif +#ifdef BUILD_DOUBLE CTEST(min, dmin_positive){ blasint N=3, inc=1; double te_min=0.0, tr_min=0.0; @@ -54,7 +56,9 @@ CTEST(min, dmin_positive){ ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); } +#endif +#ifdef BUILD_SINGLE CTEST(min, smin_zero){ blasint N=3, inc=1; float te_min=0.0, tr_min=0.0; @@ -76,7 +80,9 @@ CTEST(max, smax_negative){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } +#endif +#ifdef BUILD_DOUBLE CTEST(max, dmax_positive){ blasint N=3, inc=1; double te_max=0.0, tr_max=0.0; @@ -87,7 +93,8 @@ CTEST(max, dmax_positive){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); } - +#endif +#ifdef BUILD_SINGLE CTEST(max, smax_zero){ blasint N=3, inc=1; float te_max=0.0, tr_max=0.0; @@ -98,3 +105,5 @@ CTEST(max, smax_zero){ ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); } +#endif + diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 7afeb4c9d8..05ce3037be 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -39,10 +39,10 @@ void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, BLASINT*, complex double*, BLASINT*, BLASINT*); */ - //https://github.com/xianyi/OpenBLAS/issues/695 CTEST(potrf, bug_695){ +#ifdef BUILD_COMPLEX openblas_complex_float A1[100] = { openblas_make_complex_float(5.8525753, +0.0), @@ -153,7 +153,9 @@ CTEST(potrf, bug_695){ blasint info[1]; BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); +#endif +#ifdef BUILD_COMPLEX16 openblas_complex_double A2[100] = { openblas_make_complex_double(3.0607147216796875, +0.0), @@ -283,7 +285,8 @@ CTEST(potrf, bug_695){ char lo = 'L'; blasint nrhs = 2; BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); - +#endif +#ifdef BUILD_COMPLEX // note that this is exactly equal to A1 openblas_complex_float A3[100] = { @@ -393,9 +396,9 @@ CTEST(potrf, bug_695){ if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__); } +#endif } - // Check potrf factorizes a small problem correctly CTEST(potrf, smoketest_trivial){ float A1s[4] = {2, 0.3, 0.3, 3}; @@ -439,31 +442,43 @@ CTEST(potrf, smoketest_trivial){ uplo = 'U'; } +#ifdef BUILD_SINGLE BLASFUNC(scopy)(&nv, A1s, &inc, As, &inc); +#endif +#ifdef BUILD_DOUBLE BLASFUNC(dcopy)(&nv, A1d, &inc, Ad, &inc); +#endif +#ifdef BUILD_COMPLEX BLASFUNC(ccopy)(&nv, (float *)A1c, &inc, (float *)Ac, &inc); +#endif +#ifdef BUILD_COMPLEX16 BLASFUNC(zcopy)(&nv, (double *)A1z, &inc, (double *)Az, &inc); +#endif +#ifdef BUILD_SINGLE BLASFUNC(spotrf)(&uplo, &n, As, &n, &info); if (info != 0) { CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); } - +#endif +#ifdef BUILD_DOUBLE BLASFUNC(dpotrf)(&uplo, &n, Ad, &n, &info); if (info != 0) { CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); } - +#endif +#ifdef BUILD_COMPLEX BLASFUNC(cpotrf)(&uplo, &n, (float *)Ac, &n, &info); if (info != 0) { CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); } - +#endif +#ifdef BUILD_COMPLEX16 BLASFUNC(zpotrf)(&uplo, &n, (double *)Az, &n, &info); if (info != 0) { CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); } - +#endif /* Fill the other triangle */ if (uplo == 'L') { for (i = 0; i < n; ++i) { @@ -495,14 +510,20 @@ CTEST(potrf, smoketest_trivial){ trans1 = 'C'; trans2 = 'N'; } - +#ifdef BUILD_SINGLE BLASFUNC(sgemm)(&trans1, &trans2, &n, &n, &n, &ones, As, &n, As, &n, &zeros, Bs, &n); +#endif +#ifdef BUILD_DOUBLE BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, Ad, &n, Ad, &n, &zerod, Bd, &n); +#endif +#ifdef BUILD_COMPLEX BLASFUNC(cgemm)(&trans1, &trans2, &n, &n, &n, (float *)&onec, (float *)Ac, &n, (float *)Ac, &n, (float *)&zeroc, (float *)Bc, &n); +#endif +#ifdef BUILD_COMPLEX16 BLASFUNC(zgemm)(&trans1, &trans2, &n, &n, &n, (double *)&onez, (double *)Az, &n, (double *)Az, &n, (double *)&zeroz, (double *)Bz, &n); - +#endif /* Check result is close to original */ for (i = 0; i < n; ++i) { for (j = 0; j < n; ++j) { diff --git a/utest/test_rot.c b/utest/test_rot.c index cf72ad22d7..0e74ecbb36 100644 --- a/utest/test_rot.c +++ b/utest/test_rot.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_DOUBLE CTEST(rot,drot_inc_0) { blasint i=0; @@ -52,7 +53,9 @@ CTEST(rot,drot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX16 CTEST(rot,zdrot_inc_0) { blasint i=0; @@ -72,7 +75,9 @@ CTEST(rot,zdrot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_SINGLE CTEST(rot,srot_inc_0) { blasint i=0; @@ -91,7 +96,9 @@ CTEST(rot,srot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX CTEST(rot, csrot_inc_0) { blasint i=0; @@ -110,3 +117,5 @@ CTEST(rot, csrot_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +#endif + diff --git a/utest/test_swap.c b/utest/test_swap.c index 259c83a5c8..6d8ae80566 100644 --- a/utest/test_swap.c +++ b/utest/test_swap.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_DOUBLE CTEST(swap,dswap_inc_0) { blasint i=0; @@ -50,7 +51,9 @@ CTEST(swap,dswap_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX16 CTEST(swap,zswap_inc_0) { blasint i=0; @@ -68,7 +71,9 @@ CTEST(swap,zswap_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } +#endif +#ifdef BUILD_SINGLE CTEST(swap,sswap_inc_0) { blasint i=0; @@ -86,7 +91,9 @@ CTEST(swap,sswap_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +#endif +#ifdef BUILD_COMPLEX CTEST(swap,cswap_inc_0) { blasint i=0; @@ -104,3 +111,5 @@ CTEST(swap,cswap_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); } } +#endif + From ec2948f14784c3559b11f9aed07646396c3527cf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 22:17:46 +0200 Subject: [PATCH 189/349] Make tests conditional on BUILD_DOUBLE --- utest/test_kernel_regress.c | 2 ++ utest/test_rotmg.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/utest/test_kernel_regress.c b/utest/test_kernel_regress.c index 93a30b30cb..5b131bb2cc 100644 --- a/utest/test_kernel_regress.c +++ b/utest/test_kernel_regress.c @@ -22,6 +22,7 @@ double m[DATASIZE*DATASIZE]; CTEST(kernel_regress,skx_avx) { +#ifdef BUILD_DOUBLE double norm; int i, j, info; srand(0); @@ -47,4 +48,5 @@ CTEST(kernel_regress,skx_avx) norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1); ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10); +#endif } diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index e5ec789835..ad435f6b0b 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "openblas_utest.h" +#ifdef BUILD_DOUBLE CTEST (drotmg,rotmg) { double te_d1, tr_d1; @@ -204,3 +205,4 @@ CTEST(drotmg, drotmg_D1_big_D2_big_flag_zero) ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); } } +#endif From de139337b8bcb1c76cd157afd4d5fd035a76efdf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 22:20:41 +0200 Subject: [PATCH 190/349] Remove spurious tests for complex ASUM and NRM2 --- ctest/c_sblas1.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ctest/c_sblas1.c b/ctest/c_sblas1.c index 1a433b287b..a562014a4e 100644 --- a/ctest/c_sblas1.c +++ b/ctest/c_sblas1.c @@ -21,16 +21,6 @@ void F77_saxpy(blasint *N, const float *alpha, OPENBLAS_CONST float *X, return; } -float F77_scasum(blasint *N, float *X, blasint *incX) -{ - return cblas_scasum(*N, X, *incX); -} - -float F77_scnrm2(blasint *N, OPENBLAS_CONST float *X, blasint *incX) -{ - return cblas_scnrm2(*N, X, *incX); -} - void F77_scopy(blasint *N, OPENBLAS_CONST float *X, blasint *incX, float *Y, blasint *incY) { From 4d250d0cdf9f0d234aa9c3eeff246bbe1b9edd3b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 23:29:01 +0200 Subject: [PATCH 191/349] Rearrange ifdefs --- utest/test_potrs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 05ce3037be..2681615f46 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -42,7 +42,6 @@ void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, //https://github.com/xianyi/OpenBLAS/issues/695 CTEST(potrf, bug_695){ -#ifdef BUILD_COMPLEX openblas_complex_float A1[100] = { openblas_make_complex_float(5.8525753, +0.0), @@ -151,11 +150,11 @@ CTEST(potrf, bug_695){ blasint n=10; blasint info[1]; +#ifdef BUILD_COMPLEX BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); #endif -#ifdef BUILD_COMPLEX16 openblas_complex_double A2[100] = { openblas_make_complex_double(3.0607147216796875, +0.0), @@ -284,9 +283,9 @@ CTEST(potrf, bug_695){ }; char lo = 'L'; blasint nrhs = 2; +#ifdef BUILD_COMPLEX16 BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); #endif -#ifdef BUILD_COMPLEX // note that this is exactly equal to A1 openblas_complex_float A3[100] = { @@ -391,6 +390,7 @@ CTEST(potrf, bug_695){ openblas_make_complex_float(-0.9617417, -1.2486815), openblas_make_complex_float(3.4629636, +0.0) }; +#ifdef BUILD_COMPLEX BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info); // printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91])); if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { From 9e11c2d62f23ef2483d206aaf3952e0bd09d30cb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 13 Sep 2020 23:55:11 +0200 Subject: [PATCH 192/349] Add BUILD_SINGLE etc --- Makefile.rule | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 2c12177ee6..40bd1a8541 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -277,5 +277,10 @@ COMMON_PROF = -pg # If you want to enable the experimental BFLOAT16 support # BUILD_HALF = 1 # +# the below is not yet configurable, use cmake if you need to build only select types +BUILD_SINGLE = 1 +BUILD_DOUBLE = 1 +BUILD_COMPLEX = 1 +BUILD_COMPLEX16 = 1 # End of user configuration # From ba644378dce720f6bb946aa2b585c9e71f257e1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 14 Sep 2020 00:03:33 +0200 Subject: [PATCH 193/349] Copy BUILD_ options available to the compiler flags --- Makefile.system | 55 +++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/Makefile.system b/Makefile.system index 1b832ba418..0ccf9eaed7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -295,6 +295,7 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -593,35 +594,33 @@ endif ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC -# if the compiler accepts -march=arch11 or -march=z13 and can compile a file -# with z13-specific inline assembly, then we can include support for Z13. -# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases -# only support one or the other. -# note: LLVM version 6.x supported -march=z13 yet could not handle vector -# registers in inline assembly, so the check for supporting the -march flag is -# not enough. -ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null -ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) -ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) - -ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) +# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer +ifeq ($(GCCVERSIONGT5), 1) + ZARCH_SUPPORT_Z13 := 1 +else ifeq ($(GCCVERSIONEQ5), 1) +ifeq ($(GCCMINORVERSIONGTEQ2), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif + +ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) +ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) + ZARCH_SUPPORT_Z13 := 1 +endif +endif + +ifeq ($(ZARCH_SUPPORT_Z13), 1) DYNAMIC_CORE += Z13 -CCOMMON_OPT += -DDYN_Z13 else -$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) +$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) endif -# as above for z13, check for -march=arch12 and z14 support in the compiler. -ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) -ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) -ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) +ifeq ($(GCCVERSIONGTEQ7), 1) DYNAMIC_CORE += Z14 -CCOMMON_OPT += -DDYN_Z14 else -$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) +$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) +endif endif - -endif # ARCH zarch ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 @@ -1223,6 +1222,18 @@ endif ifeq ($(BUILD_HALF), 1) CCOMMON_OPT += -DBUILD_HALF endif +ifeq ($(BUILD_SINGLE), 1) +CCOMMON_OPT += -DBUILD_SINGLE +endif +ifeq ($(BUILD_DOUBLE), 1) +CCOMMON_OPT += -DBUILD_DOUBLE +endif +ifeq ($(BUILD_COMPLEX), 1) +CCOMMON_OPT += -DBUILD_COMPLEX +endif +ifeq ($(BUILD_COMPLEX16), 1) +CCOMMON_OPT += -DBUILD_COMPLEX16 +endif CCOMMON_OPT += -DVERSION=\"$(VERSION)\" From 274d6e015b56a9f0ccad928232ed3bd88a063754 Mon Sep 17 00:00:00 2001 From: fossum Date: Mon, 14 Sep 2020 13:10:48 -0500 Subject: [PATCH 194/349] Fixing a performance bug in trsm_[LR].c. --- driver/level3/trsm_L.c | 4 ++-- driver/level3/trsm_R.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/driver/level3/trsm_L.c b/driver/level3/trsm_L.c index d8130ee7e1..d842efa930 100644 --- a/driver/level3/trsm_L.c +++ b/driver/level3/trsm_L.c @@ -131,7 +131,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -197,7 +197,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; diff --git a/driver/level3/trsm_R.c b/driver/level3/trsm_R.c index f6a57f93fd..f76a8f7f34 100644 --- a/driver/level3/trsm_R.c +++ b/driver/level3/trsm_R.c @@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -182,7 +182,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ min_jj = min_j - min_l - ls + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -243,7 +243,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; @@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ min_jj = min_j - js + ls - jjs; - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; From dfeca46098ff7b3cc47aa053195fe1c82bce87e9 Mon Sep 17 00:00:00 2001 From: fossum Date: Tue, 15 Sep 2020 08:59:50 -0500 Subject: [PATCH 195/349] Adding performance patch for trmm, just like #2836 --- driver/level3/trmm_L.c | 8 ++++---- driver/level3/trmm_R.c | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index 1027c0c737..ae8435d036 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -139,7 +139,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -209,7 +209,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -374,7 +374,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index e8df7fb210..3be43edded 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -150,7 +150,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -207,7 +207,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -262,7 +262,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -287,7 +287,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif @@ -348,7 +348,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; #else - if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; + if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; else if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif From c4aeeeb9f4d59a28ca91382bc77e55d9abbaa6e7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 15 Sep 2020 23:15:34 +0200 Subject: [PATCH 196/349] Activate all BUILD_ options if none was specified --- cmake/system.cmake | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index aa342c3d20..8908a18908 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -393,6 +393,13 @@ set(REVISION "-r${OpenBLAS_VERSION}") set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CCOMMON_OPT}") + +if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_COMPLEX16) + set (BUILD_SINGLE ON) + set (BUILD_DOUBLE ON) + set (BUILD_COMPLEX ON) + set (BUILD_COMPLEX16 ON) +endif() if (BUILD_SINGLE) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") endif() From 2e3b15d68bc108c112abdc0ea3dc8074134b3815 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:43:55 +0200 Subject: [PATCH 197/349] Add CMakeLists.txt --- cpp_thread_test/CMakeLists.txt | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 cpp_thread_test/CMakeLists.txt diff --git a/cpp_thread_test/CMakeLists.txt b/cpp_thread_test/CMakeLists.txt new file mode 100644 index 0000000000..5eccb12ceb --- /dev/null +++ b/cpp_thread_test/CMakeLists.txt @@ -0,0 +1,23 @@ +include_directories(${PROJECT_SOURCE_DIR}) +include_directories(${PROJECT_BINARY_DIR}) + +enable_language(CXX) + +set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") + +if (USE_OPENMP) +if (CPP_THREAD_SAFETY_TEST) + message(STATUS building thread safety test) + add_executable(dgemm_thread_safety dgemm_thread_safety.cpp) + target_link_libraries(dgemm_thread_safety ${OpenBLAS_LIBNAME}) + add_test( dgemm_thread_safety ${CMAKE_CURRENT_BINARY_DIR}/dgemm_thread_safety) +endif() + + +if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_executable(dgemv_thread_safety dgemv_thread_safety.cpp) + target_link_libraries(dgemv_thread_safety ${OpenBLAS_LIBNAME}) + add_test(dgemv_thread_safety ${CMAKE_CURRENT_BINARY_DIR}/dgemv_thread_safety) +endif() + +endif() From 8c5c991bd7e4eb89fc46d6c5ac41bd5ab9363836 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:45:40 +0200 Subject: [PATCH 198/349] Add cpp_thread_test options --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b82d76704..954c053e46 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() +option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) +option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -234,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES From 84c00c3c6e3f8f1344d632a559610d03a861f9fb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:46:41 +0200 Subject: [PATCH 199/349] Support running just the GEMV version of the thread safety test --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 7a03b08f01..93e8af2eb4 100644 --- a/Makefile +++ b/Makefile @@ -146,6 +146,9 @@ ifneq ($(NO_CBLAS), 1) ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all endif +ifeq ($(CPP_THREAD_SAFETY_GEMV), 1) + $(MAKE) -C cpp_thread_test dgemv_tester +endif endif endif From 6abca76c4e0171a598ffc7f3bef8279c13d71546 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Sep 2020 13:49:24 +0200 Subject: [PATCH 200/349] Add option for running only the less demanding GEMV version of the thread safety tests --- Makefile.rule | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.rule b/Makefile.rule index 40bd1a8541..4d6f2d313f 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,6 +272,9 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 +# +# use this to run only the less memory-hungry GEMV test +# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support From 75d440caa083a32ca3b30809f18f1e29c75a967b Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Thu, 17 Sep 2020 16:45:07 +0200 Subject: [PATCH 201/349] s390x/DYNAMIC_ARCH: fixup broken merge and reapply simplification An unrelated commit and merge inadvertently reverted our recent two changes for simplifying DYNAMIC_ARCH on s390x. Simply reapply the changes. Simplify detection of which kernels we can compile on s390x. Instead of decoding the gcc version in a complicated manner, just check if CC supports a given -march=archXY flag. Together with the next patch, we thereby gain support for builds with LLVM/clang with DYNAMIC_ARCH=1. To enable builds with DYNAMIC_ARCH with older compiler releases, the Makefile and drivers/other/dynamic_arch.c need a common view of the architecture support built into the library. We follow the notation from x86 when used with DYNAMIC_LIST, where defines DYN_ denote support for a given generation to be built in. Since there are far fewer architecture generations in OpenBLAS for s390x, that does not bloat command lines too much. Closes: #2842 Fixes: ba644378dce7 ("Copy BUILD_ options available to the compiler flags" Signed-off-by: Marius Hillenbrand --- Makefile.system | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/Makefile.system b/Makefile.system index 0ccf9eaed7..c46c88581f 100644 --- a/Makefile.system +++ b/Makefile.system @@ -295,7 +295,6 @@ endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) -GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -594,34 +593,36 @@ endif ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC -# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer -ifeq ($(GCCVERSIONGT5), 1) - ZARCH_SUPPORT_Z13 := 1 -else ifeq ($(GCCVERSIONEQ5), 1) -ifeq ($(GCCMINORVERSIONGTEQ2), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) -ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) - ZARCH_SUPPORT_Z13 := 1 -endif -endif - -ifeq ($(ZARCH_SUPPORT_Z13), 1) +# if the compiler accepts -march=arch11 or -march=z13 and can compile a file +# with z13-specific inline assembly, then we can include support for Z13. +# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases +# only support one or the other. +# note: LLVM version 6.x supported -march=z13 yet could not handle vector +# registers in inline assembly, so the check for supporting the -march flag is +# not enough. +ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null +ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1) + +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) DYNAMIC_CORE += Z13 +CCOMMON_OPT += -DDYN_Z13 else -$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) endif -ifeq ($(GCCVERSIONGTEQ7), 1) +# as above for z13, check for -march=arch12 and z14 support in the compiler. +ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1) +ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1) +ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1) DYNAMIC_CORE += Z14 +CCOMMON_OPT += -DDYN_Z14 else -$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) -endif +$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) endif +endif # ARCH zarch + ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 From be43d2cb9651d37aed44307037dc98b837f95358 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Thu, 17 Sep 2020 12:56:28 -0500 Subject: [PATCH 202/349] Optimize daxpy/zaxpy for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. Tested in simulator and no new failures. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/daxpy_microk_power10.c | 131 ++++++++++++++++++ kernel/power/daxpy_power10.c | 121 +++++++++++++++++ kernel/power/zaxpy_microk_power10.c | 200 ++++++++++++++++++++++++++++ kernel/power/zaxpy_power10.c | 126 ++++++++++++++++++ 5 files changed, 580 insertions(+), 2 deletions(-) create mode 100644 kernel/power/daxpy_microk_power10.c create mode 100644 kernel/power/daxpy_power10.c create mode 100644 kernel/power/zaxpy_microk_power10.c create mode 100644 kernel/power/zaxpy_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index f390fac61d..ec02e09adb 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -142,13 +142,13 @@ CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c -DAXPYKERNEL = daxpy.c +DAXPYKERNEL = daxpy_power10.c ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power9.S else CAXPYKERNEL = caxpy.c endif -ZAXPYKERNEL = zaxpy.c +ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy.c DCOPYKERNEL = dcopy.c diff --git a/kernel/power/daxpy_microk_power10.c b/kernel/power/daxpy_microk_power10.c new file mode 100644 index 0000000000..bc9199efd3 --- /dev/null +++ b/kernel/power/daxpy_microk_power10.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) +{ + __vector double t0; + + __asm__ + ( + XXSPLTD_S(%x4,%x6,0) + + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 40, 64(%2) \n\t" + "lxvp 42, 96(%2) \n\t" + + "lxvp 36, 0(%3) \n\t" + "lxvp 38, 32(%3) \n\t" + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 36, 32, %x4 \n\t" + "xvmaddadp 37, 33, %x4 \n\t" + + "lxvp 32, 0(%2) \n\t" + "stxvp 36, 0(%3) \n\t" + + "xvmaddadp 38, 34, %x4 \n\t" + "xvmaddadp 39, 35, %x4 \n\t" + + "lxvp 34, 32(%2) \n\t" + "stxvp 38, 32(%3) \n\t" + + + "lxvp 36, 128(%3) \n\t" + "lxvp 38, 160(%3) \n\t" + + "xvmaddadp 44, 40, %x4 \n\t" + "xvmaddadp 45, 41, %x4 \n\t" + + "lxvp 40, 64(%2) \n\t" + "stxvp 44, 64(%3) \n\t" + + "xvmaddadp 46, 42, %x4 \n\t" + "xvmaddadp 47, 43, %x4 \n\t" + + "lxvp 42, 96(%2) \n\t" + "stxvp 46, 96(%3) \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddadp 36, 32, %x4 \n\t" + "xvmaddadp 37, 33, %x4 \n\t" + "xvmaddadp 38, 34, %x4 \n\t" + "xvmaddadp 39, 35, %x4 \n\t" + + "xvmaddadp 44, 40, %x4 \n\t" + "xvmaddadp 45, 41, %x4 \n\t" + "xvmaddadp 46, 42, %x4 \n\t" + "xvmaddadp 47, 43, %x4 \n\t" + + "stxvp 36, 0(%3) \n\t" + "stxvp 38, 32(%3) \n\t" + "stxvp 44, 64(%3) \n\t" + "stxvp 46, 96(%3) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 alpha=%6 t0=%x4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=wa" (t0) // 4 + : + "m" (*x), + "d" (alpha) // 6 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" + ); + +} + + diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c new file mode 100644 index 0000000000..ebe91a80f4 --- /dev/null +++ b/kernel/power/daxpy_power10.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "daxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG register i = 0; + + while(i < n) + { + y[i] += alpha * x[i]; + y[i+1] += alpha * x[i+1]; + y[i+2] += alpha * x[i+2]; + y[i+3] += alpha * x[i+3]; + y[i+4] += alpha * x[i+4]; + y[i+5] += alpha * x[i+5]; + y[i+6] += alpha * x[i+6]; + y[i+7] += alpha * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + daxpy_kernel_8(n1, x, y, da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/zaxpy_microk_power10.c b/kernel/power/zaxpy_microk_power10.c new file mode 100644 index 0000000000..8e593bbfab --- /dev/null +++ b/kernel/power/zaxpy_microk_power10.c @@ -0,0 +1,200 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4 (long n, double *x, double *y, + double alpha_r, double alpha_i) +{ +#if !defined(CONJ) + static const double mvec[2] = { 1.0, -1.0 }; +#else + static const double mvec[2] = { -1.0, 1.0 }; +#endif + const double *mvecp = mvec; + + __vector double t0; + __vector double t1; + __vector double t2; + __vector double t3; + __vector double t4; + __vector double t5; + __vector double t6; + __vector double t7; + long ytmp; + + __asm__ + ( + XXSPLTD_S(32,%x15,0) // alpha_r + XXSPLTD_S(33,%x16,0) // alpha_i + "lxvd2x 36, 0, %17 \n\t" // mvec + +#if !defined(CONJ) + "xvmuldp 33, 33, 36 \n\t" // alpha_i * mvec +#else + "xvmuldp 32, 32, 36 \n\t" // alpha_r * mvec +#endif + + "mr %12, %3 \n\t" + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + + "lxvp 40, 0(%2) \n\t" // x0 + "lxvp 42, 32(%2) \n\t" // x2 + "lxvp 48, 0(%3) \n\t" // y0 + "lxvp 50, 32(%3) \n\t" // y2 + + XXSWAPD_S(%x4,40) // exchange real and imag part + XXSWAPD_S(%x5,41) // exchange real and imag part + XXSWAPD_S(%x6,42) // exchange real and imag part + XXSWAPD_S(%x7,43) // exchange real and imag part + + "lxvp 44, 64(%2) \n\t" // x4 + "lxvp 46, 96(%2) \n\t" // x6 + "lxvp 34, 64(%3) \n\t" // y4 + "lxvp 38, 96(%3) \n\t" // y6 + + XXSWAPD_S(%x8,44) // exchange real and imag part + XXSWAPD_S(%x9,45) // exchange real and imag part + XXSWAPD_S(%x10,46) // exchange real and imag part + XXSWAPD_S(%x11,47) // exchange real and imag part + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "lxvp 40, 0(%2) \n\t" // x0 + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + "lxvp 42, 32(%2) \n\t" // x2 + + "xvmaddadp 34, 44, 32 \n\t" + "xvmaddadp 35, 45, 32 \n\t" + "lxvp 44, 64(%2) \n\t" // x4 + "xvmaddadp 38, 46, 32 \n\t" + "xvmaddadp 39, 47, 32 \n\t" + "lxvp 46, 96(%2) \n\t" // x6 + + "xvmaddadp 48, %x4, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "addi %2, %2, 128 \n\t" + "xvmaddadp 49, %x5, 33 \n\t" + "xvmaddadp 50, %x6, 33 \n\t" + "xvmaddadp 51, %x7, 33 \n\t" + + "xvmaddadp 34, %x8, 33 \n\t" + "xvmaddadp 35, %x9, 33 \n\t" + "xvmaddadp 38, %x10, 33 \n\t" + "xvmaddadp 39, %x11, 33 \n\t" + + "stxvp 48, 0(%12) \n\t" + "stxvp 50, 32(%12) \n\t" + "stxvp 34, 64(%12) \n\t" + "stxvp 38, 96(%12) \n\t" + + "addi %12, %12, 128 \n\t" + + XXSWAPD_S(%x4,40) // exchange real and imag part + XXSWAPD_S(%x5,41) // exchange real and imag part + "lxvp 48, 0(%3) \n\t" // y0 + XXSWAPD_S(%x6,42) // exchange real and imag part + XXSWAPD_S(%x7,43) // exchange real and imag part + "lxvp 50, 32(%3) \n\t" // y2 + + XXSWAPD_S(%x8,44) // exchange real and imag part + XXSWAPD_S(%x9,45) // exchange real and imag part + "lxvp 34, 64(%3) \n\t" // y4 + XXSWAPD_S(%x10,46) // exchange real and imag part + XXSWAPD_S(%x11,47) // exchange real and imag part + "lxvp 38, 96(%3) \n\t" // y6 + + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -8 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "xvmaddadp 34, 44, 32 \n\t" + "xvmaddadp 35, 45, 32 \n\t" + "xvmaddadp 38, 46, 32 \n\t" + "xvmaddadp 39, 47, 32 \n\t" + + "xvmaddadp 48, %x4, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "xvmaddadp 49, %x5, 33 \n\t" + "xvmaddadp 50, %x6, 33 \n\t" + "xvmaddadp 51, %x7, 33 \n\t" + + "xvmaddadp 34, %x8, 33 \n\t" + "xvmaddadp 35, %x9, 33 \n\t" + "xvmaddadp 38, %x10, 33 \n\t" + "xvmaddadp 39, %x11, 33 \n\t" + + "stxvp 48, 0(%12) \n\t" + "stxvp 50, 32(%12) \n\t" + "stxvp 34, 64(%12) \n\t" + "stxvp 38, 96(%12) \n\t" + + "#n=%1 x=%13=%2 y=%0=%3 alpha=(%15,%16) mvecp=%14=%17 ytmp=%12\n" + "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=wa" (t0), // 4 + "=wa" (t1), // 5 + "=wa" (t2), // 6 + "=wa" (t3), // 7 + "=wa" (t4), // 8 + "=wa" (t5), // 9 + "=wa" (t6), // 10 + "=wa" (t7), // 11 + "=b" (ytmp) // 12 + : + "m" (*x), + "m" (*mvecp), + "d" (alpha_r), // 15 + "d" (alpha_i), // 16 + "12" (mvecp) // 17 + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51" + ); +} diff --git a/kernel/power/zaxpy_power10.c b/kernel/power/zaxpy_power10.c new file mode 100644 index 0000000000..54cfb8fd7f --- /dev/null +++ b/kernel/power/zaxpy_power10.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "zaxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_4 + +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + zaxpy_kernel_4 (n1, x, y, da_r, da_i); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + From 7e4d5c237cb10642a9cbf3c173b06045dd10c230 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Sep 2020 09:19:46 +0200 Subject: [PATCH 203/349] Fix workspace query in xGELQ (Reference-LAPACK PR443) --- lapack-netlib/SRC/cgelq.f | 30 +++++++++++++++++++++--------- lapack-netlib/SRC/cgetsls.f | 2 +- lapack-netlib/SRC/dgelq.f | 30 +++++++++++++++++++++--------- lapack-netlib/SRC/dgetsls.f | 2 +- lapack-netlib/SRC/sgelq.f | 30 +++++++++++++++++++++--------- lapack-netlib/SRC/sgetsls.f | 2 +- lapack-netlib/SRC/zgelq.f | 30 +++++++++++++++++++++--------- lapack-netlib/SRC/zgetsls.f | 2 +- 8 files changed, 88 insertions(+), 40 deletions(-) diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f index c3b2238bf8..f0ff3a20d6 100644 --- a/lapack-netlib/SRC/cgelq.f +++ b/lapack-netlib/SRC/cgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f index 3d783be660..01de3c9847 100644 --- a/lapack-netlib/SRC/cgetsls.f +++ b/lapack-netlib/SRC/cgetsls.f @@ -261,7 +261,7 @@ SUBROUTINE CGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL CGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f index fc14d892fb..7b2f808628 100644 --- a/lapack-netlib/SRC/dgelq.f +++ b/lapack-netlib/SRC/dgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f index dfc72c8b2c..c2ba5e2b8e 100644 --- a/lapack-netlib/SRC/dgetsls.f +++ b/lapack-netlib/SRC/dgetsls.f @@ -258,7 +258,7 @@ SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL DGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f index 96c4097e80..e45c68db42 100644 --- a/lapack-netlib/SRC/sgelq.f +++ b/lapack-netlib/SRC/sgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ RETURN * * End of SGELQ diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f index 53d2f9431d..3bf084515f 100644 --- a/lapack-netlib/SRC/sgetsls.f +++ b/lapack-netlib/SRC/sgetsls.f @@ -258,7 +258,7 @@ SUBROUTINE SGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL SGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f index 4e7e7e38eb..beb054b87d 100644 --- a/lapack-netlib/SRC/zgelq.f +++ b/lapack-netlib/SRC/zgelq.f @@ -26,7 +26,7 @@ *> where: *> *> Q is a N-by-N orthogonal matrix; -*> L is an lower-triangular M-by-M matrix; +*> L is a lower-triangular M-by-M matrix; *> 0 is a M-by-(N-M) zero matrix, if M < N. *> *> \endverbatim @@ -187,7 +187,7 @@ SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, * .. * .. Local Scalars .. LOGICAL LQUERY, LMINWS, MINT, MINW - INTEGER MB, NB, MINTSZ, NBLCKS + INTEGER MB, NB, MINTSZ, NBLCKS, LWMIN, LWOPT, LWREQ * .. * .. External Functions .. LOGICAL LSAME @@ -243,20 +243,32 @@ SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, * * Determine if the workspace size satisfies minimal size * + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWMIN = MAX( 1, N ) + LWOPT = MAX( 1, MB*N ) + ELSE + LWMIN = MAX( 1, M ) + LWOPT = MAX( 1, MB*M ) + END IF LMINWS = .FALSE. - IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.MB*M ) - $ .AND. ( LWORK.GE.M ) .AND. ( TSIZE.GE.MINTSZ ) + IF( ( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) .OR. LWORK.LT.LWOPT ) + $ .AND. ( LWORK.GE.LWMIN ) .AND. ( TSIZE.GE.MINTSZ ) $ .AND. ( .NOT.LQUERY ) ) THEN IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) ) THEN LMINWS = .TRUE. MB = 1 NB = N END IF - IF( LWORK.LT.MB*M ) THEN + IF( LWORK.LT.LWOPT ) THEN LMINWS = .TRUE. MB = 1 END IF END IF + IF( ( N.LE.M ) .OR. ( NB.LE.M ) .OR. ( NB.GE.N ) ) THEN + LWREQ = MAX( 1, MB*N ) + ELSE + LWREQ = MAX( 1, MB*M ) + END IF * IF( M.LT.0 ) THEN INFO = -1 @@ -267,7 +279,7 @@ SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, ELSE IF( TSIZE.LT.MAX( 1, MB*M*NBLCKS + 5 ) $ .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN INFO = -6 - ELSE IF( ( LWORK.LT.MAX( 1, M*MB ) ) .AND .( .NOT.LQUERY ) + ELSE IF( ( LWORK.LT.LWREQ ) .AND .( .NOT.LQUERY ) $ .AND. ( .NOT.LMINWS ) ) THEN INFO = -8 END IF @@ -281,9 +293,9 @@ SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, T( 2 ) = MB T( 3 ) = NB IF( MINW ) THEN - WORK( 1 ) = MAX( 1, N ) + WORK( 1 ) = LWMIN ELSE - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ END IF END IF IF( INFO.NE.0 ) THEN @@ -308,7 +320,7 @@ SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK, $ LWORK, INFO ) END IF * - WORK( 1 ) = MAX( 1, MB*M ) + WORK( 1 ) = LWREQ * RETURN * diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f index 1aab3c662f..11233785b8 100644 --- a/lapack-netlib/SRC/zgetsls.f +++ b/lapack-netlib/SRC/zgetsls.f @@ -261,7 +261,7 @@ SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB, TSZM = INT( TQ( 1 ) ) LWM = INT( WORKQ( 1 ) ) CALL ZGEMLQ( 'L', TRANS, N, NRHS, M, A, LDA, TQ, - $ TSZO, B, LDB, WORKQ, -1, INFO2 ) + $ TSZM, B, LDB, WORKQ, -1, INFO2 ) LWM = MAX( LWM, INT( WORKQ( 1 ) ) ) WSIZEO = TSZO + LWO WSIZEM = TSZM + LWM From f91057cbad196be09541eccf1ece5472531f63aa Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Tue, 15 Sep 2020 10:54:37 +0200 Subject: [PATCH 204/349] s390x: move common vector definitions and utils into header ... to facilitate reuse beyond gemm_vec.c and avoid code duplication. Signed-off-by: Marius Hillenbrand --- kernel/zarch/gemm_vec.c | 34 ++----------------- kernel/zarch/vector-common.h | 64 ++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 32 deletions(-) create mode 100644 kernel/zarch/vector-common.h diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c index ef0b1d1e31..30f3171d2d 100644 --- a/kernel/zarch/gemm_vec.c +++ b/kernel/zarch/gemm_vec.c @@ -30,12 +30,13 @@ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "common.h" -#include +#include "vector-common.h" #include #include #include + #ifdef COMPLEX #error "Handling for complex numbers is not supported in this kernel" #endif @@ -153,37 +154,6 @@ static const bool backwards = false; * 3, May 2008. */ -#define VLEN_BYTES 16 -#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) - -typedef FLOAT vector_float __attribute__ ((vector_size (16))); - -/** - * Load a vector into register, and hint on 8-byte alignment to improve - * performance. gcc-9 and newer will create these hints by itself. For older - * compiler versions, use inline assembly to explicitly express the hint. - * Provide explicit hex encoding to cater for binutils versions that do not know - * about vector-load with alignment hints yet. - * - * Note that, for block sizes where we apply vectorization, vectors in A will - * always be 8-byte aligned. - */ -static inline vector_float vec_load_hinted(FLOAT const *restrict a) { - vector_float const *restrict addr = (vector_float const *restrict)a; - vector_float y; - -#if __GNUC__ < 9 && !defined(__clang__) - // hex-encode vl %[out],%[addr],3 - asm(".insn vrx,0xe70000003006,%[out],%[addr],3" - : [ out ] "=v"(y) - : [ addr ] "R"(*addr)); -#else - y = *addr; -#endif - - return y; -} - /** * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. * diff --git a/kernel/zarch/vector-common.h b/kernel/zarch/vector-common.h new file mode 100644 index 0000000000..140d39d7be --- /dev/null +++ b/kernel/zarch/vector-common.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) IBM Corporation 2020. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#define VLEN_BYTES 16 +#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) + +typedef FLOAT vector_float __attribute__ ((vector_size (VLEN_BYTES))); + +/** + * Load a vector into register, and hint on 8-byte alignment to improve + * performance. gcc-9 and newer will create these hints by itself. For older + * compiler versions, use inline assembly to explicitly express the hint. + * Provide explicit hex encoding to cater for binutils versions that do not know + * about vector-load with alignment hints yet. + * + * Note that, for block sizes where we apply vectorization, vectors in A will + * always be 8-byte aligned. + */ +static inline vector_float vec_load_hinted(FLOAT const *restrict a) { + vector_float const *restrict addr = (vector_float const *restrict)a; + vector_float y; + +#if __GNUC__ < 9 && !defined(__clang__) + // hex-encode vl %[out],%[addr],3 + asm(".insn vrx,0xe70000003006,%[out],%[addr],3" + : [ out ] "=v"(y) + : [ addr ] "R"(*addr)); +#else + y = *addr; +#endif + + return y; +} From 77ea73f5e5579ea35b6be03bac455643b84e343d Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Wed, 16 Sep 2020 15:55:38 +0200 Subject: [PATCH 205/349] s390x: for clang use fp-contract=on instead of fast Make clang slightly more cautious when contracting floating-point operations (e.g., when applying fused multiply add) by setting -ffp-contract=on (instead of fast). Signed-off-by: Marius Hillenbrand --- Makefile.zarch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.zarch b/Makefile.zarch index b841d9b4d5..092ca2589d 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -12,5 +12,5 @@ endif # Enable floating-point expression contraction for clang, since it is the # default for gcc ifeq ($(C_COMPILER), CLANG) -CCOMMON_OPT += -ffp-contract=fast +CCOMMON_OPT += -ffp-contract=on endif From 22aa81f3e587c85c5ccdcbbe2964cf5f89a00931 Mon Sep 17 00:00:00 2001 From: Marius Hillenbrand Date: Mon, 14 Sep 2020 18:36:31 +0200 Subject: [PATCH 206/349] s390x: fix cscal and zscal implementations The implementation of complex scalar * vector multiplication for Z14 makes some LAPACK tests fail because the numerical differences to the reference implementation exceed the threshold (as can be seen by running make lapack-test and replacing kernel/zarch/cscal.c with a generic implementation for comparison). The complex multiplication uses terms of the form a * b + c * d for both real and imaginary parts. The assembly code (and compiler-emitted code as well) uses fused multiply add operations for the second product and sum. The results can be "surprising", for example when both terms in the imaginary part nearly cancel each other out. In that case, the second product contributes more digits to the sum than the first product that has been rounded before. One option is to use separate multiplications (which then round the same way) and a distinct add. Change the code to pursue that path, by (1) requesting the compiler not to contract the operations into FMAs and (2) replacing the assembly kernel with corresponding vectorized C code (where change 1 also applies). Signed-off-by: Marius Hillenbrand --- kernel/zarch/cscal.c | 96 ++++++++++++++------------------------------ kernel/zarch/zscal.c | 94 ++++++++++++++----------------------------- 2 files changed, 60 insertions(+), 130 deletions(-) diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index f9e89a452e..57bb89c0ae 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -25,67 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" +#include "vector-common.h" -static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v24,%%v16,32\n\t" - "verllg %%v25,%%v17,32\n\t" - "verllg %%v26,%%v18,32\n\t" - "verllg %%v27,%%v19,32\n\t" - "verllg %%v28,%%v20,32\n\t" - "verllg %%v29,%%v21,32\n\t" - "verllg %%v30,%%v22,32\n\t" - "verllg %%v31,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); +static void cscal_kernel_16(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) { + vector_float da_r_vec = vec_splats(da_r); + vector_float da_i_vec = { -da_i, da_i, -da_i, da_i }; + + vector_float *x_vec_ptr = (vector_float *)x; + +#pragma GCC unroll 16 + for (size_t i = 0; i < n/2; i++) { + vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS); + vector_float x_swapped = {x_vec[1], x_vec[0], x_vec[3], x_vec[2]}; + + x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec; + } } static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { @@ -199,14 +167,12 @@ static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { : "cc", "r1", "v0"); } -static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, +static void cscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x) { BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; for (i = 0; i < n; i += 4) { t0 = da_r * x[0] - da_i * x[1]; @@ -324,9 +290,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - cscal_kernel_inc_8(n1, alpha, x, inc_x); + cscal_kernel_inc_8(n1, da_r, da_i, x, inc_x); j = n1; i = n1 * inc_x; } @@ -362,7 +326,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else if (da_i == 0) cscal_kernel_16_zero_i(n1, alpha, x); else - cscal_kernel_16(n1, alpha, x); + cscal_kernel_16(n1, da_r, da_i, x); i = n1 << 1; j = n1; diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index a5a8f694d3..d39b8447ec 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -25,65 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/* + * Avoid contraction of floating point operations, specifically fused + * multiply-add, because they can cause unexpected results in complex + * multiplication. + */ +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC optimize ("fp-contract=off") +#endif + +#if defined(__clang__) +#pragma clang fp contract(off) +#endif + #include "common.h" +#include "vector-common.h" -static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vpdi %%v28,%%v20,%%v20,4\n\t" - "vpdi %%v29,%%v21,%%v21,4\n\t" - "vpdi %%v30,%%v22,%%v22,4\n\t" - "vpdi %%v31,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); +static void zscal_kernel_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x) { + vector_float da_r_vec = vec_splats(da_r); + vector_float da_i_vec = { -da_i, da_i }; + + vector_float * x_vec_ptr = (vector_float *)x; + +#pragma GCC unroll 16 + for (size_t i = 0; i < n; i++) { + vector_float x_vec = vec_load_hinted(x + i * VLEN_FLOATS); + vector_float x_swapped = {x_vec[1], x_vec[0]}; + + x_vec_ptr[i] = x_vec * da_r_vec + x_swapped * da_i_vec; + } } static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { @@ -195,14 +165,12 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { : "cc", "r1", "v0"); } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, +static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x) { BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; for (i = 0; i < n; i += 4) { t0 = da_r * x[0] - da_i * x[1]; @@ -320,9 +288,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); + zscal_kernel_inc_8(n1, da_r, da_i, x, inc_x); j = n1; i = n1 * inc_x; } @@ -358,7 +324,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, else if (da_i == 0) zscal_kernel_8_zero_i(n1, alpha, x); else - zscal_kernel_8(n1, alpha, x); + zscal_kernel_8(n1, da_r, da_i, x); i = n1 << 1; j = n1; From 325b539c26414f05666c0b0bfb2d6fe3e95cb039 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 22 Sep 2020 10:38:35 +0800 Subject: [PATCH 207/349] Optimize the performance of daxpy by using universal intrinsics --- kernel/simd/intrin.h | 51 +++++++++++++++++++++++++++++++++++++ kernel/simd/intrin_avx.h | 19 ++++++++++++++ kernel/simd/intrin_avx512.h | 19 ++++++++++++++ kernel/simd/intrin_sse.h | 19 ++++++++++++++ kernel/x86_64/daxpy.c | 39 ++++++++++++++++------------ 5 files changed, 131 insertions(+), 16 deletions(-) create mode 100644 kernel/simd/intrin.h create mode 100644 kernel/simd/intrin_avx.h create mode 100644 kernel/simd/intrin_avx512.h create mode 100644 kernel/simd/intrin_sse.h diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h new file mode 100644 index 0000000000..ef599f0659 --- /dev/null +++ b/kernel/simd/intrin.h @@ -0,0 +1,51 @@ +#ifndef _INTRIN_H_ +#define _INTRIN_H_ + +#ifdef __cplusplus +extern "C" { +#endif +// include head +/** SSE **/ +#ifdef HAVE_SSE +#include +#endif +/** SSE2 **/ +#ifdef HAVE_SSE2 +#include +#endif +/** SSE3 **/ +#ifdef HAVE_SSE3 +#include +#endif +/** SSSE3 **/ +#ifdef HAVE_SSSE3 +#include +#endif +/** SSE41 **/ +#ifdef HAVE_SSE4_1 +#include +#endif + +/** AVX **/ +#ifdef HAVE_AVX +#include +#endif + +// distribute +#if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) +#include "intrin_avx512.h" +#elif defined(HAVE_AVX2) +#include "intrin_avx.h" +#elif defined(HAVE_SSE2) +#include "intrin_sse.h" +#endif + +#ifndef V_SIMD + #define V_SIMD 0 + #define V_SIMD_F64 0 +#endif + +#ifdef __cplusplus +} +#endif +#endif // _INTRIN_H_ diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h new file mode 100644 index 0000000000..7262544297 --- /dev/null +++ b/kernel/simd/intrin_avx.h @@ -0,0 +1,19 @@ +#define V_SIMD 256 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m256 v_f32; +#define v_nlanes_f32 8 +/* +arithmetic +*/ +#define v_add_f32 _mm256_add_ps +#define v_mul_f32 _mm256_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm256_loadu_ps +#define v_storeu_f32 _mm256_storeu_ps +#define v_setall_f32(VAL) _mm256_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h new file mode 100644 index 0000000000..775fe7aa5a --- /dev/null +++ b/kernel/simd/intrin_avx512.h @@ -0,0 +1,19 @@ +#define V_SIMD 512 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m512 v_f32; +#define v_nlanes_f32 16 +/* +arithmetic +*/ +#define v_add_f32 _mm512_add_ps +#define v_mul_f32 _mm512_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) +#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR)) +#define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h new file mode 100644 index 0000000000..0cc159aa78 --- /dev/null +++ b/kernel/simd/intrin_sse.h @@ -0,0 +1,19 @@ +#define V_SIMD 128 +#define V_SIMD_F64 1 +/* +Data Type +*/ +typedef __m128 v_f32; +#define v_nlanes_f32 4 +/* +arithmetic +*/ +#define v_add_f32 _mm_add_ps +#define v_mul_f32 _mm_mul_ps +/* +memory +*/ +// unaligned load +#define v_loadu_f32 _mm_loadu_ps +#define v_storeu_f32 _mm_storeu_ps +#define v_setall_f32(VAL) _mm_set1_ps(VAL) \ No newline at end of file diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index d84c0c2218..9836faca10 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -45,28 +45,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "daxpy_microk_sandy-2.c" #endif - #ifndef HAVE_KERNEL_8 +#include"../simd/intrin.h" -static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; - +#if V_SIMD + v_f32 __alpha, tmp; + __alpha = v_setall_f32(*alpha); + const int vstep = v_nlanes_f32; + for (; i < n; i += vstep) { + tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i ))); + v_storeu_f32(y + i, tmp); + } +#else while(i < n) - { - y[i] += a * x[i]; - y[i+1] += a * x[i+1]; - y[i+2] += a * x[i+2]; - y[i+3] += a * x[i+3]; - y[i+4] += a * x[i+4]; - y[i+5] += a * x[i+5]; - y[i+6] += a * x[i+6]; - y[i+7] += a * x[i+7]; - i+=8 ; - - } - + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + } +#endif } #endif From 06cf73a239ab6cc997bcb29009eb52b28a817cc3 Mon Sep 17 00:00:00 2001 From: y00512012 Date: Tue, 22 Sep 2020 16:47:10 +0800 Subject: [PATCH 208/349] fix a bug of trmm --- driver/level3/trmm_L.c | 48 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index ae8435d036..880de4df44 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -122,6 +122,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); @@ -161,9 +164,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } - for(is = min_i; is < min_l; is += GEMM_P){ + for(is = min_i; is < min_l; is += min_i){ min_i = min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); @@ -192,6 +198,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = ls; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + START_RPCC(); @@ -231,9 +241,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO STOP_RPCC(gemmcost); } - for(is = min_i; is < ls; is += GEMM_P){ + for(is = min_i; is < ls; is += min_i){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); @@ -256,9 +269,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO STOP_RPCC(gemmcost); } - for(is = ls; is < ls + min_l; is += GEMM_P){ + for(is = ls; is < ls + min_l; is += min_i){ min_i = ls + min_l - is; if (min_i > GEMM_P) min_i = GEMM_P; + if( min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); @@ -287,6 +303,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + START_RPCC(); @@ -327,9 +347,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO STOP_RPCC(trmmcost); } - for(is = m - min_l + min_i; is < m; is += GEMM_P){ + for(is = m - min_l + min_i; is < m; is += min_i){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + + START_RPCC(); @@ -357,6 +382,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO if (min_l > GEMM_Q) min_l = GEMM_Q; min_i = min_l; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + START_RPCC(); @@ -397,9 +426,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO STOP_RPCC(trmmcost); } - for(is = ls - min_l + min_i; is < ls; is += GEMM_P){ + for(is = ls - min_l + min_i; is < ls; is += min_i){ min_i = ls - is; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } + START_RPCC(); @@ -423,9 +456,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO } - for(is = ls; is < m; is += GEMM_P){ + for(is = ls; is < m; is += min_i){ min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; + if (min_i > GEMM_UNROLL_M){ + min_i = (min_i / GEMM_UNROLL_M) * GEMM_UNROLL_M; + } START_RPCC(); From 14f7dad3b7d728159bbeab72deb9e7878d108760 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 22 Sep 2020 16:52:15 +0800 Subject: [PATCH 209/349] performance improved --- kernel/simd/intrin.h | 20 ++++++++++++++++++++ kernel/simd/intrin_avx.h | 10 ++++++++++ kernel/simd/intrin_avx512.h | 4 +++- kernel/simd/intrin_sse.h | 11 +++++++++++ kernel/x86_64/daxpy.c | 4 ++-- 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h index ef599f0659..5997bb6ac3 100644 --- a/kernel/simd/intrin.h +++ b/kernel/simd/intrin.h @@ -1,6 +1,26 @@ #ifndef _INTRIN_H_ #define _INTRIN_H_ +#if defined(_MSC_VER) +#define BLAS_INLINE __inline +#elif defined(__GNUC__) +#if defined(__STRICT_ANSI__) +#define BLAS_INLINE __inline__ +#else +#define BLAS_INLINE inline +#endif +#else +#define BLAS_INLINE +#endif + +#ifdef _MSC_VER +#define BLAS_FINLINE static __forceinline +#elif defined(__GNUC__) +#define BLAS_FINLINE static BLAS_INLINE __attribute__((always_inline)) +#else +#define BLAS_FINLINE static +#endif + #ifdef __cplusplus extern "C" { #endif diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index 7262544297..f6257ae987 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -10,6 +10,16 @@ arithmetic */ #define v_add_f32 _mm256_add_ps #define v_mul_f32 _mm256_mul_ps + +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm256_fmadd_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // !HAVE_FMA3 + /* memory */ diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index 775fe7aa5a..cb116a9a31 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -10,10 +10,12 @@ arithmetic */ #define v_add_f32 _mm512_add_ps #define v_mul_f32 _mm512_mul_ps +// multiply and add, a*b + c +#define v_muladd_f32 _mm512_fmadd_ps /* memory */ // unaligned load #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) -#define v_storeu_f32(PTR) _mm512_storeu_ps((const __m512*)(PTR)) +#define v_storeu_f32 _mm512_storeu_ps #define v_setall_f32(VAL) _mm512_set1_ps(VAL) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 0cc159aa78..260112028b 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -10,6 +10,17 @@ arithmetic */ #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps +#ifdef HAVE_FMA3 + // multiply and add, a*b + c + #define v_muladd_f32 _mm_fmadd_ps +#elif defined(HAVE_FMA4) + // multiply and add, a*b + c + #define v_muladd_f32 _mm_macc_ps +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_add_f32(v_mul_f32(a, b), c); } +#endif // HAVE_FMA3 /* memory */ diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index 9836faca10..b62e3dcb3d 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -48,7 +48,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef HAVE_KERNEL_8 #include"../simd/intrin.h" -void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; FLOAT a = *alpha; @@ -57,7 +57,7 @@ void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) __alpha = v_setall_f32(*alpha); const int vstep = v_nlanes_f32; for (; i < n; i += vstep) { - tmp = v_add_f32(v_loadu_f32(y + i), v_mul_f32(__alpha, v_loadu_f32( x + i ))); + tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); v_storeu_f32(y + i, tmp); } #else From 5ba01dd1a829c02cf7ccb1b790948570570eca05 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 17:26:19 +0200 Subject: [PATCH 210/349] Add an OSX build with xcode12 --- .travis.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.travis.yml b/.travis.yml index 3f8f766fe9..482b4f6481 100644 --- a/.travis.yml +++ b/.travis.yml @@ -204,6 +204,17 @@ matrix: env: - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8" + - <<: *test-macos + osx_image: xcode12 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + - brew install gcc@10 # for gfortran + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + - <<: *test-macos osx_image: xcode10.0 env: From b886bd672b6f7aa97cb0ac8372a1ec1029d64bff Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:18:55 +0200 Subject: [PATCH 211/349] add defines for building a subset of types --- common_param.h | 92 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 82 insertions(+), 10 deletions(-) diff --git a/common_param.h b/common_param.h index a52de98ab8..a689ddf7d9 100644 --- a/common_param.h +++ b/common_param.h @@ -146,26 +146,34 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif + +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; +#endif int exclusive_cache; +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); float (*smin_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); +#endif +#ifdef BUILD_SINGLE float (*ssum_k) (BLASLONG, float *, BLASLONG); +#endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); @@ -175,6 +183,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#ifdef BUILD_SINGLE int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -185,6 +195,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); #endif +#endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -193,7 +205,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - +#endif +#ifdef BUILD_SINGLE int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -245,10 +258,14 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -257,25 +274,37 @@ BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); +#endif +#ifdef BUILD_DOUBLE double (*dsum_k) (BLASLONG, double *, BLASLONG); +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); +#endif +#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) + double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +#endif +#ifdef BUILD_DOUBLE int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - +#endif +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -283,7 +312,8 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - +#endif +#ifdef BUILD_DOUBLE int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -335,7 +365,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - +#endif #ifdef EXPRECISION int qgemm_p, qgemm_q, qgemm_r; @@ -430,6 +460,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif +#ifdef BUILD_COMPLEX int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; @@ -593,7 +624,9 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#ifdef BUILD_COMPLEX16 int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -757,6 +790,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +#endif #ifdef EXPRECISION @@ -930,22 +964,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; - +#ifdef BUILD_SINGLE int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); +#endif +#ifdef BUILD_DOUBLE int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX16 int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); +#endif +#ifdef BUILD_SINGLE int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); +#endif +#ifdef BUILD_DOUBLE int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -955,7 +1001,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX16 int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -965,17 +1013,23 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); +#endif +#ifdef BUILD_SINGLE int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); +#endif +#ifdef BUILD_DOUBLE int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -985,7 +1039,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); +#endif +#ifdef BUILD_COMPLEX16 int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -995,12 +1051,20 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); +#endif +#ifdef BUILD_SINGLE int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); +#endif +#ifdef BUILD_DOUBLE int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); +#endif +#ifdef BUILD_COMPLEX int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); +#endif +#ifdef BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - +#endif } gotoblas_t; extern gotoblas_t *gotoblas; @@ -1021,19 +1085,23 @@ extern gotoblas_t *gotoblas; #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r #define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m #define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#if defined (BUILD_DOUBLE) || defined (BUILD_COMPLEX16) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#endif #define QGEMM_P gotoblas -> qgemm_p #define QGEMM_Q gotoblas -> qgemm_q @@ -1042,19 +1110,23 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn +#ifdef BUILD_COMPLEX #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#endif +#ifdef BUILD_COMPLEX16 #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn +#endif #define XGEMM_P gotoblas -> xgemm_p #define XGEMM_Q gotoblas -> xgemm_q From 26611af8e1af43941ac02c642c16a64a37390304 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:20:05 +0200 Subject: [PATCH 212/349] fix grouping of sources used for more than one type --- cmake/lapack.cmake | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 18a74d18ef..73f2592ef0 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1,11 +1,12 @@ # Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F - ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f ../INSTALL/ilaver.f xerbla_array.f ../INSTALL/slamch.f) set(SCLAUX + scombssq.f sbdsvdx.f sstevx.f sstein.f sbdsdc.f sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f @@ -25,6 +26,7 @@ set(SCLAUX set(DZLAUX dbdsdc.f + dbdsvdx.f dstevx.f dstein.f dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f @@ -35,14 +37,14 @@ set(DZLAUX dlartg.f dlaruv.f dlas2.f dlascl.f dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f - dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f dsteqr.f dsterf.f dlaisnan.f disnan.f dlartgp.f dlartgs.f ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) set(SLASRC - sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f + sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f @@ -83,8 +85,8 @@ set(SLASRC ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f - ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f - sstevx.f ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f + ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f + ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f ssyswapr.f ssytrs.f ssytrs2.f @@ -116,7 +118,7 @@ set(SLASRC ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f - scombssq.f sgesvdq.f slaorhr_col_getrfnp.f + sgesvdq.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f @@ -229,7 +231,7 @@ set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f) set(DLASRC - dbdsvdx.f dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f @@ -270,8 +272,8 @@ set(DLASRC dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f - dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f - dstevx.f dsycon.f dsyev.f dsyevd.f dsyevr.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f + dsycon.f dsyev.f dsyevd.f dsyevr.f dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f dsysv.f dsysvx.f dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f @@ -474,12 +476,16 @@ endif() if(BUILD_COMPLEX) set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) - message(STATUS "Building Complex Precision") + message(STATUS "Building Single Precision Complex") endif() if(BUILD_COMPLEX16) set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) - message(STATUS "Building Double Complex Precision") +# for zlange/zlanhe + if (NOT BUILD_DOUBLE) + set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f) + endif () + message(STATUS "Building Double Precision Complex") endif() # add lapack-netlib folder to the sources From 3287848c8f45335b9672a3d8cded592451af0d61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:20:51 +0200 Subject: [PATCH 213/349] Support building only seleced types --- driver/level2/CMakeLists.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index 8fceba9055..f72e707e18 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -197,6 +197,19 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () +if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + if (USE_THREAD) + GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") + GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "SINGLE") + endif () +endif () +if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if (USE_THREAD) + GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "DOUBLE") + GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "DOUBLE") + endif () +endif () + if (USE_THREAD) GenerateCombinationObjects("${UL_SMP_SOURCES}" "LOWER" "U" "" 2) endif () From e5e2fbd593f78f6113b0dcee88cb3b63b613e53b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:21:30 +0200 Subject: [PATCH 214/349] Support building only selected types --- driver/level3/CMakeLists.txt | 37 +++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index f788c45b98..46cbb0d6d1 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -14,6 +14,24 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endif () endforeach () +if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) +foreach (GEMM_DEFINE ${GEMM_DEFINES}) + string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "DOUBLE") + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "DOUBLE") + endif() +endforeach() +endif() +if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) +foreach (GEMM_DEFINE ${GEMM_DEFINES}) + string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "SINGLE") + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "SINGLE") + endif() +endforeach() +endif() set(TRMM_TRSM_SOURCES trmm_L.c @@ -100,7 +118,24 @@ foreach (float_type ${FLOAT_TYPES}) endif() endif () endforeach () - + + if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) + string(TOLOWER ${gemm_define} gemm_define_LC) + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${gemm_define};THREADED_LEVEL3" "gemm_thread_${gemm_define_LC}" false "" "" false "DOUBLE" ) + endif() + endforeach() + endif () + if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) + string(TOLOWER ${gemm_define} gemm_define_LC) + if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) + GenerateNamedObjects("gemm.c" "${gemm_define};THREADED_LEVEL3" "gemm_thread_${gemm_define_LC}" false "" "" false "SINGLE" ) + endif() + endforeach() + endif () + # for gemm3m if(USE_GEMM3M) foreach (GEMM_DEFINE ${GEMM_DEFINES}) From 988a6f429e9d16bb27e73a7a8c859d5aa6e04d58 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:23:33 +0200 Subject: [PATCH 215/349] Add BUILD_vartype defines --- driver/level3/syrk_thread.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index b26d363c45..753cdb5ca7 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -56,12 +56,16 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (!(mode & BLAS_COMPLEX)) { switch (mode & BLAS_PREC) { +#ifdef BUILD_SINGLE case BLAS_SINGLE: mask = SGEMM_UNROLL_MN - 1; break; +#endif +#ifdef BUILD_DOUBLE case BLAS_DOUBLE: mask = DGEMM_UNROLL_MN - 1; break; +#endif #ifdef EXPRECISION case BLAS_XDOUBLE: mask = MAX(QGEMM_UNROLL_M, QGEMM_UNROLL_N) - 1; @@ -70,12 +74,16 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( } } else { switch (mode & BLAS_PREC) { +#ifdef BUILD_COMPLEX case BLAS_SINGLE: mask = CGEMM_UNROLL_MN - 1; break; +#endif +#ifdef BUILD_COMPLEX16 case BLAS_DOUBLE: mask = ZGEMM_UNROLL_MN - 1; break; +#endif #ifdef EXPRECISION case BLAS_XDOUBLE: mask = MAX(XGEMM_UNROLL_M, XGEMM_UNROLL_N) - 1; From 357bff06b5b9ab7f4f1de8084eceb37cdcffa250 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:24:22 +0200 Subject: [PATCH 216/349] Add BUILD_vartype defines --- driver/others/blas_server.c | 11 +++++++++-- driver/others/memory.c | 10 ++++++++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 8d3dda3bf6..acfaed75d2 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -459,13 +459,16 @@ blas_queue_t *tscq; } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) { +#ifdef BUILD_DOUBLE sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_SINGLE sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - } else { +#endif + } else { /* Other types in future */ } } else { @@ -476,11 +479,15 @@ blas_queue_t *tscq; } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } diff --git a/driver/others/memory.c b/driver/others/memory.c index 9b6c226a1e..08835ed6d9 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2201,9 +2201,15 @@ static void *alloc_mmap(void *address){ #endif #endif - +#ifdef BUILD_DOUBLE allocsize = DGEMM_P * DGEMM_Q * sizeof(double); - +#elif defined(BUILD_COMPLEX16) + allocsize = ZGEMM_P * ZGEMM_Q * sizeof(double); +#elif defined(BUILD_COMPLEX) + allocsize = CGEMM_P * CGEMM_Q * sizeof(double); +#else + allocsize = SGEMM_P * SGEMM_Q * sizeof(double); +#endif start = (BLASULONG)map_address; current = (SCALING - 1) * BUFFER_SIZE; From b475b4bd0dbc0f9c750e6a8a31769a47a777f199 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:25:04 +0200 Subject: [PATCH 217/349] Support building only a subset of types --- interface/CMakeLists.txt | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 7a8fc6698d..ad56c6dbaa 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -83,8 +83,12 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) #sdsdot, dsdot + if (BUILD_SINGLE OR BUILD_DOUBLE) GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") +endif () +if (BUILD_DOUBLE) GenerateNamedObjects("dsdot.c" "" "dsdot" ${CBLAS_FLAG} "" "" true "SINGLE") +endif () # trmm is trsm with a compiler flag set GenerateNamedObjects("trsm.c" "TRMM" "trmm" ${CBLAS_FLAG}) @@ -167,4 +171,31 @@ if (NOT DEFINED NO_LAPACK) GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) endif () +if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + GenerateNamedObjects("scal.c" "" "scal" 0 "" "" false "SINGLE") + GenerateNamedObjects("copy.c" "" "copy" 0 "" "" false "SINGLE") + GenerateNamedObjects("dot.c" "" "dot" 0 "" "" false "SINGLE") + GenerateNamedObjects("rot.c" "" "rot" 0 "" "" false "SINGLE") + GenerateNamedObjects("nrm2.c" "" "nrm2" 0 "" "" false "SINGLE") + GenerateNamedObjects("gemv.c" "" "gemv" 0 "" "" false "SINGLE") + GenerateNamedObjects("gemm.c" "" "gemm" 0 "" "" false "SINGLE") + GenerateNamedObjects("asum.c" "" "asum" 0 "" "" false "SINGLE") + GenerateNamedObjects("swap.c" "" "swap" 0 "" "" false "SINGLE") + GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "SINGLE") + GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" 0 "" "" false "SINGLE") +endif () +if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + GenerateNamedObjects("scal.c" "" "scal" 0 "" "" false "DOUBLE") + GenerateNamedObjects("copy.c" "" "copy" 0 "" "" false "DOUBLE") + GenerateNamedObjects("dot.c" "" "dot" 0 "" "" false "DOUBLE") + GenerateNamedObjects("rot.c" "" "rot" 0 "" "" false "DOUBLE") + GenerateNamedObjects("nrm2.c" "" "nrm2" 0 "" "" false "DOUBLE") + GenerateNamedObjects("gemv.c" "" "gemv" 0 "" "" false "DOUBLE") + GenerateNamedObjects("gemm.c" "" "gemm" 0 "" "" false "DOUBLE") + GenerateNamedObjects("asum.c" "" "asum" 0 "" "" false "DOUBLE") + GenerateNamedObjects("swap.c" "" "swap" 0 "" "" false "DOUBLE") + GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "DOUBLE") + GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" 0 "" "" false "DOUBLE") +endif () + add_library(interface OBJECT ${OPENBLAS_SRC}) From dfbc62ef7e89e448f2a57f3aaf72a11dae61bbd2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:25:59 +0200 Subject: [PATCH 218/349] Support building only a subset of types --- kernel/CMakeLists.txt | 94 +++++++++++- kernel/setparam-ref.c | 345 +++++++++++++++++++++++++++++++++++++++--- 2 files changed, 421 insertions(+), 18 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 84dd949a43..c81f2bf255 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -91,6 +91,59 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") + if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SASUMKERNEL}" "" "asum_k" false "" "" false "SINGLE") + if (DEFINED SMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${SMAXKERNEL}" "" "max_k" false "" "" false "SINGLE") + endif () + if (DEFINED SMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${SMINKERNEL}" "USE_MIN" "min_k" false "" "" false "SINGLE") + endif () + if (DEFINED ISMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${ISMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "SINGLE") + endif () + if (DEFINED ISMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${ISMAXKERNEL}" "" "i*max_k" false "" "" false "SINGLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${ISAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${ISAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SSCALKERNEL}" "" "scal_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SCOPYKERNEL}" "" "copy_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SSWAPKERNEL}" "" "swap_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SAXPYKERNEL}" "" "axpy_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") + endif () + if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k" false "" "" false "DOUBLE") + if (DEFINED DMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "" "max_k" false "" "" false "DOUBLE") + endif () + if (DEFINED DMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "USE_MIN" "min_k" false "" "" false "DOUBLE") + endif () + if (DEFINED IDMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "DOUBLE") + endif () + if (DEFINED IDMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "" "i*max_k" false "" "" false "DOUBLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "" "scal_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") + endif () + # Makefile.L2 GenerateCombinationObjects("generic/symv_k.c" "LOWER" "U" "" 1 "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) @@ -124,7 +177,14 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type}) endif () endforeach () - + if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + GenerateNamedObjects("${KERNELDIR}/${DGEMVNKERNEL}" "" "gemv_n" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "DOUBLE") + endif () + if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") + endif () # Makefile.L3 set(USE_TRMM false) if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) @@ -159,6 +219,38 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() + if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") + if (DGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "${DGEMMINCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "DOUBLE" "${DGEMMITCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "DOUBLE" "${DGEMMONCOPYOBJ}" false "" "" true "DOUBLE") + endif () + if (DGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "${DGEMMOTCOPYOBJ}" false "" "" true "DOUBLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE") + endif () + if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") + if (SGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "DOUBLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "DOUBLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "DOUBLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "DOUBLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") + endif () foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index c43520310b..550af86a6c 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -114,6 +114,7 @@ gotoblas_t TABLE_NAME = { #endif #endif +#if defined( BUILD_SINGLE) || defined(BUILD_COMPLEX) 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, #ifdef SGEMM_DEFAULT_UNROLL_MN @@ -121,7 +122,7 @@ gotoblas_t TABLE_NAME = { #else MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N), #endif - +#endif #ifdef HAVE_EXCLUSIVE_CACHE 1, @@ -129,19 +130,34 @@ gotoblas_t TABLE_NAME = { 0, #endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, - dsdot_kTS, - srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, - sgemv_nTS, sgemv_tTS, sger_kTS, + snrm2_kTS, sasum_kTS, +#endif +#ifdef BUILD_SINGLE + ssum_kTS, +#endif + +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) + scopy_kTS, sdot_kTS, +// dsdot_kTS, + srot_kTS, saxpy_kTS, + sscal_kTS, + sswap_kTS, + sgemv_nTS, sgemv_tTS, +#endif +#ifdef BUILD_SINGLE + sger_kTS, ssymv_LTS, ssymv_UTS, #ifdef ARCH_X86_64 sgemm_directTS, sgemm_direct_performantTS, #endif - +#endif + +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, @@ -149,6 +165,9 @@ gotoblas_t TABLE_NAME = { sgemm_oncopyTS, sgemm_otcopyTS, #endif sgemm_oncopyTS, sgemm_otcopyTS, +#endif + +#ifdef BUILD_SINGLE strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, @@ -182,6 +201,9 @@ gotoblas_t TABLE_NAME = { NULL,NULL, #endif +#endif + +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) 0, 0, 0, DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, #ifdef DGEMM_DEFAULT_UNROLL_MN @@ -189,14 +211,36 @@ gotoblas_t TABLE_NAME = { #else MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N), #endif +#endif + +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, - dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS, - drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, - dgemv_nTS, dgemv_tTS, dger_kTS, + dnrm2_kTS, dasum_kTS, +#endif +#if defined (BUILD_DOUBLE) + dsum_kTS, +#endif +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) + dcopy_kTS, ddot_kTS, +#endif +#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) + dsdot_kTS, +#endif +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) + drot_kTS, + daxpy_kTS, + dscal_kTS, + dswap_kTS, + dgemv_nTS, dgemv_tTS, +#endif +#if defined (BUILD_DOUBLE) + dger_kTS, dsymv_LTS, dsymv_UTS, +#endif +#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) dgemm_kernelTS, dgemm_betaTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dgemm_incopyTS, dgemm_itcopyTS, @@ -204,6 +248,9 @@ gotoblas_t TABLE_NAME = { dgemm_oncopyTS, dgemm_otcopyTS, #endif dgemm_oncopyTS, dgemm_otcopyTS, +#endif + +#if defined (BUILD_DOUBLE) dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, @@ -237,6 +284,8 @@ gotoblas_t TABLE_NAME = { NULL, NULL, #endif +#endif + #ifdef EXPRECISION 0, 0, 0, @@ -291,6 +340,7 @@ gotoblas_t TABLE_NAME = { #endif +#ifdef BUILD_COMPLEX 0, 0, 0, CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, #ifdef CGEMM_DEFAULT_UNROLL_MN @@ -426,6 +476,9 @@ gotoblas_t TABLE_NAME = { NULL, NULL, #endif +#endif + +#ifdef BUILD_COMPLEX16 0, 0, 0, ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, #ifdef ZGEMM_DEFAULT_UNROLL_MN @@ -560,6 +613,8 @@ gotoblas_t TABLE_NAME = { NULL, NULL, #endif +#endif + #ifdef EXPRECISION 0, 0, 0, @@ -691,25 +746,61 @@ gotoblas_t TABLE_NAME = { init_parameter, SNUMOPT, DNUMOPT, QNUMOPT, +#ifdef BUILD_SINGLE + saxpby_kTS, +#endif +#ifdef BUILD_DOUBLE + daxpby_kTS, +#endif +#ifdef BUILD_COMPLEX + caxpby_kTS, +#endif +#ifdef BUILD_COMPLEX16 + zaxpby_kTS, +#endif - saxpby_kTS, daxpby_kTS, caxpby_kTS, zaxpby_kTS, - +#ifdef BUILD_SINGLE somatcopy_k_cnTS, somatcopy_k_ctTS, somatcopy_k_rnTS, somatcopy_k_rtTS, +#endif +#ifdef BUILD_DOUBLE domatcopy_k_cnTS, domatcopy_k_ctTS, domatcopy_k_rnTS, domatcopy_k_rtTS, +#endif +#ifdef BUILD_COMPLEX comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS, comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS, +#endif +#ifdef BUILD_COMPLEX16 zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS, zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS, +#endif +#ifdef BUILD_SINGLE simatcopy_k_cnTS, simatcopy_k_ctTS, simatcopy_k_rnTS, simatcopy_k_rtTS, +#endif +#ifdef BUILD_DOUBLE dimatcopy_k_cnTS, dimatcopy_k_ctTS, dimatcopy_k_rnTS, dimatcopy_k_rtTS, +#endif +#ifdef BUILD_COMPLEX cimatcopy_k_cnTS, cimatcopy_k_ctTS, cimatcopy_k_rnTS, cimatcopy_k_rtTS, cimatcopy_k_cncTS, cimatcopy_k_ctcTS, cimatcopy_k_rncTS, cimatcopy_k_rtcTS, +#endif +#ifdef BUILD_COMPLEX16 zimatcopy_k_cnTS, zimatcopy_k_ctTS, zimatcopy_k_rnTS, zimatcopy_k_rtTS, zimatcopy_k_cncTS, zimatcopy_k_ctcTS, zimatcopy_k_rncTS, zimatcopy_k_rtcTS, +#endif - sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS - +#ifdef BUILD_SINGLE + sgeadd_kTS, +#endif +#ifdef BUILD_DOUBLE + dgeadd_kTS, +#endif +#ifdef BUILD_COMPLEX + cgeadd_kTS, +#endif +#ifdef BUILD_COMPLEX16 + zgeadd_kTS +#endif }; #if defined(ARCH_ARM64) @@ -717,26 +808,50 @@ static void init_parameter(void) { #if defined(BUILD_HALF) TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; #endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #if defined(BUILD_HALF) TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; #endif +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#endif #if defined(BUILD_HALF) TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; #endif +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; @@ -989,22 +1104,34 @@ static void init_parameter(void) { TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; #endif +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#endif +#ifdef BUILD_COMPLEX #ifdef CGEMM3M_DEFAULT_Q TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; #else TABLE_NAME.cgemm3m_q = SGEMM_DEFAULT_Q; #endif +#endif +#ifdef BUILD_COMPLEX16 #ifdef ZGEMM3M_DEFAULT_Q TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; #else TABLE_NAME.zgemm3m_q = DGEMM_DEFAULT_Q; #endif +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; @@ -1018,10 +1145,18 @@ static void init_parameter(void) { fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 64 * (l2 >> 7); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 32 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 32 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 16 * (l2 >> 7); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 16 * (l2 >> 7); TABLE_NAME.xgemm_p = 8 * (l2 >> 7); @@ -1034,10 +1169,18 @@ static void init_parameter(void) { fprintf(stderr, "Northwood\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 96 * (l2 >> 7); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 48 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 48 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 24 * (l2 >> 7); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 24 * (l2 >> 7); TABLE_NAME.xgemm_p = 12 * (l2 >> 7); @@ -1050,10 +1193,18 @@ static void init_parameter(void) { fprintf(stderr, "Atom\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 256; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 128; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 128; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 64; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 64; TABLE_NAME.xgemm_p = 32; @@ -1066,10 +1217,18 @@ static void init_parameter(void) { fprintf(stderr, "Prescott\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 56 * (l2 >> 7); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 28 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 28 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 14 * (l2 >> 7); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 14 * (l2 >> 7); TABLE_NAME.xgemm_p = 7 * (l2 >> 7); @@ -1082,10 +1241,18 @@ static void init_parameter(void) { fprintf(stderr, "Core2\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 46 * (l2 >> 9) + 4; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 23 * (l2 >> 9) + 4; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 92 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 46 * (l2 >> 9) + 4; @@ -1098,10 +1265,18 @@ static void init_parameter(void) { fprintf(stderr, "Penryn\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4; @@ -1114,10 +1289,18 @@ static void init_parameter(void) { fprintf(stderr, "Dunnington\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8; TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4; @@ -1131,10 +1314,18 @@ static void init_parameter(void) { fprintf(stderr, "Nehalem\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1147,10 +1338,18 @@ static void init_parameter(void) { fprintf(stderr, "Sandybridge\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1163,10 +1362,18 @@ static void init_parameter(void) { fprintf(stderr, "Haswell\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1179,10 +1386,18 @@ static void init_parameter(void) { fprintf(stderr, "SkylakeX\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1196,10 +1411,18 @@ static void init_parameter(void) { fprintf(stderr, "Opteron\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = 56 + 14 * (l2 >> 7); TABLE_NAME.xgemm_p = 28 + 7 * (l2 >> 7); @@ -1212,10 +1435,18 @@ static void init_parameter(void) { fprintf(stderr, "Barcelona\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1228,10 +1459,18 @@ static void init_parameter(void) { fprintf(stderr, "Bobcate\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1244,10 +1483,18 @@ static void init_parameter(void) { fprintf(stderr, "Bulldozer\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1260,10 +1507,18 @@ static void init_parameter(void) { fprintf(stderr, "Excavator\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1277,10 +1532,18 @@ static void init_parameter(void) { fprintf(stderr, "Piledriver\n"); #endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1293,10 +1556,18 @@ static void init_parameter(void) { fprintf(stderr, "Steamroller\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1309,10 +1580,18 @@ static void init_parameter(void) { fprintf(stderr, "Zen\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; @@ -1326,11 +1605,18 @@ static void init_parameter(void) { fprintf(stderr, "NANO\n"); #endif +#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; - +#endif #ifdef EXPRECISION @@ -1340,41 +1626,55 @@ static void init_parameter(void) { #endif - +#ifdef BUILD_COMPLEX #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; #else TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; #endif +#endif +#ifdef BUILD_COMPLEX16 #ifdef ZGEMM3M_DEFAULT_P TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; #else TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; #endif +#endif #ifdef EXPRECISION TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; #endif - +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_p = ((TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_p = ((TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_p = ((TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1)/CGEMM_DEFAULT_UNROLL_M) * CGEMM_DEFAULT_UNROLL_M; +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ((TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1)/ZGEMM_DEFAULT_UNROLL_M) * ZGEMM_DEFAULT_UNROLL_M; +#endif +#ifdef BUILD_COMPLEX #ifdef CGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1)/CGEMM3M_DEFAULT_UNROLL_M) * CGEMM3M_DEFAULT_UNROLL_M; #else TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; #endif +#endif +#ifdef BUILD_COMPLEX16 #ifdef ZGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1)/ZGEMM3M_DEFAULT_UNROLL_M) * ZGEMM3M_DEFAULT_UNROLL_M; #else TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; #endif +#endif #ifdef QUAD_PRECISION TABLE_NAME.qgemm_p = ((TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1)/QGEMM_DEFAULT_UNROLL_M) * QGEMM_DEFAULT_UNROLL_M; @@ -1386,15 +1686,19 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif +#ifdef BUILD_SINGLE TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); +#endif +#ifdef BUILD_DOUBLE TABLE_NAME.dgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15); +#endif #ifdef EXPRECISION TABLE_NAME.qgemm_r = (((BUFFER_SIZE - @@ -1403,26 +1707,33 @@ static void init_parameter(void) { ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); #endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); +#endif +#ifdef BUILD_COMPLEX TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15); +#endif +#ifdef BUILD_COMPLEX16 TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15); - +#endif From 0eaae30e8c0b9f80426a0557de774680b0e4ab5f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:28:03 +0200 Subject: [PATCH 219/349] Adapt tests to having only a subset of types in the build --- test/CMakeLists.txt | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f1f773cbaf..360ff2151b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -4,7 +4,7 @@ include_directories(${PROJECT_BINARY_DIR}) enable_language(Fortran) if (BUILD_SINGLE) - list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) + list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3) endif() if (BUILD_DOUBLE) list (APPEND OpenBLAS_Tests dblat1 dblat2 dblat3) @@ -17,7 +17,7 @@ if (BUILD_COMPLEX16) endif() foreach(test_bin ${OpenBLAS_Tests}) -add_executable(${test_bin} ${test_bin}.f) + add_executable(${test_bin} ${test_bin}.f) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) endforeach() @@ -34,7 +34,19 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh "fi\n" ) -set(float_types s d c z) +#set(float_types s d c z) +if (BUILD_SINGLE) + list (APPEND float_types s) +endif() +if (BUILD_DOUBLE) + list (APPEND float_types d) +endif() +if (BUILD_COMPLEX) + list (APPEND float_types c) +endif() +if (BUILD_COMPLEX16) + list (APPEND float_types z) +endif() foreach(float_type ${float_types}) string(TOUPPER ${float_type} float_type_upper) add_test(NAME "${float_type}blas1" From 98153875e94c4c33d9cc4583711130cf8e23b8d0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Sep 2020 23:28:57 +0200 Subject: [PATCH 220/349] Adapt tests to having only a subset of types in the library --- utest/test_potrs.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/utest/test_potrs.c b/utest/test_potrs.c index 2681615f46..f39287d6f3 100644 --- a/utest/test_potrs.c +++ b/utest/test_potrs.c @@ -529,16 +529,20 @@ CTEST(potrf, smoketest_trivial){ for (j = 0; j < n; ++j) { double err; +#ifdef BUILD_SINGLE err = fabs(A1s[i+n*j] - Bs[i+n*j]); if (err > 1e-5) { CTEST_ERR("%s:%d %c s(%d,%d) difference: %g", __FILE__, __LINE__, uplo, i, j, err); } - +#endif +#ifdef BUILD_DOUBLE err = fabs(A1d[i+n*j] - Bd[i+n*j]); if (err > 1e-12) { CTEST_ERR("%s:%d %c d(%d,%d) difference: %g", __FILE__, __LINE__, uplo, i, j, err); } +#endif +#ifdef BUILD_COMPLEX #ifdef OPENBLAS_COMPLEX_C99 err = cabsf(A1c[i+n*j] - Bc[i+n*j]); #else @@ -548,7 +552,9 @@ CTEST(potrf, smoketest_trivial){ if (err > 1e-5) { CTEST_ERR("%s:%d %c c(%d,%d) difference: %g", __FILE__, __LINE__, uplo, i, j, err); } +#endif +#ifdef BUILD_COMPLEX16 #ifdef OPENBLAS_COMPLEX_C99 err = cabs(A1z[i+n*j] - Bz[i+n*j]); #else @@ -558,6 +564,7 @@ CTEST(potrf, smoketest_trivial){ if (err > 1e-12) { CTEST_ERR("%s:%d %c z(%d,%d) difference: %g", __FILE__, __LINE__, uplo, i, j, err); } +#endif } } } From f2e9a24e1a6da1eb3c297e979ac23f47d3685b07 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Sep 2020 19:02:20 +0200 Subject: [PATCH 221/349] Add AWS Graviton2 build --- .travis.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/.travis.yml b/.travis.yml index 482b4f6481..4bfdf485c3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -233,6 +233,21 @@ matrix: - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" + + - &test-graviton2 + os: linux + arch: arm64-graviton2 + dist: focal + group: edge + virt: lxd + compiler: gcc + addons: + apt: + packages: + - gfortran + script: + - travis_wait 45 make && make lapack-test + # whitelist branches: only: From c5a32288c6058223ada420a9e25a4533cf9475bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Sep 2020 23:24:37 +0200 Subject: [PATCH 222/349] Work around sgemm_r/dgemm_r not being properly defined with BUILD_COMPLEX/BUILD_COMPLEX16 --- common_param.h | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/common_param.h b/common_param.h index a689ddf7d9..b6abc4e744 100644 --- a/common_param.h +++ b/common_param.h @@ -189,14 +189,14 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); #endif -#endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -1085,7 +1085,7 @@ extern gotoblas_t *gotoblas; #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if defined (BUILD_SINGLE) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -1094,7 +1094,7 @@ extern gotoblas_t *gotoblas; #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn #endif -#if defined (BUILD_DOUBLE) || defined (BUILD_COMPLEX16) +#if defined (BUILD_DOUBLE) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r @@ -1117,6 +1117,14 @@ extern gotoblas_t *gotoblas; #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#ifndef BUILD_SINGLE +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R 1024 +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif #endif #ifdef BUILD_COMPLEX16 @@ -1126,6 +1134,14 @@ extern gotoblas_t *gotoblas; #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn +#ifndef BUILD_DOUBLE +#define DGEMM_P gotoblas -> dgemm_p +#define DGEMM_Q gotoblas -> dgemm_q +#define DGEMM_R 1024 +#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m +#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n +#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#endif #endif #define XGEMM_P gotoblas -> xgemm_p From 896bbd55e19aa628fb1438333d1376b27c0bcd65 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Sep 2020 23:25:55 +0200 Subject: [PATCH 223/349] Add support for building only selected variable types --- driver/others/blas_server_omp.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index d126955e4d..bdb5ebfd24 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -300,12 +300,15 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#if defined ( BUILD_DOUBLE) || defined (BUILD_COMPLEX16) sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); - +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE){ +#if defined (BUILD_SINGLE) || defined (BUILD_COMPLEX) sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#endif } else { /* Other types in future */ } @@ -317,15 +320,24 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ } else #endif if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ +#ifdef BUILD_COMPLEX16 sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#else +fprintf(stderr,"UNHANDLED COMPLEX16\n"); +#endif } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { +#ifdef BUILD_COMPLEX sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); +#else +fprintf(stderr,"UNHANDLED COMPLEX\n"); +#endif } else { /* Other types in future */ } } +if (!sb) fprintf(stderr,"SB not declared!!!\n"); queue->sb=sb; } } From 881c15179f93c96d9567ef74dceef1dfdbd5ccfa Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Sun, 27 Sep 2020 09:35:50 +0800 Subject: [PATCH 224/349] remove default support for FMA4 on zen architect --- getarch.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/getarch.c b/getarch.c index 83043bdf22..e2c22d3a07 100644 --- a/getarch.c +++ b/getarch.c @@ -492,7 +492,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ - "-DHAVE_AVX -DHAVE_FMA4" + "-DHAVE_AVX" #define LIBNAME "bulldozer" #define CORENAME "BULLDOZER" #endif @@ -508,7 +508,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "piledriver" #define CORENAME "PILEDRIVER" #endif @@ -524,7 +524,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "steamroller" #define CORENAME "STEAMROLLER" #endif @@ -540,7 +540,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ - "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" + "-DHAVE_AVX -DHAVE_FMA3" #define LIBNAME "excavator" #define CORENAME "EXCAVATOR" #endif From 7f539fb850a89b216c2d95aa48c9c36236c56767 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Sep 2020 22:48:41 +0200 Subject: [PATCH 225/349] Update cpu list, outline cmake build, clarify scope of set_num_threads extension --- README.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f8226f5cb1..6d44129c25 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,10 @@ Building OpenBLAS requires the following to be installed: Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically. To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`. -The full target list is in the file `TargetList.txt`. +The full target list is in the file `TargetList.txt`. For building with `cmake`, the +usual conventions apply, i.e. create a build directory either underneath the toplevel +OpenBLAS source directory or separate from it, and invoke `cmake` there with the path +to the source tree and any build options you plan to set. ### Cross compile @@ -152,13 +155,17 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Falkor**: same as A57 (different cpu specifications) - **ThunderX**: Optimized some Level-1 functions - **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2 +- **ThunderX3T110** - **TSV110**: Optimized some Level-3 helper functions - **EMAG 8180**: preliminary support based on A57 +- **Neoverse N1**: (AWS Graviton2) preliminary support +- **Apple Vortex**: preliminary support based on ARMV8 #### PPC/PPC64 - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. +- **POWER10**: #### IBM zEnterprise System @@ -226,7 +233,8 @@ We provide the following functions to control the number of threads at runtime: void goto_set_num_threads(int num_threads); void openblas_set_num_threads(int num_threads); ``` - +Note that these are only used once at library initialization, and are not available for +fine-tuning thread numbers in individual BLAS calls. If you compile this library with `USE_OPENMP=1`, you should use the above functions too. ## Reporting bugs From 7ed25e9e1010faa94a04d694080f982ed9e60b53 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 27 Sep 2020 22:59:20 +0200 Subject: [PATCH 226/349] FIx underflow/rounding errors in LAPACK (S,D)LANV2 Reference-LAPACK PR 445, fixing their issue 263 --- lapack-netlib/SRC/dlanv2.f | 28 ++++++++++++++++++++++++++-- lapack-netlib/SRC/slanv2.f | 28 ++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f index d68481f7e7..61b016f168 100644 --- a/lapack-netlib/SRC/dlanv2.f +++ b/lapack-netlib/SRC/dlanv2.f @@ -140,13 +140,16 @@ SUBROUTINE DLANV2( A, B, C, D, RT1R, RT1I, RT2R, RT2I, CS, SN ) * * .. Parameters .. DOUBLE PRECISION ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0 ) + PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, + $ TWO = 2.0D0 ) DOUBLE PRECISION MULTPL PARAMETER ( MULTPL = 4.0D+0 ) * .. * .. Local Scalars .. DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, - $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z + $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, + $ SAFMN2, SAFMX2 + INTEGER COUNT * .. * .. External Functions .. DOUBLE PRECISION DLAMCH, DLAPY2 @@ -157,7 +160,11 @@ SUBROUTINE DLANV2( A, B, C, D, RT1R, RT1I, RT2R, RT2I, CS, SN ) * .. * .. Executable Statements .. * + SAFMIN = DLAMCH( 'S' ) EPS = DLAMCH( 'P' ) + SAFMN2 = DLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) / + $ LOG( DLAMCH( 'B' ) ) / TWO ) + SAFMX2 = ONE / SAFMN2 IF( C.EQ.ZERO ) THEN CS = ONE SN = ZERO @@ -212,7 +219,24 @@ SUBROUTINE DLANV2( A, B, C, D, RT1R, RT1I, RT2R, RT2I, CS, SN ) * Complex eigenvalues, or real (almost) equal eigenvalues. * Make diagonal elements equal. * + COUNT = 0 SIGMA = B + C + 10 CONTINUE + COUNT = COUNT + 1 + SCALE = MAX( ABS(TEMP), ABS(SIGMA) ) + IF( SCALE.GE.SAFMX2 ) THEN + SIGMA = SIGMA * SAFMN2 + TEMP = TEMP * SAFMN2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + IF( SCALE.LE.SAFMN2 ) THEN + SIGMA = SIGMA * SAFMX2 + TEMP = TEMP * SAFMX2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + P = HALF*TEMP TAU = DLAPY2( SIGMA, TEMP ) CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) ) SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA ) diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f index 1163446fae..e678305f20 100644 --- a/lapack-netlib/SRC/slanv2.f +++ b/lapack-netlib/SRC/slanv2.f @@ -140,13 +140,16 @@ SUBROUTINE SLANV2( A, B, C, D, RT1R, RT1I, RT2R, RT2I, CS, SN ) * * .. Parameters .. REAL ZERO, HALF, ONE - PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0 ) + PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, + $ TWO = 2.0E+0 ) REAL MULTPL PARAMETER ( MULTPL = 4.0E+0 ) * .. * .. Local Scalars .. REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, - $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z + $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, + $ SAFMN2, SAFMX2 + INTEGER COUNT * .. * .. External Functions .. REAL SLAMCH, SLAPY2 @@ -157,7 +160,11 @@ SUBROUTINE SLANV2( A, B, C, D, RT1R, RT1I, RT2R, RT2I, CS, SN ) * .. * .. Executable Statements .. * + SAFMIN = SLAMCH( 'S' ) EPS = SLAMCH( 'P' ) + SAFMN2 = SLAMCH( 'B' )**INT( LOG( SAFMIN / EPS ) / + $ LOG( SLAMCH( 'B' ) ) / TWO ) + SAFMX2 = ONE / SAFMN2 IF( C.EQ.ZERO ) THEN CS = ONE SN = ZERO @@ -212,7 +219,24 @@ SUBROUTINE SLANV2( A, B, C, D, RT1R, RT1I, RT2R, RT2I, CS, SN ) * Complex eigenvalues, or real (almost) equal eigenvalues. * Make diagonal elements equal. * + COUNT = 0 SIGMA = B + C + 10 CONTINUE + COUNT = COUNT + 1 + SCALE = MAX( ABS(TEMP), ABS(SIGMA) ) + IF( SCALE.GE.SAFMX2 ) THEN + SIGMA = SIGMA * SAFMN2 + TEMP = TEMP * SAFMN2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + IF( SCALE.LE.SAFMN2 ) THEN + SIGMA = SIGMA * SAFMX2 + TEMP = TEMP * SAFMX2 + IF (COUNT .LE. 20) + $ GOTO 10 + END IF + P = HALF*TEMP TAU = SLAPY2( SIGMA, TEMP ) CS = SQRT( HALF*( ONE+ABS( SIGMA ) / TAU ) ) SN = -( P / ( TAU*CS ) )*SIGN( ONE, SIGMA ) From fe8cd5ae7e0958cced30e7086509d286a8442be0 Mon Sep 17 00:00:00 2001 From: Thomas Hisch Date: Mon, 28 Sep 2020 00:42:17 +0200 Subject: [PATCH 227/349] Consolidate usage of backticks for build options There were some build options in the README that were not highlighted. Now all are highlighted. --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 6d44129c25..ca034e7473 100644 --- a/README.md +++ b/README.md @@ -174,18 +174,18 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th ### Support for multiple targets in a single library -OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake. +OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. -For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default. +For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default. -DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, +`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias, Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano. On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14. -The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the +The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the common code in the library, usually you will want to set this to the oldest model you expect to encounter. Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. From 2df4235e00a73ad61b7997c74497fd86eb278ebf Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Sun, 27 Sep 2020 21:42:32 -0500 Subject: [PATCH 228/349] Optimize dcopy/zcopy for POWER10 This patch makes use of new POWER10 vector pair instructions for loads and stores. Tested in simulator and no new failures. --- kernel/power/KERNEL.POWER10 | 4 +- kernel/power/dcopy_microk_power10.c | 134 ++++++++++++++++++++++++++++ kernel/power/dcopy_power10.c | 123 +++++++++++++++++++++++++ kernel/power/zcopy_microk_power10.c | 134 ++++++++++++++++++++++++++++ kernel/power/zcopy_power10.c | 132 +++++++++++++++++++++++++++ 5 files changed, 525 insertions(+), 2 deletions(-) create mode 100644 kernel/power/dcopy_microk_power10.c create mode 100644 kernel/power/dcopy_power10.c create mode 100644 kernel/power/zcopy_microk_power10.c create mode 100644 kernel/power/zcopy_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index ec02e09adb..d0cda7fb66 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -151,9 +151,9 @@ endif ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy.c -DCOPYKERNEL = dcopy.c +DCOPYKERNEL = dcopy_power10.c CCOPYKERNEL = ccopy.c -ZCOPYKERNEL = zcopy.c +ZCOPYKERNEL = zcopy_power10.c # SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/power/dcopy_microk_power10.c b/kernel/power/dcopy_microk_power10.c new file mode 100644 index 0000000000..8940e0db9c --- /dev/null +++ b/kernel/power/dcopy_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_64 1 + +static void dcopy_kernel_64 (long n, double *x, double *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" + "lxvp 54, 352(%2) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" + "lxvp 62, 480(%2) \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -64 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "stxvp 34, 32(%3) \n\t" + "lxvp 34, 32(%2) \n\t" + "stxvp 36, 64(%3) \n\t" + "lxvp 36, 64(%2) \n\t" + "stxvp 38, 96(%3) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxvp 40, 128(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "stxvp 42, 160(%3) \n\t" + "lxvp 42, 160(%2) \n\t" + "stxvp 44, 192(%3) \n\t" + "lxvp 44, 192(%2) \n\t" + "stxvp 46, 224(%3) \n\t" + "lxvp 46, 224(%2) \n\t" + + "stxvp 48, 256(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "stxvp 50, 288(%3) \n\t" + "lxvp 50, 288(%2) \n\t" + "stxvp 52, 320(%3) \n\t" + "lxvp 52, 320(%2) \n\t" + "stxvp 54, 352(%3) \n\t" + "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "stxvp 58, 416(%3) \n\t" + "lxvp 58, 416(%2) \n\t" + "stxvp 60, 448(%3) \n\t" + "lxvp 60, 448(%2) \n\t" + "stxvp 62, 480(%3) \n\t" + "lxvp 62, 480(%2) \n\t" + + "addi %3, %3, 512 \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -64 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + "stxvp 48, 256(%3) \n\t" + "stxvp 50, 288(%3) \n\t" + "stxvp 52, 320(%3) \n\t" + "stxvp 54, 352(%3) \n\t" + "stxvp 56, 384(%3) \n\t" + "stxvp 58, 416(%3) \n\t" + "stxvp 60, 448(%3) \n\t" + "stxvp 62, 480(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/dcopy_power10.c b/kernel/power/dcopy_power10.c new file mode 100644 index 0000000000..32530d570b --- /dev/null +++ b/kernel/power/dcopy_power10.c @@ -0,0 +1,123 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "dcopy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_64 + +static void dcopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + dcopy_kernel_64(n1, x, y); + i=n1; + } + + while(i < n) + { + y[i] = x[i] ; + i++ ; + + } + + + } + else + { + + while(i < n) + { + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/zcopy_microk_power10.c b/kernel/power/zcopy_microk_power10.c new file mode 100644 index 0000000000..f2f2119a30 --- /dev/null +++ b/kernel/power/zcopy_microk_power10.c @@ -0,0 +1,134 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void zcopy_kernel_32 (long n, double *x, double *y) +{ + __asm__ + ( + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 36, 64(%2) \n\t" + "lxvp 38, 96(%2) \n\t" + "lxvp 40, 128(%2) \n\t" + "lxvp 42, 160(%2) \n\t" + "lxvp 44, 192(%2) \n\t" + "lxvp 46, 224(%2) \n\t" + + "lxvp 48, 256(%2) \n\t" + "lxvp 50, 288(%2) \n\t" + "lxvp 52, 320(%2) \n\t" + "lxvp 54, 352(%2) \n\t" + "lxvp 56, 384(%2) \n\t" + "lxvp 58, 416(%2) \n\t" + "lxvp 60, 448(%2) \n\t" + "lxvp 62, 480(%2) \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -32 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "lxvp 32, 0(%2) \n\t" + "stxvp 34, 32(%3) \n\t" + "lxvp 34, 32(%2) \n\t" + "stxvp 36, 64(%3) \n\t" + "lxvp 36, 64(%2) \n\t" + "stxvp 38, 96(%3) \n\t" + "lxvp 38, 96(%2) \n\t" + + "stxvp 40, 128(%3) \n\t" + "lxvp 40, 128(%2) \n\t" + "stxvp 42, 160(%3) \n\t" + "lxvp 42, 160(%2) \n\t" + "stxvp 44, 192(%3) \n\t" + "lxvp 44, 192(%2) \n\t" + "stxvp 46, 224(%3) \n\t" + "lxvp 46, 224(%2) \n\t" + + "stxvp 48, 256(%3) \n\t" + "lxvp 48, 256(%2) \n\t" + "stxvp 50, 288(%3) \n\t" + "lxvp 50, 288(%2) \n\t" + "stxvp 52, 320(%3) \n\t" + "lxvp 52, 320(%2) \n\t" + "stxvp 54, 352(%3) \n\t" + "lxvp 54, 352(%2) \n\t" + "stxvp 56, 384(%3) \n\t" + "lxvp 56, 384(%2) \n\t" + "stxvp 58, 416(%3) \n\t" + "lxvp 58, 416(%2) \n\t" + "stxvp 60, 448(%3) \n\t" + "lxvp 60, 448(%2) \n\t" + "stxvp 62, 480(%3) \n\t" + "lxvp 62, 480(%2) \n\t" + + "addi %3, %3, 512 \n\t" + "addi %2, %2, 512 \n\t" + + "addic. %1, %1, -32 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "stxvp 32, 0(%3) \n\t" + "stxvp 34, 32(%3) \n\t" + "stxvp 36, 64(%3) \n\t" + "stxvp 38, 96(%3) \n\t" + "stxvp 40, 128(%3) \n\t" + "stxvp 42, 160(%3) \n\t" + "stxvp 44, 192(%3) \n\t" + "stxvp 46, 224(%3) \n\t" + "stxvp 48, 256(%3) \n\t" + "stxvp 50, 288(%3) \n\t" + "stxvp 52, 320(%3) \n\t" + "stxvp 54, 352(%3) \n\t" + "stxvp 56, 384(%3) \n\t" + "stxvp 58, 416(%3) \n\t" + "stxvp 60, 448(%3) \n\t" + "stxvp 62, 480(%3) \n\t" + + "#n=%1 x=%4=%2 y=%0=%3" + : + "=m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); +} diff --git a/kernel/power/zcopy_power10.c b/kernel/power/zcopy_power10.c new file mode 100644 index 0000000000..99d463b024 --- /dev/null +++ b/kernel/power/zcopy_power10.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "zcopy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void zcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + zcopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + From 1b1a757f5f389b9496f016defaecccb63c415fa6 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 28 Sep 2020 20:36:53 +0800 Subject: [PATCH 229/349] Optimize the performance of dot by using universal intrinsics in X86/ARM --- kernel/generic/dot.c | 50 ++++++++++++++++++++++++++++++------- kernel/simd/intrin.h | 9 +++++++ kernel/simd/intrin_avx.h | 32 ++++++++++++++++-------- kernel/simd/intrin_avx512.h | 32 +++++++++++++++++------- kernel/simd/intrin_neon.h | 42 +++++++++++++++++++++++++++++++ kernel/simd/intrin_sse.h | 36 ++++++++++++++++++-------- utest/test_dsdot.c | 14 +++++++++++ 7 files changed, 177 insertions(+), 38 deletions(-) create mode 100644 kernel/simd/intrin_neon.h diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index bc07bc78f4..f1ea6b264a 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -47,27 +47,59 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; - - while(i < n1) +#if V_SIMD && !defined(DSDOT) + const int vstep = v_nlanes_f32; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + v_f32 vsum0 = v_zero_f32(); + v_f32 vsum1 = v_zero_f32(); + v_f32 vsum2 = v_zero_f32(); + v_f32 vsum3 = v_zero_f32(); + while(i < unrollx4) + { + vsum0 = v_muladd_f32( + v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0 + ); + vsum1 = v_muladd_f32( + v_loadu_f32(x + i + vstep), v_loadu_f32(y + i + vstep), vsum1 + ); + vsum2 = v_muladd_f32( + v_loadu_f32(x + i + vstep*2), v_loadu_f32(y + i + vstep*2), vsum2 + ); + vsum3 = v_muladd_f32( + v_loadu_f32(x + i + vstep*3), v_loadu_f32(y + i + vstep*3), vsum3 + ); + i += vstep*4; + } + vsum0 = v_add_f32( + v_add_f32(vsum0, vsum1), v_add_f32(vsum2 , vsum3) + ); + while(i < unrollx) + { + vsum0 = v_muladd_f32( + v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0 + ); + i += vstep; + } + dot = v_sum_f32(vsum0); +#elif defined(DSDOT) + for (; i < n1; i += 4) { - -#if defined(DSDOT) dot += (double) y[i] * (double) x[i] + (double) y[i+1] * (double) x[i+1] + (double) y[i+2] * (double) x[i+2] + (double) y[i+3] * (double) x[i+3] ; + } #else + for (; i < n1; i += 4) + { dot += y[i] * x[i] + y[i+1] * x[i+1] + y[i+2] * x[i+2] + y[i+3] * x[i+3] ; -#endif - i+=4 ; - } - +#endif while(i < n) { diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h index 5997bb6ac3..ef8fcb8657 100644 --- a/kernel/simd/intrin.h +++ b/kernel/simd/intrin.h @@ -51,6 +51,11 @@ extern "C" { #include #endif +/** NEON **/ +#ifdef HAVE_NEON +#include +#endif + // distribute #if defined(HAVE_AVX512VL) || defined(HAVE_AVX512BF16) #include "intrin_avx512.h" @@ -60,6 +65,10 @@ extern "C" { #include "intrin_sse.h" #endif +#ifdef HAVE_NEON +#include "intrin_neon.h" +#endif + #ifndef V_SIMD #define V_SIMD 0 #define V_SIMD_F64 0 diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index f6257ae987..f36a3dbf0f 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -1,13 +1,13 @@ #define V_SIMD 256 #define V_SIMD_F64 1 -/* -Data Type -*/ +/*************************** + * Data Type + ***************************/ typedef __m256 v_f32; #define v_nlanes_f32 8 -/* -arithmetic -*/ +/*************************** + * Arithmetic + ***************************/ #define v_add_f32 _mm256_add_ps #define v_mul_f32 _mm256_mul_ps @@ -20,10 +20,22 @@ arithmetic { return v_add_f32(v_mul_f32(a, b), c); } #endif // !HAVE_FMA3 -/* -memory -*/ +// Horizontal add: Calculates the sum of all vector elements. +BLAS_FINLINE float v_sum_f32(__m256 a) +{ + __m256 sum_halves = _mm256_hadd_ps(a, a); + sum_halves = _mm256_hadd_ps(sum_halves, sum_halves); + __m128 lo = _mm256_castps256_ps128(sum_halves); + __m128 hi = _mm256_extractf128_ps(sum_halves, 1); + __m128 sum = _mm_add_ps(lo, hi); + return _mm_cvtss_f32(sum); +} + +/*************************** + * memory + ***************************/ // unaligned load #define v_loadu_f32 _mm256_loadu_ps #define v_storeu_f32 _mm256_storeu_ps -#define v_setall_f32(VAL) _mm256_set1_ps(VAL) \ No newline at end of file +#define v_setall_f32(VAL) _mm256_set1_ps(VAL) +#define v_zero_f32 _mm256_setzero_ps \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index cb116a9a31..70e5f72e39 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -1,21 +1,35 @@ #define V_SIMD 512 #define V_SIMD_F64 1 -/* -Data Type -*/ +/*************************** + * Data Type + ***************************/ typedef __m512 v_f32; #define v_nlanes_f32 16 -/* -arithmetic -*/ +/*************************** + * Arithmetic + ***************************/ #define v_add_f32 _mm512_add_ps #define v_mul_f32 _mm512_mul_ps // multiply and add, a*b + c #define v_muladd_f32 _mm512_fmadd_ps -/* -memory -*/ + +BLAS_FINLINE float v_sum_f32(v_f32 a) +{ + __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512 sum32 = _mm512_add_ps(a, h64); + __m512 h32 = _mm512_shuffle_f32x4(sum32, sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum16 = _mm512_add_ps(sum32, h32); + __m512 h16 = _mm512_permute_ps(sum16, _MM_SHUFFLE(1, 0, 3, 2)); + __m512 sum8 = _mm512_add_ps(sum16, h16); + __m512 h4 = _mm512_permute_ps(sum8, _MM_SHUFFLE(2, 3, 0, 1)); + __m512 sum4 = _mm512_add_ps(sum8, h4); + return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); +} +/*************************** + * memory + ***************************/ // unaligned load #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) #define v_storeu_f32 _mm512_storeu_ps #define v_setall_f32(VAL) _mm512_set1_ps(VAL) +#define v_zero_f32 _mm512_setzero_ps diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h new file mode 100644 index 0000000000..5875c0e4ea --- /dev/null +++ b/kernel/simd/intrin_neon.h @@ -0,0 +1,42 @@ +#define V_SIMD 128 +#ifdef __aarch64__ + #define V_SIMD_F64 1 +#else + #define V_SIMD_F64 0 +#endif +/*************************** + * Data Type + ***************************/ +typedef float32x4_t v_f32; +#define v_nlanes_f32 4 +/*************************** + * Arithmetic + ***************************/ +#define v_add_f32 vaddq_f32 +#define v_mul_f32 vmulq_f32 + +// FUSED F32 +#ifdef HAVE_VFPV4 // FMA + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return vfmaq_f32(c, a, b); } +#else + // multiply and add, a*b + c + BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) + { return vmlaq_f32(c, a, b); } +#endif + +// Horizontal add: Calculates the sum of all vector elements. +BLAS_FINLINE float v_sum_f32(float32x4_t a) +{ + float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); + return vget_lane_f32(vpadd_f32(r, r), 0); +} +/*************************** + * memory + ***************************/ +// unaligned load +#define v_loadu_f32(a) vld1q_f32((const float*)a) +#define v_storeu_f32 vst1q_f32 +#define v_setall_f32(VAL) vdupq_n_f32(VAL) +#define v_zero_f32() vdupq_n_f32(0.0f) \ No newline at end of file diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 260112028b..9de7e1b278 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -1,13 +1,13 @@ #define V_SIMD 128 #define V_SIMD_F64 1 -/* -Data Type -*/ +/*************************** + * Data Type + ***************************/ typedef __m128 v_f32; #define v_nlanes_f32 4 -/* -arithmetic -*/ +/*************************** + * Arithmetic + ***************************/ #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps #ifdef HAVE_FMA3 @@ -21,10 +21,26 @@ arithmetic BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } #endif // HAVE_FMA3 -/* -memory -*/ + +// Horizontal add: Calculates the sum of all vector elements. +BLAS_FINLINE float v_sum_f32(__m128 a) +{ +#ifdef HAVE_SSE3 + __m128 sum_halves = _mm_hadd_ps(a, a); + return _mm_cvtss_f32(_mm_hadd_ps(sum_halves, sum_halves)); +#else + __m128 t1 = _mm_movehl_ps(a, a); + __m128 t2 = _mm_add_ps(a, t1); + __m128 t3 = _mm_shuffle_ps(t2, t2, 1); + __m128 t4 = _mm_add_ss(t2, t3); + return _mm_cvtss_f32(t4); +#endif +} +/*************************** + * memory + ***************************/ // unaligned load #define v_loadu_f32 _mm_loadu_ps #define v_storeu_f32 _mm_storeu_ps -#define v_setall_f32(VAL) _mm_set1_ps(VAL) \ No newline at end of file +#define v_setall_f32(VAL) _mm_set1_ps(VAL) +#define v_zero_f32 _mm_setzero_ps \ No newline at end of file diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c index d58b398a80..57da7101ef 100644 --- a/utest/test_dsdot.c +++ b/utest/test_dsdot.c @@ -47,3 +47,17 @@ CTEST(dsdot,dsdot_n_1) ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); } + +CTEST(dsdot,dsdot_n_2) +{ + float x[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F}; + float y[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F}; + blasint incx=1; + blasint incy=1; + blasint n=8; + + double res1=0.0f, res2= 2.0400000444054616; + + res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); + ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); +} \ No newline at end of file From 60e6c68e3811ae9b7b3bead134507e10fa31aed9 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Tue, 29 Sep 2020 16:36:14 +0800 Subject: [PATCH 230/349] Adapt ARM architect --- kernel/arm64/KERNEL.ARMV8 | 2 +- kernel/arm64/KERNEL.CORTEXA53 | 2 +- kernel/arm64/KERNEL.CORTEXA57 | 2 +- kernel/generic/dot.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index fe32d31373..603e47d879 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.S -SDOTKERNEL = dot.S +SDOTKERNEL = ../generic/dot.c CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.CORTEXA53 b/kernel/arm64/KERNEL.CORTEXA53 index eba38a92e8..e23133e526 100644 --- a/kernel/arm64/KERNEL.CORTEXA53 +++ b/kernel/arm64/KERNEL.CORTEXA53 @@ -97,7 +97,7 @@ CNRM2KERNEL = znrm2.S ZNRM2KERNEL = znrm2.S DDOTKERNEL = dot.S -SDOTKERNEL = dot.S +SDOTKERNEL = ../generic/dot.c CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 index 04d6940d7a..dcf2383a9c 100644 --- a/kernel/arm64/KERNEL.CORTEXA57 +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -70,7 +70,7 @@ DCOPYKERNEL = copy.S CCOPYKERNEL = copy.S ZCOPYKERNEL = copy.S -SDOTKERNEL = dot.S +SDOTKERNEL = ../generic/dot.c DDOTKERNEL = dot.S CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S diff --git a/kernel/generic/dot.c b/kernel/generic/dot.c index f1ea6b264a..5abbb735ce 100644 --- a/kernel/generic/dot.c +++ b/kernel/generic/dot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" - +#include "../simd/intrin.h" #if defined(DSDOT) double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) #else @@ -47,9 +47,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - int n1 = n & -4; + int n1 = n & -4; #if V_SIMD && !defined(DSDOT) - const int vstep = v_nlanes_f32; + const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; v_f32 vsum0 = v_zero_f32(); From 2bf70c8e3b72f560ab35320ed12df9ac92f9b46c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:43:25 +0200 Subject: [PATCH 231/349] Change ifdef linux to __linux for C11 compatibility --- cpuid_arm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpuid_arm.c b/cpuid_arm.c index 19aa907187..a3b1dfd332 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -54,7 +54,7 @@ static char *cpuname_lower[] = { int get_feature(char *search) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -90,7 +90,7 @@ int get_feature(char *search) int detect(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; @@ -289,7 +289,7 @@ void get_libname(void) void get_features(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; From be40440ec59e8ac16b7c63d62ab743845073d2ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:45:18 +0200 Subject: [PATCH 232/349] Change ifdef linux to __linux for C11 compatibility --- cpuid_arm64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index a0d3e15b99..ae150ef1be 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -90,7 +90,7 @@ static char *cpuname_lower[] = { int get_feature(char *search) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -126,7 +126,7 @@ int get_feature(char *search) int detect(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; @@ -242,7 +242,7 @@ void get_cpucount(void) { int n=0; -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; @@ -441,7 +441,7 @@ void get_libname(void) void get_features(void) { -#ifdef linux +#ifdef __linux FILE *infile; char buffer[2048], *p,*t; p = (char *) NULL ; From a7d5d0078dd3d5a0c5d1aff9f3723d6799bd0410 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:46:25 +0200 Subject: [PATCH 233/349] Change ifdef linux to __linux for C11 compatibility --- cpuid_mips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_mips.c b/cpuid_mips.c index 3a2e123935..e6e837f732 100644 --- a/cpuid_mips.c +++ b/cpuid_mips.c @@ -84,7 +84,7 @@ static char *cpuname[] = { int detect(void){ -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; From 0b2bb5696af3c7abb0b0d5038124eb4a5f883fbc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:47:25 +0200 Subject: [PATCH 234/349] Change ifdef linux to __linux for C11 compatibility --- cpuid_mips64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 0e32bfc0b0..0c19ac1e7a 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -90,7 +90,7 @@ static char *cpuname[] = { int detect(void){ -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; From e1574cbc83a691f2f0ff898c9976e1f5861d9686 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:50:21 +0200 Subject: [PATCH 235/349] Change ifdef linux to __linux for C11 compatibility and add a fallback for unsupported operating systems in detect() --- cpuid_power.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpuid_power.c b/cpuid_power.c index b17493bc8f..2526e8d0e6 100644 --- a/cpuid_power.c +++ b/cpuid_power.c @@ -104,7 +104,7 @@ char *corename[] = { int detect(void){ -#ifdef linux +#ifdef __linux FILE *infile; char buffer[512], *p; @@ -214,6 +214,8 @@ switch ( id >> 16 ) { return CPUTYPE_UNKNOWN; } #endif + + return CPUTYPE_UNKNOWN; } void get_architecture(void){ From 5464eb13ea362012047d98dd7c6ecd33ca58b27b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 22:59:41 +0200 Subject: [PATCH 236/349] Change ifdef linux to __linux for C11 compatibility --- benchmark/amax.c | 2 +- benchmark/amin.c | 2 +- benchmark/asum.c | 2 +- benchmark/axpby.c | 2 +- benchmark/axpy.c | 2 +- benchmark/copy.c | 2 +- benchmark/dot.c | 2 +- benchmark/geev.c | 2 +- benchmark/gemm.c | 2 +- benchmark/gemm3m.c | 2 +- benchmark/gemv.c | 2 +- benchmark/ger.c | 2 +- benchmark/gesv.c | 2 +- benchmark/getri.c | 2 +- benchmark/hbmv.c | 2 +- benchmark/hemm.c | 2 +- benchmark/hemv.c | 2 +- benchmark/her.c | 2 +- benchmark/her2.c | 2 +- benchmark/her2k.c | 2 +- benchmark/herk.c | 2 +- benchmark/hpmv.c | 2 +- benchmark/iamax.c | 2 +- benchmark/iamin.c | 2 +- benchmark/imax.c | 2 +- benchmark/imin.c | 2 +- benchmark/linpack.c | 2 +- benchmark/max.c | 2 +- benchmark/min.c | 2 +- benchmark/nrm2.c | 2 +- benchmark/rot.c | 2 +- benchmark/rotm.c | 2 +- benchmark/scal.c | 2 +- benchmark/spmv.c | 2 +- benchmark/spr.c | 2 +- benchmark/spr2.c | 2 +- benchmark/swap.c | 2 +- benchmark/symm.c | 2 +- benchmark/symv.c | 2 +- benchmark/syr.c | 2 +- benchmark/syr2.c | 2 +- benchmark/syr2k.c | 2 +- benchmark/syrk.c | 2 +- benchmark/tpmv.c | 2 +- benchmark/tpsv.c | 2 +- benchmark/trmm.c | 2 +- benchmark/trmv.c | 2 +- benchmark/trsm.c | 2 +- benchmark/trsv.c | 2 +- benchmark/zdot-intel.c | 2 +- benchmark/zdot.c | 2 +- 51 files changed, 51 insertions(+), 51 deletions(-) diff --git a/benchmark/amax.c b/benchmark/amax.c index 32f55ce836..19ae95c8b2 100644 --- a/benchmark/amax.c +++ b/benchmark/amax.c @@ -146,7 +146,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/amin.c b/benchmark/amin.c index 218f0ea9f2..d0cadbd3ba 100644 --- a/benchmark/amin.c +++ b/benchmark/amin.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/asum.c b/benchmark/asum.c index e3d16acfd2..bcccd9089d 100644 --- a/benchmark/asum.c +++ b/benchmark/asum.c @@ -152,7 +152,7 @@ int main(int argc, char *argv[]){ } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/axpby.c b/benchmark/axpby.c index 3b3dd9979c..793ee7e40f 100644 --- a/benchmark/axpby.c +++ b/benchmark/axpby.c @@ -152,7 +152,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/axpy.c b/benchmark/axpy.c index e40f93c705..760703c1da 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -151,7 +151,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/copy.c b/benchmark/copy.c index d7f58c94fd..eb5148fffd 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -154,7 +154,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/dot.c b/benchmark/dot.c index 50d05e5320..aae3c04b09 100644 --- a/benchmark/dot.c +++ b/benchmark/dot.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/geev.c b/benchmark/geev.c index ef92712201..4fd2c8d6fd 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -214,7 +214,7 @@ int main(int argc, char *argv[]){ } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/gemm.c b/benchmark/gemm.c index d2235330b0..84dd292c5e 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -197,7 +197,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c index f4048c4361..98c13e1be7 100644 --- a/benchmark/gemm3m.c +++ b/benchmark/gemm3m.c @@ -163,7 +163,7 @@ int main(int argc, char *argv[]){ loops = atoi(p); -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/gemv.c b/benchmark/gemv.c index a9dee67d26..fb1f541d36 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -181,7 +181,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/ger.c b/benchmark/ger.c index ca7e94e155..d53d328f0d 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -165,7 +165,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/gesv.c b/benchmark/gesv.c index 80f644e698..057cbd2431 100644 --- a/benchmark/gesv.c +++ b/benchmark/gesv.c @@ -165,7 +165,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/getri.c b/benchmark/getri.c index e8b82a758e..a070147687 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -188,7 +188,7 @@ int main(int argc, char *argv[]){ } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c index b9dcc03bbe..60ba9fb890 100644 --- a/benchmark/hbmv.c +++ b/benchmark/hbmv.c @@ -158,7 +158,7 @@ int main(int argc, char *argv[]){ exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/hemm.c b/benchmark/hemm.c index 2fe0f5c5f6..2bc165458b 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -151,7 +151,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/hemv.c b/benchmark/hemv.c index b6ff512ce0..98618a04e8 100644 --- a/benchmark/hemv.c +++ b/benchmark/hemv.c @@ -152,7 +152,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/her.c b/benchmark/her.c index f4e10b684f..010f8120dc 100644 --- a/benchmark/her.c +++ b/benchmark/her.c @@ -149,7 +149,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/her2.c b/benchmark/her2.c index e10b7e98e5..0f80f3ed92 100644 --- a/benchmark/her2.c +++ b/benchmark/her2.c @@ -151,7 +151,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/her2k.c b/benchmark/her2k.c index a0772fefff..021873beb3 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -150,7 +150,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/herk.c b/benchmark/herk.c index eed8ed7389..c09d35c1f8 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -149,7 +149,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c index 6e6634fcfb..b0157094ee 100644 --- a/benchmark/hpmv.c +++ b/benchmark/hpmv.c @@ -155,7 +155,7 @@ int main(int argc, char *argv[]){ exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/iamax.c b/benchmark/iamax.c index 736f02b891..c87044ab4f 100644 --- a/benchmark/iamax.c +++ b/benchmark/iamax.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/iamin.c b/benchmark/iamin.c index b2c779811a..e7c8e59e4e 100644 --- a/benchmark/iamin.c +++ b/benchmark/iamin.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/imax.c b/benchmark/imax.c index c7060af848..b56ef64ba9 100644 --- a/benchmark/imax.c +++ b/benchmark/imax.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/imin.c b/benchmark/imin.c index f8bdc25374..4a92c8bd07 100644 --- a/benchmark/imin.c +++ b/benchmark/imin.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/linpack.c b/benchmark/linpack.c index e4b20e99d7..661a441755 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -174,7 +174,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/max.c b/benchmark/max.c index 2fa6e5a14a..a19a386a27 100644 --- a/benchmark/max.c +++ b/benchmark/max.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/min.c b/benchmark/min.c index 9abed8e802..4df8fb0fde 100644 --- a/benchmark/min.c +++ b/benchmark/min.c @@ -139,7 +139,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/nrm2.c b/benchmark/nrm2.c index d3718f9e05..0f416621a8 100644 --- a/benchmark/nrm2.c +++ b/benchmark/nrm2.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/rot.c b/benchmark/rot.c index 8ec8b1d973..69698988db 100644 --- a/benchmark/rot.c +++ b/benchmark/rot.c @@ -156,7 +156,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/rotm.c b/benchmark/rotm.c index 8dea2d08ce..17c8d54164 100644 --- a/benchmark/rotm.c +++ b/benchmark/rotm.c @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/scal.c b/benchmark/scal.c index 453c3234db..8bd62c77cc 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -150,7 +150,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/spmv.c b/benchmark/spmv.c index 2a26c9416b..cff504d3b3 100644 --- a/benchmark/spmv.c +++ b/benchmark/spmv.c @@ -163,7 +163,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/spr.c b/benchmark/spr.c index c91e587b11..5dcaa4f8b3 100755 --- a/benchmark/spr.c +++ b/benchmark/spr.c @@ -149,7 +149,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/spr2.c b/benchmark/spr2.c index e8ee345d78..a5f2791f75 100755 --- a/benchmark/spr2.c +++ b/benchmark/spr2.c @@ -153,7 +153,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/swap.c b/benchmark/swap.c index 368c59cd43..76d5459955 100644 --- a/benchmark/swap.c +++ b/benchmark/swap.c @@ -151,7 +151,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/symm.c b/benchmark/symm.c index b979e8d518..bb9849eb5f 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/symv.c b/benchmark/symv.c index 789c3560fb..e4c892b5ad 100644 --- a/benchmark/symv.c +++ b/benchmark/symv.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/syr.c b/benchmark/syr.c index 458bc6edb6..a9dd293e60 100644 --- a/benchmark/syr.c +++ b/benchmark/syr.c @@ -144,7 +144,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/syr2.c b/benchmark/syr2.c index 0129dd09a3..9efbca3154 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -150,7 +150,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index b1fcd8a189..a906559eba 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 95625a6c4a..0fbb943f67 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -159,7 +159,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c index ee5b97f247..fe9d075343 100644 --- a/benchmark/tpmv.c +++ b/benchmark/tpmv.c @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c index 46d78fd172..8472ac2610 100644 --- a/benchmark/tpsv.c +++ b/benchmark/tpsv.c @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/trmm.c b/benchmark/trmm.c index e095b85ee4..23af122b4b 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/trmv.c b/benchmark/trmv.c index f5a5fe31a5..46641b3e41 100644 --- a/benchmark/trmv.c +++ b/benchmark/trmv.c @@ -132,7 +132,7 @@ int main(int argc, char *argv[]) fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/trsm.c b/benchmark/trsm.c index 6ce1d532c4..17676946ab 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -172,7 +172,7 @@ int main(int argc, char *argv[]){ -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/trsv.c b/benchmark/trsv.c index c60890de4a..1734e2adb0 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -159,7 +159,7 @@ int main(int argc, char *argv[]){ uplo,diag,loops); -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c index bb2c40f382..ba15153650 100644 --- a/benchmark/zdot-intel.c +++ b/benchmark/zdot-intel.c @@ -146,7 +146,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif diff --git a/benchmark/zdot.c b/benchmark/zdot.c index 136135c9c3..fa624e859e 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -145,7 +145,7 @@ int main(int argc, char *argv[]){ fprintf(stderr,"Out of Memory!!\n");exit(1); } -#ifdef linux +#ifdef __linux srandom(getpid()); #endif From 2367726578884f3975d12e276927b1f52acc152c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 30 Sep 2020 23:28:49 +0200 Subject: [PATCH 237/349] Remove redundant status message --- cmake/system_check.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index d06f4779fd..b0ab926fcf 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -121,7 +121,6 @@ endif() include(CheckIncludeFile) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) -if (HAVE_C11 EQUAL 1) -message (STATUS found stdatomic.h) +if (HAVE_C11) set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11") endif() From dee7c49938ef34c18deb3175f6e67ae9a2240f5f Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 1 Oct 2020 10:43:16 +0200 Subject: [PATCH 238/349] Fix TABs and trailing space --- driver/others/memory.c | 352 ++++++++++++++++++++--------------------- 1 file changed, 176 insertions(+), 176 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 9b6c226a1e..5c9c388ce1 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -80,7 +80,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef COMPILE_TLS #endif -#if defined(__GLIBC_PREREQ) +#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2,20) #undef COMPILE_TLS #endif @@ -161,7 +161,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) #include #undef printf -#define printf _cprintf +#define printf _cprintf #endif #ifdef OS_LINUX @@ -190,14 +190,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CONSTRUCTOR __cdecl #define DESTRUCTOR __cdecl #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) -#define CONSTRUCTOR __attribute__ ((constructor(101))) -#define DESTRUCTOR __attribute__ ((destructor(101))) +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) #else -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -272,7 +272,7 @@ int get_num_procs(void) { return nums; } ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; } else { @@ -281,7 +281,7 @@ int get_num_procs(void) { return nums; } ret = CPU_COUNT(&cpuset); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; return nums; } #endif @@ -628,12 +628,12 @@ static void *alloc_mmap(void *address){ if (address){ map_address = mmap(address, - allocation_block_size, - MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + allocation_block_size, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, - allocation_block_size, - MMAP_ACCESS, MMAP_POLICY, -1, 0); + allocation_block_size, + MMAP_ACCESS, MMAP_POLICY, -1, 0); } STORE_RELEASE_FUNC(map_address, alloc_mmap_free); @@ -648,7 +648,7 @@ static void *alloc_mmap(void *address){ #else #define BENCH_ITERATION 4 -#define SCALING 2 +#define SCALING 2 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { @@ -711,60 +711,60 @@ static void *alloc_mmap(void *address){ #endif map_address = mmap(NULL, allocation_block_size * SCALING, - MMAP_ACCESS, MMAP_POLICY, -1, 0); + MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { #ifdef OS_LINUX #ifdef DEBUG - int ret=0; - ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); - if(ret==-1){ - int errsv=errno; - perror("OpenBLAS alloc_mmap:"); - printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); - } + int ret=0; + ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("OpenBLAS alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } #else - my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif - allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); - start = (BLASULONG)map_address; - current = (SCALING - 1) * allocation_block_size; - original = current; + start = (BLASULONG)map_address; + current = (SCALING - 1) * allocation_block_size; + original = current; - while(current > 0 && current <= original) { - *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; - start += PAGESIZE; - current -= PAGESIZE; - } + while(current > 0 && current <= original) { + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } - *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; - start = (BLASULONG)map_address; + start = (BLASULONG)map_address; - best = (BLASULONG)-1; - best_address = map_address; + best = (BLASULONG)-1; + best_address = map_address; - while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) { + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) { - current = run_bench(start, allocsize); + current = run_bench(start, allocsize); - if (best > current) { - best = current; - best_address = (void *)start; - } + if (best > current) { + best = current; + best_address = (void *)start; + } - start += PAGESIZE; + start += PAGESIZE; - } + } if ((BLASULONG)best_address > (BLASULONG)map_address) - munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address); @@ -854,9 +854,9 @@ static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, - allocation_block_size, - MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); + allocation_block_size, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; @@ -897,9 +897,9 @@ static void *alloc_devicedirver(void *address){ } map_address = mmap(address, allocation_block_size, - PROT_READ | PROT_WRITE, - MAP_FILE | MAP_SHARED, - fd, 0); + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd); @@ -974,12 +974,12 @@ static void *alloc_hugetlb(void *address){ shmid = shmget(IPC_PRIVATE, allocation_block_size, #ifdef OS_LINUX - SHM_HUGETLB | + SHM_HUGETLB | #endif #ifdef OS_AIX - SHM_LGPAGE | SHM_PIN | + SHM_LGPAGE | SHM_PIN | #endif - IPC_CREAT | SHM_R | SHM_W); + IPC_CREAT | SHM_R | SHM_W); if (shmid != -1) { map_address = (void *)shmat(shmid, address, SHM_RND); @@ -1026,9 +1026,9 @@ static void *alloc_hugetlb(void *address){ } map_address = (void *)VirtualAlloc(address, - allocation_block_size, - MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); + allocation_block_size, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); tp.Privileges[0].Attributes = 0; AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); @@ -1078,9 +1078,9 @@ static void *alloc_hugetlbfile(void *address){ unlink(filename); map_address = mmap(address, allocation_block_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, - fd, 0); + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd); @@ -1107,7 +1107,7 @@ static volatile int memory_initialized = 0; /* 1 : Level 2 functions */ /* 2 : Thread */ - static void blas_memory_cleanup(void* ptr){ +static void blas_memory_cleanup(void* ptr){ if (ptr) { struct alloc_t ** table = (struct alloc_t **)ptr; int pos; @@ -1243,27 +1243,27 @@ UNLOCK_COMMAND(&alloc_lock); while ((func != NULL) && (map_address == (void *) -1)) { - map_address = (*func)((void *)base_address); + map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER - if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); - } + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n"); + } #endif #ifdef ALLOC_HUGETLBFILE - if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n"); #endif - } + } #endif #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) - if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif - func ++; + func ++; } #ifdef DEBUG @@ -1377,7 +1377,7 @@ static BLASULONG init_lock = 0UL; #endif static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, - void *sa, void *sb, BLASLONG pos) { + void *sa, void *sb, BLASLONG pos) { #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) @@ -1507,11 +1507,11 @@ void CONSTRUCTOR gotoblas_init(void) { struct rlimit curlimit; if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) { - if ( curlimit.rlim_cur != curlimit.rlim_max ) - { - curlimit.rlim_cur = curlimit.rlim_max; - setrlimit(RLIMIT_STACK, &curlimit); - } + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } } #endif @@ -1545,7 +1545,7 @@ void DESTRUCTOR gotoblas_quit(void) { TlsFree(local_storage_key); #else pthread_key_delete(local_storage_key); -#endif +#endif #endif #ifdef PROFILE @@ -1605,8 +1605,8 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser */ static int on_process_term(void) { - gotoblas_quit(); - return 0; + gotoblas_quit(); + return 0; } #ifdef _WIN64 #pragma comment(linker, "/INCLUDE:_tls_used") @@ -1705,7 +1705,7 @@ void gotoblas_dummy_for_PGI(void) { #if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__)) #include #undef printf -#define printf _cprintf +#define printf _cprintf #endif #ifdef OS_LINUX @@ -1734,14 +1734,14 @@ void gotoblas_dummy_for_PGI(void) { #define CONSTRUCTOR __cdecl #define DESTRUCTOR __cdecl #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) -#define CONSTRUCTOR __attribute__ ((constructor(101))) -#define DESTRUCTOR __attribute__ ((destructor(101))) +#define CONSTRUCTOR __attribute__ ((constructor(101))) +#define DESTRUCTOR __attribute__ ((destructor(101))) #else -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -1817,7 +1817,7 @@ int get_num_procs(void) { return nums; } ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; } else { @@ -1826,7 +1826,7 @@ int get_num_procs(void) { return nums; } ret = CPU_COUNT(&cpuset); - if (ret > 0 && ret < nums) nums = ret; + if (ret > 0 && ret < nums) nums = ret; return nums; } #endif @@ -2083,26 +2083,26 @@ static void *alloc_mmap(void *address){ if (address){ map_address = mmap(address, - BUFFER_SIZE, - MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0); } else { map_address = mmap(address, - BUFFER_SIZE, - MMAP_ACCESS, MMAP_POLICY, -1, 0); + BUFFER_SIZE, + MMAP_ACCESS, MMAP_POLICY, -1, 0); } if (map_address != (void *)-1) { #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#endif +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -#endif +#endif } else { -#ifdef DEBUG +#ifdef DEBUG int errsv=errno; perror("OpenBLAS : mmap failed:"); printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); @@ -2119,7 +2119,7 @@ static void *alloc_mmap(void *address){ #else #define BENCH_ITERATION 4 -#define SCALING 2 +#define SCALING 2 static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) { @@ -2182,59 +2182,59 @@ static void *alloc_mmap(void *address){ #endif map_address = mmap(NULL, BUFFER_SIZE * SCALING, - MMAP_ACCESS, MMAP_POLICY, -1, 0); + MMAP_ACCESS, MMAP_POLICY, -1, 0); if (map_address != (void *)-1) { #ifdef OS_LINUX #ifdef DEBUG - int ret=0; - ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); - if(ret==-1){ - int errsv=errno; - perror("OpenBLAS alloc_mmap:"); - printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); - } + int ret=0; + ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + if(ret==-1){ + int errsv=errno; + perror("OpenBLAS alloc_mmap:"); + printf("error code=%d,\tmap_address=%lx\n",errsv,map_address); + } #else - my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); + my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0); #endif #endif - allocsize = DGEMM_P * DGEMM_Q * sizeof(double); + allocsize = DGEMM_P * DGEMM_Q * sizeof(double); - start = (BLASULONG)map_address; - current = (SCALING - 1) * BUFFER_SIZE; + start = (BLASULONG)map_address; + current = (SCALING - 1) * BUFFER_SIZE; - while(current > 0) { - *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; - start += PAGESIZE; - current -= PAGESIZE; - } + while(current > 0) { + *(BLASLONG *)start = (BLASLONG)start + PAGESIZE; + start += PAGESIZE; + current -= PAGESIZE; + } - *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; + *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address; - start = (BLASULONG)map_address; + start = (BLASULONG)map_address; - best = (BLASULONG)-1; - best_address = map_address; + best = (BLASULONG)-1; + best_address = map_address; - while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { + while ((start + allocsize < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) { - current = run_bench(start, allocsize); + current = run_bench(start, allocsize); - if (best > current) { - best = current; - best_address = (void *)start; - } + if (best > current) { + best = current; + best_address = (void *)start; + } - start += PAGESIZE; + start += PAGESIZE; - } + } if ((BLASULONG)best_address > (BLASULONG)map_address) - munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); + munmap(map_address, (BLASULONG)best_address - (BLASULONG)map_address); munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address); @@ -2342,9 +2342,9 @@ static void *alloc_windows(void *address){ void *map_address; map_address = VirtualAlloc(address, - BUFFER_SIZE, - MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); + BUFFER_SIZE, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); if (map_address == (void *)NULL) map_address = (void *)-1; @@ -2388,9 +2388,9 @@ static void *alloc_devicedirver(void *address){ } map_address = mmap(address, BUFFER_SIZE, - PROT_READ | PROT_WRITE, - MAP_FILE | MAP_SHARED, - fd, 0); + PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, + fd, 0); if (map_address != (void *)-1) { release_info[release_pos].address = map_address; @@ -2471,12 +2471,12 @@ static void *alloc_hugetlb(void *address){ shmid = shmget(IPC_PRIVATE, BUFFER_SIZE, #ifdef OS_LINUX - SHM_HUGETLB | + SHM_HUGETLB | #endif #ifdef OS_AIX - SHM_LGPAGE | SHM_PIN | + SHM_LGPAGE | SHM_PIN | #endif - IPC_CREAT | SHM_R | SHM_W); + IPC_CREAT | SHM_R | SHM_W); if (shmid != -1) { map_address = (void *)shmat(shmid, address, SHM_RND); @@ -2511,7 +2511,7 @@ static void *alloc_hugetlb(void *address){ tp.PrivilegeCount = 1; tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - + if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) { CloseHandle(hToken); return (void*)-1; @@ -2523,9 +2523,9 @@ static void *alloc_hugetlb(void *address){ } map_address = (void *)VirtualAlloc(address, - BUFFER_SIZE, - MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, - PAGE_READWRITE); + BUFFER_SIZE, + MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE); tp.Privileges[0].Attributes = 0; AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL); @@ -2578,9 +2578,9 @@ static void *alloc_hugetlbfile(void *address){ unlink(filename); map_address = mmap(address, BUFFER_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED, - fd, 0); + PROT_READ | PROT_WRITE, + MAP_SHARED, + fd, 0); if (map_address != (void *)-1) { release_info[release_pos].address = map_address; @@ -2717,7 +2717,7 @@ void *blas_memory_alloc(int procpos){ if (!memory[position].used && (memory[position].pos == mypos)) { #if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#else +#else blas_lock(&memory[position].lock); #endif if (!memory[position].used) goto allocation; @@ -2725,7 +2725,7 @@ void *blas_memory_alloc(int procpos){ UNLOCK_COMMAND(&alloc_lock); #else blas_unlock(&memory[position].lock); -#endif +#endif } position ++; @@ -2741,22 +2741,22 @@ void *blas_memory_alloc(int procpos){ LOCK_COMMAND(&alloc_lock); #endif do { - RMB; -#if defined(USE_OPENMP) - if (!memory[position].used) { + RMB; +#if defined(USE_OPENMP) + if (!memory[position].used) { blas_lock(&memory[position].lock); #endif if (!memory[position].used) goto allocation; - + #if defined(USE_OPENMP) - blas_unlock(&memory[position].lock); + blas_unlock(&memory[position].lock); } #endif position ++; } while (position < NUM_BUFFERS); #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); + UNLOCK_COMMAND(&alloc_lock); #endif goto error; @@ -2770,7 +2770,7 @@ void *blas_memory_alloc(int procpos){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); #else - blas_unlock(&memory[position].lock); + blas_unlock(&memory[position].lock); #endif if (!memory[position].addr) { do { @@ -2784,27 +2784,27 @@ void *blas_memory_alloc(int procpos){ while ((func != NULL) && (map_address == (void *) -1)) { - map_address = (*func)((void *)base_address); + map_address = (*func)((void *)base_address); #ifdef ALLOC_DEVICEDRIVER - if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); - } + if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { + fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); + } #endif #ifdef ALLOC_HUGETLBFILE - if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { + if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { #ifndef OS_WINDOWS - fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); #endif - } + } #endif #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) - if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; + if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; #endif - func ++; + func ++; } #ifdef DEBUG @@ -2818,7 +2818,7 @@ void *blas_memory_alloc(int procpos){ #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#endif +#endif memory[position].addr = map_address; #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); @@ -2856,7 +2856,7 @@ void *blas_memory_alloc(int procpos){ #ifdef DEBUG printf("Mapped : %p %3d\n\n", - (void *)memory[position].addr, position); + (void *)memory[position].addr, position); #endif return (void *)memory[position].addr; @@ -2972,7 +2972,7 @@ static BLASULONG init_lock = 0UL; #endif static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, - void *sa, void *sb, BLASLONG pos) { + void *sa, void *sb, BLASLONG pos) { #if !defined(ARCH_POWER) && !defined(ARCH_SPARC) @@ -3099,15 +3099,15 @@ void CONSTRUCTOR gotoblas_init(void) { //#if defined(OS_LINUX) #if 0 - struct rlimit curlimit; - if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) - { - if ( curlimit.rlim_cur != curlimit.rlim_max ) - { - curlimit.rlim_cur = curlimit.rlim_max; - setrlimit(RLIMIT_STACK, &curlimit); - } - } + struct rlimit curlimit; + if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 ) + { + if ( curlimit.rlim_cur != curlimit.rlim_max ) + { + curlimit.rlim_cur = curlimit.rlim_max; + setrlimit(RLIMIT_STACK, &curlimit); + } + } #endif #ifdef SMP @@ -3189,8 +3189,8 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser */ static int on_process_term(void) { - gotoblas_quit(); - return 0; + gotoblas_quit(); + return 0; } #ifdef _WIN64 #pragma comment(linker, "/INCLUDE:_tls_used") @@ -3237,7 +3237,7 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif -#endif +#endif } #endif From 3c05f54df8de5df17507e80697d651e147e0bf69 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 1 Oct 2020 10:48:45 +0200 Subject: [PATCH 239/349] Avoid out of bounds access on invalid memory free --- driver/others/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 5c9c388ce1..91cfefbd71 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2882,9 +2882,10 @@ void blas_memory_free(void *free_area){ while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; - if (memory[position].addr != free_area) goto error; + if (position >= NUM_BUFFERS) goto error; #ifdef DEBUG + if (memory[position].addr != free_area) goto error; printf(" Position : %d\n", position); #endif From 3094fc6c83c7a623f9a7e7846eb711a8a99ddfff Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Thu, 1 Oct 2020 15:41:42 +0200 Subject: [PATCH 240/349] Lazyly reinit threads after a fork in OMP mode This initializes the per-thread memory buffers which get cleared/released on a fork via pthread_at_fork. Not doing so leads to each thread calling blas_memory_alloc on almost every execution which slows down the code significantly as the threads race for the memory allocation using locks to serialize that. --- driver/others/blas_server_omp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index d126955e4d..da0a5674a7 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -48,6 +48,21 @@ #else +#ifndef likely +#ifdef __GNUC__ +#define likely(x) __builtin_expect(!!(x), 1) +#else +#define likely(x) (x) +#endif +#endif +#ifndef unlikely +#ifdef __GNUC__ +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define unlikely(x) (x) +#endif +#endif + #ifndef OMP_SCHED #define OMP_SCHED static #endif @@ -350,6 +365,9 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ int exec_blas(BLASLONG num, blas_queue_t *queue){ + // Handle lazy re-init of the thread-pool after a POSIX fork + if (unlikely(blas_server_avail == 0)) blas_thread_init(); + BLASLONG i, buf_index; if ((num <= 0) || (queue == NULL)) return 0; From d2333e784224ba19f01659210d2aaab04b43d45c Mon Sep 17 00:00:00 2001 From: User User-User Date: Sat, 3 Oct 2020 18:00:34 +0300 Subject: [PATCH 241/349] aarch64 fix std=c18 compilation --- common.h | 2 +- driver/others/dynamic_arm64.c | 2 +- kernel/arm64/daxpy_thunderx.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/common.h b/common.h index adc162536b..ac12dd6d8a 100644 --- a/common.h +++ b/common.h @@ -352,7 +352,7 @@ typedef int blasint; #endif #if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5) -#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); +#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n"); #endif #ifdef BULLDOZER diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 157b03365b..be22b247c2 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -68,7 +68,7 @@ extern void openblas_warning(int verbose, const char * msg); #endif #define get_cpu_ftr(id, var) ({ \ - asm("mrs %0, "#id : "=r" (var)); \ + __asm__("mrs %0, "#id : "=r" (var)); \ }) static char *corename[] = { diff --git a/kernel/arm64/daxpy_thunderx.c b/kernel/arm64/daxpy_thunderx.c index 37aae93914..f44f9d4e51 100644 --- a/kernel/arm64/daxpy_thunderx.c +++ b/kernel/arm64/daxpy_thunderx.c @@ -62,7 +62,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) y5 = a * x[5] + y[5]; y6 = a * x[6] + y[6]; y7 = a * x[7] + y[7]; - asm("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7)); + __asm__("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7)); y[0] = y0; y[1] = y1; y[2] = y2; @@ -74,7 +74,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) xx = (x + 4*128/sizeof(*x)); yy = (y + 4*128/sizeof(*y)); - asm("":"+r"(yy)::"memory"); + __asm__("":"+r"(yy)::"memory"); prefetch(xx); prefetch(yy); From dc8e4e1959855ca24af7e2d675f2be33087ff96c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Oct 2020 22:59:24 +0200 Subject: [PATCH 242/349] Reduce the BLAS3 heap allocation threshold to 32 and mark it as configurable --- Makefile.rule | 17 ++++++++++++++++- common.h | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 4d6f2d313f..635e02c024 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -279,7 +279,22 @@ COMMON_PROF = -pg # If you want to enable the experimental BFLOAT16 support # BUILD_HALF = 1 -# + + +# Set the thread number threshold beyond which the job array for the threaded level3 BLAS +# will be allocated on the heap rather than the stack. (This array alone requires +# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu +# counts, but obviously it is not the only item that ends up on the stack. +# The default value of 32 ensures that the overall requirement is compatible +# with the default 1MB stacksize imposed by having the Java VM loaded without use +# of its -Xss parameter. +# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible +# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java +# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code +# BLAS3_MEM_ALLOC_THRESHOLD = 160 + + + # the below is not yet configurable, use cmake if you need to build only select types BUILD_SINGLE = 1 BUILD_DOUBLE = 1 diff --git a/common.h b/common.h index ac12dd6d8a..ab287262c1 100644 --- a/common.h +++ b/common.h @@ -402,7 +402,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #endif #ifndef BLAS3_MEM_ALLOC_THRESHOLD -#define BLAS3_MEM_ALLOC_THRESHOLD 160 +#define BLAS3_MEM_ALLOC_THRESHOLD 32 #endif #ifdef QUAD_PRECISION From a5feea6611f49e875de83282c061843e18050af6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 4 Oct 2020 23:01:06 +0200 Subject: [PATCH 243/349] make BLAS3_MEM_ALLOC_THRESHOLD configurable on non-Windows --- cmake/system.cmake | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/system.cmake b/cmake/system.cmake index 8908a18908..0734065df2 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -323,7 +323,13 @@ else () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") endif () endif () - +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") +if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD) +if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32) +set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}") +endif() +endif() +endif() if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () From a5b164946ccc9dec037d4e0a1cd2f2202b1c918a Mon Sep 17 00:00:00 2001 From: Matti Picus Date: Mon, 5 Oct 2020 22:13:25 +0300 Subject: [PATCH 244/349] add fninit to reset fpu registers before assembler routines --- kernel/x86_64/amax.S | 2 ++ kernel/x86_64/asum.S | 3 ++- kernel/x86_64/dot.S | 1 + kernel/x86_64/iamax.S | 1 + kernel/x86_64/izamax.S | 1 + kernel/x86_64/nrm2.S | 1 + kernel/x86_64/qconjg.S | 1 + kernel/x86_64/qdot.S | 2 ++ kernel/x86_64/qgemm_kernel_2x2.S | 2 ++ kernel/x86_64/qgemv_n.S | 2 ++ kernel/x86_64/qgemv_t.S | 1 + kernel/x86_64/qtrsm_kernel_LN_2x2.S | 2 ++ kernel/x86_64/qtrsm_kernel_LT_2x2.S | 2 ++ kernel/x86_64/qtrsm_kernel_RT_2x2.S | 3 +++ kernel/x86_64/sum.S | 2 ++ kernel/x86_64/xdot.S | 3 +++ kernel/x86_64/xgemm3m_kernel_2x2.S | 2 ++ kernel/x86_64/xgemm_kernel_1x1.S | 2 ++ kernel/x86_64/xgemv_n.S | 2 ++ kernel/x86_64/xgemv_t.S | 2 ++ kernel/x86_64/xtrsm_kernel_LT_1x1.S | 2 ++ kernel/x86_64/zamax.S | 2 ++ kernel/x86_64/zasum.S | 2 ++ kernel/x86_64/zdot.S | 2 ++ kernel/x86_64/znrm2.S | 2 ++ kernel/x86_64/zscal.S | 2 ++ kernel/x86_64/zsum.S | 2 ++ 27 files changed, 50 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/amax.S b/kernel/x86_64/amax.S index 0e9bf4db44..257147dfb8 100644 --- a/kernel/x86_64/amax.S +++ b/kernel/x86_64/amax.S @@ -54,6 +54,8 @@ PROLOGUE PROFCODE + + fninit salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/asum.S b/kernel/x86_64/asum.S index 31f973894b..24f57dd111 100644 --- a/kernel/x86_64/asum.S +++ b/kernel/x86_64/asum.S @@ -49,7 +49,8 @@ PROLOGUE PROFCODE - + + fninit fldz testq M, M jle .L999 diff --git a/kernel/x86_64/dot.S b/kernel/x86_64/dot.S index e63d9cd893..2319885f19 100644 --- a/kernel/x86_64/dot.S +++ b/kernel/x86_64/dot.S @@ -49,6 +49,7 @@ PROLOGUE PROFCODE + fninit salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY diff --git a/kernel/x86_64/iamax.S b/kernel/x86_64/iamax.S index 79e1bae1d0..0c666d623b 100644 --- a/kernel/x86_64/iamax.S +++ b/kernel/x86_64/iamax.S @@ -59,6 +59,7 @@ PROLOGUE PROFCODE + fninit salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/izamax.S b/kernel/x86_64/izamax.S index c066acd624..e450c2cd23 100644 --- a/kernel/x86_64/izamax.S +++ b/kernel/x86_64/izamax.S @@ -59,6 +59,7 @@ PROLOGUE PROFCODE + fninit salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S index e9be1262ac..548e3b7447 100644 --- a/kernel/x86_64/nrm2.S +++ b/kernel/x86_64/nrm2.S @@ -50,6 +50,7 @@ PROLOGUE PROFCODE + fninit fldz testq M, M jle .L999 diff --git a/kernel/x86_64/qconjg.S b/kernel/x86_64/qconjg.S index 49ca766491..bab5418311 100644 --- a/kernel/x86_64/qconjg.S +++ b/kernel/x86_64/qconjg.S @@ -41,6 +41,7 @@ PROLOGUE PROFCODE + fninit fldz FLD 1 * SIZE(ARG1) diff --git a/kernel/x86_64/qdot.S b/kernel/x86_64/qdot.S index a48a04fdd0..e7d31360b0 100644 --- a/kernel/x86_64/qdot.S +++ b/kernel/x86_64/qdot.S @@ -58,6 +58,8 @@ PROLOGUE + fninit + pushl %edi pushl %esi pushl %ebx diff --git a/kernel/x86_64/qgemm_kernel_2x2.S b/kernel/x86_64/qgemm_kernel_2x2.S index 99db3961fa..7b5e7707d5 100644 --- a/kernel/x86_64/qgemm_kernel_2x2.S +++ b/kernel/x86_64/qgemm_kernel_2x2.S @@ -74,6 +74,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qgemv_n.S b/kernel/x86_64/qgemv_n.S index 630d03ffb1..1b65b03f0e 100644 --- a/kernel/x86_64/qgemv_n.S +++ b/kernel/x86_64/qgemv_n.S @@ -76,6 +76,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qgemv_t.S b/kernel/x86_64/qgemv_t.S index d7c9cd2a59..00188c2578 100644 --- a/kernel/x86_64/qgemv_t.S +++ b/kernel/x86_64/qgemv_t.S @@ -75,6 +75,7 @@ PROLOGUE PROFCODE + fninit subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_LN_2x2.S b/kernel/x86_64/qtrsm_kernel_LN_2x2.S index 536042e65e..030eff8934 100644 --- a/kernel/x86_64/qtrsm_kernel_LN_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LN_2x2.S @@ -74,6 +74,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_LT_2x2.S b/kernel/x86_64/qtrsm_kernel_LT_2x2.S index 6e94976c5b..d86972c72c 100644 --- a/kernel/x86_64/qtrsm_kernel_LT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LT_2x2.S @@ -74,6 +74,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_RT_2x2.S b/kernel/x86_64/qtrsm_kernel_RT_2x2.S index caa7de14a6..2826a62c93 100644 --- a/kernel/x86_64/qtrsm_kernel_RT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_RT_2x2.S @@ -74,6 +74,9 @@ PROLOGUE PROFCODE + fninit + + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/sum.S b/kernel/x86_64/sum.S index d075eaa042..3d5fa7cc29 100644 --- a/kernel/x86_64/sum.S +++ b/kernel/x86_64/sum.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/xdot.S b/kernel/x86_64/xdot.S index ea97164b24..ec89b799c2 100644 --- a/kernel/x86_64/xdot.S +++ b/kernel/x86_64/xdot.S @@ -59,6 +59,9 @@ PROFCODE + fninit + + #define N %ebx #define X %esi #define INCX %ecx diff --git a/kernel/x86_64/xgemm3m_kernel_2x2.S b/kernel/x86_64/xgemm3m_kernel_2x2.S index 843fc243aa..e8da78d82a 100644 --- a/kernel/x86_64/xgemm3m_kernel_2x2.S +++ b/kernel/x86_64/xgemm3m_kernel_2x2.S @@ -78,6 +78,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/xgemm_kernel_1x1.S b/kernel/x86_64/xgemm_kernel_1x1.S index e0cd1f1dfa..f04ab07f59 100644 --- a/kernel/x86_64/xgemm_kernel_1x1.S +++ b/kernel/x86_64/xgemm_kernel_1x1.S @@ -97,6 +97,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/xgemv_n.S b/kernel/x86_64/xgemv_n.S index cbde6402dc..7d28c118ac 100644 --- a/kernel/x86_64/xgemv_n.S +++ b/kernel/x86_64/xgemv_n.S @@ -76,6 +76,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/xgemv_t.S b/kernel/x86_64/xgemv_t.S index 31320f6514..e796760883 100644 --- a/kernel/x86_64/xgemv_t.S +++ b/kernel/x86_64/xgemv_t.S @@ -75,6 +75,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/xtrsm_kernel_LT_1x1.S b/kernel/x86_64/xtrsm_kernel_LT_1x1.S index a61a240fdb..54d41932f8 100644 --- a/kernel/x86_64/xtrsm_kernel_LT_1x1.S +++ b/kernel/x86_64/xtrsm_kernel_LT_1x1.S @@ -90,6 +90,8 @@ PROLOGUE PROFCODE + fninit + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/zamax.S b/kernel/x86_64/zamax.S index 74e127e6c0..bfd836193d 100644 --- a/kernel/x86_64/zamax.S +++ b/kernel/x86_64/zamax.S @@ -55,6 +55,8 @@ PROLOGUE PROFCODE + fninit + salq $ZBASE_SHIFT, INCX fldz diff --git a/kernel/x86_64/zasum.S b/kernel/x86_64/zasum.S index c372fc5dd1..9ea2aadc05 100644 --- a/kernel/x86_64/zasum.S +++ b/kernel/x86_64/zasum.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S index 94d1008ff1..f7df919b7c 100644 --- a/kernel/x86_64/zdot.S +++ b/kernel/x86_64/zdot.S @@ -54,6 +54,8 @@ PROLOGUE PROFCODE + fninit + #ifdef WINDOWS_ABI movq 40(%rsp), INCY #endif diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S index 4115eab1db..cb02a5a9fe 100644 --- a/kernel/x86_64/znrm2.S +++ b/kernel/x86_64/znrm2.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/zscal.S b/kernel/x86_64/zscal.S index 5282e0f725..08c0831a44 100644 --- a/kernel/x86_64/zscal.S +++ b/kernel/x86_64/zscal.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + salq $ZBASE_SHIFT, INCX FLD 8(%rsp) diff --git a/kernel/x86_64/zsum.S b/kernel/x86_64/zsum.S index 45e0ddff55..1c39048396 100644 --- a/kernel/x86_64/zsum.S +++ b/kernel/x86_64/zsum.S @@ -50,6 +50,8 @@ PROLOGUE PROFCODE + fninit + fldz testq M, M jle .L999 From 78124860911ae2b4e226d1cd76486120c3187c72 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 6 Oct 2020 21:33:16 +0200 Subject: [PATCH 245/349] Use generic C for D/Z nrm2 kernels on Windows to work around fpu exception bug --- kernel/x86_64/KERNEL | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 4a2e13bedb..d75196974e 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -259,8 +259,12 @@ SNRM2KERNEL = nrm2_sse.S endif ifndef DNRM2KERNEL +ifeq ($(OSNAME),WINNT) +DNRM2KERNEL = ../arm/nrm2.c +else DNRM2KERNEL = nrm2.S endif +endif ifndef QNRM2KERNEL QNRM2KERNEL = nrm2.S @@ -271,8 +275,12 @@ CNRM2KERNEL = znrm2_sse.S endif ifndef ZNRM2KERNEL +ifeq ($(OSNAME),WINNT) +ZNRM2KERNEL = ../arm/znrm2.c +else ZNRM2KERNEL = znrm2.S endif +endif ifndef XNRM2KERNEL XNRM2KERNEL = znrm2.S From f32d34a01528a0b9f2df5229c17789333d41125a Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Sat, 10 Oct 2020 10:36:15 +0800 Subject: [PATCH 246/349] add sse3 compiler flag --- Makefile.x86_64 | 5 +++++ cmake/system.cmake | 3 +++ kernel/Makefile | 3 +++ 3 files changed, 11 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 00975b25af..65b67bba1f 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,11 @@ endif endif endif +ifdef HAVE_SSE3 +CCOMMON_OPT += -msse3 +FCOMMON_OPT += -msse3 +endif + ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH ifndef NO_AVX512 diff --git a/cmake/system.cmake b/cmake/system.cmake index 8908a18908..1e6a292c85 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,9 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() endif() if (DEFINED TARGET) diff --git a/kernel/Makefile b/kernel/Makefile index 16211218f4..0f0fa5a5e7 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -5,6 +5,9 @@ endif TOPDIR = .. include $(TOPDIR)/Makefile.system +ifdef HAVE_SSE3 +CFLAGS += -msse3 +endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) From de27e4f5fb54a792ea35720b67f0a395ad3e1026 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 00:40:22 +0200 Subject: [PATCH 247/349] Stop DYNAMIC_ARCH build if the toplevel source contains a stray config_kernel.h from a gmake build This is unlikely to happen in practice, but if it does, the rogue file would get included instead of the dynamically generated version for each target_core, leading to very confusing errors like "invalid operands (undefined UND and ABS sections)" in compilation of the assembly kernels as macros like PREFETCH would remain undefined --- cmake/arch.cmake | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index c00f8fe71d..e851dd0887 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -1,4 +1,3 @@ -## ## Author: Hank Anderson ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets various variables based on architecture. @@ -80,10 +79,15 @@ if (DYNAMIC_ARCH) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") endif () if (DYNAMIC_LIST) - set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) + set(DYNAMIC_CORE ${DYNAMIC_LIST}) endif () endif () + CHECK_INCLUDE_FILE ("${PROJECT_SOURCE_DIR}/config_kernel.h" TRAP) + if (TRAP) + message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") + endif () + if (NOT DYNAMIC_CORE) message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") unset(DYNAMIC_ARCH CACHE) From 82a497ec5d4c759acc9994b6d1eba54ea90e3b9b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 00:43:09 +0200 Subject: [PATCH 248/349] restore PRESCOTT default for DYNAMIC_LIST --- cmake/arch.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e851dd0887..c048f13d18 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -79,7 +79,7 @@ if (DYNAMIC_ARCH) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") endif () if (DYNAMIC_LIST) - set(DYNAMIC_CORE ${DYNAMIC_LIST}) + set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () endif () From 0c773b8205d5108a765db44eaca6427b2b3af608 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 01:04:57 +0200 Subject: [PATCH 249/349] Do not rely on HAVE_SSE3 in DYNAMIC_ARCH builds --- Makefile.x86_64 | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 65b67bba1f..e793a1c2f9 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -9,9 +9,11 @@ endif endif ifdef HAVE_SSE3 +ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 endif +endif ifeq ($(CORE), SKYLAKEX) ifndef DYNAMIC_ARCH From 7a531284817d411e8d89deb3a0a912d1b1e4aca8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 01:06:46 +0200 Subject: [PATCH 250/349] Add whitelist of DYNAMIC_ARCH kernels for which -msse3 needs to be enabled --- kernel/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index 0f0fa5a5e7..290fb2afe4 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,6 +41,9 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO NEHALEM BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) + override CFLAGS += -msse3 +endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) ifeq ($(GCCVERSIONGTEQ10), 1) From 9d43140d61d93a6b96844c19b760b64ba49d451f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 12:58:17 +0200 Subject: [PATCH 251/349] Improve check for conflicting config_kernel.h --- cmake/arch.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index c048f13d18..99e685d048 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -83,8 +83,7 @@ if (DYNAMIC_ARCH) endif () endif () - CHECK_INCLUDE_FILE ("${PROJECT_SOURCE_DIR}/config_kernel.h" TRAP) - if (TRAP) + if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") endif () From 190b74dd2454d1c7db535b8feaf36db008c29d70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 13:26:05 +0200 Subject: [PATCH 252/349] Add files via upload From 63d7dad04cd23c71cc96495cc61adb20475a17c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:15:35 +0200 Subject: [PATCH 253/349] Adapt utests for builds supportin only some variable types --- utest/test_dsdot.c | 17 ++--------------- utest/test_fork.c | 6 ++++++ 2 files changed, 8 insertions(+), 15 deletions(-) diff --git a/utest/test_dsdot.c b/utest/test_dsdot.c index 57da7101ef..adef4e91c0 100644 --- a/utest/test_dsdot.c +++ b/utest/test_dsdot.c @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "openblas_utest.h" - +#if defined(BUILD_SINGLE) && defined(BUILD_DOUBLE) CTEST(dsdot,dsdot_n_1) { float x= 0.172555164F; @@ -47,17 +47,4 @@ CTEST(dsdot,dsdot_n_1) ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); } - -CTEST(dsdot,dsdot_n_2) -{ - float x[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F}; - float y[] = {0.1F, 0.2F, 0.3F, 0.4F, 0.5F, 0.6F, 0.7F, 0.8F}; - blasint incx=1; - blasint incy=1; - blasint n=8; - - double res1=0.0f, res2= 2.0400000444054616; - - res1=BLASFUNC(dsdot)(&n, &x, &incx, &y, &incy); - ASSERT_DBL_NEAR_TOL(res2, res1, DOUBLE_EPS); -} \ No newline at end of file +#endif diff --git a/utest/test_fork.c b/utest/test_fork.c index 0b90407b11..5c976f9207 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -48,6 +48,7 @@ void* xmalloc(size_t n) } } +#ifdef BUILD_DOUBLE void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { char trans1 = 'T'; @@ -59,9 +60,13 @@ void check_dgemm(double *a, double *b, double *result, double *expected, blasint ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); } } +#endif CTEST(fork, safety) { +#ifndef BUILD_DOUBLE +exit(0); +#else blasint n = 1000; int i; @@ -124,4 +129,5 @@ CTEST(fork, safety) ASSERT_EQUAL(wait_pid, fork_pid); ASSERT_EQUAL(0, WEXITSTATUS (child_status)); } +#endif } From 08f4749eb483f16618e553db54e8ae9d537795e4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:25:24 +0200 Subject: [PATCH 254/349] Adapt tests to having only a subset of types in the build --- test/Makefile | 245 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 218 insertions(+), 27 deletions(-) diff --git a/test/Makefile b/test/Makefile index 45f9821ec5..a3966756d5 100644 --- a/test/Makefile +++ b/test/Makefile @@ -7,82 +7,242 @@ all :: else all :: level1 level2 level3 endif +$(info buildvars [$(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16)]) +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) +level1: sblat1 dblat1 cblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) +level1: dblat1 cblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) +level1: sblat1 cblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) +level1: cblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) +level1: cblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) +level1: zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) +level1: sblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) +level1: sblat1 dblat1 zblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) +level1: sblat1 dblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) +level1: sblat1 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) +level1: dblat1 +endif -level1 : sblat1 dblat1 cblat1 zblat1 ifndef CROSS +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat1 +endif ifdef SMP ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./dblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./cblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./zblat1 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./dblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./cblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./zblat1 endif endif endif +endif + +#level2: sblat2 dblat2 cblat2 zblat2 +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) +level2: sblat2 dblat2 cblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) +level2: dblat2 cblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) +level2: sblat2 cblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) +level2: cblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) +level2: cblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) +level2: zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) +level2: sblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) +level2: sblat2 dblat2 zblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) +level2: sblat2 dblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) +level2: sblat2 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) +level2: dblat2 +endif -level2 : sblat2 dblat2 cblat2 zblat2 ifndef CROSS rm -f ?BLAT2.SUMM +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +endif ifdef SMP rm -f ?BLAT2.SUMM ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat2 < ./sblat2.dat @$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./dblat2 < ./dblat2.dat @$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./cblat2 < ./cblat2.dat @$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./zblat2 < ./zblat2.dat @$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0 endif endif endif +endif -ifeq ($(BUILD_HALF),1) -level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 -else -level3 : sblat3 dblat3 cblat3 zblat3 +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) +level3: sblat3 dblat3 cblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) +level3: dblat3 cblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) +level3: sblat3 cblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) +level3: cblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) +level3: cblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) +level3: zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) +level3: sblat3 zblat3 endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) +level3: sblat3 dblat3 zblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) +level3: sblat3 dblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) +level3: sblat3 +endif +ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) +level3: dblat3 +endif + + + +#ifeq ($(BUILD_HALF),1) +#level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 +#else +#level3 : sblat3 dblat3 cblat3 zblat3 +#endif + ifndef CROSS rm -f ?BLAT3.SUMM ifeq ($(BUILD_HALF),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +endif ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) @@ -90,30 +250,46 @@ ifeq ($(BUILD_HALF),1) OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 +endif else ifeq ($(BUILD_HALF),1) OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat @$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./cblat3 < ./cblat3.dat @$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif endif endif +endif level3_3m : zblat3_3m cblat3_3m @@ -151,56 +327,71 @@ endif endif endif +ifeq ($(BUILD_SINGLE),1) sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_DOUBLE),1) dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + +dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +else +dblat2: +dblat3: +endif + + qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_COMPLEX),1) cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_COMPLEX16),1) +zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) + zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif ifeq ($(BUILD_HALF),1) test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif -dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - -zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - +ifeq ($(BUILD_COMPLEX),1) cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +endif +ifeq ($(BUILD_COMPLEX16),1) zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) - +endif From f6d2827d0ca2d773ee1295a674b096119cff3f44 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:32:00 +0200 Subject: [PATCH 255/349] Adapt ctests to having only a subset of types in the build --- ctest/Makefile | 119 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 115 insertions(+), 4 deletions(-) diff --git a/ctest/Makefile b/ctest/Makefile index 6f5b65142a..cba904f75e 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -46,56 +46,155 @@ else all :: all1 all2 all3 endif -all1: xscblat1 xdcblat1 xccblat1 xzcblat1 +ifeq ($(BUILD_SINGLE),1) +all1targets += xscblat1 +endif +ifeq ($(BUILD_DOUBLE),1) +all1targets += xdcblat1 +endif +ifeq ($(BUILD_COMPLEX),1) +all1targets += xccblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) +all1targets += xzcblat1 +endif + +all1: $(all1targets) + ifndef CROSS ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./xdcblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./xzcblat1 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./xscblat1 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./xdcblat1 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./xccblat1 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat1 endif endif +endif + +ifeq ($(BUILD_SINGLE),1) +all2targets += xscblat2 +endif +ifeq ($(BUILD_DOUBLE),1) +all2targets += xdcblat2 +endif +ifeq ($(BUILD_COMPLEX),1) +all2targets += xccblat2 +endif +ifeq ($(BUILD_COMPLEX16),1) +all2targets += xzcblat2 +endif + +all2: $(all2targets) -all2: xscblat2 xdcblat2 xccblat2 xzcblat2 ifndef CROSS ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat2 < sin2 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./xdcblat2 < din2 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat2 < cin2 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./xzcblat2 < zin2 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./xscblat2 < sin2 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./xdcblat2 < din2 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2 endif endif +endif + + +ifeq ($(BUILD_SINGLE),1) +all3targets += xscblat3 +endif +ifeq ($(BUILD_DOUBLE),1) +all3targets += xdcblat3 +endif +ifeq ($(BUILD_COMPLEX),1) +all3targets += xccblat3 +endif +ifeq ($(BUILD_COMPLEX16),1) +all3targets += xzcblat3 +endif + +all3: $(all3targets) -all3: xscblat3 xdcblat3 xccblat3 xzcblat3 ifndef CROSS ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xscblat3 < sin3 +endif +ifeq ($(BUILD_DOUBLE),1) OMP_NUM_THREADS=2 ./xdcblat3 < din3 +endif +ifeq ($(BUILD_COMPLEX),1) OMP_NUM_THREADS=2 ./xccblat3 < cin3 +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./xzcblat3 < zin3 +endif else +ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./xscblat3 < sin3 +endif +ifeq ($(BUILD_DOUBLE),1) OPENBLAS_NUM_THREADS=2 ./xdcblat3 < din3 +endif +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./xccblat3 < cin3 +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3 endif +endif +endif all3_3m: xzcblat3_3m xccblat3_3m ifeq ($(USE_OPENMP), 1) +ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m +endif +ifeq ($(BUILD_COMPLEX16),1) OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m +endif else +ifeq ($(BUILD_COMPLEX),1) OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m +endif +ifeq ($(BUILD_COMPLEX16),1) OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m endif endif @@ -115,13 +214,19 @@ endif endif endif +ifeq ($(BUILD_SINGLE),1) # Single real xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) + xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_DOUBLE),1) # Double real xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) @@ -129,7 +234,10 @@ xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB) xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_COMPLEX),1) # Single complex xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) @@ -140,7 +248,10 @@ xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME) xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) +endif + +ifeq ($(BUILD_COMPLEX16),1) # Double complex xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB) @@ -152,6 +263,6 @@ xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME) xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME) $(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB) - +endif include $(TOPDIR)/Makefile.tail From 6a83c591d65ebf1ccb7a7be69d5744d9ce522d24 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:34:12 +0200 Subject: [PATCH 256/349] Adapt for having only a subset of variable types --- exports/Makefile | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 75901586c6..960150c864 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -33,6 +33,18 @@ endif ifndef BUILD_HALF BUILD_HALF = 0 endif +ifndef BUILD_SINGLE +BUILD_SINGLE = 0 +endif +ifndef BUILD_DOUBLE +BUILD_DOUBLE = 0 +endif +ifndef BUILD_COMPLEX +BUILD_COMPLEX = 0 +endif +ifndef BUILD_COMPLEX16 +BUILD_COMPLEX16 = 0 +endif ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) @@ -108,10 +120,10 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) $(LIBPREFIX).def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) ifeq ($(OSNAME), Darwin) INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib @@ -246,23 +258,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed From d33de97d60d27b753f217d0a8d6a7ef1a6df12d9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:36:45 +0200 Subject: [PATCH 257/349] Adapt to having only a subset of variable types supported --- exports/gensymbol | 1646 +++++++++++++++++++++++++-------------------- 1 file changed, 907 insertions(+), 739 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index ce4d9bb64a..736fdc2cdf 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -16,74 +16,84 @@ # 2017/08/01 Saar # removed blas_thread_shutdown_ # -@blasobjs = ( - caxpy,ccopy,cdotc,cdotu,cgbmv,cgemm,cgemv,cgerc,cgeru, - chbmv,chemm,chemv,cher2,cher2k,cher,cherk, - chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap, +@blasobjsc = ( + caxpy,caxpby,ccopy,cdotc,cdotu,cgbmv,cgemm,cgemv,cgerc,cgeru, + chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax, + chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2, csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm, - ctrsv, - damax,damin,dasum,daxpy,dcabs1,dcopy,ddot,dgbmv,dgemm, + ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum); + +@blasobjsd = ( + damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm, dgemv,dger,dmax,dmin,dnrm2,drot,drotg,drotm,drotmg,dsbmv, - dscal,dsdot,dspmv,dspr2, + dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy, dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv, - dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,dzamax,dzamin,dzasum,dznrm2, - icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, - izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, - scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, - smax,smin,snrm2, + dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv, + idamax,idamin,idmax,idmin,dgeadd,dsum); + +@blasobjss = ( + isamax,isamin,ismax,ismin, + samax,samin,sasum,saxpy, saxpby, + scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, + smax,smin,snrm2,simatcopy,somatcopy, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, - strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, + strmm,strmv,strsm,strsv, sgeadd,ssum); + +@blasobjsz = ( + izamax,izamin,, + zaxpy,zaxpby,zcopy,zdotc,zdotu,zdrot, zdscal,zgbmv,zgemm,zgemv,zgerc,zgeru, zhbmv,zhemm,zhemv,zher2,zher2k,zher,zherk,zhpmv,zhpr2, zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, - xerbla, - saxpby,daxpby,caxpby,zaxpby, - somatcopy, domatcopy, comatcopy, zomatcopy, - simatcopy, dimatcopy, cimatcopy, zimatcopy, - sgeadd,dgeadd,cgeadd,zgeadd, - ssum, dsum, scsum, dzsum -); + zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2, + zgeadd, dzsum); +@cblasobjs = (lsame, xerbla); @halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); -@cblasobjs = ( +@cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, - cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, - cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, - cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, + cblas_cher, cblas_cherk, cblas_chpmv, cblas_chpr2, cblas_chpr, cblas_cscal, cblas_caxpby, + cblas_csscal, cblas_cswap, cblas_csymm, cblas_csyr2k, cblas_csyrk, cblas_ctbmv, cblas_cgeadd, + cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, + cblas_scnrm2, cblas_scasum, + cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy + ); +@cblasobjsd = ( cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot, cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2, cblas_drot, cblas_drotg, cblas_drotm, cblas_drotmg, cblas_dsbmv, cblas_dscal, cblas_dsdot, cblas_dspmv, cblas_dspr2, cblas_dspr, cblas_dswap, cblas_dsymm, cblas_dsymv, cblas_dsyr2, cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv, - cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_dzasum, - cblas_dznrm2, cblas_icamax, cblas_idamax, - cblas_isamax, cblas_izamax, + cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd, + cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy + ); + +@cblasobjss = ( cblas_sasum, cblas_saxpy, - cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, + cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, - cblas_strsv, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, + cblas_strsv, cblas_sgeadd, + cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy + ); +@cblasobjsz = ( + cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal, cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm, cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2, cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk, cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm, cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub, - cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, - cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, - cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, - cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd, - cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin, - cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin, - cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax, - cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum, - cblas_xerbla + cblas_zaxpby, cblas_zgeadd, + cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy ); +@cblasobjs = ( cblas_xerbla ); + @halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( @@ -103,12 +113,22 @@ # xdrot,xrotg, ); -@gemm3mobjs = ( - cgemm3m,zgemm3m + @gemm3mobjs=(); + + @cblasgemm3mobjs=(); + +@gemm3mobjsc = ( + cgemm3m, +); +@gemm3mobjsz = ( + zgemm3m ); -@cblasgemm3mobjs = ( - cblas_cgemm3m,cblas_zgemm3m +@cblasgemm3mobjsc = ( + cblas_cgemm3m +); +@cblasgemm3mobjsz = ( + cblas_zgemm3m ); @@ -131,22 +151,68 @@ @misc_underscore_objs = ( ); -@lapackobjs = ( +@lapackobjss = ( # These routines are provided by OpenBLAS. - sgesv, dgesv, cgesv, zgesv, - sgetf2, dgetf2, cgetf2, zgetf2, - sgetrf, dgetrf, cgetrf, zgetrf, - slaswp, dlaswp, claswp, zlaswp, - sgetrs, dgetrs, cgetrs, zgetrs, - slauu2, dlauu2, clauu2, zlauu2, - slauum, dlauum, clauum, zlauum, - spotf2, dpotf2, cpotf2, zpotf2, - spotrf, dpotrf, cpotrf, zpotrf, - strti2, dtrti2, ctrti2, ztrti2, - strtri, dtrtri, ctrtri, ztrtri, - spotri, dpotri, cpotri, zpotri, + sgesv, + sgetf2, + sgetrf, + slaswp, + sgetrs, + slauu2, + slauum, + spotf2, + spotrf, + strti2, + strtri, + spotri, +); + +@lapackobjsd = ( + dgesv, + dgetf2, + dgetrf, + dlaswp, + dgetrs, + dlauu2, + dlauum, + dpotf2, + dpotrf, + dtrti2, + dtrtri, + dpotri, +); + +@lapackobjsc = ( +cgesv, +cgetf2, +cgetrf, +claswp, +cgetrs, +clauu2, +clauum, +cpotf2, +cpotrf, +ctrti2, +ctrtri, +cpotri, +); + +@lapackobjsz = ( +zgesv, +zgetf2, +zgetrf, +zlaswp, +zgetrs, +zlauu2, +zlauum, +zpotf2, +zpotrf, +ztrti2, +ztrtri, +zpotri, ); + @lapackobjs2 = ( # These routines are provided by LAPACK (reference implementation). # @@ -162,7 +228,9 @@ ilaenv, ieeeck, lsamen, iparmq, ilaprec, ilatrans, ilauplo, iladiag, ilaver, slamch, slamc3, - +); + +@lapackobjs2sc = ( # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. # excluded: second_$(TIMER) sbdsdc, @@ -180,7 +248,9 @@ slasr, slasrt, slassq, slasv2, spttrf, sstebz, sstedc, ssteqr, ssterf, slaisnan, sisnan, slartgp, slartgs, +); +@lapackobjs2dz = ( # DZLAUX -- Auxiliary routines called from both DOUBLE and COMPLEX*16. # excluded: dsecnd_$(TIMER) dbdsdc, @@ -199,7 +269,9 @@ dsteqr, dsterf, dlaisnan, disnan, dlartgp, dlartgs, dlamch, dlamc3, +); +@lapackobjs2s = ( # SLASRC -- Single precision real LAPACK routines # already provided by @lapackobjs: # sgesv, sgetf2, slaswp, slauu2, slauum, spotf2, spotri, strti2, strtri @@ -262,7 +334,9 @@ sorbdb5, sorbdb6, sorcsd, sorcsd2by1, sgeqrt, sgeqrt2, sgeqrt3, sgemqrt, stpqrt, stpqrt2, stpmqrt, stprfb, +); +@lapackobjs2ds = ( # DSLASRC -- Double-single mixed precision real routines called from # single, single-extra and double precision real LAPACK # routines (i.e. from SLASRC, SXLASRC, DLASRC). @@ -270,7 +344,9 @@ # already provided by @lapackobjs: # sgetrs, spotrf, sgetrf spotrs, +); +@lapackobjs2c = ( # CLASRC -- Single precision complex LAPACK routines # already provided by @blasobjs: # already provided by @lapackobjs: @@ -338,7 +414,8 @@ cunbdb5, cunbdb6, cuncsd, cuncsd2by1, cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, ctpqrt, ctpqrt2, ctpmqrt, ctprfb, - +); +@lapack2objszc = ( # ZCLASRC -- Double-single mixed precision complex routines called from # single, single-extra and double precision complex LAPACK # routines (i.e. from CLASRC, CXLASRC, ZLASRC). @@ -346,7 +423,9 @@ # already provided by @lapackobjs: # cgetrs, cpotrf, cgetrf cpotrs, +); +@lapack2objsd = ( # DLASRC -- Double precision real LAPACK routines # already provided by @lapackobjs: # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, @@ -411,7 +490,8 @@ dorbdb5, dorbdb6, dorcsd, dorcsd2by1, dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, dtpqrt, dtpqrt2, dtpmqrt, dtprfb, - +); +@lapackobjs2z = ( # ZLASRC -- Double precision complex LAPACK routines # already provided by @blasobjs: # already provided by @lapackobjs: @@ -485,8 +565,10 @@ zunbdb5, zunbdb6, zuncsd, zuncsd2by1, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, +); # functions added for lapack-3.6.0 +@lapack2objsc = ( @lapack2objsc, cgejsv, cgesvdx, cgesvj, @@ -521,6 +603,8 @@ cspr2, csyr2, cunm22, +); +@lapackobjs2d = (@lapack2objsd, dbdsvdx, dgesvdx, dgetrf2, @@ -552,6 +636,8 @@ dorm22, dpotrf2, dsecnd, + ); + @lapack2objss = (@lapack2objss, sbdsvdx, second, sgesvdx, @@ -583,6 +669,8 @@ slatmt, sorm22, spotrf2, + ); + @lapack2objsz = (@lapack2objsz, zgejsv, zgesvdx, zgesvj, @@ -617,9 +705,9 @@ zspr2, zsyr2, zunm22, - +); # functions added for lapack-3.7.0 - +@lapack2objss = (@lapack2objss, slarfy, strevc3, sgelqt, @@ -637,6 +725,8 @@ stplqt, stplqt2, stpmlqt, + ); + @lapack2objsd = (@lapack2objsd, dlarfy, dsyconvf, dtrevc3, @@ -655,6 +745,8 @@ dtplqt, dtplqt2, dtpmlqt, + ); + @lapack2objsc = (@lapack2objsc, clarfy, csyconvf, ctrevc3, @@ -673,6 +765,8 @@ ctplqt, ctplqt2, ctpmlqt, + ); + @lapack2objsz = (@lapack2objsz, zlarfy, zsyconvf, ztrevc3, @@ -691,6 +785,8 @@ zlaswlq, zlamswlq, zgemlq, + ); + @lapack2objs = (@lapack2objs, sladiv1, dladiv1, iparam2stage, @@ -698,16 +794,23 @@ # functions added for lapack-3.8.0 ilaenv2stage, - + ); # functions added for lapack-3.9.0 +@lapack2objsc = (@lapack2objsc, cgesvdq, cungtsqr, dcombssq, + ); +@lapack2objsd = (@lapack2objsd, dgesvdq, dorgtsqr, + ); +@lapack2objss = (@lapack2objss, scombssq, sgesvdq, sorgtsqr, + ); +@lapack2objsz = (@lapack2objsz, zgesvdq, zungtsqr ); @@ -717,36 +820,54 @@ dlagsy, dsysvxx, sporfsx, slatms, zlatms, zherfsx, csysvxx, ); -@lapack_deprecated_objs = ( - cgegs, cggsvd, ctzrqf, dgeqpf, dlatzm, sgelsx, slahrd, zgegv, zggsvp, - cgegv, cggsvp, dgegs, dggsvd, dtzrqf, sgeqpf, slatzm, zgelsx, zlahrd, - cgelsx, clahrd, dgegv, dggsvp, sgegs, sggsvd, stzrqf, zgeqpf, zlatzm, - cgeqpf, clatzm, dgelsx, dlahrd, sgegv, sggsvp, zgegs, zggsvd, ztzrqf, -); - -@lapacke_deprecated_objs = ( +@lapack_deprecated_objsc = ( + cgegs, cggsvd, + cgegv, cggsvp, + cgelsx, clahrd, + cgeqpf, clatzm, + ctzrqf, + ); +@lapack_deprecated_objsd = ( + dgegs, dgeqpf, + dgegv, dggsvd, + dgelsx, dggsvp, + dlahrd, + dlatzm, dtzrqf); + +@lapack_deprecated_objss = ( + sgegs, + sgegv, + ); + +@lapacke_deprecated_objsc = ( LAPACKE_cggsvp, LAPACKE_cggsvp_work, - LAPACKE_dggsvp, - LAPACKE_dggsvp_work, - LAPACKE_sggsvp, - LAPACKE_sggsvp_work, - LAPACKE_zggsvp, - LAPACKE_zggsvp_work, LAPACKE_cggsvd, LAPACKE_cggsvd_work, - LAPACKE_dggsvd, - LAPACKE_dggsvd_work, - LAPACKE_sggsvd, - LAPACKE_sggsvd_work, - LAPACKE_zggsvd, - LAPACKE_zggsvd_work, LAPACKE_cgeqpf, LAPACKE_cgeqpf_work, +); +@lapacke_deprecated_objsd = ( + LAPACKE_dggsvp, + LAPACKE_dggsvp_work, + LAPACKE_dggsvd, + LAPACKE_dggsvd_work, LAPACKE_dgeqpf, LAPACKE_dgeqpf_work, +); +@lapacke_deprecated_objss = ( + LAPACKE_sggsvp, + LAPACKE_sggsvp_work, + LAPACKE_sggsvd, + LAPACKE_sggsvd_work, LAPACKE_sgeqpf, LAPACKE_sgeqpf_work, +); +@lapacke_deprecated_objsz = ( + LAPACKE_zggsvp, + LAPACKE_zggsvp_work, + LAPACKE_zggsvd, + LAPACKE_zggsvd_work, LAPACKE_zgeqpf, LAPACKE_zgeqpf_work, ); @@ -763,6 +884,15 @@ # exported since the respective LAPACK routines are not built by default. # @(OBJ) from `lapack-3.4.1/lapacke/utils/Makefile` + LAPACKE_lsame, + LAPACKE_ilaver, + LAPACKE_xerbla, + lapack_make_complex_float, + lapack_make_complex_double, + LAPACKE_get_nancheck, + LAPACKE_set_nancheck, +); +@lapackeobjsc = ( LAPACKE_cgb_nancheck, LAPACKE_cgb_trans, LAPACKE_cge_nancheck, @@ -801,118 +931,6 @@ LAPACKE_ctp_trans, LAPACKE_ctr_nancheck, LAPACKE_ctr_trans, - LAPACKE_dgb_nancheck, - LAPACKE_dgb_trans, - LAPACKE_dge_nancheck, - LAPACKE_dge_trans, - LAPACKE_dgg_nancheck, - LAPACKE_dgg_trans, - LAPACKE_dgt_nancheck, - LAPACKE_dhs_nancheck, - LAPACKE_dhs_trans, - LAPACKE_d_nancheck, - LAPACKE_dpb_nancheck, - LAPACKE_dpb_trans, - LAPACKE_dpf_nancheck, - LAPACKE_dpf_trans, - LAPACKE_dpo_nancheck, - LAPACKE_dpo_trans, - LAPACKE_dpp_nancheck, - LAPACKE_dpp_trans, - LAPACKE_dpt_nancheck, - LAPACKE_dsb_nancheck, - LAPACKE_dsb_trans, - LAPACKE_dsp_nancheck, - LAPACKE_dsp_trans, - LAPACKE_dst_nancheck, - LAPACKE_dsy_nancheck, - LAPACKE_dsy_trans, - LAPACKE_dtb_nancheck, - LAPACKE_dtb_trans, - LAPACKE_dtf_nancheck, - LAPACKE_dtf_trans, - LAPACKE_dtp_nancheck, - LAPACKE_dtp_trans, - LAPACKE_dtr_nancheck, - LAPACKE_dtr_trans, - LAPACKE_lsame, - LAPACKE_sgb_nancheck, - LAPACKE_sgb_trans, - LAPACKE_sge_nancheck, - LAPACKE_sge_trans, - LAPACKE_sgg_nancheck, - LAPACKE_sgg_trans, - LAPACKE_sgt_nancheck, - LAPACKE_shs_nancheck, - LAPACKE_shs_trans, - LAPACKE_s_nancheck, - LAPACKE_spb_nancheck, - LAPACKE_spb_trans, - LAPACKE_spf_nancheck, - LAPACKE_spf_trans, - LAPACKE_spo_nancheck, - LAPACKE_spo_trans, - LAPACKE_spp_nancheck, - LAPACKE_spp_trans, - LAPACKE_spt_nancheck, - LAPACKE_ssb_nancheck, - LAPACKE_ssb_trans, - LAPACKE_ssp_nancheck, - LAPACKE_ssp_trans, - LAPACKE_sst_nancheck, - LAPACKE_ssy_nancheck, - LAPACKE_ssy_trans, - LAPACKE_stb_nancheck, - LAPACKE_stb_trans, - LAPACKE_stf_nancheck, - LAPACKE_stf_trans, - LAPACKE_stp_nancheck, - LAPACKE_stp_trans, - LAPACKE_str_nancheck, - LAPACKE_str_trans, - LAPACKE_xerbla, - LAPACKE_zgb_nancheck, - LAPACKE_zgb_trans, - LAPACKE_zge_nancheck, - LAPACKE_zge_trans, - LAPACKE_zgg_nancheck, - LAPACKE_zgg_trans, - LAPACKE_zgt_nancheck, - LAPACKE_zhb_nancheck, - LAPACKE_zhb_trans, - LAPACKE_zhe_nancheck, - LAPACKE_zhe_trans, - LAPACKE_zhp_nancheck, - LAPACKE_zhp_trans, - LAPACKE_zhs_nancheck, - LAPACKE_zhs_trans, - LAPACKE_z_nancheck, - LAPACKE_zpb_nancheck, - LAPACKE_zpb_trans, - LAPACKE_zpf_nancheck, - LAPACKE_zpf_trans, - LAPACKE_zpo_nancheck, - LAPACKE_zpo_trans, - LAPACKE_zpp_nancheck, - LAPACKE_zpp_trans, - LAPACKE_zpt_nancheck, - LAPACKE_zsp_nancheck, - LAPACKE_zsp_trans, - LAPACKE_zst_nancheck, - LAPACKE_zsy_nancheck, - LAPACKE_zsy_trans, - LAPACKE_ztb_nancheck, - LAPACKE_ztb_trans, - LAPACKE_ztf_nancheck, - LAPACKE_ztf_trans, - LAPACKE_ztp_nancheck, - LAPACKE_ztp_trans, - LAPACKE_ztr_nancheck, - LAPACKE_ztr_trans, - lapack_make_complex_float, - lapack_make_complex_double, - - # @(SRC_OBJ) from `lapack-3.5.0/lapacke/src/Makefile` LAPACKE_cbbcsd, LAPACKE_cbbcsd_work, LAPACKE_cbdsqr, @@ -1405,82 +1423,238 @@ LAPACKE_cupgtr_work, LAPACKE_cupmtr, LAPACKE_cupmtr_work, - LAPACKE_dbbcsd, - LAPACKE_dbbcsd_work, - LAPACKE_dbdsdc, - LAPACKE_dbdsdc_work, - LAPACKE_dbdsqr, - LAPACKE_dbdsqr_work, - LAPACKE_ddisna, - LAPACKE_ddisna_work, - LAPACKE_dgbbrd, - LAPACKE_dgbbrd_work, - LAPACKE_dgbcon, - LAPACKE_dgbcon_work, - LAPACKE_dgbequ, - LAPACKE_dgbequ_work, - LAPACKE_dgbequb, - LAPACKE_dgbequb_work, - LAPACKE_dgbrfs, - LAPACKE_dgbrfs_work, - LAPACKE_dgbsv, - LAPACKE_dgbsv_work, - LAPACKE_dgbsvx, - LAPACKE_dgbsvx_work, - LAPACKE_dgbtrf, - LAPACKE_dgbtrf_work, - LAPACKE_dgbtrs, - LAPACKE_dgbtrs_work, - LAPACKE_dgebak, - LAPACKE_dgebak_work, - LAPACKE_dgebal, - LAPACKE_dgebal_work, - LAPACKE_dgebrd, - LAPACKE_dgebrd_work, - LAPACKE_dgecon, - LAPACKE_dgecon_work, - LAPACKE_dgeequ, - LAPACKE_dgeequ_work, - LAPACKE_dgeequb, - LAPACKE_dgeequb_work, - LAPACKE_dgees, - LAPACKE_dgees_work, - LAPACKE_dgeesx, - LAPACKE_dgeesx_work, - LAPACKE_dgeev, - LAPACKE_dgeev_work, - LAPACKE_dgeevx, - LAPACKE_dgeevx_work, - LAPACKE_dgehrd, - LAPACKE_dgehrd_work, - LAPACKE_dgejsv, - LAPACKE_dgejsv_work, - LAPACKE_dgelq2, - LAPACKE_dgelq2_work, - LAPACKE_dgelqf, - LAPACKE_dgelqf_work, - LAPACKE_dgels, - LAPACKE_dgels_work, - LAPACKE_dgelsd, - LAPACKE_dgelsd_work, - LAPACKE_dgelss, - LAPACKE_dgelss_work, - LAPACKE_dgelsy, - LAPACKE_dgelsy_work, - LAPACKE_dgemqrt, - LAPACKE_dgemqrt_work, - LAPACKE_dgeqlf, - LAPACKE_dgeqlf_work, - LAPACKE_dgeqp3, - LAPACKE_dgeqp3_work, - LAPACKE_dgeqr2, - LAPACKE_dgeqr2_work, - LAPACKE_dgeqrf, - LAPACKE_dgeqrf_work, - LAPACKE_dgeqrfp, - LAPACKE_dgeqrfp_work, - LAPACKE_dgeqrt, - LAPACKE_dgeqrt2, + LAPACKE_csyr, + LAPACKE_csyr_work, + LAPACKE_clatms, + LAPACKE_clatms_work, + LAPACKE_clagge, + LAPACKE_clagge_work, + LAPACKE_claghe, + LAPACKE_claghe_work, + LAPACKE_clagsy, + LAPACKE_clagsy_work, + LAPACKE_cgejsv, + LAPACKE_cgejsv_work, + LAPACKE_cgesvdx, + LAPACKE_cgesvdx_work, + LAPACKE_cgesvj, + LAPACKE_cgesvj_work, + LAPACKE_cgetrf2, + LAPACKE_cgetrf2_work, + LAPACKE_cgges3, + LAPACKE_cgges3_work, + LAPACKE_cggev3, + LAPACKE_cggev3_work, + LAPACKE_cgghd3, + LAPACKE_cgghd3_work, + LAPACKE_cggsvd3, + LAPACKE_cggsvd3_work, + LAPACKE_cggsvp3, + LAPACKE_cggsvp3_work, + LAPACKE_chetrf_rook, + LAPACKE_chetrf_rook_work, + LAPACKE_chetrs_rook, + LAPACKE_chetrs_rook_work, + LAPACKE_clapmt, + LAPACKE_clapmt_work, + LAPACKE_clascl, + LAPACKE_clascl_work, + LAPACKE_cpotrf2, + LAPACKE_cpotrf2_work, + LAPACKE_csytrf_rook, + LAPACKE_csytrf_rook_work, + LAPACKE_csytrs_rook, + LAPACKE_csytrs_rook_work, + LAPACKE_cuncsd2by1, + LAPACKE_cuncsd2by1_work, + LAPACKE_cgelq, + LAPACKE_cgelq_work, + LAPACKE_cgemlq, + LAPACKE_cgemlq_work, + LAPACKE_cgemqr, + LAPACKE_cgemqr_work, + LAPACKE_cgeqr, + LAPACKE_cgeqr_work, + LAPACKE_cgetsls, + LAPACKE_cgetsls_work, + LAPACKE_chbev_2stage, + LAPACKE_chbev_2stage_work, + LAPACKE_chbevd_2stage, + LAPACKE_chbevd_2stage_work, + LAPACKE_chbevx_2stage, + LAPACKE_chbevx_2stage_work, + LAPACKE_checon_3, + LAPACKE_checon_3_work, + LAPACKE_cheev_2stage, + LAPACKE_cheev_2stage_work, + LAPACKE_cheevd_2stage, + LAPACKE_cheevd_2stage_work, + LAPACKE_cheevr_2stage, + LAPACKE_cheevr_2stage_work, + LAPACKE_cheevx_2stage, + LAPACKE_cheevx_2stage_work, + LAPACKE_chegv_2stage, + LAPACKE_chegv_2stage_work, + LAPACKE_chesv_aa, + LAPACKE_chesv_aa_work, + LAPACKE_chesv_rk, + LAPACKE_chesv_rk_work, + LAPACKE_chetrf_aa, + LAPACKE_chetrf_aa_work, + LAPACKE_chetrf_rk, + LAPACKE_chetrf_rk_work, + LAPACKE_chetri_3, + LAPACKE_chetri_3_work, + LAPACKE_chetrs_aa, + LAPACKE_chetrs_aa_work, + LAPACKE_chetrs_3, + LAPACKE_chetrs_3_work, + LAPACKE_csycon_3, + LAPACKE_csycon_3_work, + LAPACKE_csysv_aa, + LAPACKE_csysv_aa_work, + LAPACKE_csysv_rk, + LAPACKE_csysv_rk_work, + LAPACKE_csytrf_aa, + LAPACKE_csytrf_aa_work, + LAPACKE_csytrf_rk, + LAPACKE_csytrf_rk_work, + LAPACKE_csytri_3, + LAPACKE_csytri_3_work, + LAPACKE_csytrs_aa, + LAPACKE_csytrs_aa_work, + LAPACKE_csytrs_3, + LAPACKE_csytrs_3_work, + LAPACKE_chesv_aa_2stage, + LAPACKE_chesv_aa_2stage_work, + LAPACKE_chetrf_aa_2stage, + LAPACKE_chetrf_aa_2stage_work, + LAPACKE_chetrs_aa_2stage, + LAPACKE_chetrs_aa_2stage_work, + LAPACKE_clacrm, + LAPACKE_clacrm_work, + LAPACKE_clarcm, + LAPACKE_clarcm_work, + LAPACKE_classq, + LAPACKE_classq_work, + LAPACKE_csysv_aa_2stage, + LAPACKE_csysv_aa_2stage_work, + LAPACKE_csytrf_aa_2stage, + LAPACKE_csytrf_aa_2stage_work, + LAPACKE_csytrs_aa_2stage, + LAPACKE_csytrs_aa_2stage_work, +); +@lapackeobjsd = ( + LAPACKE_dgb_nancheck, + LAPACKE_dgb_trans, + LAPACKE_dge_nancheck, + LAPACKE_dge_trans, + LAPACKE_dgg_nancheck, + LAPACKE_dgg_trans, + LAPACKE_dgt_nancheck, + LAPACKE_dhs_nancheck, + LAPACKE_dhs_trans, + LAPACKE_d_nancheck, + LAPACKE_dpb_nancheck, + LAPACKE_dpb_trans, + LAPACKE_dpf_nancheck, + LAPACKE_dpf_trans, + LAPACKE_dpo_nancheck, + LAPACKE_dpo_trans, + LAPACKE_dpp_nancheck, + LAPACKE_dpp_trans, + LAPACKE_dpt_nancheck, + LAPACKE_dsb_nancheck, + LAPACKE_dsb_trans, + LAPACKE_dsp_nancheck, + LAPACKE_dsp_trans, + LAPACKE_dst_nancheck, + LAPACKE_dsy_nancheck, + LAPACKE_dsy_trans, + LAPACKE_dtb_nancheck, + LAPACKE_dtb_trans, + LAPACKE_dtf_nancheck, + LAPACKE_dtf_trans, + LAPACKE_dtp_nancheck, + LAPACKE_dtp_trans, + LAPACKE_dtr_nancheck, + LAPACKE_dtr_trans, + LAPACKE_dbbcsd, + LAPACKE_dbbcsd_work, + LAPACKE_dbdsdc, + LAPACKE_dbdsdc_work, + LAPACKE_dbdsqr, + LAPACKE_dbdsqr_work, + LAPACKE_ddisna, + LAPACKE_ddisna_work, + LAPACKE_dgbbrd, + LAPACKE_dgbbrd_work, + LAPACKE_dgbcon, + LAPACKE_dgbcon_work, + LAPACKE_dgbequ, + LAPACKE_dgbequ_work, + LAPACKE_dgbequb, + LAPACKE_dgbequb_work, + LAPACKE_dgbrfs, + LAPACKE_dgbrfs_work, + LAPACKE_dgbsv, + LAPACKE_dgbsv_work, + LAPACKE_dgbsvx, + LAPACKE_dgbsvx_work, + LAPACKE_dgbtrf, + LAPACKE_dgbtrf_work, + LAPACKE_dgbtrs, + LAPACKE_dgbtrs_work, + LAPACKE_dgebak, + LAPACKE_dgebak_work, + LAPACKE_dgebal, + LAPACKE_dgebal_work, + LAPACKE_dgebrd, + LAPACKE_dgebrd_work, + LAPACKE_dgecon, + LAPACKE_dgecon_work, + LAPACKE_dgeequ, + LAPACKE_dgeequ_work, + LAPACKE_dgeequb, + LAPACKE_dgeequb_work, + LAPACKE_dgees, + LAPACKE_dgees_work, + LAPACKE_dgeesx, + LAPACKE_dgeesx_work, + LAPACKE_dgeev, + LAPACKE_dgeev_work, + LAPACKE_dgeevx, + LAPACKE_dgeevx_work, + LAPACKE_dgehrd, + LAPACKE_dgehrd_work, + LAPACKE_dgejsv, + LAPACKE_dgejsv_work, + LAPACKE_dgelq2, + LAPACKE_dgelq2_work, + LAPACKE_dgelqf, + LAPACKE_dgelqf_work, + LAPACKE_dgels, + LAPACKE_dgels_work, + LAPACKE_dgelsd, + LAPACKE_dgelsd_work, + LAPACKE_dgelss, + LAPACKE_dgelss_work, + LAPACKE_dgelsy, + LAPACKE_dgelsy_work, + LAPACKE_dgemqrt, + LAPACKE_dgemqrt_work, + LAPACKE_dgeqlf, + LAPACKE_dgeqlf_work, + LAPACKE_dgeqp3, + LAPACKE_dgeqp3_work, + LAPACKE_dgeqr2, + LAPACKE_dgeqr2_work, + LAPACKE_dgeqrf, + LAPACKE_dgeqrf_work, + LAPACKE_dgeqrfp, + LAPACKE_dgeqrfp_work, + LAPACKE_dgeqrt, + LAPACKE_dgeqrt2, LAPACKE_dgeqrt2_work, LAPACKE_dgeqrt3, LAPACKE_dgeqrt3_work, @@ -1889,31 +2063,155 @@ LAPACKE_dtrttp_work, LAPACKE_dtzrzf, LAPACKE_dtzrzf_work, - LAPACKE_sbbcsd, - LAPACKE_sbbcsd_work, - LAPACKE_sbdsdc, - LAPACKE_sbdsdc_work, - LAPACKE_sbdsqr, - LAPACKE_sbdsqr_work, - LAPACKE_sdisna, - LAPACKE_sdisna_work, - LAPACKE_sgbbrd, - LAPACKE_sgbbrd_work, - LAPACKE_sgbcon, - LAPACKE_sgbcon_work, - LAPACKE_sgbequ, - LAPACKE_sgbequ_work, - LAPACKE_sgbequb, - LAPACKE_sgbequb_work, - LAPACKE_sgbrfs, - LAPACKE_sgbrfs_work, - LAPACKE_sgbsv, - LAPACKE_sgbsv_work, - LAPACKE_sgbsvx, - LAPACKE_sgbsvx_work, - LAPACKE_sgbtrf, - LAPACKE_sgbtrf_work, - LAPACKE_sgbtrs, + LAPACKE_dlatms, + LAPACKE_dlatms_work, + LAPACKE_dlagge, + LAPACKE_dlagge_work, + LAPACKE_dlagsy, + LAPACKE_dlagsy_work, + LAPACKE_dbdsvdx, + LAPACKE_dbdsvdx_work, + LAPACKE_dgesvdx, + LAPACKE_dgesvdx_work, + LAPACKE_dgetrf2, + LAPACKE_dgetrf2_work, + LAPACKE_dgges3, + LAPACKE_dgges3_work, + LAPACKE_dggev3, + LAPACKE_dggev3_work, + LAPACKE_dgghd3, + LAPACKE_dgghd3_work, + LAPACKE_dggsvd3, + LAPACKE_dggsvd3_work, + LAPACKE_dggsvp3, + LAPACKE_dggsvp3_work, + LAPACKE_dlapmt, + LAPACKE_dlapmt_work, + LAPACKE_dlascl, + LAPACKE_dlascl_work, + LAPACKE_dorcsd2by1, + LAPACKE_dorcsd2by1_work, + LAPACKE_dpotrf2, + LAPACKE_dpotrf2_work, + LAPACKE_dsytrf_rook, + LAPACKE_dsytrf_rook_work, + LAPACKE_dsytrs_rook, + LAPACKE_dsytrs_rook_work, + LAPACKE_dgelq, + LAPACKE_dgelq_work, + LAPACKE_dgemlq, + LAPACKE_dgemlq_work, + LAPACKE_dgemqr, + LAPACKE_dgemqr_work, + LAPACKE_dgeqr, + LAPACKE_dgeqr_work, + LAPACKE_dgetsls, + LAPACKE_dgetsls_work, + LAPACKE_dsbev_2stage, + LAPACKE_dsbev_2stage_work, + LAPACKE_dsbevd_2stage, + LAPACKE_dsbevd_2stage_work, + LAPACKE_dsbevx_2stage, + LAPACKE_dsbevx_2stage_work, + LAPACKE_dsycon_3, + LAPACKE_dsycon_3_work, + LAPACKE_dsyev_2stage, + LAPACKE_dsyev_2stage_work, + LAPACKE_dsyevd_2stage, + LAPACKE_dsyevd_2stage_work, + LAPACKE_dsyevr_2stage, + LAPACKE_dsyevr_2stage_work, + LAPACKE_dsyevx_2stage, + LAPACKE_dsyevx_2stage_work, + LAPACKE_dsygv_2stage, + LAPACKE_dsygv_2stage_work, + LAPACKE_dsysv_aa, + LAPACKE_dsysv_aa_work, + LAPACKE_dsysv_rk, + LAPACKE_dsysv_rk_work, + LAPACKE_dsytrf_aa, + LAPACKE_dsytrf_aa_work, + LAPACKE_dsytrf_rk, + LAPACKE_dsytrf_rk_work, + LAPACKE_dsytri_3, + LAPACKE_dsytri_3_work, + LAPACKE_dsytrs_aa, + LAPACKE_dsytrs_aa_work, + LAPACKE_dsytrs_3, + LAPACKE_dsytrs_3_work, + LAPACKE_dlassq, + LAPACKE_dlassq_work, + LAPACKE_dsysv_aa_2stage, + LAPACKE_dsysv_aa_2stage_work, + LAPACKE_dsytrf_aa_2stage, + LAPACKE_dsytrf_aa_2stage_work, + LAPACKE_dsytrs_aa_2stage, + LAPACKE_dsytrs_aa_2stage_work, + LAPACKE_dgesvdq, + LAPACKE_dgesvdq_work, + LAPACKE_slag2d, + LAPACKE_slag2d_work, +); +@lapackeobjss = ( + LAPACKE_sgb_nancheck, + LAPACKE_sgb_trans, + LAPACKE_sge_nancheck, + LAPACKE_sge_trans, + LAPACKE_sgg_nancheck, + LAPACKE_sgg_trans, + LAPACKE_sgt_nancheck, + LAPACKE_shs_nancheck, + LAPACKE_shs_trans, + LAPACKE_s_nancheck, + LAPACKE_spb_nancheck, + LAPACKE_spb_trans, + LAPACKE_spf_nancheck, + LAPACKE_spf_trans, + LAPACKE_spo_nancheck, + LAPACKE_spo_trans, + LAPACKE_spp_nancheck, + LAPACKE_spp_trans, + LAPACKE_spt_nancheck, + LAPACKE_ssb_nancheck, + LAPACKE_ssb_trans, + LAPACKE_ssp_nancheck, + LAPACKE_ssp_trans, + LAPACKE_sst_nancheck, + LAPACKE_ssy_nancheck, + LAPACKE_ssy_trans, + LAPACKE_stb_nancheck, + LAPACKE_stb_trans, + LAPACKE_stf_nancheck, + LAPACKE_stf_trans, + LAPACKE_stp_nancheck, + LAPACKE_stp_trans, + LAPACKE_str_nancheck, + LAPACKE_str_trans, + LAPACKE_sbbcsd, + LAPACKE_sbbcsd_work, + LAPACKE_sbdsdc, + LAPACKE_sbdsdc_work, + LAPACKE_sbdsqr, + LAPACKE_sbdsqr_work, + LAPACKE_sdisna, + LAPACKE_sdisna_work, + LAPACKE_sgbbrd, + LAPACKE_sgbbrd_work, + LAPACKE_sgbcon, + LAPACKE_sgbcon_work, + LAPACKE_sgbequ, + LAPACKE_sgbequ_work, + LAPACKE_sgbequb, + LAPACKE_sgbequb_work, + LAPACKE_sgbrfs, + LAPACKE_sgbrfs_work, + LAPACKE_sgbsv, + LAPACKE_sgbsv_work, + LAPACKE_sgbsvx, + LAPACKE_sgbsvx_work, + LAPACKE_sgbtrf, + LAPACKE_sgbtrf_work, + LAPACKE_sgbtrs, LAPACKE_sgbtrs_work, LAPACKE_sgebak, LAPACKE_sgebak_work, @@ -2035,8 +2333,6 @@ LAPACKE_slacn2_work, LAPACKE_slacpy, LAPACKE_slacpy_work, - LAPACKE_slag2d, - LAPACKE_slag2d_work, LAPACKE_slamch, LAPACKE_slamch_work, LAPACKE_slange, @@ -2367,112 +2663,240 @@ LAPACKE_strttp_work, LAPACKE_stzrzf, LAPACKE_stzrzf_work, - LAPACKE_zbbcsd, - LAPACKE_zbbcsd_work, - LAPACKE_zbdsqr, - LAPACKE_zbdsqr_work, - LAPACKE_zcgesv, - LAPACKE_zcgesv_work, - LAPACKE_zcposv, - LAPACKE_zcposv_work, - LAPACKE_zgbbrd, - LAPACKE_zgbbrd_work, - LAPACKE_zgbcon, - LAPACKE_zgbcon_work, - LAPACKE_zgbequ, - LAPACKE_zgbequ_work, - LAPACKE_zgbequb, - LAPACKE_zgbequb_work, - LAPACKE_zgbrfs, - LAPACKE_zgbrfs_work, - LAPACKE_zgbsv, - LAPACKE_zgbsv_work, - LAPACKE_zgbsvx, - LAPACKE_zgbsvx_work, - LAPACKE_zgbtrf, - LAPACKE_zgbtrf_work, - LAPACKE_zgbtrs, - LAPACKE_zgbtrs_work, - LAPACKE_zgebak, - LAPACKE_zgebak_work, - LAPACKE_zgebal, - LAPACKE_zgebal_work, - LAPACKE_zgebrd, - LAPACKE_zgebrd_work, - LAPACKE_zgecon, - LAPACKE_zgecon_work, - LAPACKE_zgeequ, - LAPACKE_zgeequ_work, - LAPACKE_zgeequb, - LAPACKE_zgeequb_work, - LAPACKE_zgees, - LAPACKE_zgees_work, - LAPACKE_zgeesx, - LAPACKE_zgeesx_work, - LAPACKE_zgeev, - LAPACKE_zgeev_work, - LAPACKE_zgeevx, - LAPACKE_zgeevx_work, - LAPACKE_zgehrd, - LAPACKE_zgehrd_work, - LAPACKE_zgelq2, - LAPACKE_zgelq2_work, - LAPACKE_zgelqf, - LAPACKE_zgelqf_work, - LAPACKE_zgels, - LAPACKE_zgels_work, - LAPACKE_zgelsd, - LAPACKE_zgelsd_work, - LAPACKE_zgelss, - LAPACKE_zgelss_work, - LAPACKE_zgelsy, - LAPACKE_zgelsy_work, - LAPACKE_zgemqrt, - LAPACKE_zgemqrt_work, - LAPACKE_zgeqlf, - LAPACKE_zgeqlf_work, - LAPACKE_zgeqp3, - LAPACKE_zgeqp3_work, - LAPACKE_zgeqr2, - LAPACKE_zgeqr2_work, - LAPACKE_zgeqrf, - LAPACKE_zgeqrf_work, - LAPACKE_zgeqrfp, - LAPACKE_zgeqrfp_work, - LAPACKE_zgeqrt, - LAPACKE_zgeqrt2, - LAPACKE_zgeqrt2_work, - LAPACKE_zgeqrt3, - LAPACKE_zgeqrt3_work, - LAPACKE_zgeqrt_work, - LAPACKE_zgerfs, - LAPACKE_zgerfs_work, - LAPACKE_zgerqf, - LAPACKE_zgerqf_work, - LAPACKE_zgesdd, - LAPACKE_zgesdd_work, - LAPACKE_zgesv, - LAPACKE_zgesv_work, - LAPACKE_zgesvd, - LAPACKE_zgesvd_work, - LAPACKE_zgesvx, - LAPACKE_zgesvx_work, - LAPACKE_zgetf2, - LAPACKE_zgetf2_work, - LAPACKE_zgetrf, - LAPACKE_zgetrf_work, - LAPACKE_zgetri, - LAPACKE_zgetri_work, - LAPACKE_zgetrs, - LAPACKE_zgetrs_work, - LAPACKE_zggbak, - LAPACKE_zggbak_work, - LAPACKE_zggbal, - LAPACKE_zggbal_work, - LAPACKE_zgges, - LAPACKE_zgges_work, - LAPACKE_zggesx, - LAPACKE_zggesx_work, + LAPACKE_slatms, + LAPACKE_slatms_work, + LAPACKE_slagge, + LAPACKE_slagge_work, + LAPACKE_slagsy, + LAPACKE_slagsy_work, + LAPACKE_sbdsvdx, + LAPACKE_sbdsvdx_work, + LAPACKE_sgesvdx, + LAPACKE_sgesvdx_work, + LAPACKE_sgetrf2, + LAPACKE_sgetrf2_work, + LAPACKE_sgges3, + LAPACKE_sgges3_work, + LAPACKE_sggev3, + LAPACKE_sggev3_work, + LAPACKE_sgghd3, + LAPACKE_sgghd3_work, + LAPACKE_sggsvd3, + LAPACKE_sggsvd3_work, + LAPACKE_sggsvp3, + LAPACKE_sggsvp3_work, + LAPACKE_slapmt, + LAPACKE_slapmt_work, + LAPACKE_slascl, + LAPACKE_slascl_work, + LAPACKE_sorcsd2by1, + LAPACKE_sorcsd2by1_work, + LAPACKE_spotrf2, + LAPACKE_spotrf2_work, + LAPACKE_ssytrf_rook, + LAPACKE_ssytrf_rook_work, + LAPACKE_ssytrs_rook, + LAPACKE_ssytrs_rook_work, + LAPACKE_stpqrt, + LAPACKE_stpqrt_work, + LAPACKE_sgelq, + LAPACKE_sgelq_work, + LAPACKE_sgemlq, + LAPACKE_sgemlq_work, + LAPACKE_sgemqr, + LAPACKE_sgemqr_work, + LAPACKE_sgeqr, + LAPACKE_sgeqr_work, + LAPACKE_sgetsls, + LAPACKE_sgetsls_work, + LAPACKE_ssbev_2stage, + LAPACKE_ssbev_2stage_work, + LAPACKE_ssbevd_2stage, + LAPACKE_ssbevd_2stage_work, + LAPACKE_ssbevx_2stage, + LAPACKE_ssbevx_2stage_work, + LAPACKE_ssycon_3, + LAPACKE_ssycon_3_work, + LAPACKE_ssyev_2stage, + LAPACKE_ssyev_2stage_work, + LAPACKE_ssyevd_2stage, + LAPACKE_ssyevd_2stage_work, + LAPACKE_ssyevr_2stage, + LAPACKE_ssyevr_2stage_work, + LAPACKE_ssyevx_2stage, + LAPACKE_ssyevx_2stage_work, + LAPACKE_ssygv_2stage, + LAPACKE_ssygv_2stage_work, + LAPACKE_ssysv_aa, + LAPACKE_ssysv_aa_work, + LAPACKE_ssysv_rk, + LAPACKE_ssysv_rk_work, + LAPACKE_ssytrf_aa, + LAPACKE_ssytrf_aa_work, + LAPACKE_ssytrf_rk, + LAPACKE_ssytrf_rk_work, + LAPACKE_ssytri_3, + LAPACKE_ssytri_3_work, + LAPACKE_ssytrs_aa, + LAPACKE_ssytrs_aa_work, + LAPACKE_ssytrs_3, + LAPACKE_ssytrs_3_work, + LAPACKE_slassq, + LAPACKE_slassq_work, + LAPACKE_ssysv_aa_2stage, + LAPACKE_ssysv_aa_2stage_work, + LAPACKE_ssytrf_aa_2stage, + LAPACKE_ssytrf_aa_2stage_work, + LAPACKE_ssytrs_aa_2stage, + LAPACKE_ssytrs_aa_2stage_work, + LAPACKE_sgesvdq, + LAPACKE_sgesvdq_work, +); +@lapackeobjsz = ( + LAPACKE_zgb_nancheck, + LAPACKE_zgb_trans, + LAPACKE_zge_nancheck, + LAPACKE_zge_trans, + LAPACKE_zgg_nancheck, + LAPACKE_zgg_trans, + LAPACKE_zgt_nancheck, + LAPACKE_zhb_nancheck, + LAPACKE_zhb_trans, + LAPACKE_zhe_nancheck, + LAPACKE_zhe_trans, + LAPACKE_zhp_nancheck, + LAPACKE_zhp_trans, + LAPACKE_zhs_nancheck, + LAPACKE_zhs_trans, + LAPACKE_z_nancheck, + LAPACKE_zpb_nancheck, + LAPACKE_zpb_trans, + LAPACKE_zpf_nancheck, + LAPACKE_zpf_trans, + LAPACKE_zpo_nancheck, + LAPACKE_zpo_trans, + LAPACKE_zpp_nancheck, + LAPACKE_zpp_trans, + LAPACKE_zpt_nancheck, + LAPACKE_zsp_nancheck, + LAPACKE_zsp_trans, + LAPACKE_zst_nancheck, + LAPACKE_zsy_nancheck, + LAPACKE_zsy_trans, + LAPACKE_ztb_nancheck, + LAPACKE_ztb_trans, + LAPACKE_ztf_nancheck, + LAPACKE_ztf_trans, + LAPACKE_ztp_nancheck, + LAPACKE_ztp_trans, + LAPACKE_ztr_nancheck, + LAPACKE_ztr_trans, + LAPACKE_zbbcsd, + LAPACKE_zbbcsd_work, + LAPACKE_zbdsqr, + LAPACKE_zbdsqr_work, + LAPACKE_zcgesv, + LAPACKE_zcgesv_work, + LAPACKE_zcposv, + LAPACKE_zcposv_work, + LAPACKE_zgbbrd, + LAPACKE_zgbbrd_work, + LAPACKE_zgbcon, + LAPACKE_zgbcon_work, + LAPACKE_zgbequ, + LAPACKE_zgbequ_work, + LAPACKE_zgbequb, + LAPACKE_zgbequb_work, + LAPACKE_zgbrfs, + LAPACKE_zgbrfs_work, + LAPACKE_zgbsv, + LAPACKE_zgbsv_work, + LAPACKE_zgbsvx, + LAPACKE_zgbsvx_work, + LAPACKE_zgbtrf, + LAPACKE_zgbtrf_work, + LAPACKE_zgbtrs, + LAPACKE_zgbtrs_work, + LAPACKE_zgebak, + LAPACKE_zgebak_work, + LAPACKE_zgebal, + LAPACKE_zgebal_work, + LAPACKE_zgebrd, + LAPACKE_zgebrd_work, + LAPACKE_zgecon, + LAPACKE_zgecon_work, + LAPACKE_zgeequ, + LAPACKE_zgeequ_work, + LAPACKE_zgeequb, + LAPACKE_zgeequb_work, + LAPACKE_zgees, + LAPACKE_zgees_work, + LAPACKE_zgeesx, + LAPACKE_zgeesx_work, + LAPACKE_zgeev, + LAPACKE_zgeev_work, + LAPACKE_zgeevx, + LAPACKE_zgeevx_work, + LAPACKE_zgehrd, + LAPACKE_zgehrd_work, + LAPACKE_zgelq2, + LAPACKE_zgelq2_work, + LAPACKE_zgelqf, + LAPACKE_zgelqf_work, + LAPACKE_zgels, + LAPACKE_zgels_work, + LAPACKE_zgelsd, + LAPACKE_zgelsd_work, + LAPACKE_zgelss, + LAPACKE_zgelss_work, + LAPACKE_zgelsy, + LAPACKE_zgelsy_work, + LAPACKE_zgemqrt, + LAPACKE_zgemqrt_work, + LAPACKE_zgeqlf, + LAPACKE_zgeqlf_work, + LAPACKE_zgeqp3, + LAPACKE_zgeqp3_work, + LAPACKE_zgeqr2, + LAPACKE_zgeqr2_work, + LAPACKE_zgeqrf, + LAPACKE_zgeqrf_work, + LAPACKE_zgeqrfp, + LAPACKE_zgeqrfp_work, + LAPACKE_zgeqrt, + LAPACKE_zgeqrt2, + LAPACKE_zgeqrt2_work, + LAPACKE_zgeqrt3, + LAPACKE_zgeqrt3_work, + LAPACKE_zgeqrt_work, + LAPACKE_zgerfs, + LAPACKE_zgerfs_work, + LAPACKE_zgerqf, + LAPACKE_zgerqf_work, + LAPACKE_zgesdd, + LAPACKE_zgesdd_work, + LAPACKE_zgesv, + LAPACKE_zgesv_work, + LAPACKE_zgesvd, + LAPACKE_zgesvd_work, + LAPACKE_zgesvx, + LAPACKE_zgesvx_work, + LAPACKE_zgetf2, + LAPACKE_zgetf2_work, + LAPACKE_zgetrf, + LAPACKE_zgetrf_work, + LAPACKE_zgetri, + LAPACKE_zgetri_work, + LAPACKE_zgetrs, + LAPACKE_zgetrs_work, + LAPACKE_zggbak, + LAPACKE_zggbak_work, + LAPACKE_zggbal, + LAPACKE_zggbal_work, + LAPACKE_zgges, + LAPACKE_zgges_work, + LAPACKE_zggesx, + LAPACKE_zggesx_work, LAPACKE_zggev, LAPACKE_zggev_work, LAPACKE_zggevx, @@ -2864,11 +3288,7 @@ LAPACKE_zupmtr, LAPACKE_zupmtr_work, LAPACKE_zsyr, - LAPACKE_csyr, LAPACKE_zsyr_work, - LAPACKE_csyr_work, - LAPACKE_ilaver, - ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the ## corresponding LAPACK extended precision routines. @@ -2948,128 +3368,15 @@ ## @(MATGEN_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` ## Not exported: requires LAPACKE_TESTING to be set and depends on libtmg ## (see `lapack-3.4.1/TESTING/MATGEN`). - LAPACKE_clatms, - LAPACKE_clatms_work, - LAPACKE_dlatms, - LAPACKE_dlatms_work, - LAPACKE_slatms, - LAPACKE_slatms_work, LAPACKE_zlatms, LAPACKE_zlatms_work, - LAPACKE_clagge, - LAPACKE_clagge_work, - LAPACKE_dlagge, - LAPACKE_dlagge_work, - LAPACKE_slagge, - LAPACKE_slagge_work, LAPACKE_zlagge, LAPACKE_zlagge_work, - LAPACKE_claghe, - LAPACKE_claghe_work, LAPACKE_zlaghe, LAPACKE_zlaghe_work, - LAPACKE_clagsy, - LAPACKE_clagsy_work, - LAPACKE_dlagsy, - LAPACKE_dlagsy_work, - LAPACKE_slagsy, - LAPACKE_slagsy_work, LAPACKE_zlagsy, LAPACKE_zlagsy_work, ## new function from lapack-3.6.0 - - LAPACKE_cgejsv, - LAPACKE_cgejsv_work, - LAPACKE_cgesvdx, - LAPACKE_cgesvdx_work, - LAPACKE_cgesvj, - LAPACKE_cgesvj_work, - LAPACKE_cgetrf2, - LAPACKE_cgetrf2_work, - LAPACKE_cgges3, - LAPACKE_cgges3_work, - LAPACKE_cggev3, - LAPACKE_cggev3_work, - LAPACKE_cgghd3, - LAPACKE_cgghd3_work, - LAPACKE_cggsvd3, - LAPACKE_cggsvd3_work, - LAPACKE_cggsvp3, - LAPACKE_cggsvp3_work, - LAPACKE_chetrf_rook, - LAPACKE_chetrf_rook_work, - LAPACKE_chetrs_rook, - LAPACKE_chetrs_rook_work, - LAPACKE_clapmt, - LAPACKE_clapmt_work, - LAPACKE_clascl, - LAPACKE_clascl_work, - LAPACKE_cpotrf2, - LAPACKE_cpotrf2_work, - LAPACKE_csytrf_rook, - LAPACKE_csytrf_rook_work, - LAPACKE_csytrs_rook, - LAPACKE_csytrs_rook_work, - LAPACKE_cuncsd2by1, - LAPACKE_cuncsd2by1_work, - LAPACKE_dbdsvdx, - LAPACKE_dbdsvdx_work, - LAPACKE_dgesvdx, - LAPACKE_dgesvdx_work, - LAPACKE_dgetrf2, - LAPACKE_dgetrf2_work, - LAPACKE_dgges3, - LAPACKE_dgges3_work, - LAPACKE_dggev3, - LAPACKE_dggev3_work, - LAPACKE_dgghd3, - LAPACKE_dgghd3_work, - LAPACKE_dggsvd3, - LAPACKE_dggsvd3_work, - LAPACKE_dggsvp3, - LAPACKE_dggsvp3_work, - LAPACKE_dlapmt, - LAPACKE_dlapmt_work, - LAPACKE_dlascl, - LAPACKE_dlascl_work, - LAPACKE_dorcsd2by1, - LAPACKE_dorcsd2by1_work, - LAPACKE_dpotrf2, - LAPACKE_dpotrf2_work, - LAPACKE_dsytrf_rook, - LAPACKE_dsytrf_rook_work, - LAPACKE_dsytrs_rook, - LAPACKE_dsytrs_rook_work, - LAPACKE_sbdsvdx, - LAPACKE_sbdsvdx_work, - LAPACKE_sgesvdx, - LAPACKE_sgesvdx_work, - LAPACKE_sgetrf2, - LAPACKE_sgetrf2_work, - LAPACKE_sgges3, - LAPACKE_sgges3_work, - LAPACKE_sggev3, - LAPACKE_sggev3_work, - LAPACKE_sgghd3, - LAPACKE_sgghd3_work, - LAPACKE_sggsvd3, - LAPACKE_sggsvd3_work, - LAPACKE_sggsvp3, - LAPACKE_sggsvp3_work, - LAPACKE_slapmt, - LAPACKE_slapmt_work, - LAPACKE_slascl, - LAPACKE_slascl_work, - LAPACKE_sorcsd2by1, - LAPACKE_sorcsd2by1_work, - LAPACKE_spotrf2, - LAPACKE_spotrf2_work, - LAPACKE_ssytrf_rook, - LAPACKE_ssytrf_rook_work, - LAPACKE_ssytrs_rook, - LAPACKE_ssytrs_rook_work, - LAPACKE_stpqrt, - LAPACKE_stpqrt_work, LAPACKE_zgejsv, LAPACKE_zgejsv_work, LAPACKE_zgesvdx, @@ -3106,148 +3413,6 @@ LAPACKE_zuncsd2by1_work, ## new function from lapack-3.7.0 - LAPACKE_cgelq, - LAPACKE_cgelq_work, - LAPACKE_cgemlq, - LAPACKE_cgemlq_work, - LAPACKE_cgemqr, - LAPACKE_cgemqr_work, - LAPACKE_cgeqr, - LAPACKE_cgeqr_work, - LAPACKE_cgetsls, - LAPACKE_cgetsls_work, - LAPACKE_chbev_2stage, - LAPACKE_chbev_2stage_work, - LAPACKE_chbevd_2stage, - LAPACKE_chbevd_2stage_work, - LAPACKE_chbevx_2stage, - LAPACKE_chbevx_2stage_work, - LAPACKE_checon_3, - LAPACKE_checon_3_work, - LAPACKE_cheev_2stage, - LAPACKE_cheev_2stage_work, - LAPACKE_cheevd_2stage, - LAPACKE_cheevd_2stage_work, - LAPACKE_cheevr_2stage, - LAPACKE_cheevr_2stage_work, - LAPACKE_cheevx_2stage, - LAPACKE_cheevx_2stage_work, - LAPACKE_chegv_2stage, - LAPACKE_chegv_2stage_work, - LAPACKE_chesv_aa, - LAPACKE_chesv_aa_work, - LAPACKE_chesv_rk, - LAPACKE_chesv_rk_work, - LAPACKE_chetrf_aa, - LAPACKE_chetrf_aa_work, - LAPACKE_chetrf_rk, - LAPACKE_chetrf_rk_work, - LAPACKE_chetri_3, - LAPACKE_chetri_3_work, - LAPACKE_chetrs_aa, - LAPACKE_chetrs_aa_work, - LAPACKE_chetrs_3, - LAPACKE_chetrs_3_work, - LAPACKE_csycon_3, - LAPACKE_csycon_3_work, - LAPACKE_csysv_aa, - LAPACKE_csysv_aa_work, - LAPACKE_csysv_rk, - LAPACKE_csysv_rk_work, - LAPACKE_csytrf_aa, - LAPACKE_csytrf_aa_work, - LAPACKE_csytrf_rk, - LAPACKE_csytrf_rk_work, - LAPACKE_csytri_3, - LAPACKE_csytri_3_work, - LAPACKE_csytrs_aa, - LAPACKE_csytrs_aa_work, - LAPACKE_csytrs_3, - LAPACKE_csytrs_3_work, - LAPACKE_dgelq, - LAPACKE_dgelq_work, - LAPACKE_dgemlq, - LAPACKE_dgemlq_work, - LAPACKE_dgemqr, - LAPACKE_dgemqr_work, - LAPACKE_dgeqr, - LAPACKE_dgeqr_work, - LAPACKE_dgetsls, - LAPACKE_dgetsls_work, - LAPACKE_dsbev_2stage, - LAPACKE_dsbev_2stage_work, - LAPACKE_dsbevd_2stage, - LAPACKE_dsbevd_2stage_work, - LAPACKE_dsbevx_2stage, - LAPACKE_dsbevx_2stage_work, - LAPACKE_dsycon_3, - LAPACKE_dsycon_3_work, - LAPACKE_dsyev_2stage, - LAPACKE_dsyev_2stage_work, - LAPACKE_dsyevd_2stage, - LAPACKE_dsyevd_2stage_work, - LAPACKE_dsyevr_2stage, - LAPACKE_dsyevr_2stage_work, - LAPACKE_dsyevx_2stage, - LAPACKE_dsyevx_2stage_work, - LAPACKE_dsygv_2stage, - LAPACKE_dsygv_2stage_work, - LAPACKE_dsysv_aa, - LAPACKE_dsysv_aa_work, - LAPACKE_dsysv_rk, - LAPACKE_dsysv_rk_work, - LAPACKE_dsytrf_aa, - LAPACKE_dsytrf_aa_work, - LAPACKE_dsytrf_rk, - LAPACKE_dsytrf_rk_work, - LAPACKE_dsytri_3, - LAPACKE_dsytri_3_work, - LAPACKE_dsytrs_aa, - LAPACKE_dsytrs_aa_work, - LAPACKE_dsytrs_3, - LAPACKE_dsytrs_3_work, - LAPACKE_sgelq, - LAPACKE_sgelq_work, - LAPACKE_sgemlq, - LAPACKE_sgemlq_work, - LAPACKE_sgemqr, - LAPACKE_sgemqr_work, - LAPACKE_sgeqr, - LAPACKE_sgeqr_work, - LAPACKE_sgetsls, - LAPACKE_sgetsls_work, - LAPACKE_ssbev_2stage, - LAPACKE_ssbev_2stage_work, - LAPACKE_ssbevd_2stage, - LAPACKE_ssbevd_2stage_work, - LAPACKE_ssbevx_2stage, - LAPACKE_ssbevx_2stage_work, - LAPACKE_ssycon_3, - LAPACKE_ssycon_3_work, - LAPACKE_ssyev_2stage, - LAPACKE_ssyev_2stage_work, - LAPACKE_ssyevd_2stage, - LAPACKE_ssyevd_2stage_work, - LAPACKE_ssyevr_2stage, - LAPACKE_ssyevr_2stage_work, - LAPACKE_ssyevx_2stage, - LAPACKE_ssyevx_2stage_work, - LAPACKE_ssygv_2stage, - LAPACKE_ssygv_2stage_work, - LAPACKE_ssysv_aa, - LAPACKE_ssysv_aa_work, - LAPACKE_ssysv_rk, - LAPACKE_ssysv_rk_work, - LAPACKE_ssytrf_aa, - LAPACKE_ssytrf_aa_work, - LAPACKE_ssytrf_rk, - LAPACKE_ssytrf_rk_work, - LAPACKE_ssytri_3, - LAPACKE_ssytri_3_work, - LAPACKE_ssytrs_aa, - LAPACKE_ssytrs_aa_work, - LAPACKE_ssytrs_3, - LAPACKE_ssytrs_3_work, LAPACKE_zgelq, LAPACKE_zgelq_work, LAPACKE_zgemlq, @@ -3308,42 +3473,6 @@ LAPACKE_zsytrs_3_work, ## new function from lapack-3.8.0 - LAPACKE_chesv_aa_2stage, - LAPACKE_chesv_aa_2stage_work, - LAPACKE_chetrf_aa_2stage, - LAPACKE_chetrf_aa_2stage_work, - LAPACKE_chetrs_aa_2stage, - LAPACKE_chetrs_aa_2stage_work, - LAPACKE_clacrm, - LAPACKE_clacrm_work, - LAPACKE_clarcm, - LAPACKE_clarcm_work, - LAPACKE_classq, - LAPACKE_classq_work, - LAPACKE_csysv_aa_2stage, - LAPACKE_csysv_aa_2stage_work, - LAPACKE_csytrf_aa_2stage, - LAPACKE_csytrf_aa_2stage_work, - LAPACKE_csytrs_aa_2stage, - LAPACKE_csytrs_aa_2stage_work, - LAPACKE_dlassq, - LAPACKE_dlassq_work, - LAPACKE_dsysv_aa_2stage, - LAPACKE_dsysv_aa_2stage_work, - LAPACKE_dsytrf_aa_2stage, - LAPACKE_dsytrf_aa_2stage_work, - LAPACKE_dsytrs_aa_2stage, - LAPACKE_dsytrs_aa_2stage_work, - LAPACKE_get_nancheck, - LAPACKE_set_nancheck, - LAPACKE_slassq, - LAPACKE_slassq_work, - LAPACKE_ssysv_aa_2stage, - LAPACKE_ssysv_aa_2stage_work, - LAPACKE_ssytrf_aa_2stage, - LAPACKE_ssytrf_aa_2stage_work, - LAPACKE_ssytrs_aa_2stage, - LAPACKE_ssytrs_aa_2stage_work, LAPACKE_zhesv_aa_2stage, LAPACKE_zhesv_aa_2stage_work, LAPACKE_zhetrf_aa_2stage, @@ -3362,36 +3491,19 @@ LAPACKE_zsytrf_aa_2stage_work, LAPACKE_zsytrs_aa_2stage, LAPACKE_zsytrs_aa_2stage_work, - # new functions from 3.9.0 - LAPACKE_dgesvdq, - LAPACKE_dgesvdq_work, - LAPACKE_sgesvdq, - LAPACKE_sgesvdq_work, LAPACKE_zgesvdq, LAPACKE_zgesvdq_work - ); #These function may need 2 underscores. @lapack_embeded_underscore_objs=( - xerbla_array, chla_transtype, slasyf_rook, + xerbla_array, chla_transtype, + ); +@lapack_embeded_underscore_objs_s=( + slasyf_rook, ssytf2_rook, ssytrf_rook, ssytrs_rook, ssytri_rook, ssycon_rook, ssysv_rook, - chetf2_rook, chetrf_rook, chetri_rook, - chetrs_rook, checon_rook, chesv_rook, - clahef_rook, clasyf_rook, - csytf2_rook, csytrf_rook, csytrs_rook, - csytri_rook, csycon_rook, csysv_rook, - dlasyf_rook, - dsytf2_rook, dsytrf_rook, dsytrs_rook, - dsytri_rook, dsycon_rook, dsysv_rook, - zhetf2_rook, zhetrf_rook, zhetri_rook, - zhetrs_rook, zhecon_rook, zhesv_rook, - zlahef_rook, zlasyf_rook, - zsytf2_rook, zsytrf_rook, zsytrs_rook, - zsytri_rook, zsycon_rook, zsysv_rook, -# 3.7.0 slasyf_rk, ssyconvf_rook, ssytf2_rk, ssytrf_rk, ssytrs_3, ssytri_3, ssytri_3x, ssycon_3, ssysv_rk, @@ -3400,15 +3512,18 @@ ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage, ssyev_2stage, ssyevx_2stage, ssyevr_2stage, ssbev_2stage, ssbevx_2stage, ssbevd_2stage, - ssygv_2stage, dlasyf_rk, dsyconvf_rook, - dsytf2_rk, dsytrf_rk, dsytrs_3, - dsytri_3, dsytri_3x, dsycon_3, - dsysv_rk, dlasyf_aa, dsysv_aa, - dsytrf_aa, dsytrs_aa, dsytrd_2stage, - dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels, - dsyevd_2stage, dsyev_2stage, dsyevx_2stage, - dsyevr_2stage, dsbev_2stage, dsbevx_2stage, - dsbevd_2stage, dsygv_2stage, chetf2_rk, + ssygv_2stage, + ssysv_aa_2stage, ssytrf_aa_2stage, + ssytrs_aa_2stage, + slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, +); +@lapack_embeded_underscore_objs_c=( + chetf2_rook, chetrf_rook, chetri_rook, + chetrs_rook, checon_rook, chesv_rook, + clahef_rook, clasyf_rook, + csytf2_rook, csytrf_rook, csytrs_rook, + csytri_rook, csycon_rook, csysv_rook, + chetf2_rk, chetrf_rk, chetri_3, chetri_3x, chetrs_3, checon_3, chesv_rk, chesv_aa, chetrf_aa, chetrs_aa, @@ -3421,6 +3536,35 @@ chb2st_kernels, cheevd_2stage, cheev_2stage, cheevx_2stage, cheevr_2stage, chbev_2stage, chbevx_2stage, chbevd_2stage, chegv_2stage, + chesv_aa_2stage, + chetrf_aa_2stage, chetrs_aa_2stage, + csysv_aa_2stage, csytrf_aa_2stage, + csytrs_aa_2stage, + claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, +); +@lapack_embeded_underscore_objs_d=( + dlasyf_rook, + dsytf2_rook, dsytrf_rook, dsytrs_rook, + dsytri_rook, dsycon_rook, dsysv_rook, + dlasyf_rk, dsyconvf_rook, + dsytf2_rk, dsytrf_rk, dsytrs_3, + dsytri_3, dsytri_3x, dsycon_3, + dsysv_rk, dlasyf_aa, dsysv_aa, + dsytrf_aa, dsytrs_aa, dsytrd_2stage, + dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels, + dsyevd_2stage, dsyev_2stage, dsyevx_2stage, + dsyevr_2stage, dsbev_2stage, dsbevx_2stage, + dsbevd_2stage, dsygv_2stage, + dsysv_aa_2stage, + dsytrf_aa_2stage, dsytrs_aa_2stage, + dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, +); +@lapack_embeded_underscore_objs_z=( + zhetf2_rook, zhetrf_rook, zhetri_rook, + zhetrs_rook, zhecon_rook, zhesv_rook, + zlahef_rook, zlasyf_rook, + zsytf2_rook, zsytrf_rook, zsytrs_rook, + zsytri_rook, zsycon_rook, zsysv_rook, zhetf2_rk, zhetrf_rk, zhetri_3, zhetri_3x, zhetrs_3, zhecon_3, zhesv_rk, zhesv_aa, zhetrf_aa, @@ -3434,22 +3578,10 @@ zheev_2stage, zheevx_2stage, zheevr_2stage, zhbev_2stage, zhbevx_2stage, zhbevd_2stage, zhegv_2stage, -# 3.8.0 - ssysv_aa_2stage, ssytrf_aa_2stage, - ssytrs_aa_2stage, chesv_aa_2stage, - chetrf_aa_2stage, chetrs_aa_2stage, - csysv_aa_2stage, csytrf_aa_2stage, - csytrs_aa_2stage, dsysv_aa_2stage, - dsytrf_aa_2stage, dsytrs_aa_2stage, zhesv_aa_2stage, zhetrf_aa_2stage, zhetrs_aa_2stage, zsysv_aa_2stage, zsytrf_aa_2stage, zsytrs_aa_2stage, -# 3.9.0 - claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col, - dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col, - slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col, zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col - ); @@ -3461,6 +3593,42 @@ if ($ARGV[12] == 1) { @blasobjs = (@blasobjs, @halfblasobjs); @cblasobjs = (@cblasobjs, @halfcblasobjs); } +if ($ARGV[13] == 1) { + @blasobjs = (@blasobjs, @blasobjss); + @cblasobjs = (@cblasobjs, @cblasobjss); + @lapackobjs = (@lapackobjs, @lapackobjss); + @lapack2objs = (@lapack2objs, @lapack2objss); + @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_s); + @lapackeobjs = (@lapackeobjs, @lapackeobjss); +} +if ($ARGV[14] == 1) { + @blasobjs = (@blasobjs, @blasobjsd); + @cblasobjs = (@cblasobjs, @cblasobjsd); + @lapackobjs = (@lapackobjs, @lapackobjsd); + @lapack2objs = (@lapack2objs, @lapack2objsd); + @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_d); + @lapackeobjs = (@lapackeobjs, @lapackeobjsd); +} +if ($ARGV[15] == 1) { + @blasobjs = (@blasobjs, @blasobjsc); + @cblasobjs = (@cblasobjs, @cblasobjsc); + @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsc); + @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsc); + @lapackobjs = (@lapackobjs, @lapackobjsc); + @lapack2objs = (@lapack2objs, @lapack2objsc, @lapac2objszc); + @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_c); + @lapackeobjs = (@lapackeobjs, @lapackeobjsc); +} +if ($ARGV[16] == 1) { + @blasobjs = (@blasobjs, @blasobjsz); + @cblasobjs = (@cblasobjs, @cblasobjsz); + @gemm3mobjs = (@gemm3mobjs, @gemm3mobjsz); + @cblasgemm3mobjs = (@cblasgemm3mobjs, @sblasgemm3mobjsz); + @lapackobjs = (@lapackobjs, @lapackobjsz); + @lapack2objs = (@lapack2objs, @lapack2objsz, @lapack2objszc); + @lapack_embeded_underscore_objs = (@lapack_embeded_underscore_objs, @lapack_embeded_underscore_objs_z); + @lapackeobjs = (@lapackeobjs, @lapackeobjsz); +} if ($ARGV[8] == 1) { #ONLY_CBLAS=1 @underscore_objs = (@misc_underscore_objs); From b8f95354c7edb67bfdeb317ef3e735a0b0e3c8ab Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:38:25 +0200 Subject: [PATCH 258/349] Adapt to having only a subset of variable types supported --- lapack/trtrs/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile index a3b8f43227..8ba63c21ad 100644 --- a/lapack/trtrs/Makefile +++ b/lapack/trtrs/Makefile @@ -17,6 +17,19 @@ ZBLASOBJS += ztrtrs_UNU_parallel.$(SUFFIX) ztrtrs_UNN_parallel.$(SUFFIX) ztrtrs_ XBLASOBJS += xtrtrs_UNU_parallel.$(SUFFIX) xtrtrs_UNN_parallel.$(SUFFIX) xtrtrs_UTU_parallel.$(SUFFIX) xtrtrs_UTN_parallel.$(SUFFIX) xtrtrs_URU_parallel.$(SUFFIX) xtrtrs_URN_parallel.$(SUFFIX) xtrtrs_UCU_parallel.$(SUFFIX) xtrtrs_UCN_parallel.$(SUFFIX) xtrtrs_LNU_parallel.$(SUFFIX) xtrtrs_LNN_parallel.$(SUFFIX) xtrtrs_LTU_parallel.$(SUFFIX) xtrtrs_LTN_parallel.$(SUFFIX) xtrtrs_LRU_parallel.$(SUFFIX) xtrtrs_LRN_parallel.$(SUFFIX) xtrtrs_LCU_parallel.$(SUFFIX) xtrtrs_LCN_parallel.$(SUFFIX) endif +ifneq ($(BUILD_SINGLE),1) +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX),1) +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + strtrs_UNU_single.$(SUFFIX) : trtrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F) From dcd51d5c72e5f05e43e327e3c4d9d954d5f80b8f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:39:19 +0200 Subject: [PATCH 259/349] Adapt to having only a subset of variable types supported --- lapack/trtri/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/trtri/Makefile b/lapack/trtri/Makefile index 626c47bbf2..72167ff566 100644 --- a/lapack/trtri/Makefile +++ b/lapack/trtri/Makefile @@ -23,6 +23,19 @@ ZBLASOBJS += ztrtri_UU_parallel.$(SUFFIX) ztrtri_UN_parallel.$(SUFFIX) ztrtri_LU XBLASOBJS += xtrtri_UU_parallel.$(SUFFIX) xtrtri_UN_parallel.$(SUFFIX) xtrtri_LU_parallel.$(SUFFIX) xtrtri_LN_parallel.$(SUFFIX) endif +ifneq ($(BUILD_SINGLE),1) +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX),1) +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + strtri_UU_single.$(SUFFIX) : trtri_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUNIT $< -o $(@F) From cf53970bcb34c17bd1f83c3b521e372f6e57f043 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:40:06 +0200 Subject: [PATCH 260/349] Adapt to having only a subset of variable types supported --- lapack/trti2/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack/trti2/Makefile b/lapack/trti2/Makefile index 45251fb1e7..005e80d733 100644 --- a/lapack/trti2/Makefile +++ b/lapack/trti2/Makefile @@ -1,11 +1,19 @@ TOPDIR = ../.. include ../../Makefile.system +ifeq ($(BUILD_SINGLE),1) SBLASOBJS = strti2_UU.$(SUFFIX) strti2_UN.$(SUFFIX) strti2_LU.$(SUFFIX) strti2_LN.$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS = dtrti2_UU.$(SUFFIX) dtrti2_UN.$(SUFFIX) dtrti2_LU.$(SUFFIX) dtrti2_LN.$(SUFFIX) +endif QBLASOBJS = qtrti2_UU.$(SUFFIX) qtrti2_UN.$(SUFFIX) qtrti2_LU.$(SUFFIX) qtrti2_LN.$(SUFFIX) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS = ctrti2_UU.$(SUFFIX) ctrti2_UN.$(SUFFIX) ctrti2_LU.$(SUFFIX) ctrti2_LN.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS = ztrti2_UU.$(SUFFIX) ztrti2_UN.$(SUFFIX) ztrti2_LU.$(SUFFIX) ztrti2_LN.$(SUFFIX) +endif XBLASOBJS = xtrti2_UU.$(SUFFIX) xtrti2_UN.$(SUFFIX) xtrti2_LU.$(SUFFIX) xtrti2_LN.$(SUFFIX) strti2_UU.$(SUFFIX) : trti2_U.c From 9df12eb08fde4d2f5ee49da1a48b0bd15a1bdbd4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:40:51 +0200 Subject: [PATCH 261/349] Adapt to having only a subset of variable types supported --- lapack/potrf/Makefile | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/lapack/potrf/Makefile b/lapack/potrf/Makefile index 21efa55403..feefd04834 100644 --- a/lapack/potrf/Makefile +++ b/lapack/potrf/Makefile @@ -17,6 +17,20 @@ ZBLASOBJS += zpotrf_U_parallel.$(SUFFIX) zpotrf_L_parallel.$(SUFFIX) XBLASOBJS += xpotrf_U_parallel.$(SUFFIX) xpotrf_L_parallel.$(SUFFIX) endif +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifeq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + + spotrf_U_single.$(SUFFIX) : potrf_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) From e5966f860671381963acc2f7cfa95a3b1e24510e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:41:43 +0200 Subject: [PATCH 262/349] Adapt to having only a subset of variable types supported --- lapack/potf2/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/potf2/Makefile b/lapack/potf2/Makefile index 5946ad9c8a..f48570064b 100644 --- a/lapack/potf2/Makefile +++ b/lapack/potf2/Makefile @@ -8,6 +8,19 @@ CBLASOBJS = cpotf2_U.$(SUFFIX) cpotf2_L.$(SUFFIX) ZBLASOBJS = zpotf2_U.$(SUFFIX) zpotf2_L.$(SUFFIX) XBLASOBJS = xpotf2_U.$(SUFFIX) xpotf2_L.$(SUFFIX) +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" + SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +endif +ifeq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" + CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= +endif + spotf2_U.$(SUFFIX) : potf2_U.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) From bc319cee826cf2cc7d750bc83895aef9504d18db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:42:26 +0200 Subject: [PATCH 263/349] Adapt to having only a subset of variable types supported --- lapack/lauum/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/lauum/Makefile b/lapack/lauum/Makefile index f163479ef9..c57f179378 100644 --- a/lapack/lauum/Makefile +++ b/lapack/lauum/Makefile @@ -17,6 +17,19 @@ ZBLASOBJS += zlauum_U_parallel.$(SUFFIX) zlauum_L_parallel.$(SUFFIX) XBLASOBJS += xlauum_U_parallel.$(SUFFIX) xlauum_L_parallel.$(SUFFIX) endif +ifneq ($(BUILD_SINGLE),1) +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX),1) +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + slauum_U_single.$(SUFFIX) : lauum_U_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) From b2620580594d2d6d0b06ea814dca34b37f79f84d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:43:13 +0200 Subject: [PATCH 264/349] Adapt to having only a subset of variable types supported --- lapack/lauu2/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack/lauu2/Makefile b/lapack/lauu2/Makefile index dc6a640b4d..60d2db4dbc 100644 --- a/lapack/lauu2/Makefile +++ b/lapack/lauu2/Makefile @@ -1,11 +1,19 @@ TOPDIR = ../.. include ../../Makefile.system +ifeq ($(BUILD_SINGLE),1) SBLASOBJS = slauu2_U.$(SUFFIX) slauu2_L.$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS = dlauu2_U.$(SUFFIX) dlauu2_L.$(SUFFIX) +endif QBLASOBJS = qlauu2_U.$(SUFFIX) qlauu2_L.$(SUFFIX) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS = clauu2_U.$(SUFFIX) clauu2_L.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS = zlauu2_U.$(SUFFIX) zlauu2_L.$(SUFFIX) +endif XBLASOBJS = xlauu2_U.$(SUFFIX) xlauu2_L.$(SUFFIX) slauu2_U.$(SUFFIX) : lauu2_U.c From 5c657fffad5bc4246964b6c4204685b5cd036d32 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:44:13 +0200 Subject: [PATCH 265/349] Adapt to having only a subset of variable types supported --- lapack/laswp/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack/laswp/Makefile b/lapack/laswp/Makefile index 389800692d..2028d994e3 100644 --- a/lapack/laswp/Makefile +++ b/lapack/laswp/Makefile @@ -1,11 +1,19 @@ TOPDIR = ../.. include ../../Makefile.system +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" SBLASOBJS = slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS = dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) +endif QBLASOBJS = qlaswp_plus.$(SUFFIX) qlaswp_minus.$(SUFFIX) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CBLASOBJS = claswp_plus.$(SUFFIX) claswp_minus.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS = zlaswp_plus.$(SUFFIX) zlaswp_minus.$(SUFFIX) +endif XBLASOBJS = xlaswp_plus.$(SUFFIX) xlaswp_minus.$(SUFFIX) slaswp_plus.$(SUFFIX) slaswp_minus.$(SUFFIX) dlaswp_plus.$(SUFFIX) dlaswp_minus.$(SUFFIX) \ From 20cf1d773f34a54946994b2b219545049f9b9fb0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:44:56 +0200 Subject: [PATCH 266/349] Adapt to having only a subset of variable types supported --- lapack/getrs/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/getrs/Makefile b/lapack/getrs/Makefile index 2640ef0975..f325693673 100644 --- a/lapack/getrs/Makefile +++ b/lapack/getrs/Makefile @@ -17,6 +17,19 @@ ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_pa XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX) endif +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifeq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + sgetrs_N_single.$(SUFFIX) : getrs_single.c $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANS $< -o $(@F) From 93454022a9c3580fedbe06204234542448a62081 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:45:40 +0200 Subject: [PATCH 267/349] Adapt to having only a subset of variable types supported --- lapack/getrf/Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lapack/getrf/Makefile b/lapack/getrf/Makefile index a559dfb0d6..976ca3c0bb 100644 --- a/lapack/getrf/Makefile +++ b/lapack/getrf/Makefile @@ -17,6 +17,19 @@ ZBLASOBJS += zgetrf_parallel.$(SUFFIX) XBLASOBJS += xgetrf_parallel.$(SUFFIX) endif +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif +ifeq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS= +endif +ifneq ($(BUILD_COMPLEX16),1) +ZBLASOBJS= +endif + ifeq ($(USE_OPENMP), 1) GETRF_SRC = getrf_parallel_omp.c else From b27ca78a2105d676b446bf49231faf76455f8dfc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:46:24 +0200 Subject: [PATCH 268/349] Adapt to having only a subset of variable types supported --- lapack/getf2/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack/getf2/Makefile b/lapack/getf2/Makefile index 612c6f9ccd..a524a32350 100644 --- a/lapack/getf2/Makefile +++ b/lapack/getf2/Makefile @@ -1,11 +1,19 @@ TOPDIR = ../.. include ../../Makefile.system +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" SBLASOBJS = sgetf2_k.$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS = dgetf2_k.$(SUFFIX) +endif QBLASOBJS = qgetf2_k.$(SUFFIX) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CBLASOBJS = cgetf2_k.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS = zgetf2_k.$(SUFFIX) +endif XBLASOBJS = xgetf2_k.$(SUFFIX) sgetf2_k.$(SUFFIX) : getf2_k.c From efe1ad4700bb55a06d9fc8e8291934a51c55c501 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:48:23 +0200 Subject: [PATCH 269/349] Add Makefile support for enabling only some variable types --- lapack-netlib/TESTING/MATGEN/Makefile | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile index 87432fd04f..e21ebd6c3c 100644 --- a/lapack-netlib/TESTING/MATGEN/Makefile +++ b/lapack-netlib/TESTING/MATGEN/Makefile @@ -33,25 +33,37 @@ TOPSRCDIR = ../.. include $(TOPSRCDIR)/make.inc +ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" SCATGEN = slatm1.o slatm7.o slaran.o slarnd.o +endif +ifeq ($(BUILD_SINGLE),1) SMATGEN = slatms.o slatme.o slatmr.o slatmt.o \ slagge.o slagsy.o slakf2.o slarge.o slaror.o slarot.o slatm2.o \ slatm3.o slatm5.o slatm6.o slahilb.o +endif +ifeq ($(BUILD_COMPLEX),1) CMATGEN = clatms.o clatme.o clatmr.o clatmt.o \ clagge.o claghe.o clagsy.o clakf2.o clarge.o claror.o clarot.o \ clatm1.o clarnd.o clatm2.o clatm3.o clatm5.o clatm6.o clahilb.o +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DZATGEN = dlatm1.o dlatm7.o dlaran.o dlarnd.o +endif +ifeq ($(BUILD_DOUBLE),1) DMATGEN = dlatms.o dlatme.o dlatmr.o dlatmt.o \ dlagge.o dlagsy.o dlakf2.o dlarge.o dlaror.o dlarot.o dlatm2.o \ dlatm3.o dlatm5.o dlatm6.o dlahilb.o +endif +ifeq ($(BUILD_COMPLEX16),1) ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ zlagge.o zlaghe.o zlagsy.o zlakf2.o zlarge.o zlaror.o zlarot.o \ zlatm1.o zlarnd.o zlatm2.o zlatm3.o zlatm5.o zlatm6.o zlahilb.o +endif .PHONY: all all: $(TMGLIB) @@ -97,5 +109,9 @@ cleanobj: cleanlib: rm -f $(TMGLIB) +ifeq ($(filter $(BUILD_SINGLE) $(BUILD_COMPLEX),1),) slaran.o: slaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +endif +ifeq ($(filter $(BUILD_DOUBLE) $(BUILD_COMPLEX16),1),) dlaran.o: dlaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $< +endif From ef552bc578274d257985e5ce76b3999920540daa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:49:06 +0200 Subject: [PATCH 270/349] Add Makefile support for enabling only some variable types --- lapack-netlib/SRC/Makefile | 62 ++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 16 deletions(-) diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 9f79e20e9d..83baac8753 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -66,7 +66,9 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \ ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \ ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o +ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" SCLAUX = \ + sbdsvdx.o sstevx.o sstein.o \ sbdsdc.o \ sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ slaed0.o slaed1.o slaed2.o slaed3.o slaed4.o slaed5.o slaed6.o \ @@ -81,10 +83,14 @@ SCLAUX = \ slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ ssteqr.o ssterf.o slaisnan.o sisnan.o \ - slartgp.o slartgs.o \ + slartgp.o slartgs.o scombssq.o \ ../INSTALL/second_$(TIMER).o +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DZLAUX = \ + dcombssq.o \ + dbdsvdx.o dstevx.o dstein.o \ dbdsdc.o \ dbdsqr.o ddisna.o dlabad.o dlacpy.o dladiv.o dlae2.o dlaebz.o \ dlaed0.o dlaed1.o dlaed2.o dlaed3.o dlaed4.o dlaed5.o dlaed6.o \ @@ -101,9 +107,12 @@ DZLAUX = \ dsteqr.o dsterf.o dlaisnan.o disnan.o \ dlartgp.o dlartgs.o \ ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o +endif +#ifeq ($(BUILD_SINGLE),1) +ifdef BUILD_SINGLE SLASRC_O = \ - sbdsvdx.o spotrf2.o sgetrf2.o \ + spotrf2.o sgetrf2.o \ sgbbrd.o sgbcon.o sgbequ.o sgbrfs.o sgbsv.o \ sgbsvx.o sgbtf2.o sgbtrf.o sgbtrs.o sgebak.o sgebal.o sgebd2.o \ sgebrd.o sgecon.o sgeequ.o sgees.o sgeesx.o sgeev.o sgeevx.o \ @@ -145,8 +154,7 @@ SLASRC_O = \ ssbev.o ssbevd.o ssbevx.o ssbgst.o ssbgv.o ssbgvd.o ssbgvx.o \ ssbtrd.o sspcon.o sspev.o sspevd.o sspevx.o sspgst.o \ sspgv.o sspgvd.o sspgvx.o ssprfs.o sspsv.o sspsvx.o ssptrd.o \ - ssptrf.o ssptri.o ssptrs.o sstegr.o sstein.o sstev.o sstevd.o sstevr.o \ - sstevx.o \ + ssptrf.o ssptri.o ssptrs.o sstegr.o sstev.o sstevd.o sstevr.o \ ssycon.o ssyev.o ssyevd.o ssyevr.o ssyevx.o ssygs2.o \ ssygst.o ssygv.o ssygvd.o ssygvx.o ssyrfs.o ssysv.o ssysvx.o \ ssytd2.o ssytf2.o ssytrd.o ssytrf.o ssytri.o ssytri2.o ssytri2x.o \ @@ -180,9 +188,13 @@ SLASRC_O = \ ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \ ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \ ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \ - sgesvdq.o scombssq.o + sgesvdq.o + +endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" DSLASRC_O = spotrs.o sgetrs.o spotrf.o sgetrf.o +endif ifdef USEXBLAS SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ @@ -194,6 +206,7 @@ SXLASRC = sgesvxx.o sgerfsx.o sla_gerfsx_extended.o sla_geamv.o \ slascl2.o sla_wwaddw.o endif +ifeq ($(BUILD_COMPLEX),1) CLASRC_O = \ cpotrf2.o cgetrf2.o \ cbdsqr.o cgbbrd.o cgbcon.o cgbequ.o cgbrfs.o cgbsv.o cgbsvx.o \ @@ -284,6 +297,7 @@ CLASRC_O = \ cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \ chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \ cgesvdq.o +endif ifdef USEXBLAS CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ @@ -299,11 +313,13 @@ CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \ cla_lin_berr.o clarscl2.o clascl2.o cla_wwaddw.o endif -ZCLASRC_O = cpotrs.o cgetrs.o cpotrf.o cgetrf.o +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +ZCLASRC_O = cpotrs.o cgetrs.o cpotrf.o cgetrf.o clag2z.o +endif +ifeq ($(BUILD_DOUBLE),1) DLASRC_O = \ dpotrf2.o dgetrf2.o \ - dbdsvdx.o \ dgbbrd.o dgbcon.o dgbequ.o dgbrfs.o dgbsv.o \ dgbsvx.o dgbtf2.o dgbtrf.o dgbtrs.o dgebak.o dgebal.o dgebd2.o \ dgebrd.o dgecon.o dgeequ.o dgees.o dgeesx.o dgeev.o dgeevx.o \ @@ -345,8 +361,7 @@ DLASRC_O = \ dsbev.o dsbevd.o dsbevx.o dsbgst.o dsbgv.o dsbgvd.o dsbgvx.o \ dsbtrd.o dspcon.o dspev.o dspevd.o dspevx.o dspgst.o \ dspgv.o dspgvd.o dspgvx.o dsprfs.o dspsv.o dspsvx.o dsptrd.o \ - dsptrf.o dsptri.o dsptrs.o dstegr.o dstein.o dstev.o dstevd.o dstevr.o \ - dstevx.o \ + dsptrf.o dsptri.o dsptrs.o dstegr.o dstev.o dstevd.o dstevr.o \ dsycon.o dsyev.o dsyevd.o dsyevr.o \ dsyevx.o dsygs2.o dsygst.o dsygv.o dsygvd.o dsygvx.o dsyrfs.o \ dsysv.o dsysvx.o \ @@ -381,7 +396,8 @@ DLASRC_O = \ dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \ dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \ dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \ - dgesvdq.o dcombssq.o + dgesvdq.o +endif ifdef USEXBLAS DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ @@ -393,6 +409,7 @@ DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \ dlascl2.o dla_wwaddw.o endif +ifeq ($(BUILD_COMPLEX16),1) ZLASRC_O = \ zpotrf2.o zgetrf2.o \ zbdsqr.o zgbbrd.o zgbcon.o zgbequ.o zgbrfs.o zgbsv.o zgbsvx.o \ @@ -471,7 +488,7 @@ ZLASRC_O = \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ zunmtr.o zupgtr.o \ zupmtr.o izmax1.o dzsum1.o zstemr.o \ - zcgesv.o zcposv.o zlag2c.o clag2z.o zlat2c.o \ + zcgesv.o zcposv.o zlag2c.o zlat2c.o \ zhfrk.o ztfttp.o zlanhf.o zpftrf.o zpftri.o zpftrs.o ztfsm.o ztftri.o \ ztfttr.o ztpttf.o ztpttr.o ztrttf.o ztrttp.o \ zgeequb.o zgbequb.o zsyequb.o zpoequb.o zheequb.o \ @@ -488,6 +505,7 @@ ZLASRC_O = \ zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \ zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \ zgesvdq.o +endif ifdef USEXBLAS ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ @@ -501,18 +519,30 @@ ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \ zla_lin_berr.o zlarscl2.o zlascl2.o zla_wwaddw.o endif -DEPRECSRC = DEPRECATED/cgegs.o DEPRECATED/cgegv.o DEPRECATED/cgelsx.o \ +ifeq ($(BUILD_COMPLEX),1) +CDEPRECSRC = DEPRECATED/cgegs.o DEPRECATED/cgegv.o DEPRECATED/cgelsx.o \ DEPRECATED/cgeqpf.o DEPRECATED/cggsvd.o DEPRECATED/cggsvp.o \ - DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o \ + DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o +endif + +ifeq ($(BUILD_DOUBLE),1) +DDEPRECSRC = \ DEPRECATED/dgegs.o DEPRECATED/dgegv.o DEPRECATED/dgelsx.o \ DEPRECATED/dgeqpf.o DEPRECATED/dggsvd.o DEPRECATED/dggsvp.o \ - DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o \ + DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o +endif +ifeq ($(BUILD_SINGLE),1) +SDEPRECSRC = \ DEPRECATED/sgegs.o DEPRECATED/sgegv.o DEPRECATED/sgelsx.o \ DEPRECATED/sgeqpf.o DEPRECATED/sggsvd.o DEPRECATED/sggsvp.o \ - DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o \ + DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o +endif +ifeq ($(BUILD_COMPLEX16),1) +ZDEPRECSRC = \ DEPRECATED/zgegs.o DEPRECATED/zgegv.o DEPRECATED/zgelsx.o \ DEPRECATED/zgeqpf.o DEPRECATED/zggsvd.o DEPRECATED/zggsvp.o \ DEPRECATED/zlahrd.o DEPRECATED/zlatzm.o DEPRECATED/ztzrqf.o +endif # filter out optimized codes from OpenBLAS ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o @@ -560,7 +590,7 @@ ALLXOBJ = $(SXLASRC) $(DXLASRC) $(CXLASRC) $(ZXLASRC) endif ifdef BUILD_DEPRECATED -DEPRECATED = $(DEPRECSRC) +DEPRECATED = $(SDEPRECSRC) $(DDEPRECSRC) $(CDEPRECSRC) $(ZDEPRECSRC) endif .PHONY: all From a6570108c570848f0f036b296ad0c35e826a7bc2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:49:58 +0200 Subject: [PATCH 271/349] Add Makefile support for enabling only some variable types --- lapack-netlib/LAPACKE/src/Makefile | 99 ++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 31 deletions(-) diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index 8060151ae8..a602dd7a06 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -46,6 +46,7 @@ OBJ = \ lapacke_ilaver.o \ lapacke_nancheck.o +ifeq ($(BUILD_COMPLEX),1) OBJ_C = \ lapacke_cbbcsd.o \ lapacke_cbbcsd_work.o \ @@ -653,7 +654,9 @@ lapacke_cupgtr.o \ lapacke_cupgtr_work.o \ lapacke_cupmtr.o \ lapacke_cupmtr_work.o +endif +ifeq ($(BUILD_DOUBLE),1) OBJ_D = \ lapacke_dbbcsd.o \ lapacke_dbbcsd_work.o \ @@ -1218,8 +1221,12 @@ lapacke_dtrttf_work.o \ lapacke_dtrttp.o \ lapacke_dtrttp_work.o \ lapacke_dtzrzf.o \ -lapacke_dtzrzf_work.o +lapacke_dtzrzf_work.o \ +lapacke_slag2d.o \ +lapacke_slag2d_work.o +endif +ifeq ($(BUILD_SINGLE),1) OBJ_S = \ lapacke_sbbcsd.o \ lapacke_sbbcsd_work.o \ @@ -1395,8 +1402,6 @@ lapacke_slacn2.o \ lapacke_slacn2_work.o \ lapacke_slacpy.o \ lapacke_slacpy_work.o \ -lapacke_slag2d.o \ -lapacke_slag2d_work.o \ lapacke_slamch.o \ lapacke_slamch_work.o \ lapacke_slange.o \ @@ -1781,7 +1786,9 @@ lapacke_strttp.o \ lapacke_strttp_work.o \ lapacke_stzrzf.o \ lapacke_stzrzf_work.o +endif +ifeq ($(BUILD_COMPLEX16),1) OBJ_Z = \ lapacke_zbbcsd.o \ lapacke_zbbcsd_work.o \ @@ -2393,35 +2400,52 @@ lapacke_zupgtr.o \ lapacke_zupgtr_work.o \ lapacke_zupmtr.o \ lapacke_zupmtr_work.o +endif ifdef BUILD_DEPRECATED -DEPRECATED = \ +ifeq ($(BUILD_COMPLEX),1) +DEPRECATEDC = \ lapacke_cggsvp.o \ lapacke_cggsvp_work.o \ -lapacke_dggsvp.o \ -lapacke_dggsvp_work.o \ -lapacke_sggsvp.o \ -lapacke_sggsvp_work.o \ -lapacke_zggsvp.o \ -lapacke_zggsvp_work.o \ lapacke_cggsvd.o \ lapacke_cggsvd_work.o \ +lapacke_cgeqpf.o \ +lapacke_cgeqpf_work.o +endif + +ifeq ($(BUILD_DOUBLE),1) +DEPRECATEDD = \ +lapacke_dggsvp.o \ +lapacke_dggsvp_work.o \ lapacke_dggsvd.o \ lapacke_dggsvd_work.o \ +lapacke_dgeqpf.o \ +lapacke_dgeqpf_work.o +endif + +ifeq ($(BUILD_SINGLE),1) +DEPRECATEDS = \ +lapacke_sggsvp.o \ +lapacke_sggsvp_work.o \ lapacke_sggsvd.o \ lapacke_sggsvd_work.o \ +lapacke_sgeqpf.o \ +lapacke_sgeqpf_work.o +endif + +ifeq ($(BUILD_COMPLEX16),1) +DEPRECATEDZ = \ +lapacke_zggsvp.o \ +lapacke_zggsvp_work.o \ lapacke_zggsvd.o \ lapacke_zggsvd_work.o \ -lapacke_cgeqpf.o \ -lapacke_cgeqpf_work.o \ -lapacke_dgeqpf.o \ -lapacke_dgeqpf_work.o \ -lapacke_sgeqpf.o \ -lapacke_sgeqpf_work.o \ lapacke_zgeqpf.o \ lapacke_zgeqpf_work.o endif +DEPRECATED = $(DEPRECATEDS) $(DEPRECATEDD) $(DEPRECATEDC) $(DEPRECATEDZ) +endif + ifdef USEXBLAS EXTENDED = \ lapacke_cgbrfsx.o lapacke_cporfsx.o lapacke_dgerfsx.o lapacke_sgbrfsx.o lapacke_ssyrfsx.o lapacke_zherfsx.o \ @@ -2440,37 +2464,50 @@ endif ifdef LAPACKE_WITH_TMG # FILE PARTS OF TMGLIB -MATGEN = \ +ifeq ($(BUILD_COMPLEX),1) +MATGENC = \ lapacke_clatms.o \ lapacke_clatms_work.o \ -lapacke_dlatms.o \ -lapacke_dlatms_work.o \ -lapacke_slatms.o \ -lapacke_slatms_work.o \ -lapacke_zlatms.o \ -lapacke_zlatms_work.o \ lapacke_clagge.o \ lapacke_clagge_work.o \ +lapacke_claghe.o \ +lapacke_claghe_work.o \ +lapacke_clagsy.o \ +lapacke_clagsy_work.o +endif +ifeq ($(BUILD_DOUBLE),1) +MATGEND = \ +lapacke_dlatms.o \ +lapacke_dlatms_work.o \ lapacke_dlagge.o \ lapacke_dlagge_work.o \ +lapacke_dlagsy.o \ +lapacke_dlagsy_work.o +endif +ifeq ($(BUILD_SINGLE),1) +MATGENS = \ +lapacke_slatms.o \ +lapacke_slatms_work.o \ lapacke_slagge.o \ lapacke_slagge_work.o \ +lapacke_slagsy.o \ +lapacke_slagsy_work.o +endif +ifeq ($(BUILD_COMPLEX16),1) +MATGENZ = \ +lapacke_zlatms.o \ +lapacke_zlatms_work.o \ lapacke_zlagge.o \ lapacke_zlagge_work.o \ -lapacke_claghe.o \ -lapacke_claghe_work.o \ lapacke_zlaghe.o \ lapacke_zlaghe_work.o \ -lapacke_clagsy.o \ -lapacke_clagsy_work.o \ -lapacke_dlagsy.o \ -lapacke_dlagsy_work.o \ -lapacke_slagsy.o \ -lapacke_slagsy_work.o \ lapacke_zlagsy.o \ lapacke_zlagsy_work.o endif +MATGEN = $(MATGENS) $(MATGEND) $(MATGENC) $(MATGENZ) +endif + .PHONY: all all: $(LAPACKELIB) From 6b6adf8a4a563f4afbcab2a9d39b5eaa55da13b1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:52:09 +0200 Subject: [PATCH 272/349] Allow compiling only a subset of kernels for specific variable types --- kernel/CMakeLists.txt | 215 +++++++++++++++++++++++-- kernel/Makefile.L2 | 70 +++++++- kernel/Makefile.L3 | 84 ++++++++-- kernel/setparam-ref.c | 367 ++++++++++++++++++++++-------------------- 4 files changed, 527 insertions(+), 209 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index c81f2bf255..988b83338a 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -91,7 +91,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") - if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SASUMKERNEL}" "" "asum_k" false "" "" false "SINGLE") @@ -110,14 +110,14 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${ISAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${ISAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SSCALKERNEL}" "" "scal_k" false "" "" false "SINGLE") - GenerateNamedObjects("${KERNELDIR}/${SCOPYKERNEL}" "" "copy_k" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SSWAPKERNEL}" "" "swap_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SAXPYKERNEL}" "" "axpy_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") endif () - if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k" false "" "" false "DOUBLE") @@ -177,11 +177,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false ${float_type}) endif () endforeach () - if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DGEMVNKERNEL}" "" "gemv_n" false "" "" false "DOUBLE") GenerateNamedObjects("${KERNELDIR}/${DGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "DOUBLE") endif () - if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + if (BUILD_COMPLEX AND NOT BUILD_SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") endif () @@ -219,7 +219,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() - if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") if (DGEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "${DGEMMINCOPYOBJ}" false "" "" true "DOUBLE") @@ -235,19 +235,19 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) endif () GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE") endif () - if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) - GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") + if ((BUILD_DOUBLE OR BUILD_COMPLEX) AND NOT BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "SINGLE") if (SGEMMINCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "DOUBLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") endif () if (SGEMMITCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "DOUBLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") endif () if (SGEMMONCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "DOUBLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") endif () if (SGEMMOTCOPY) - GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "DOUBLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") endif () GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") endif () @@ -591,7 +591,31 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) #geadd GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) endforeach () - + if (BUILD_DOUBLE AND NOT BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false "SINGLE") + + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false "SINGLE") + + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false "SINGLE") + + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false "SINGLE") + endif () # Makefile.LA if(NOT NO_LAPACK) @@ -618,6 +642,28 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}NEG_TCOPY}_${${float_char}GEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}LASWP_NCOPY}_${${float_char}GEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false ${float_type}) endforeach() + if (BUILD_COMPLEX AND NOT BUILD_SINGLE) + if (NOT DEFINED SNEG_TCOPY) + set(SNEG_TCOPY ../generic/neg_tcopy.c) + endif () + + if (NOT DEFINED SLASWP_NCOPY) + set(SLASWP_NCOPY ../generic/laswp_ncopy.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}_${SGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}_${SGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "SINGLE") + endif() + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + if (NOT DEFINED DNEG_TCOPY) + set(DNEG_TCOPY ../generic/neg_tcopy.c) + endif () + + if (NOT DEFINED DLASWP_NCOPY) + set(DLASWP_NCOPY ../generic/laswp_ncopy.c) + endif () + GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}" "" "neg_tcopy" false "" "" false "DOUBLE") + GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}" "" "laswp_ncopy" false "" "" false "DOUBLE") + endif() endif() if (${DYNAMIC_ARCH}) @@ -649,8 +695,147 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) endforeach () - + if (BUILD_COMPLEX AND NOT BUILD_SINGLE) + GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") + GenerateNamedObjects("generic/neg_tcopy_${SGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/laswp_ncopy_${SGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "SINGLE") + endif () + if (BUILD_DOUBLE AND NOT BUILD_SINGLE) + GenerateNamedObjects("generic/neg_tcopy_${SGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/laswp_ncopy_${SGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_uncopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" ${TSUFFIX} false "SINGLE") + + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_lncopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" ${TSUFFIX} false "SINGLE") + + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_utcopy_${SGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" ${TSUFFIX} false "SINGLE") + + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" ${TSUFFIX} false "SINGLE") + GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") + + if (SGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") + endif () + if (SGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") + endif () + GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") + GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") + endif () + + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") + GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") + endif () + if (BUILD_COMPLEX16 AND NOT BUILD_COMPLEX) + GenerateNamedObjects("${KERNELDIR}/${CAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "COMPLEX") + if (DEFINED CMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${CMAXKERNEL}" "" "max_k" false "" "" false "COMPLEX") + endif () + if (DEFINED CMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${CMINKERNEL}" "USE_MIN" "min_k" false "" "" false "COMPLEX") + endif () + GenerateNamedObjects("${KERNELDIR}/${ICAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${ICAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "COMPLEX") + if (DEFINED ICMAXKERNEL) + GenerateNamedObjects("${KERNELDIR}/${ICMAXKERNEL}" "" "i*max_k" false "" "" false "COMPLEX") + endif () + if (DEFINED ICMINKERNEL) + GenerateNamedObjects("${KERNELDIR}/${ICMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "COMPLEX") + endif () + GenerateNamedObjects("${KERNELDIR}/${CASUMKERNEL}" "" "asum_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CAXPYKERNEL}" "" "axpy_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CNRM2KERNEL}" "" "nrm2_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CROTKERNEL}" "" "rot_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CSCALKERNEL}" "" "scal_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CSWAPKERNEL}" "" "swap_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CAXPBYKERNEL}" "" "axpby_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CSUMKERNEL}" "" "sum_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CAXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CDOTKERNEL}" "" "dotu_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CDOTKERNEL}" "CONJ" "dotc_k" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "" "gemv_n" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "TRANSA" "gemv_t" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "CONJ" "gemv_r" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "CONJ;TRANSA" "gemv_c" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "XCONJ" "gemv_o" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "XCONJ;TRANSA" "gemv_u" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVNKERNEL}" "XCONJ;CONJ" "gemv_s" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMVTKERNEL}" "XCONJ;CONJ;TRANSA" "gemv_d" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL;CONJ" "trsm_kernel_LR" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LT}" "LT;TRSMKERNEL;CONJ" "trsm_kernel_LC" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL;CONJ" "trsm_kernel_RR" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RT}" "RT;TRSMKERNEL;CONJ" "trsm_kernel_RC" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CTRSMKERNEL_RT}" "RT;TRSMKERNEL" "trsm_kernel_RT" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "NN" "gemm_kernel_n" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "CN" "gemm_kernel_l" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "NC" "gemm_kernel_r" false "" "" false "COMPLEX") + GenerateNamedObjects("${KERNELDIR}/${CGEMMKERNEL}" "CC" "gemm_kernel_b" false "" "" false "COMPLEX") + if (CGEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${CGEMMINCOPY}" "COMPLEX" "${CGEMMINCOPYOBJ}" false "" "" true "COMPLEX") + endif () + + if (CGEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${CGEMMITCOPY}" "COMPLEX" "${CGEMMITCOPYOBJ}" false "" "" true "COMPLEX") + endif () + + if (CGEMMONCOPY) + GenerateNamedObjects("${KERNELDIR}/${CGEMMONCOPY}" "COMPLEX" "${CGEMMONCOPYOBJ}" false "" "" true "COMPLEX") + endif () + + if (CGEMMOTCOPY) + GenerateNamedObjects("${KERNELDIR}/${CGEMMOTCOPY}" "COMPLEX" "${CGEMMOTCOPYOBJ}" false "" "" true "COMPLEX") + endif () + GenerateNamedObjects("${KERNELDIR}/${CGEMM_BETA}" "" "gemm_beta" false "" "" false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" ${TSUFFIX} false "COMPLEX") + + GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_lncopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" ${TSUFFIX} false "COMPLEX") + + GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_utcopy_${CGEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" ${TSUFFIX} false "COMPLEX") + + GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/ztrsm_ltcopy_${CGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "COMPLEX") + GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "COMPLEX") endif () + endif () add_library(kernel${TSUFFIX} OBJECT ${OPENBLAS_SRC}) set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}") @@ -665,7 +850,7 @@ if (${DYNAMIC_ARCH}) set(BUILD_KERNEL 1) set(KDIR "") set(TSUFFIX "_${TARGET_CORE}") - set(KERNEL_DEFINITIONS "-DBUILD_KERNEL -DTABLE_NAME=gotoblas_${TARGET_CORE} -DTS=${TSUFFIX}") + set(KERNEL_DEFINITIONS "-DBUILD_KERNEL -DTABLE_NAME=gotoblas_${TARGET_CORE} -DTS=${TSUFFIX}") build_core("${TARGET_CORE}" "${KDIR}" "${TSUFFIX}" "${KERNEL_DEFINITIONS}") set(ADD_COMMONOBJS 0) endforeach() diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 2aeb8f0415..79399c3423 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -186,31 +186,46 @@ ifndef XHEMV_M_KERNEL XHEMV_M_KERNEL = ../generic/zhemv_k.c endif +ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" SBLASOBJS += \ - sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \ + sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_SINGLE),1) +SBLASOBJS += \ + ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \ sger_k$(TSUFFIX).$(SUFFIX) - +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS += \ dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) dsymv_U$(TSUFFIX).$(SUFFIX) dsymv_L$(TSUFFIX).$(SUFFIX) \ dger_k$(TSUFFIX).$(SUFFIX) - +endif QBLASOBJS += \ qgemv_n$(TSUFFIX).$(SUFFIX) qgemv_t$(TSUFFIX).$(SUFFIX) qsymv_U$(TSUFFIX).$(SUFFIX) qsymv_L$(TSUFFIX).$(SUFFIX) \ qger_k$(TSUFFIX).$(SUFFIX) - +ifeq ($(BUILD_COMPLEX),1) +SBLASOBJS += \ + sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \ cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX) \ csymv_U$(TSUFFIX).$(SUFFIX) csymv_L$(TSUFFIX).$(SUFFIX) \ chemv_U$(TSUFFIX).$(SUFFIX) chemv_L$(TSUFFIX).$(SUFFIX) chemv_V$(TSUFFIX).$(SUFFIX) chemv_M$(TSUFFIX).$(SUFFIX) \ cgeru_k$(TSUFFIX).$(SUFFIX) cgerc_k$(TSUFFIX).$(SUFFIX) cgerv_k$(TSUFFIX).$(SUFFIX) cgerd_k$(TSUFFIX).$(SUFFIX) - +endif +ifeq ($(BUILD_COMPLEX16),1) +CBLASOBJS += \ + cgemv_n$(TSUFFIX).$(SUFFIX) cgemv_t$(TSUFFIX).$(SUFFIX) cgemv_r$(TSUFFIX).$(SUFFIX) cgemv_c$(TSUFFIX).$(SUFFIX) \ + cgemv_o$(TSUFFIX).$(SUFFIX) cgemv_u$(TSUFFIX).$(SUFFIX) cgemv_s$(TSUFFIX).$(SUFFIX) cgemv_d$(TSUFFIX).$(SUFFIX) +DBLASOBJS += \ + dgemv_n$(TSUFFIX).$(SUFFIX) dgemv_t$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zgemv_n$(TSUFFIX).$(SUFFIX) zgemv_t$(TSUFFIX).$(SUFFIX) zgemv_r$(TSUFFIX).$(SUFFIX) zgemv_c$(TSUFFIX).$(SUFFIX) \ zgemv_o$(TSUFFIX).$(SUFFIX) zgemv_u$(TSUFFIX).$(SUFFIX) zgemv_s$(TSUFFIX).$(SUFFIX) zgemv_d$(TSUFFIX).$(SUFFIX) \ zsymv_U$(TSUFFIX).$(SUFFIX) zsymv_L$(TSUFFIX).$(SUFFIX) \ zhemv_U$(TSUFFIX).$(SUFFIX) zhemv_L$(TSUFFIX).$(SUFFIX) zhemv_V$(TSUFFIX).$(SUFFIX) zhemv_M$(TSUFFIX).$(SUFFIX) \ zgeru_k$(TSUFFIX).$(SUFFIX) zgerc_k$(TSUFFIX).$(SUFFIX) zgerv_k$(TSUFFIX).$(SUFFIX) zgerd_k$(TSUFFIX).$(SUFFIX) +endif XBLASOBJS += \ xgemv_n$(TSUFFIX).$(SUFFIX) xgemv_t$(TSUFFIX).$(SUFFIX) xgemv_r$(TSUFFIX).$(SUFFIX) xgemv_c$(TSUFFIX).$(SUFFIX) \ @@ -219,17 +234,21 @@ XBLASOBJS += \ xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) +ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)sgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -DTRANS $< -o $@ +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@ +endif $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -UTRANS $< -o $@ @@ -237,6 +256,8 @@ $(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)qgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -UCOMPLEX -DTRANS $< -o $@ + +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" $(KDIR)cgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ @@ -260,6 +281,10 @@ $(KDIR)cgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE $(KDIR)cgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)cgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ +endif + + +ifeq ($(BUILD_COMPLEX16),1) $(KDIR)zgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ @@ -284,6 +309,7 @@ $(KDIR)zgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE $(KDIR)zgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)zgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -DDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ +endif $(KDIR)xgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVNKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -UTRANS -UCONJ -UXCONJ $< -o $@ @@ -309,17 +335,25 @@ $(KDIR)xgemv_s$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_s$(TSUFFIX).$(PSUFFIX) : $(KERNE $(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMVTKERNEL) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@ + +ifeq ($(BUILD_SINGLE),1) + $(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@ $(KDIR)ssymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_L_KERNEL) $(SSYMV_L_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DLOWER $< -o $@ +endif + + +ifeq ($(BUILD_DOUBLE),1) $(KDIR)dsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_U_KERNEL) $(DSYMV_U_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -ULOWER $< -o $@ $(KDIR)dsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)dsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSYMV_L_KERNEL) $(DSYMV_L_PARAM) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DLOWER $< -o $@ +endif $(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_U_KERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -ULOWER $< -o $@ @@ -327,17 +361,23 @@ $(KDIR)qsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_U$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)qsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)qsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSYMV_L_KERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DLOWER $< -o $@ +ifeq ($(BUILD_COMPLEX),1) + $(KDIR)csymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_U_KERNEL) $(CSYMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER $< -o $@ $(KDIR)csymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)csymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSYMV_L_KERNEL) $(CSYMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER $< -o $@ +endif + +ifeq ($(BUILD_COMPLEX16),1) $(KDIR)zsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_U_KERNEL) $(ZSYMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER $< -o $@ $(KDIR)zsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)zsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSYMV_L_KERNEL) $(ZSYMV_L_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER $< -o $@ +endif $(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_U_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER $< -o $@ @@ -345,15 +385,23 @@ $(KDIR)xsymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_U$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xsymv_L$(TSUFFIX).$(SUFFIX) $(KDIR)xsymv_L$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSYMV_L_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER $< -o $@ +ifeq ($(BUILD_SINGLE),1) + $(KDIR)sger_k$(TSUFFIX).$(SUFFIX) $(KDIR)sger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGERKERNEL) $(SGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE $< -o $@ +endif + +ifeq ($(BUILD_DOUBLE),1) $(KDIR)dger_k$(TSUFFIX).$(SUFFIX) $(KDIR)dger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGERKERNEL) $(DGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE $< -o $@ +endif $(KDIR)qger_k$(TSUFFIX).$(SUFFIX) $(KDIR)qger_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGERKERNEL) $(QGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE $< -o $@ +ifeq ($(BUILD_COMPLEX),1) + $(KDIR)cgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERUKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -UCONJ $< -o $@ @@ -365,6 +413,9 @@ $(KDIR)cgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)cgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)cgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGERCKERNEL) $(CGERPARAM) $(CC) -c $(CFLAGS) -UDOUBLE -DCONJ -DXCONJ $< -o $@ +endif + +ifeq ($(BUILD_COMPLEX16),1) $(KDIR)zgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERUKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -UCONJ $< -o $@ @@ -377,6 +428,7 @@ $(KDIR)zgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)zgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)zgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGERCKERNEL) $(ZGERPARAM) $(CC) -c $(CFLAGS) -DDOUBLE -DCONJ -DXCONJ $< -o $@ +endif $(KDIR)xgeru_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgeru_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERUKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ $< -o $@ @@ -390,6 +442,8 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM) $(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@ +ifeq ($(BUILD_COMPLEX),1) + $(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@ @@ -401,6 +455,9 @@ $(KDIR)chemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)chemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_M_KERNEL) $(CHEMV_L_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ +endif + +ifeq ($(BUILD_COMPLEX16),1) $(KDIR)zhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_U_KERNEL) $(ZHEMV_U_PARAM) $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -ULOWER -DHEMV $< -o $@ @@ -413,7 +470,7 @@ $(KDIR)zhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)zhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)zhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZHEMV_M_KERNEL) $(ZHEMV_L_PARAM) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ - +endif $(KDIR)xhemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_U_KERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -ULOWER -DHEMV $< -o $@ @@ -426,3 +483,4 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ + diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 24e17d9b4b..e03ed0fad2 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -100,8 +100,10 @@ SHKERNELOBJS += \ $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ + sgemm_beta$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) @@ -110,28 +112,36 @@ SKERNELOBJS += \ sgemm_direct$(TSUFFIX).$(SUFFIX) \ sgemm_direct_performant$(TSUFFIX).$(SUFFIX) endif +endif +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DKERNELOBJS += \ + dgemm_beta$(TSUFFIX).$(SUFFIX) \ dgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) +endif QKERNELOBJS += \ qgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \ $(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ) +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CKERNELOBJS += \ cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) +endif +ifeq ($(BUILD_COMPLEX16),1) ZKERNELOBJS += \ zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) +endif XKERNELOBJS += \ xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ @@ -153,38 +163,48 @@ ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS += \ dgemm_beta$(TSUFFIX).$(SUFFIX) \ dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) +endif QBLASOBJS += \ qgemm_beta$(TSUFFIX).$(SUFFIX) \ qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ - qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ + qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS += \ - cgemm_beta$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS += \ + cgemm_beta$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS += \ zgemm_beta$(TSUFFIX).$(SUFFIX) \ ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -194,7 +214,8 @@ ZBLASOBJS += \ ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) +endif XBLASOBJS += \ xgemm_beta$(TSUFFIX).$(SUFFIX) \ @@ -205,7 +226,7 @@ XBLASOBJS += \ xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \ xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \ xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \ - xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) \ + xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) ifeq ($(USE_GEMM3M), 1) @@ -215,6 +236,7 @@ XBLASOBJS += xgemm3m_kernel$(TSUFFIX).$(SUFFIX) endif +ifeq ($(BUILD_SINGLE),1) SBLASOBJS += \ strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \ strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -223,7 +245,10 @@ SBLASOBJS += \ strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ - strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) +endif +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS += \ strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \ strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \ @@ -231,10 +256,15 @@ SBLASOBJS += \ strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \ strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \ strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \ - strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \ + strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_SINGLE),1) +SBLASOBJS += \ ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \ ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS += \ dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -254,6 +284,7 @@ DBLASOBJS += \ dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX) +endif QBLASOBJS += \ qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ @@ -273,8 +304,9 @@ QBLASOBJS += \ qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \ qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS += \ ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -284,6 +316,13 @@ CBLASOBJS += \ ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \ + csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ + csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ + chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) +endif +ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" +CBLASOBJS += \ ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \ @@ -291,12 +330,10 @@ CBLASOBJS += \ ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \ ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \ - ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) \ - csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \ - csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \ - chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \ - chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX) + ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS += \ ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ @@ -318,6 +355,7 @@ ZBLASOBJS += \ zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \ zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \ zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX) +endif XBLASOBJS += \ xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \ @@ -343,6 +381,7 @@ XBLASOBJS += \ ifeq ($(USE_GEMM3M), 1) +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS += \ cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ @@ -362,7 +401,9 @@ CBLASOBJS += \ chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS += \ zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \ @@ -382,6 +423,7 @@ ZBLASOBJS += \ zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \ zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \ zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) +endif XBLASOBJS += \ xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \ @@ -406,20 +448,25 @@ XBLASOBJS += \ endif ###### BLAS extensions ##### + +ifeq ($(BUILD_SINGLE),1) SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ sgeadd_k$(TSUFFIX).$(SUFFIX) - +endif +ifeq ($(BUILD_DOUBLE),1) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ dgeadd_k$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX),1) CBLASOBJS += \ comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ @@ -430,7 +477,9 @@ CBLASOBJS += \ cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ cgeadd_k$(TSUFFIX).$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX16),1) ZBLASOBJS += \ zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ @@ -441,6 +490,7 @@ ZBLASOBJS += \ zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) +endif ifeq ($(BUILD_HALF), 1) SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 550af86a6c..dd49d8e4ec 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -114,7 +114,7 @@ gotoblas_t TABLE_NAME = { #endif #endif -#if defined( BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, #ifdef SGEMM_DEFAULT_UNROLL_MN @@ -130,34 +130,38 @@ gotoblas_t TABLE_NAME = { 0, #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1 ) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, snrm2_kTS, sasum_kTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 ssum_kTS, #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) scopy_kTS, sdot_kTS, // dsdot_kTS, srot_kTS, saxpy_kTS, - sscal_kTS, +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) + sscal_kTS, +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) sswap_kTS, sgemv_nTS, sgemv_tTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 sger_kTS, ssymv_LTS, ssymv_UTS, +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) #ifdef ARCH_X86_64 sgemm_directTS, sgemm_direct_performantTS, #endif -#endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) sgemm_kernelTS, sgemm_betaTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, @@ -167,7 +171,7 @@ gotoblas_t TABLE_NAME = { sgemm_oncopyTS, sgemm_otcopyTS, #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS, @@ -178,6 +182,8 @@ gotoblas_t TABLE_NAME = { #endif strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS, strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS, +#endif +#if BUILD_SINGLE == 1 strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS, @@ -194,16 +200,16 @@ gotoblas_t TABLE_NAME = { ssymm_outcopyTS, ssymm_oltcopyTS, #endif ssymm_outcopyTS, ssymm_oltcopyTS, - +#endif +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) #ifndef NO_LAPACK sneg_tcopyTS, slaswp_ncopyTS, #else NULL,NULL, #endif - #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) 0, 0, 0, DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, #ifdef DGEMM_DEFAULT_UNROLL_MN @@ -214,33 +220,33 @@ gotoblas_t TABLE_NAME = { #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, dnrm2_kTS, dasum_kTS, #endif -#if defined (BUILD_DOUBLE) +#if (BUILD_DOUBLE==1) dsum_kTS, #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) dcopy_kTS, ddot_kTS, #endif -#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) +#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) dsdot_kTS, #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, dgemv_nTS, dgemv_tTS, #endif -#if defined (BUILD_DOUBLE) +#if (BUILD_DOUBLE==1) dger_kTS, dsymv_LTS, dsymv_UTS, #endif -#if defined (BUILD_DOUBLE) || defined(BUILD_COMPLEX16) +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) dgemm_kernelTS, dgemm_betaTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dgemm_incopyTS, dgemm_itcopyTS, @@ -250,7 +256,7 @@ gotoblas_t TABLE_NAME = { dgemm_oncopyTS, dgemm_otcopyTS, #endif -#if defined (BUILD_DOUBLE) +#if (BUILD_DOUBLE==1) dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS, @@ -340,7 +346,7 @@ gotoblas_t TABLE_NAME = { #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX || BUILD_COMPLEX16) 0, 0, 0, CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, #ifdef CGEMM_DEFAULT_UNROLL_MN @@ -348,21 +354,34 @@ gotoblas_t TABLE_NAME = { #else MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N), #endif - camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, - cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS, - cdotu_kTS, cdotc_kTS, csrot_kTS, - caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, +#endif +#if (BUILD_COMPLEX) + cnrm2_kTS, casum_kTS, csum_kTS, +#endif +#if (BUILD_COMPLEX || BUILD_COMPLEX16) + ccopy_kTS, cdotu_kTS, cdotc_kTS, +#endif +#if (BUILD_COMPLEX) + csrot_kTS, +#endif +#if (BUILD_COMPLEX || BUILD_COMPLEX16) + caxpy_kTS, + caxpyc_kTS, + cscal_kTS, + cswap_kTS, cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS, cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS, +#endif +#if (BUILD_COMPLEX) cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS, csymv_LTS, csymv_UTS, chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS, - +#endif +#if (BUILD_COMPLEX || BUILD_COMPLEX16) cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS, cgemm_betaTS, - #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N cgemm_incopyTS, cgemm_itcopyTS, #else @@ -382,6 +401,8 @@ gotoblas_t TABLE_NAME = { #endif ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS, ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS, +#endif +#if (BUILD_COMPLEX) ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS, ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS, @@ -411,7 +432,7 @@ gotoblas_t TABLE_NAME = { 0, 0, 0, -#if defined(USE_GEMM3M) +#if (USE_GEMM3M) #ifdef CGEMM3M_DEFAULT_UNROLL_M CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N), #else @@ -469,16 +490,20 @@ gotoblas_t TABLE_NAME = { NULL, NULL, NULL, NULL, #endif +#endif +#if (BUILD_COMPLEX || BUILD_COMPLEX16) #ifndef NO_LAPACK - cneg_tcopyTS, claswp_ncopyTS, + cneg_tcopyTS, + + claswp_ncopyTS, #else NULL, NULL, #endif #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 0, 0, 0, ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, #ifdef ZGEMM_DEFAULT_UNROLL_MN @@ -548,7 +573,7 @@ gotoblas_t TABLE_NAME = { zhemm_outcopyTS, zhemm_oltcopyTS, 0, 0, 0, -#if defined(USE_GEMM3M) +#if (USE_GEMM3M) #ifdef ZGEMM3M_DEFAULT_UNROLL_M ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N), #else @@ -681,7 +706,7 @@ gotoblas_t TABLE_NAME = { xhemm_outcopyTS, xhemm_oltcopyTS, 0, 0, 0, -#if defined(USE_GEMM3M) +#if (USE_GEMM3M) QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N), xgemm3m_kernelTS, @@ -746,110 +771,110 @@ gotoblas_t TABLE_NAME = { init_parameter, SNUMOPT, DNUMOPT, QNUMOPT, -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 saxpby_kTS, #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 daxpby_kTS, #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 caxpby_kTS, #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16== 1 zaxpby_kTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 somatcopy_k_cnTS, somatcopy_k_ctTS, somatcopy_k_rnTS, somatcopy_k_rtTS, #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE== 1 domatcopy_k_cnTS, domatcopy_k_ctTS, domatcopy_k_rnTS, domatcopy_k_rtTS, #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS, comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS, #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS, zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 simatcopy_k_cnTS, simatcopy_k_ctTS, simatcopy_k_rnTS, simatcopy_k_rtTS, #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE== 1 dimatcopy_k_cnTS, dimatcopy_k_ctTS, dimatcopy_k_rnTS, dimatcopy_k_rtTS, #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX== 1 cimatcopy_k_cnTS, cimatcopy_k_ctTS, cimatcopy_k_rnTS, cimatcopy_k_rtTS, cimatcopy_k_cncTS, cimatcopy_k_ctcTS, cimatcopy_k_rncTS, cimatcopy_k_rtcTS, #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 zimatcopy_k_cnTS, zimatcopy_k_ctTS, zimatcopy_k_rnTS, zimatcopy_k_rtTS, zimatcopy_k_cncTS, zimatcopy_k_ctcTS, zimatcopy_k_rncTS, zimatcopy_k_rtcTS, #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 sgeadd_kTS, #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE==1 dgeadd_kTS, #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 cgeadd_kTS, #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 zgeadd_kTS #endif }; -#if defined(ARCH_ARM64) +#if (ARCH_ARM64) static void init_parameter(void) { -#if defined(BUILD_HALF) +#if (BUILD_HALF) TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif -#if defined(BUILD_HALF) +#if (BUILD_HALF) TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE== 1 TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX== 1 TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; #endif -#if defined(BUILD_HALF) +#if (BUILD_HALF) TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE==1 TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; #endif @@ -862,7 +887,7 @@ static void init_parameter(void) { TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; #endif -#if defined(USE_GEMM3M) +#if (USE_GEMM3M) #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; #else @@ -907,8 +932,8 @@ static void init_parameter(void) { #endif } -#else // defined(ARCH_ARM64) -#if defined(ARCH_POWER) +#else // (ARCH_ARM64) +#if (ARCH_POWER) static void init_parameter(void) { #ifdef BUILD_HALF @@ -938,7 +963,7 @@ static void init_parameter(void) { } #else //POWER -#if defined(ARCH_ZARCH) +#if (ARCH_ZARCH) static void init_parameter(void) { #ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; @@ -1104,20 +1129,20 @@ static void init_parameter(void) { TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; #endif -#ifdef BUILD_SINGLE +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 #ifdef CGEMM3M_DEFAULT_Q TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; #else @@ -1125,7 +1150,7 @@ static void init_parameter(void) { #endif #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 #ifdef ZGEMM3M_DEFAULT_Q TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; #else @@ -1139,22 +1164,22 @@ static void init_parameter(void) { TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q; #endif -#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) +#if (CORE_KATMAI) || (CORE_COPPERMINE) || (CORE_BANIAS) || (CORE_YONAH) || (CORE_ATHLON) #ifdef DEBUG fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 64 * (l2 >> 7); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 TABLE_NAME.dgemm_p = 32 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 32 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 16 * (l2 >> 7); #endif #ifdef EXPRECISION @@ -1169,16 +1194,16 @@ static void init_parameter(void) { fprintf(stderr, "Northwood\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 96 * (l2 >> 7); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 TABLE_NAME.dgemm_p = 48 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 48 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 24 * (l2 >> 7); #endif #ifdef EXPRECISION @@ -1193,16 +1218,16 @@ static void init_parameter(void) { fprintf(stderr, "Atom\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 256; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE ==1 TABLE_NAME.dgemm_p = 128; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 128; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 64; #endif #ifdef EXPRECISION @@ -1217,16 +1242,16 @@ static void init_parameter(void) { fprintf(stderr, "Prescott\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 56 * (l2 >> 7); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE ==1 TABLE_NAME.dgemm_p = 28 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 28 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 TABLE_NAME.zgemm_p = 14 * (l2 >> 7); #endif #ifdef EXPRECISION @@ -1241,16 +1266,16 @@ static void init_parameter(void) { fprintf(stderr, "Core2\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE==1 TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 46 * (l2 >> 9) + 4; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 23 * (l2 >> 9) + 4; #endif #ifdef EXPRECISION @@ -1265,16 +1290,16 @@ static void init_parameter(void) { fprintf(stderr, "Penryn\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE == 1 TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; #endif #ifdef EXPRECISION @@ -1289,16 +1314,16 @@ static void init_parameter(void) { fprintf(stderr, "Dunnington\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE ==1 TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4; #endif #ifdef EXPRECISION @@ -1314,16 +1339,16 @@ static void init_parameter(void) { fprintf(stderr, "Nehalem\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1338,16 +1363,16 @@ static void init_parameter(void) { fprintf(stderr, "Sandybridge\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1362,16 +1387,16 @@ static void init_parameter(void) { fprintf(stderr, "Haswell\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1380,22 +1405,22 @@ static void init_parameter(void) { #endif #endif -#if defined (SKYLAKEX) || defined (COOPERLAKE) +#if defined(SKYLAKEX) || defined(COOPERLAKE) #ifdef DEBUG fprintf(stderr, "SkylakeX\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1411,16 +1436,16 @@ static void init_parameter(void) { fprintf(stderr, "Opteron\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7); #endif #ifdef EXPRECISION @@ -1435,16 +1460,16 @@ static void init_parameter(void) { fprintf(stderr, "Barcelona\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1459,16 +1484,16 @@ static void init_parameter(void) { fprintf(stderr, "Bobcate\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1483,16 +1508,16 @@ static void init_parameter(void) { fprintf(stderr, "Bulldozer\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1507,16 +1532,16 @@ static void init_parameter(void) { fprintf(stderr, "Excavator\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1532,16 +1557,16 @@ static void init_parameter(void) { fprintf(stderr, "Piledriver\n"); #endif -#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1556,16 +1581,16 @@ static void init_parameter(void) { fprintf(stderr, "Steamroller\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1580,16 +1605,16 @@ static void init_parameter(void) { fprintf(stderr, "Zen\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif #ifdef EXPRECISION @@ -1605,16 +1630,16 @@ static void init_parameter(void) { fprintf(stderr, "NANO\n"); #endif -#if defined (BUILD_SINGLE) || defined(BUILD_COMPLEX) +#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; #endif -#ifdef BUILD_DOUBLE +#if (BUILD_DOUBLE==1) TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX +#if (BUILD_COMPLEX==1) TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; #endif -#ifdef BUILD_COMPLEX16 +#if (BUILD_COMPLEX16==1) TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif @@ -1626,7 +1651,7 @@ static void init_parameter(void) { #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; #else @@ -1634,7 +1659,7 @@ static void init_parameter(void) { #endif #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 #ifdef ZGEMM3M_DEFAULT_P TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; #else @@ -1647,20 +1672,20 @@ static void init_parameter(void) { #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE == 1 TABLE_NAME.sgemm_p = ((TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M; #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE== 1 TABLE_NAME.dgemm_p = ((TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 TABLE_NAME.cgemm_p = ((TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1)/CGEMM_DEFAULT_UNROLL_M) * CGEMM_DEFAULT_UNROLL_M; #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 TABLE_NAME.zgemm_p = ((TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1)/ZGEMM_DEFAULT_UNROLL_M) * ZGEMM_DEFAULT_UNROLL_M; #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX==1 #ifdef CGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1)/CGEMM3M_DEFAULT_UNROLL_M) * CGEMM3M_DEFAULT_UNROLL_M; #else @@ -1668,7 +1693,7 @@ static void init_parameter(void) { #endif #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16==1 #ifdef ZGEMM3M_DEFAULT_UNROLL_M TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1)/ZGEMM3M_DEFAULT_UNROLL_M) * ZGEMM3M_DEFAULT_UNROLL_M; #else @@ -1686,14 +1711,14 @@ static void init_parameter(void) { fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p); #endif -#ifdef BUILD_SINGLE +#if BUILD_SINGLE==1 TABLE_NAME.sgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15); #endif -#ifdef BUILD_DOUBLE +#if BUILD_DOUBLE==1 TABLE_NAME.dgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) @@ -1707,28 +1732,28 @@ static void init_parameter(void) { ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX ==1 TABLE_NAME.cgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 ==1 TABLE_NAME.zgemm_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15); #endif -#ifdef BUILD_COMPLEX +#if BUILD_COMPLEX == 1 TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15); #endif -#ifdef BUILD_COMPLEX16 +#if BUILD_COMPLEX16 == 1 TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE - ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA + TABLE_NAME.align) & ~TABLE_NAME.align) @@ -1755,4 +1780,4 @@ static void init_parameter(void) { } #endif //POWER #endif //ZARCH -#endif //defined(ARCH_ARM64) +#endif //(ARCH_ARM64) From 0f7d73ff6d66e651e4d96b26056932746e885f1c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:53:26 +0200 Subject: [PATCH 273/349] Allow supporting only a subset of variable types --- interface/CMakeLists.txt | 4 +-- interface/Makefile | 54 +++++++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 8 deletions(-) diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index ad56c6dbaa..5346ecadde 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -171,7 +171,7 @@ if (NOT DEFINED NO_LAPACK) GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) endif () -if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) +if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) GenerateNamedObjects("scal.c" "" "scal" 0 "" "" false "SINGLE") GenerateNamedObjects("copy.c" "" "copy" 0 "" "" false "SINGLE") GenerateNamedObjects("dot.c" "" "dot" 0 "" "" false "SINGLE") @@ -184,7 +184,7 @@ if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) GenerateNamedObjects("axpy.c" "" "axpy" 0 "" "" false "SINGLE") GenerateNamedObjects("imax.c" "USE_ABS" "i*amax" 0 "" "" false "SINGLE") endif () -if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) +if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("scal.c" "" "scal" 0 "" "" false "DOUBLE") GenerateNamedObjects("copy.c" "" "copy" 0 "" "" false "DOUBLE") GenerateNamedObjects("dot.c" "" "dot" 0 "" "" false "DOUBLE") diff --git a/interface/Makefile b/interface/Makefile index fde6227bc4..71393aaba9 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -329,7 +329,10 @@ CCBLAS3OBJS = \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ - cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX) + cblas_cgeadd.$(SUFFIX) + +CXERBLAOBJ = \ + cblas_xerbla.$(SUFFIX) @@ -391,6 +394,8 @@ ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS) SHEXTOBJS += $(CSHEXTOBJS) + +CBAUXOBJS += $(CXERBLAOBJ) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) @@ -434,13 +439,11 @@ QLAPACKOBJS = \ # cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ # clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) cpotri.$(SUFFIX) - CLAPACKOBJS = \ cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \ cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \ clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX) - #ZLAPACKOBJS = \ # zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \ # zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \ @@ -469,8 +472,42 @@ ZBLASOBJS += $(ZLAPACKOBJS) endif -FUNCOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +ifneq ($(BUILD_SINGLE),1) + SBLASOBJS= +ifeq ($(BUILD_DOUBLE),1) + SBLASOBJS = dsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) strsm.$(SUFFIX) \ + sgetrs.$(SUFFIX) sgetrf.$(SUFFIX) spotf2.$(SUFFIX) spotrf.$(SUFFIX) \ + ssyrk.$(SUFFIX) sgemv.$(SUFFIX) +endif +ifeq ($(BUILD_COMPLEX),1) + SBLASOBJS = \ + sdot.$(SUFFIX) srot.$(SUFFIX) snrm2.$(SUFFIX) sswap.$(SUFFIX) \ + isamax.$(SUFFIX) saxpy.$(SUFFIX) sscal.$(SUFFIX) scopy.$(SUFFIX) \ + sgemv.$(SUFFIX) sgemm.$(SUFFIX) +endif +endif +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + DBLASOBJS = \ + ddot.$(SUFFIX) drot.$(SUFFIX) dnrm2.$(SUFFIX) dswap.$(SUFFIX) \ + idamax.$(SUFFIX) daxpy.$(SUFFIX) dscal.$(SUFFIX) dcopy.$(SUFFIX) \ + dgemv.$(SUFFIX) dgemm.$(SUFFIX) +endif +endif +ifneq ($(BUILD_COMPLEX),1) + CBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + CBLASOBJS = cgetrs.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) cgetrf.$(SUFFIX) \ + cpotrf.$(SUFFIX) ctrsm.$(SUFFIX) cblas_cdotc_sub.$(SUFFIX) +endif +endif +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= +endif +FUNCOBJS = $(SHEXTOBJS) $(CXERBLAOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +$(info FUNCOBJS = {[$(FUNCOBJS)]} ) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif @@ -481,6 +518,7 @@ endif FUNCALLFILES = $(FUNCOBJS:.$(SUFFIX)=) + include $(TOPDIR)/Makefile.tail all :: libs @@ -503,11 +541,14 @@ level1 : $(BEXTOBJS) $(SHBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $( level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) +level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) + $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ + +aux : $(CBAUXOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ -$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS +$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) $(CBAUXOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c $(CC) $(CFLAGS) -c $< -o $(@F) @@ -2268,3 +2309,4 @@ cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + From 886a8e319048ff92a923f989ca1a01b594b60808 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:57:32 +0200 Subject: [PATCH 274/349] Adapt for supporting only a subset of variable types --- driver/level3/CMakeLists.txt | 8 +++--- driver/level3/Makefile | 54 ++++++++++++++++++++++++++++++++++++ driver/level3/syrk_thread.c | 4 +-- 3 files changed, 60 insertions(+), 6 deletions(-) diff --git a/driver/level3/CMakeLists.txt b/driver/level3/CMakeLists.txt index 46cbb0d6d1..077862abc8 100644 --- a/driver/level3/CMakeLists.txt +++ b/driver/level3/CMakeLists.txt @@ -14,7 +14,7 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endif () endforeach () -if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) +if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "DOUBLE") @@ -23,7 +23,7 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) endif() endforeach() endif() -if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) +if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) foreach (GEMM_DEFINE ${GEMM_DEFINES}) string(TOLOWER ${GEMM_DEFINE} GEMM_DEFINE_LC) GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "SINGLE") @@ -119,7 +119,7 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () - if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) + if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) string(TOLOWER ${gemm_define} gemm_define_LC) if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) @@ -127,7 +127,7 @@ foreach (float_type ${FLOAT_TYPES}) endif() endforeach() endif () - if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) + if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) foreach (gemm_define ${GEMM_COMPLEX_DEFINES}) string(TOLOWER ${gemm_define} gemm_define_LC) if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 09a62d9bf5..e3aa30256c 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -287,6 +287,60 @@ HPLOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) \ dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) endif +ifneq ($(BUILD_SINGLE),1) + SBLASOBJS= +ifeq ($(BUILD_DOUBLE),1) + SBLASOBJS= \ + strsm_LNUU.$(SUFFIX) strsm_LNUN.$(SUFFIX) strsm_LNLU.$(SUFFIX) strsm_LNLN.$(SUFFIX) \ + strsm_LTUU.$(SUFFIX) strsm_LTUN.$(SUFFIX) strsm_LTLU.$(SUFFIX) strsm_LTLN.$(SUFFIX) \ + strsm_RNUU.$(SUFFIX) strsm_RNUN.$(SUFFIX) strsm_RNLU.$(SUFFIX) strsm_RNLN.$(SUFFIX) \ + strsm_RTUU.$(SUFFIX) strsm_RTUN.$(SUFFIX) strsm_RTLU.$(SUFFIX) strsm_RTLN.$(SUFFIX) \ + ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ + ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) +ifndef USE_SIMPLE_THREADED_LEVEL3 +SBLASOBJS += ssyrk_thread_UN.$(SUFFIX) ssyrk_thread_UT.$(SUFFIX) ssyrk_thread_LN.$(SUFFIX) ssyrk_thread_LT.$(SUFFIX) +endif +endif +ifeq ($(BUILD_COMPLEX),1) + SBLASOBJS = sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) +ifndef USE_SIMPLE_THREADED_LEVEL3 +SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + DBLASOBJS = dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) +ifndef USE_SIMPLE_THREADED_LEVEL3 +DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_COMPLEX),1) + CBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + CBLASOBJS= \ + cherk_UN.$(SUFFIX) cherk_UC.$(SUFFIX) cherk_LN.$(SUFFIX) cherk_LC.$(SUFFIX) \ + cherk_kernel_UN.$(SUFFIX) cherk_kernel_UC.$(SUFFIX) \ + cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \ + ctrsm_LNUU.$(SUFFIX) ctrsm_LNUN.$(SUFFIX) ctrsm_LNLU.$(SUFFIX) ctrsm_LNLN.$(SUFFIX) \ + ctrsm_LTUU.$(SUFFIX) ctrsm_LTUN.$(SUFFIX) ctrsm_LTLU.$(SUFFIX) ctrsm_LTLN.$(SUFFIX) \ + ctrsm_LRUU.$(SUFFIX) ctrsm_LRUN.$(SUFFIX) ctrsm_LRLU.$(SUFFIX) ctrsm_LRLN.$(SUFFIX) \ + ctrsm_LCUU.$(SUFFIX) ctrsm_LCUN.$(SUFFIX) ctrsm_LCLU.$(SUFFIX) ctrsm_LCLN.$(SUFFIX) \ + ctrsm_RNUU.$(SUFFIX) ctrsm_RNUN.$(SUFFIX) ctrsm_RNLU.$(SUFFIX) ctrsm_RNLN.$(SUFFIX) \ + ctrsm_RTUU.$(SUFFIX) ctrsm_RTUN.$(SUFFIX) ctrsm_RTLU.$(SUFFIX) ctrsm_RTLN.$(SUFFIX) \ + ctrsm_RRUU.$(SUFFIX) ctrsm_RRUN.$(SUFFIX) ctrsm_RRLU.$(SUFFIX) ctrsm_RRLN.$(SUFFIX) \ + ctrsm_RCUU.$(SUFFIX) ctrsm_RCUN.$(SUFFIX) ctrsm_RCLU.$(SUFFIX) ctrsm_RCLN.$(SUFFIX) +ifndef USE_SIMPLE_THREADED_LEVEL3 +CBLASOBJS += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread_LN.$(SUFFIX) cherk_thread_LC.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= +endif + all :: shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c index 753cdb5ca7..12808afd5e 100644 --- a/driver/level3/syrk_thread.c +++ b/driver/level3/syrk_thread.c @@ -56,12 +56,12 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( if (!(mode & BLAS_COMPLEX)) { switch (mode & BLAS_PREC) { -#ifdef BUILD_SINGLE +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) case BLAS_SINGLE: mask = SGEMM_UNROLL_MN - 1; break; #endif -#ifdef BUILD_DOUBLE +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) case BLAS_DOUBLE: mask = DGEMM_UNROLL_MN - 1; break; From 887e00fd7fc328fb647bdc9aa2feb18898092a73 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 14:58:57 +0200 Subject: [PATCH 275/349] Adapt for supporting only a subset of variable types --- driver/level2/CMakeLists.txt | 4 +- driver/level2/Makefile | 82 +++++++++++++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 4 deletions(-) diff --git a/driver/level2/CMakeLists.txt b/driver/level2/CMakeLists.txt index f72e707e18..61367e5960 100644 --- a/driver/level2/CMakeLists.txt +++ b/driver/level2/CMakeLists.txt @@ -197,13 +197,13 @@ foreach (float_type ${FLOAT_TYPES}) endif () endforeach () -if (DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_SINGLE) +if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) if (USE_THREAD) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "SINGLE") endif () endif () -if (DEFINED BUILD_COMPLEX16 AND NOT DEFINED BUILD_DOUBLE) +if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) if (USE_THREAD) GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "DOUBLE") GenerateNamedObjects("gemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "DOUBLE") diff --git a/driver/level2/Makefile b/driver/level2/Makefile index 79c4ca153d..7212d66622 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -417,19 +417,63 @@ XBLASOBJS += \ endif +ifneq ($(BUILD_SINGLE),1) + SBLASOBJS= +ifeq ($(BUILD_DOUBLE),1) +ifdef SMP +SBLASOBJS += \ + sgemv_thread_n.$(SUFFIX) sgemv_thread_t.$(SUFFIX) \ + strsv_NUU.$(SUFFIX) strsv_NUN.$(SUFFIX) strsv_NLU.$(SUFFIX) strsv_NLN.$(SUFFIX) \ + strsv_TUU.$(SUFFIX) strsv_TUN.$(SUFFIX) strsv_TLU.$(SUFFIX) strsv_TLN.$(SUFFIX) +endif +endif +ifeq ($(BUILD_COMPLEX),1) +ifdef SMP + SBLASOBJS = sgemv_thread_n.$(SUFFIX) sgemv_thread_t.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_DOUBLE),1) + DBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) +ifdef SMP + DBLASOBJS = dgemv_thread_n.$(SUFFIX) dgemv_thread_t.$(SUFFIX) +endif +endif +endif +ifneq ($(BUILD_COMPLEX),1) + CBLASOBJS= +ifeq ($(BUILD_COMPLEX16),1) + CBLASOBJS= \ + ctrsv_NUU.$(SUFFIX) ctrsv_NUN.$(SUFFIX) ctrsv_NLU.$(SUFFIX) ctrsv_NLN.$(SUFFIX) \ + ctrsv_TUU.$(SUFFIX) ctrsv_TUN.$(SUFFIX) ctrsv_TLU.$(SUFFIX) ctrsv_TLN.$(SUFFIX) \ + ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ + ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) +endif +endif +ifneq ($(BUILD_COMPLEX16),1) + ZBLASOBJS= +endif + all :: +ifeq ($(BUILD_SINGLE),1) + sgbmv_n.$(SUFFIX) sgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -UDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< sgbmv_t.$(SUFFIX) sgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -UDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< +endif + +ifeq ($(BUILD_DOUBLE),1) dgbmv_n.$(SUFFIX) dgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< dgbmv_t.$(SUFFIX) dgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< +endif qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANS $(CFLAGS) -o $(@F) $< @@ -437,6 +481,8 @@ qgbmv_n.$(SUFFIX) qgbmv_n.$(PSUFFIX) : gbmv_k.c qgbmv_t.$(SUFFIX) qgbmv_t.$(PSUFFIX) : gbmv_k.c $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANS $(CFLAGS) -o $(@F) $< +ifeq ($(BUILD_COMPLEX),1) + cgbmv_n.$(SUFFIX) cgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -460,6 +506,9 @@ cgbmv_s.$(SUFFIX) cgbmv_s.$(PSUFFIX) : zgbmv_k.c cgbmv_d.$(SUFFIX) cgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< +endif + +ifeq ($(BUILD_COMPLEX16),1) zgbmv_n.$(SUFFIX) zgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -484,6 +533,7 @@ zgbmv_s.$(SUFFIX) zgbmv_s.$(PSUFFIX) : zgbmv_k.c zgbmv_d.$(SUFFIX) zgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< +endif xgbmv_n.$(SUFFIX) xgbmv_n.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANS -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -509,24 +559,34 @@ xgbmv_s.$(SUFFIX) xgbmv_s.$(PSUFFIX) : zgbmv_k.c xgbmv_d.$(SUFFIX) xgbmv_d.$(PSUFFIX) : zgbmv_k.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANS -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +ifeq ($(BUILD_SINGLE),1) + sgbmv_thread_n.$(SUFFIX) sgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -UDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< sgbmv_thread_t.$(SUFFIX) sgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -UDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< +endif + + +ifeq ($(BUILD_DOUBLE),1) dgbmv_thread_n.$(SUFFIX) dgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< dgbmv_thread_t.$(SUFFIX) dgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< - +endif qgbmv_thread_n.$(SUFFIX) qgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DXDOUBLE -UTRANSA $(CFLAGS) -o $(@F) $< qgbmv_thread_t.$(SUFFIX) qgbmv_thread_t.$(PSUFFIX) : gbmv_thread.c $(CC) -c -UCOMPLEX -DXDOUBLE -DTRANSA $(CFLAGS) -o $(@F) $< + +ifeq ($(BUILD_COMPLEX),1) + cgbmv_thread_n.$(SUFFIX) cgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -550,6 +610,10 @@ cgbmv_thread_s.$(SUFFIX) cgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c cgbmv_thread_d.$(SUFFIX) cgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< +endif + + +ifeq ($(BUILD_COMPLEX16),1) zgbmv_thread_n.$(SUFFIX) zgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -574,6 +638,7 @@ zgbmv_thread_s.$(SUFFIX) zgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c zgbmv_thread_d.$(SUFFIX) zgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< +endif xgbmv_thread_n.$(SUFFIX) xgbmv_thread_n.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $(CFLAGS) -o $(@F) $< @@ -599,24 +664,32 @@ xgbmv_thread_s.$(SUFFIX) xgbmv_thread_s.$(PSUFFIX) : gbmv_thread.c xgbmv_thread_d.$(SUFFIX) xgbmv_thread_d.$(PSUFFIX) : gbmv_thread.c $(CC) -c -DCOMPLEX -DXDOUBLE -DTRANSA -DCONJ -DXCONJ $(CFLAGS) -o $(@F) $< + +ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" sgemv_thread_n.$(SUFFIX) sgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) sgemv_thread_t.$(SUFFIX) sgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) +endif + +ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" dgemv_thread_n.$(SUFFIX) dgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) dgemv_thread_t.$(SUFFIX) dgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) - +endif qgemv_thread_n.$(SUFFIX) qgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) qgemv_thread_t.$(SUFFIX) qgemv_thread_t.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) + +ifeq ($(BUILD_COMPLEX),1) + cgemv_thread_n.$(SUFFIX) cgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) @@ -640,6 +713,10 @@ cgemv_thread_s.$(SUFFIX) cgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common. cgemv_thread_d.$(SUFFIX) cgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) +endif + + +ifeq ($(BUILD_COMPLEX16),1) zgemv_thread_n.$(SUFFIX) zgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) @@ -664,6 +741,7 @@ zgemv_thread_s.$(SUFFIX) zgemv_thread_s.$(PSUFFIX) : gemv_thread.c ../../common. zgemv_thread_d.$(SUFFIX) zgemv_thread_d.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DTRANSA -DCONJ -DXCONJ $< -o $(@F) +endif xgemv_thread_n.$(SUFFIX) xgemv_thread_n.$(PSUFFIX) : gemv_thread.c ../../common.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) From 68e6823d36a2e727c6db7bf850ba2b05b204a04a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 15:01:32 +0200 Subject: [PATCH 276/349] Adapt for supporting only a subset of variable types --- cmake/arch.cmake | 3 +-- cmake/lapack.cmake | 28 +++++++++++++++++----------- cmake/system.cmake | 23 +++++++++-------------- cmake/system_check.cmake | 2 +- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index c048f13d18..99e685d048 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -83,8 +83,7 @@ if (DYNAMIC_ARCH) endif () endif () - CHECK_INCLUDE_FILE ("${PROJECT_SOURCE_DIR}/config_kernel.h" TRAP) - if (TRAP) + if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h) message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again") endif () diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 18a74d18ef..73f2592ef0 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -1,11 +1,12 @@ # Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files. set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F - ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f + ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f ../INSTALL/ilaver.f xerbla_array.f ../INSTALL/slamch.f) set(SCLAUX + scombssq.f sbdsvdx.f sstevx.f sstein.f sbdsdc.f sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f @@ -25,6 +26,7 @@ set(SCLAUX set(DZLAUX dbdsdc.f + dbdsvdx.f dstevx.f dstein.f dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f @@ -35,14 +37,14 @@ set(DZLAUX dlartg.f dlaruv.f dlas2.f dlascl.f dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f - dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f + dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f dsteqr.f dsterf.f dlaisnan.f disnan.f dlartgp.f dlartgs.f ../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f) set(SLASRC - sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f + sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f @@ -83,8 +85,8 @@ set(SLASRC ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f - ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f - sstevx.f ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f + ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f + ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f ssyswapr.f ssytrs.f ssytrs2.f @@ -116,7 +118,7 @@ set(SLASRC ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f - scombssq.f sgesvdq.f slaorhr_col_getrfnp.f + sgesvdq.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f @@ -229,7 +231,7 @@ set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f) set(DLASRC - dbdsvdx.f dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f + dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f @@ -270,8 +272,8 @@ set(DLASRC dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f - dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f - dstevx.f dsycon.f dsyev.f dsyevd.f dsyevr.f + dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f + dsycon.f dsyev.f dsyevd.f dsyevr.f dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f dsysv.f dsysvx.f dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f @@ -474,12 +476,16 @@ endif() if(BUILD_COMPLEX) set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX}) SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN}) - message(STATUS "Building Complex Precision") + message(STATUS "Building Single Precision Complex") endif() if(BUILD_COMPLEX16) set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX}) SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN}) - message(STATUS "Building Double Complex Precision") +# for zlange/zlanhe + if (NOT BUILD_DOUBLE) + set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f) + endif () + message(STATUS "Building Double Precision Complex") endif() # add lapack-netlib folder to the sources diff --git a/cmake/system.cmake b/cmake/system.cmake index 3729f6c62e..a504530fb4 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,9 +70,6 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() - if (DEFINED HAVE_SSE3) - set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") - endif() endif() if (DEFINED TARGET) @@ -326,13 +323,7 @@ else () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") endif () endif () -if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") -if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD) -if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32) -set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}") -endif() -endif() -endif() + if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () @@ -410,16 +401,20 @@ if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_CO set (BUILD_COMPLEX16 ON) endif() if (BUILD_SINGLE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE=1") + set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") endif() if (BUILD_DOUBLE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") + set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") endif() if (BUILD_COMPLEX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") + set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX=1") endif() if (BUILD_COMPLEX16) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") + set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX16=1") endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index b0ab926fcf..fdc79c8cee 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -121,6 +121,6 @@ endif() include(CheckIncludeFile) CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) -if (HAVE_C11) +if (HAVE_C11 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11") endif() From e396ec8b56511d84930e849b08d825af62b821a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 15:11:15 +0200 Subject: [PATCH 277/349] Allow building support for only a subset of variable types --- CMakeLists.txt | 28 +++++---- Makefile | 15 ++++- Makefile.rule | 32 +++------- Makefile.tail | 4 +- common_param.h | 166 ++++++++++++++++++++++++++++++++++++++++++------- 5 files changed, 182 insertions(+), 63 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 954c053e46..f43e0e0fc0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,8 +29,10 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() -option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) -option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) +option(BUILD_SINGLE "Single precision" OFF) +option(BUILD_DOUBLE "Double precision" OFF) +option(BUILD_COMPLEX "Single precision" OFF) +option(BUILD_COMPLEX16 "Single precision" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -108,28 +110,33 @@ endif() set(FLOAT_TYPES "") if (BUILD_SINGLE) - message(STATUS "Building Single Precision") - list(APPEND FLOAT_TYPES "SINGLE") # defines nothing + message(STATUS "Building Songle Precision") + list(APPEND FLOAT_TYPES "SINGLE") + # set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") endif () if (BUILD_DOUBLE) message(STATUS "Building Double Precision") - list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE + list(APPEND FLOAT_TYPES "DOUBLE") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") endif () if (BUILD_COMPLEX) message(STATUS "Building Complex Precision") - list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX -endif () + list(APPEND FLOAT_TYPES "COMPLEX") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") +endif () if (BUILD_COMPLEX16) message(STATUS "Building Double Complex Precision") - list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE + list(APPEND FLOAT_TYPES "ZCOMPLEX") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") endif () if (BUILD_HALF) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "HALF") # defines nothing + list(APPEND FLOAT_TYPES "HALF") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF") endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") @@ -236,9 +243,6 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) - if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) - add_subdirectory(cpp_thread_test) - endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES diff --git a/Makefile b/Makefile index 93e8af2eb4..a9af62a22f 100644 --- a/Makefile +++ b/Makefile @@ -146,9 +146,6 @@ ifneq ($(NO_CBLAS), 1) ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all endif -ifeq ($(CPP_THREAD_SAFETY_GEMV), 1) - $(MAKE) -C cpp_thread_test dgemv_tester -endif endif endif @@ -304,6 +301,18 @@ else endif ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_SINGLE), 1) + -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_DOUBLE), 1) + -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX), 1) + -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX16), 1) + -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.rule b/Makefile.rule index 635e02c024..09dfb08813 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,33 +272,17 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 -# -# use this to run only the less memory-hungry GEMV test -# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support # BUILD_HALF = 1 - - -# Set the thread number threshold beyond which the job array for the threaded level3 BLAS -# will be allocated on the heap rather than the stack. (This array alone requires -# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu -# counts, but obviously it is not the only item that ends up on the stack. -# The default value of 32 ensures that the overall requirement is compatible -# with the default 1MB stacksize imposed by having the Java VM loaded without use -# of its -Xss parameter. -# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible -# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java -# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code -# BLAS3_MEM_ALLOC_THRESHOLD = 160 - - - -# the below is not yet configurable, use cmake if you need to build only select types -BUILD_SINGLE = 1 -BUILD_DOUBLE = 1 -BUILD_COMPLEX = 1 -BUILD_COMPLEX16 = 1 +# +# Select if you need to build only select types +# BUILD_SINGLE = 1 +# BUILD_DOUBLE = 1 +# BUILD_COMPLEX = 1 +# BUILD_COMPLEX16 = 1 +# +# # End of user configuration # diff --git a/Makefile.tail b/Makefile.tail index cfc4a36fca..6410824503 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -11,8 +11,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) +BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) diff --git a/common_param.h b/common_param.h index a52de98ab8..81b479e537 100644 --- a/common_param.h +++ b/common_param.h @@ -146,40 +146,56 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif + +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; +#endif int exclusive_cache; +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); float (*smin_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); +#endif +#if BUILD_SINGLE float (*ssum_k) (BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if BUILD_SINGLE int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); @@ -193,7 +209,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -215,7 +232,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - +#endif +#if BUILD_SINGLE int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -242,13 +260,18 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -257,25 +280,37 @@ BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); +#endif +#if BUILD_DOUBLE double (*dsum_k) (BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) + double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +#endif +#if BUILD_DOUBLE int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -283,7 +318,8 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - +#endif +#if BUILD_DOUBLE int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -335,7 +371,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - +#endif #ifdef EXPRECISION int qgemm_p, qgemm_q, qgemm_r; @@ -430,22 +466,29 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; - float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); +#endif +#if BUILD_COMPLEX float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); float (*csum_k) (BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if BUILD_COMPLEX int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - +#endif +#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -459,6 +502,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -470,13 +515,14 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -507,6 +553,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -590,10 +638,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - +#endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#if BUILD_COMPLEX16 int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -757,6 +808,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +#endif #ifdef EXPRECISION @@ -930,22 +982,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; - +#if BUILD_SINGLE int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -955,7 +1019,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -965,17 +1031,23 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -985,7 +1057,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -995,12 +1069,20 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); +#endif +#if BUILD_DOUBLE int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); +#endif +#if BUILD_COMPLEX int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - +#endif } gotoblas_t; extern gotoblas_t *gotoblas; @@ -1021,19 +1103,31 @@ extern gotoblas_t *gotoblas; #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn #endif +#if (BUILD_SINGLE) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r #define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m #define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#if (BUILD_DOUBLE) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#if ! (BUILD_SINGLE) +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R gotoblas -> sgemm_r +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#endif #define QGEMM_P gotoblas -> qgemm_p #define QGEMM_Q gotoblas -> qgemm_q @@ -1042,19 +1136,47 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn +#if BUILD_COMPLEX #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#ifndef BUILD_SINGLE +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R 1024 +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#endif +#if BUILD_COMPLEX16 #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn +#ifndef BUILD_DOUBLE +#define DGEMM_P gotoblas -> dgemm_p +#define DGEMM_Q gotoblas -> dgemm_q +#define DGEMM_R 1024 +#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m +#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n +#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#endif +#ifndef BUILD_COMPLEX +#define CGEMM_P gotoblas -> cgemm_p +#define CGEMM_Q gotoblas -> cgemm_q +#define CGEMM_R gotoblas -> cgemm_r +#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m +#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n +#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#endif +#endif #define XGEMM_P gotoblas -> xgemm_p #define XGEMM_Q gotoblas -> xgemm_q @@ -1222,7 +1344,7 @@ extern gotoblas_t *gotoblas; #endif #ifndef COMPLEX -#if defined(XDOUBLE) +#if (XDOUBLE) #define GEMM_P QGEMM_P #define GEMM_Q QGEMM_Q #define GEMM_R QGEMM_R @@ -1246,7 +1368,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif defined(HALF) +#elif (HALF) #define GEMM_P SHGEMM_P #define GEMM_Q SHGEMM_Q #define GEMM_R SHGEMM_R @@ -1272,7 +1394,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N #endif #else -#if defined(XDOUBLE) +#if (XDOUBLE) #define GEMM_P XGEMM_P #define GEMM_Q XGEMM_Q #define GEMM_R XGEMM_R @@ -1386,7 +1508,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_P #ifdef XDOUBLE #define GEMM3M_P XGEMM3M_P -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_P ZGEMM3M_P #else #define GEMM3M_P CGEMM3M_P @@ -1396,7 +1518,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_Q #ifdef XDOUBLE #define GEMM3M_Q XGEMM3M_Q -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_Q ZGEMM3M_Q #else #define GEMM3M_Q CGEMM3M_Q @@ -1406,7 +1528,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_R #ifdef XDOUBLE #define GEMM3M_R XGEMM3M_R -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_R ZGEMM3M_R #else #define GEMM3M_R CGEMM3M_R From 5f23bdf437e6605f2fd36e3500026501f79eb134 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 17:23:08 +0200 Subject: [PATCH 278/349] remove debug output --- test/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/test/Makefile b/test/Makefile index a3966756d5..069d7880ae 100644 --- a/test/Makefile +++ b/test/Makefile @@ -7,7 +7,6 @@ all :: else all :: level1 level2 level3 endif -$(info buildvars [$(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16)]) ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) level1: sblat1 dblat1 cblat1 zblat1 endif From 8c5e08076ea8779d19f072254bcaadd15b495acc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 17:33:51 +0200 Subject: [PATCH 279/349] If none of the BUILD_ options is set, enable them all --- Makefile.system | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index c46c88581f..501b161aeb 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,7 +9,7 @@ ifndef TOPDIR TOPDIR = . endif -# If ARCH is not set, we use the host system's architecture for getarch compile options. + # If ARCH is not set, we use the host system's architecture for getarch compile options. ifndef ARCH HOSTARCH := $(shell uname -m) else @@ -73,6 +73,18 @@ endif # # Beginning of system configuration # +ifneq ($(BUILD_SINGLE),1) +ifneq ($(BUILD_DOUBLE),1) +ifneq ($(BUILD_COMPLEX),1) +ifneq ($(BUILD_COMPLEX16),1) +override BUILD_SINGLE=1 +override BUILD_DOUBLE=1 +override BUILD_COMPLEX=1 +override BUILD_COMPLEX16=1 +endif +endif +endif +endif ifndef HOSTCC HOSTCC = $(CC) From 1da32cc1fc3b7602619f56e6243aaa7e225b504d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 17:45:41 +0200 Subject: [PATCH 280/349] Add cblas_xerbla interface --- Makefile.tail | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.tail b/Makefile.tail index cfc4a36fca..6410824503 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -11,8 +11,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) +BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) From ae8b0d257a134b5630248f97b803a090bc51e31a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 18:08:21 +0200 Subject: [PATCH 281/349] Set BUILD_ options to 1 instead of just defining them --- Makefile.system | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile.system b/Makefile.system index 501b161aeb..eb6e14a982 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1236,16 +1236,16 @@ ifeq ($(BUILD_HALF), 1) CCOMMON_OPT += -DBUILD_HALF endif ifeq ($(BUILD_SINGLE), 1) -CCOMMON_OPT += -DBUILD_SINGLE +CCOMMON_OPT += -DBUILD_SINGLE=1 endif ifeq ($(BUILD_DOUBLE), 1) -CCOMMON_OPT += -DBUILD_DOUBLE +CCOMMON_OPT += -DBUILD_DOUBLE=1 endif ifeq ($(BUILD_COMPLEX), 1) -CCOMMON_OPT += -DBUILD_COMPLEX +CCOMMON_OPT += -DBUILD_COMPLEX=1 endif ifeq ($(BUILD_COMPLEX16), 1) -CCOMMON_OPT += -DBUILD_COMPLEX16 +CCOMMON_OPT += -DBUILD_COMPLEX16=1 endif CCOMMON_OPT += -DVERSION=\"$(VERSION)\" From 6154f72d6dc241260a53f9a9e424f18dd3f0f943 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 18:25:16 +0200 Subject: [PATCH 282/349] Copy BUILD_ settings to the LAPACK make.inc --- Makefile | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Makefile b/Makefile index 93e8af2eb4..6e7b31b1a6 100644 --- a/Makefile +++ b/Makefile @@ -304,6 +304,18 @@ else endif ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_SINGLE), 1) + -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_DOUBLE), 1) + -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX), 1) + -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX16), 1) + -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc From caa0d757cac13c59fa9ff763f4ccc91d73ffc5c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 18:29:34 +0200 Subject: [PATCH 283/349] repair TABs --- Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 6e7b31b1a6..22f7314d9f 100644 --- a/Makefile +++ b/Makefile @@ -306,16 +306,16 @@ ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_SINGLE), 1) - -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_DOUBLE), 1) - -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_COMPLEX), 1) - -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif ifeq ($(BUILD_COMPLEX16), 1) - -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc + -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc From d314d1f49f1cde993b1daa53a748303d853b4503 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:37:38 +0200 Subject: [PATCH 284/349] Rename shgemm_kernel_power10.c to sbgemm_kernel_power10.c --- kernel/power/{shgemm_kernel_power10.c => sbgemm_kernel_power10.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/power/{shgemm_kernel_power10.c => sbgemm_kernel_power10.c} (100%) diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c similarity index 100% rename from kernel/power/shgemm_kernel_power10.c rename to kernel/power/sbgemm_kernel_power10.c From 9ae80490e050c2526ce426b9557ff1a981142218 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:39:42 +0200 Subject: [PATCH 285/349] rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/power/sbgemm_kernel_power10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index 1ae9e04bf0..46d82598af 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common.h" #include -#if defined(HALF) && defined(HALFCONVERSION) +#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) static float bfloat16tof32 (bfloat16 f16) { @@ -131,7 +131,7 @@ vector char mask = #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); /************************************************************************************* -* SHGEMM Kernel +* SBGEMM Kernel *************************************************************************************/ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, From d7dd9b396c3385e7eeb63cafb38778e74e31f16f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:40:43 +0200 Subject: [PATCH 286/349] Rename shdot.c to sbdot.c --- kernel/x86_64/{shdot.c => sbdot.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/x86_64/{shdot.c => sbdot.c} (100%) diff --git a/kernel/x86_64/shdot.c b/kernel/x86_64/sbdot.c similarity index 100% rename from kernel/x86_64/shdot.c rename to kernel/x86_64/sbdot.c From 68ce719faca5b17a3fd91ace87f474e6b255d358 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:41:13 +0200 Subject: [PATCH 287/349] Rename shdot_microk_cooperlake.c to sbdot_microk_cooperlake.c --- .../{shdot_microk_cooperlake.c => sbdot_microk_cooperlake.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename kernel/x86_64/{shdot_microk_cooperlake.c => sbdot_microk_cooperlake.c} (100%) diff --git a/kernel/x86_64/shdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c similarity index 100% rename from kernel/x86_64/shdot_microk_cooperlake.c rename to kernel/x86_64/sbdot_microk_cooperlake.c From fd942360421e0cfb82220042a2396479e9ec3383 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:42:07 +0200 Subject: [PATCH 288/349] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/x86_64/sbdot.c | 18 +++++++++--------- kernel/x86_64/sbdot_microk_cooperlake.c | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/x86_64/sbdot.c b/kernel/x86_64/sbdot.c index 5073fda2a6..ef14fd6186 100644 --- a/kernel/x86_64/sbdot.c +++ b/kernel/x86_64/sbdot.c @@ -28,16 +28,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #if defined(COOPERLAKE) -#include "shdot_microk_cooperlake.c" +#include "sbdot_microk_cooperlake.c" #endif -static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) +static float sbdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) { float d = 0.0; -#ifdef HAVE_SHDOT_ACCL_KERNEL +#ifdef HAVE_SBDOT_ACCL_KERNEL if ((inc_x == 1) && (inc_y == 1)) { - return shdot_accl_kernel(n, x, y); + return sbdot_accl_kernel(n, x, y); } #endif @@ -56,11 +56,11 @@ static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, } #if defined(SMP) -static int shdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, +static int sbdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y, float *result, BLASLONG dummy3) { - *(float *)result = shdot_compute(n, x, inc_x, y, inc_y); + *(float *)result = sbdot_compute(n, x, inc_x, y, inc_y); return 0; } @@ -94,13 +94,13 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y } if (nthreads <= 1) { - dot_result = shdot_compute(n, x, inc_x, y, inc_y); + dot_result = sbdot_compute(n, x, inc_x, y, inc_y); } else { char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2]; int mode = BLAS_BFLOAT16 | BLAS_REAL; blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, y, inc_y, thread_result, 0, - (void *)shdot_thread_func, nthreads); + (void *)sbdot_thread_func, nthreads); float * ptr = (float *)thread_result; for (int i = 0; i < nthreads; i++) { dot_result += (*ptr); @@ -108,7 +108,7 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y } } #else - dot_result = shdot_compute(n, x, inc_x, y, inc_y); + dot_result = sbdot_compute(n, x, inc_x, y, inc_y); #endif return dot_result; diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index e645296f18..067726cb1c 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -28,11 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* need a new enough GCC for avx512 support */ #if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) -#define HAVE_SHDOT_ACCL_KERNEL 1 +#define HAVE_SBDOT_ACCL_KERNEL 1 #include "common.h" #include -static float shdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) +static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) { __m128 accum128 = _mm_setzero_ps(); if (n> 127) { /* n range from 128 to inf. */ From 4db09c6cec22711f8ec1588bc9d01d7db9e91478 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:42:45 +0200 Subject: [PATCH 289/349] Rename compare_sgemm_shgemm.c to compare_sgemm_sbgemm.c --- test/{compare_sgemm_shgemm.c => compare_sgemm_sbgemm.c} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{compare_sgemm_shgemm.c => compare_sgemm_sbgemm.c} (100%) diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_sbgemm.c similarity index 100% rename from test/compare_sgemm_shgemm.c rename to test/compare_sgemm_sbgemm.c From 924fd806d0737ab6faabab8fab26a102073ebbfd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:43:36 +0200 Subject: [PATCH 290/349] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- test/compare_sgemm_sbgemm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c index 57aee7b8f1..3d4eb2515e 100644 --- a/test/compare_sgemm_sbgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "../common.h" #define SGEMM BLASFUNC(sgemm) -#define SHGEMM BLASFUNC(shgemm) +#define SBGEMM BLASFUNC(sbgemm) typedef union { unsigned short v; @@ -102,7 +102,7 @@ main (int argc, char *argv[]) } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); - SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, + SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) @@ -126,6 +126,6 @@ main (int argc, char *argv[]) } } if (ret != 0) - fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); + fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); return ret; } From 5800758b43e26d873a89ace25bafff947980a5c9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:44:38 +0200 Subject: [PATCH 291/349] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- test/Makefile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/Makefile b/test/Makefile index 069d7880ae..06fb7fe86f 100644 --- a/test/Makefile +++ b/test/Makefile @@ -214,16 +214,16 @@ endif -#ifeq ($(BUILD_HALF),1) -#level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 +#ifeq ($(BUILD_BFLOAT16),1) +#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 #else #level3 : sblat3 dblat3 cblat3 zblat3 #endif ifndef CROSS rm -f ?BLAT3.SUMM -ifeq ($(BUILD_HALF),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) @@ -245,8 +245,8 @@ endif ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) -ifeq ($(BUILD_HALF),1) - OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OMP_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) @@ -266,8 +266,8 @@ ifeq ($(BUILD_COMPLEX16),1) @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 endif else -ifeq ($(BUILD_HALF),1) - OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) @@ -377,9 +377,9 @@ zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif -ifeq ($(BUILD_HALF),1) -test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_BFLOAT16),1) +test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif ifeq ($(BUILD_COMPLEX),1) @@ -398,7 +398,7 @@ clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ sblat2 dblat2 cblat2 zblat2 \ - test_shgemm sblat3 dblat3 cblat3 zblat3 \ + test_sbgemm sblat3 dblat3 cblat3 zblat3 \ sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ From ca31c32693bbb70cd8eeee5f2be09a7e9d1b363c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:49:22 +0200 Subject: [PATCH 292/349] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- cblas.h | 2 +- common.h | 4 +- common_interface.h | 4 +- common_level1.h | 2 +- common_level3.h | 28 +++--- common_macro.h | 94 +++++++++--------- common_param.h | 230 ++++++++++++++++++--------------------------- getarch_2nd.c | 4 +- param.h | 32 +++---- 9 files changed, 178 insertions(+), 222 deletions(-) diff --git a/cblas.h b/cblas.h index 21f3958f24..4fc6f86812 100644 --- a/cblas.h +++ b/cblas.h @@ -392,7 +392,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE /* convert BFLOAT16 array to double array */ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); /* dot production of BFLOAT16 input arrays, and output as float */ -float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); +float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); #ifdef __cplusplus } diff --git a/common.h b/common.h index ab287262c1..89eeb197db 100644 --- a/common.h +++ b/common.h @@ -260,7 +260,7 @@ typedef unsigned long BLASULONG; #ifndef BFLOAT16 #include typedef uint16_t bfloat16; -#define HALFCONVERSION 1 +#define BFLOAT16CONVERSION 1 #endif #ifdef USE64BITINT @@ -303,7 +303,7 @@ typedef int blasint; #define SIZE 8 #define BASE_SHIFT 3 #define ZBASE_SHIFT 4 -#elif defined(HALF) +#elif defined(BFLOAT16) #define IFLOAT bfloat16 #define XFLOAT IFLOAT #define FLOAT float diff --git a/common_interface.h b/common_interface.h index 35a957aa17..bee09e8941 100644 --- a/common_interface.h +++ b/common_interface.h @@ -54,7 +54,7 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); -float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); +float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); @@ -474,7 +474,7 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint /* Level 3 routines */ -void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, +void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *, bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); diff --git a/common_level1.h b/common_level1.h index 88aa275a58..7b17962c48 100644 --- a/common_level1.h +++ b/common_level1.h @@ -46,7 +46,7 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); -float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); +float sbdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); diff --git a/common_level3.h b/common_level3.h index 671a7a0866..c4f9435a98 100644 --- a/common_level3.h +++ b/common_level3.h @@ -55,7 +55,7 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); -int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, +int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -78,10 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #endif -int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); @@ -505,7 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); -int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); +int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); @@ -534,10 +534,10 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); -int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); @@ -631,10 +631,10 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif -int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 3d6bcd9e84..605d74adae 100644 --- a/common_macro.h +++ b/common_macro.h @@ -644,7 +644,7 @@ #define GEADD_K DGEADD_K -#elif defined(HALF) +#elif defined(BFLOAT16) #define D_TO_BF16_K SHDTOBF16_K #define D_BF16_TO_K DBF16TOD_K @@ -662,7 +662,7 @@ #define ASUM_K SASUM_K #define DOTU_K SDOTU_K #define DOTC_K SDOTC_K -#define BF16_DOT_K SHDOT_K +#define BF16_DOT_K SBDOT_K #define AXPYU_K SAXPYU_K #define AXPYC_K SAXPYC_K #define AXPBY_K SAXPBY_K @@ -682,32 +682,32 @@ #define NRM2_K SNRM2_K #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L -#define GEMM_BETA SHGEMM_BETA -#define GEMM_KERNEL_N SHGEMM_KERNEL -#define GEMM_KERNEL_L SHGEMM_KERNEL -#define GEMM_KERNEL_R SHGEMM_KERNEL -#define GEMM_KERNEL_B SHGEMM_KERNEL - -#define GEMM_NN SHGEMM_NN -#define GEMM_CN SHGEMM_TN -#define GEMM_TN SHGEMM_TN -#define GEMM_NC SHGEMM_NT -#define GEMM_NT SHGEMM_NT -#define GEMM_CC SHGEMM_TT -#define GEMM_CT SHGEMM_TT -#define GEMM_TC SHGEMM_TT -#define GEMM_TT SHGEMM_TT -#define GEMM_NR SHGEMM_NN -#define GEMM_TR SHGEMM_TN -#define GEMM_CR SHGEMM_TN -#define GEMM_RN SHGEMM_NN -#define GEMM_RT SHGEMM_NT -#define GEMM_RC SHGEMM_NT -#define GEMM_RR SHGEMM_NN -#define GEMM_ONCOPY SHGEMM_ONCOPY -#define GEMM_OTCOPY SHGEMM_OTCOPY -#define GEMM_INCOPY SHGEMM_INCOPY -#define GEMM_ITCOPY SHGEMM_ITCOPY +#define GEMM_BETA SBGEMM_BETA +#define GEMM_KERNEL_N SBGEMM_KERNEL +#define GEMM_KERNEL_L SBGEMM_KERNEL +#define GEMM_KERNEL_R SBGEMM_KERNEL +#define GEMM_KERNEL_B SBGEMM_KERNEL + +#define GEMM_NN SBGEMM_NN +#define GEMM_CN SBGEMM_TN +#define GEMM_TN SBGEMM_TN +#define GEMM_NC SBGEMM_NT +#define GEMM_NT SBGEMM_NT +#define GEMM_CC SBGEMM_TT +#define GEMM_CT SBGEMM_TT +#define GEMM_TC SBGEMM_TT +#define GEMM_TT SBGEMM_TT +#define GEMM_NR SBGEMM_NN +#define GEMM_TR SBGEMM_TN +#define GEMM_CR SBGEMM_TN +#define GEMM_RN SBGEMM_NN +#define GEMM_RT SBGEMM_NT +#define GEMM_RC SBGEMM_NT +#define GEMM_RR SBGEMM_NN +#define GEMM_ONCOPY SBGEMM_ONCOPY +#define GEMM_OTCOPY SBGEMM_OTCOPY +#define GEMM_INCOPY SBGEMM_INCOPY +#define GEMM_ITCOPY SBGEMM_ITCOPY #define SYMM_THREAD_LU SSYMM_THREAD_LU #define SYMM_THREAD_LL SSYMM_THREAD_LL #define SYMM_THREAD_RU SSYMM_THREAD_RU @@ -723,22 +723,22 @@ #define HEMM_THREAD_RU SHEMM_THREAD_RU #define HEMM_THREAD_RL SHEMM_THREAD_RL -#define GEMM_THREAD_NN SHGEMM_THREAD_NN -#define GEMM_THREAD_CN SHGEMM_THREAD_TN -#define GEMM_THREAD_TN SHGEMM_THREAD_TN -#define GEMM_THREAD_NC SHGEMM_THREAD_NT -#define GEMM_THREAD_NT SHGEMM_THREAD_NT -#define GEMM_THREAD_CC SHGEMM_THREAD_TT -#define GEMM_THREAD_CT SHGEMM_THREAD_TT -#define GEMM_THREAD_TC SHGEMM_THREAD_TT -#define GEMM_THREAD_TT SHGEMM_THREAD_TT -#define GEMM_THREAD_NR SHGEMM_THREAD_NN -#define GEMM_THREAD_TR SHGEMM_THREAD_TN -#define GEMM_THREAD_CR SHGEMM_THREAD_TN -#define GEMM_THREAD_RN SHGEMM_THREAD_NN -#define GEMM_THREAD_RT SHGEMM_THREAD_NT -#define GEMM_THREAD_RC SHGEMM_THREAD_NT -#define GEMM_THREAD_RR SHGEMM_THREAD_NN +#define GEMM_THREAD_NN SBGEMM_THREAD_NN +#define GEMM_THREAD_CN SBGEMM_THREAD_TN +#define GEMM_THREAD_TN SBGEMM_THREAD_TN +#define GEMM_THREAD_NC SBGEMM_THREAD_NT +#define GEMM_THREAD_NT SBGEMM_THREAD_NT +#define GEMM_THREAD_CC SBGEMM_THREAD_TT +#define GEMM_THREAD_CT SBGEMM_THREAD_TT +#define GEMM_THREAD_TC SBGEMM_THREAD_TT +#define GEMM_THREAD_TT SBGEMM_THREAD_TT +#define GEMM_THREAD_NR SBGEMM_THREAD_NN +#define GEMM_THREAD_TR SBGEMM_THREAD_TN +#define GEMM_THREAD_CR SBGEMM_THREAD_TN +#define GEMM_THREAD_RN SBGEMM_THREAD_NN +#define GEMM_THREAD_RT SBGEMM_THREAD_NT +#define GEMM_THREAD_RC SBGEMM_THREAD_NT +#define GEMM_THREAD_RR SBGEMM_THREAD_NN #ifdef UNIT @@ -2491,9 +2491,9 @@ #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; -extern BLASLONG shgemm_p; -extern BLASLONG shgemm_q; -extern BLASLONG shgemm_r; +extern BLASLONG sbgemm_p; +extern BLASLONG sbgemm_q; +extern BLASLONG sbgemm_r; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_param.h b/common_param.h index 0fe5e6c1dc..3615230810 100644 --- a/common_param.h +++ b/common_param.h @@ -47,9 +47,9 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; -#ifdef BUILD_HALF - int shgemm_p, shgemm_q, shgemm_r; - int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; +#ifdef BUILD_BFLOAT16 + int sbgemm_p, sbgemm_q, sbgemm_r; + int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); @@ -69,8 +69,8 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); float (*shasum_k) (BLASLONG, float *, BLASLONG); float (*shsum_k) (BLASLONG, float *, BLASLONG); int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); - double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*sbdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); + double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); @@ -78,20 +78,20 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); - int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); + int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); - int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -147,14 +147,14 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; #endif int exclusive_cache; -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); @@ -167,11 +167,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); #endif - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE float (*ssum_k) (BLASLONG, float *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -179,26 +178,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); #endif - -#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) +#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); @@ -213,8 +206,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); #endif - -#if (BUILD_SINGLE) || (BUILD_DOUBLE) +#ifdef BUILD_SINGLE int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -236,8 +228,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); -#endif -#if BUILD_SINGLE + int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -264,18 +255,17 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); -#endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) + int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -286,21 +276,21 @@ BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE double (*dsum_k) (BLASLONG, double *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); #endif -#if (BUILD_SINGLE) || (BUILD_DOUBLE) +#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE) double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); #endif -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -308,15 +298,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); #endif - -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); #endif - -#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) +#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -325,8 +313,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); #endif - -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -473,30 +460,23 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif - -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) +#ifdef BUILD_COMPLEX int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; + float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); -#endif -#if BUILD_COMPLEX float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); float (*csum_k) (BLASLONG, float *, BLASLONG); -#endif -#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); -#endif -#if BUILD_COMPLEX int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); -#endif -#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) + int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -510,8 +490,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -523,14 +501,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -561,8 +538,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); -#endif -#if (BUILD_COMPLEX) int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -646,14 +621,12 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); -#endif -#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) + int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif - -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -991,35 +964,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; - -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -1031,7 +1003,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -1043,21 +1015,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -1069,7 +1041,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -1081,16 +1053,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); #endif -#if BUILD_SINGLE +#ifdef BUILD_SINGLE int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); #endif -#if BUILD_DOUBLE +#ifdef BUILD_DOUBLE int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); #endif -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); #endif } gotoblas_t; @@ -1104,16 +1076,16 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache -#ifdef BUILD_HALF -#define SHGEMM_P gotoblas -> shgemm_p -#define SHGEMM_Q gotoblas -> shgemm_q -#define SHGEMM_R gotoblas -> shgemm_r -#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m -#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n -#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#ifdef BUILD_BFLOAT16 +#define SBGEMM_P gotoblas -> sbgemm_p +#define SBGEMM_Q gotoblas -> sbgemm_q +#define SBGEMM_R gotoblas -> sbgemm_r +#define SBGEMM_UNROLL_M gotoblas -> sbgemm_unroll_m +#define SBGEMM_UNROLL_N gotoblas -> sbgemm_unroll_n +#define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn #endif -#if (BUILD_SINGLE) +#if defined (BUILD_SINGLE) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r @@ -1122,21 +1094,13 @@ extern gotoblas_t *gotoblas; #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn #endif -#if (BUILD_DOUBLE) +#if defined (BUILD_DOUBLE) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn -#if ! (BUILD_SINGLE) -#define SGEMM_P gotoblas -> sgemm_p -#define SGEMM_Q gotoblas -> sgemm_q -#define SGEMM_R gotoblas -> sgemm_r -#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m -#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n -#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn -#endif #endif #define QGEMM_P gotoblas -> qgemm_p @@ -1146,7 +1110,7 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn -#if BUILD_COMPLEX +#ifdef BUILD_COMPLEX #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r @@ -1163,7 +1127,7 @@ extern gotoblas_t *gotoblas; #endif #endif -#if BUILD_COMPLEX16 +#ifdef BUILD_COMPLEX16 #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r @@ -1178,14 +1142,6 @@ extern gotoblas_t *gotoblas; #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn #endif -#ifndef BUILD_COMPLEX -#define CGEMM_P gotoblas -> cgemm_p -#define CGEMM_Q gotoblas -> cgemm_q -#define CGEMM_R gotoblas -> cgemm_r -#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m -#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n -#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn -#endif #endif #define XGEMM_P gotoblas -> xgemm_p @@ -1230,16 +1186,16 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif -#ifdef BUILD_HALF -#define SHGEMM_P SHGEMM_DEFAULT_P -#define SHGEMM_Q SHGEMM_DEFAULT_Q -#define SHGEMM_R SHGEMM_DEFAULT_R -#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#ifdef SHGEMM_DEFAULT_UNROLL_MN -#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN +#ifdef BUILD_BFLOAT16 +#define SBGEMM_P SBGEMM_DEFAULT_P +#define SBGEMM_Q SBGEMM_DEFAULT_Q +#define SBGEMM_R SBGEMM_DEFAULT_R +#define SBGEMM_UNROLL_M SBGEMM_DEFAULT_UNROLL_M +#define SBGEMM_UNROLL_N SBGEMM_DEFAULT_UNROLL_N +#ifdef SBGEMM_DEFAULT_UNROLL_MN +#define SBGEMM_UNROLL_MN SBGEMM_DEFAULT_UNROLL_MN #else -#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) +#define SBGEMM_UNROLL_MN MAX((SBGEMM_UNROLL_M), (SBGEMM_UNROLL_N)) #endif #endif @@ -1354,7 +1310,7 @@ extern gotoblas_t *gotoblas; #endif #ifndef COMPLEX -#if (XDOUBLE) +#if defined(XDOUBLE) #define GEMM_P QGEMM_P #define GEMM_Q QGEMM_Q #define GEMM_R QGEMM_R @@ -1378,18 +1334,18 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif (HALF) -#define GEMM_P SHGEMM_P -#define GEMM_Q SHGEMM_Q -#define GEMM_R SHGEMM_R -#define GEMM_UNROLL_M SHGEMM_UNROLL_M -#define GEMM_UNROLL_N SHGEMM_UNROLL_N -#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#elif defined(BFLOAT16) +#define GEMM_P SBGEMM_P +#define GEMM_Q SBGEMM_Q +#define GEMM_R SBGEMM_R +#define GEMM_UNROLL_M SBGEMM_UNROLL_M +#define GEMM_UNROLL_N SBGEMM_UNROLL_N +#define GEMM_UNROLL_MN SBGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SBGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SBGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SBGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SBGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SBGEMM_DEFAULT_UNROLL_N #else #define GEMM_P SGEMM_P #define GEMM_Q SGEMM_Q @@ -1404,7 +1360,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N #endif #else -#if (XDOUBLE) +#if defined(XDOUBLE) #define GEMM_P XGEMM_P #define GEMM_Q XGEMM_Q #define GEMM_R XGEMM_R @@ -1475,8 +1431,8 @@ extern gotoblas_t *gotoblas; #define GEMM_THREAD gemm_thread_n #endif -#ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) +#ifndef SBGEMM_DEFAULT_R +#define SBGEMM_DEFAULT_R (((BUFFER_SIZE - ((SBGEMM_DEFAULT_P * SBGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SBGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef SGEMM_DEFAULT_R @@ -1518,7 +1474,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_P #ifdef XDOUBLE #define GEMM3M_P XGEMM3M_P -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_P ZGEMM3M_P #else #define GEMM3M_P CGEMM3M_P @@ -1528,7 +1484,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_Q #ifdef XDOUBLE #define GEMM3M_Q XGEMM3M_Q -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_Q ZGEMM3M_Q #else #define GEMM3M_Q CGEMM3M_Q @@ -1538,7 +1494,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_R #ifdef XDOUBLE #define GEMM3M_R XGEMM3M_R -#elif defined (DOUBLE) +#elif defined(DOUBLE) #define GEMM3M_R ZGEMM3M_R #else #define GEMM3M_R CGEMM3M_R diff --git a/getarch_2nd.c b/getarch_2nd.c index a1d0ccac8d..c390ef52c6 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -9,8 +9,8 @@ int main(int argc, char **argv) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { - printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); - printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); + printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M); + printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); diff --git a/param.h b/param.h index 1ab982dc5e..f3ddde6a19 100644 --- a/param.h +++ b/param.h @@ -72,12 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H -#define SHGEMM_DEFAULT_UNROLL_N 4 -#define SHGEMM_DEFAULT_UNROLL_M 8 -#define SHGEMM_DEFAULT_UNROLL_MN 32 -#define SHGEMM_DEFAULT_P 256 -#define SHGEMM_DEFAULT_R 256 -#define SHGEMM_DEFAULT_Q 256 +#define SBGEMM_DEFAULT_UNROLL_N 4 +#define SBGEMM_DEFAULT_UNROLL_M 8 +#define SBGEMM_DEFAULT_UNROLL_MN 32 +#define SBGEMM_DEFAULT_P 256 +#define SBGEMM_DEFAULT_R 256 +#define SBGEMM_DEFAULT_Q 256 #ifdef OPTERON #define SNUMOPT 4 @@ -2426,16 +2426,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER10) -#undef SHGEMM_DEFAULT_UNROLL_N -#undef SHGEMM_DEFAULT_UNROLL_M -#undef SHGEMM_DEFAULT_P -#undef SHGEMM_DEFAULT_R -#undef SHGEMM_DEFAULT_Q -#define SHGEMM_DEFAULT_UNROLL_M 16 -#define SHGEMM_DEFAULT_UNROLL_N 8 -#define SHGEMM_DEFAULT_P 832 -#define SHGEMM_DEFAULT_Q 1026 -#define SHGEMM_DEFAULT_R 4096 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_UNROLL_N 8 +#define SBGEMM_DEFAULT_P 832 +#define SBGEMM_DEFAULT_Q 1026 +#define SBGEMM_DEFAULT_R 4096 #endif #if defined(SPARC) && defined(V7) From 573508f0ee04d890dcaf2307728063d2d23371de Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:50:54 +0200 Subject: [PATCH 293/349] Rename common_sh.h to common_sb.h --- common_sh.h => common_sb.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename common_sh.h => common_sb.h (100%) diff --git a/common_sh.h b/common_sb.h similarity index 100% rename from common_sh.h rename to common_sb.h From 3bc8e8c33404d4d3b8f5bd35c662f53fb1c6285c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:51:34 +0200 Subject: [PATCH 294/349] Rename "HALF" and "sh" to "BFLOAT16"and "sb" --- common_sb.h | 110 ++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/common_sb.h b/common_sb.h index 5dc99b3bde..66968ab00f 100644 --- a/common_sb.h +++ b/common_sb.h @@ -1,77 +1,77 @@ -#ifndef COMMON_SH_H -#define COMMON_SH_H +#ifndef COMMON_SB_H +#define COMMON_SB_H #ifndef DYNAMIC_ARCH -#define SHDOT_K shdot_k -#define SHSTOBF16_K shstobf16_k -#define SHDTOBF16_K shdtobf16_k +#define SBDOT_K sbdot_k +#define SBSTOBF16_K sbstobf16_k +#define SBDTOBF16_K sbdtobf16_k #define SBF16TOS_K sbf16tos_k #define DBF16TOD_K dbf16tod_k -#define SHGEMM_ONCOPY shgemm_oncopy -#define SHGEMM_OTCOPY shgemm_otcopy +#define SBGEMM_ONCOPY sbgemm_oncopy +#define SBGEMM_OTCOPY sbgemm_otcopy -#if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N -#define SHGEMM_INCOPY shgemm_oncopy -#define SHGEMM_ITCOPY shgemm_otcopy +#if SBGEMM_DEFAULT_UNROLL_M == SBGEMM_DEFAULT_UNROLL_N +#define SBGEMM_INCOPY sbgemm_oncopy +#define SBGEMM_ITCOPY sbgemm_otcopy #else -#define SHGEMM_INCOPY shgemm_incopy -#define SHGEMM_ITCOPY shgemm_itcopy +#define SBGEMM_INCOPY sbgemm_incopy +#define SBGEMM_ITCOPY sbgemm_itcopy #endif -#define SHGEMM_BETA shgemm_beta -#define SHGEMM_KERNEL shgemm_kernel +#define SBGEMM_BETA sbgemm_beta +#define SBGEMM_KERNEL sbgemm_kernel #else -#define SHDOT_K gotoblas -> shdot_k -#define SHSTOBF16_K gotoblas -> shstobf16_k -#define SHDTOBF16_K gotoblas -> shdtobf16_k +#define SBDOT_K gotoblas -> sbdot_k +#define SBSTOBF16_K gotoblas -> sbstobf16_k +#define SBDTOBF16_K gotoblas -> sbdtobf16_k #define SBF16TOS_K gotoblas -> sbf16tos_k #define DBF16TOD_K gotoblas -> dbf16tod_k -#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy -#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy -#define SHGEMM_INCOPY gotoblas -> shgemm_incopy -#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy -#define SHGEMM_BETA gotoblas -> shgemm_beta -#define SHGEMM_KERNEL gotoblas -> shgemm_kernel +#define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy +#define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy +#define SBGEMM_INCOPY gotoblas -> sbgemm_incopy +#define SBGEMM_ITCOPY gotoblas -> sbgemm_itcopy +#define SBGEMM_BETA gotoblas -> sbgemm_beta +#define SBGEMM_KERNEL gotoblas -> sbgemm_kernel #endif -#define SHGEMM_NN shgemm_nn -#define SHGEMM_CN shgemm_tn -#define SHGEMM_TN shgemm_tn -#define SHGEMM_NC shgemm_nt -#define SHGEMM_NT shgemm_nt -#define SHGEMM_CC shgemm_tt -#define SHGEMM_CT shgemm_tt -#define SHGEMM_TC shgemm_tt -#define SHGEMM_TT shgemm_tt -#define SHGEMM_NR shgemm_nn -#define SHGEMM_TR shgemm_tn -#define SHGEMM_CR shgemm_tn -#define SHGEMM_RN shgemm_nn -#define SHGEMM_RT shgemm_nt -#define SHGEMM_RC shgemm_nt -#define SHGEMM_RR shgemm_nn +#define SBGEMM_NN sbgemm_nn +#define SBGEMM_CN sbgemm_tn +#define SBGEMM_TN sbgemm_tn +#define SBGEMM_NC sbgemm_nt +#define SBGEMM_NT sbgemm_nt +#define SBGEMM_CC sbgemm_tt +#define SBGEMM_CT sbgemm_tt +#define SBGEMM_TC sbgemm_tt +#define SBGEMM_TT sbgemm_tt +#define SBGEMM_NR sbgemm_nn +#define SBGEMM_TR sbgemm_tn +#define SBGEMM_CR sbgemm_tn +#define SBGEMM_RN sbgemm_nn +#define SBGEMM_RT sbgemm_nt +#define SBGEMM_RC sbgemm_nt +#define SBGEMM_RR sbgemm_nn -#define SHGEMM_THREAD_NN shgemm_thread_nn -#define SHGEMM_THREAD_CN shgemm_thread_tn -#define SHGEMM_THREAD_TN shgemm_thread_tn -#define SHGEMM_THREAD_NC shgemm_thread_nt -#define SHGEMM_THREAD_NT shgemm_thread_nt -#define SHGEMM_THREAD_CC shgemm_thread_tt -#define SHGEMM_THREAD_CT shgemm_thread_tt -#define SHGEMM_THREAD_TC shgemm_thread_tt -#define SHGEMM_THREAD_TT shgemm_thread_tt -#define SHGEMM_THREAD_NR shgemm_thread_nn -#define SHGEMM_THREAD_TR shgemm_thread_tn -#define SHGEMM_THREAD_CR shgemm_thread_tn -#define SHGEMM_THREAD_RN shgemm_thread_nn -#define SHGEMM_THREAD_RT shgemm_thread_nt -#define SHGEMM_THREAD_RC shgemm_thread_nt -#define SHGEMM_THREAD_RR shgemm_thread_nn +#define SBGEMM_THREAD_NN sbgemm_thread_nn +#define SBGEMM_THREAD_CN sbgemm_thread_tn +#define SBGEMM_THREAD_TN sbgemm_thread_tn +#define SBGEMM_THREAD_NC sbgemm_thread_nt +#define SBGEMM_THREAD_NT sbgemm_thread_nt +#define SBGEMM_THREAD_CC sbgemm_thread_tt +#define SBGEMM_THREAD_CT sbgemm_thread_tt +#define SBGEMM_THREAD_TC sbgemm_thread_tt +#define SBGEMM_THREAD_TT sbgemm_thread_tt +#define SBGEMM_THREAD_NR sbgemm_thread_nn +#define SBGEMM_THREAD_TR sbgemm_thread_tn +#define SBGEMM_THREAD_CR sbgemm_thread_tn +#define SBGEMM_THREAD_RN sbgemm_thread_nn +#define SBGEMM_THREAD_RT sbgemm_thread_nt +#define SBGEMM_THREAD_RC sbgemm_thread_nt +#define SBGEMM_THREAD_RR sbgemm_thread_nn #endif From 32733ded0460841708cde93d50fed735fd35ed5e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:52:45 +0200 Subject: [PATCH 295/349] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- lapack/potrf/potrf_parallel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index 008fcb8cc0..29364cc051 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -382,7 +382,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; #elif defined(HALF) mode = BLAS_HALF | BLAS_REAL; - mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; + mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; From dc8a1afa6357662736fdf7d4eb73cf65bc7ccde1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:53:50 +0200 Subject: [PATCH 296/349] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/x86_64/KERNEL | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index d75196974e..4f110f0bf8 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -146,8 +146,8 @@ ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif -ifndef SHDOTKERNEL -SHDOTKERNEL = shdot.c +ifndef SBDOTKERNEL +SBDOTKERNEL = sbdot.c endif ifndef TOBF16KERNEL From 2061f7fdff640635467fcc790500c0a2028955db Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:54:53 +0200 Subject: [PATCH 297/349] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/power/KERNEL.POWER10 | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index d0cda7fb66..5cf1660a25 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -7,16 +7,16 @@ else #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -SHGEMM_BETA = ../generic/gemm_beta.c -SHGEMMKERNEL = shgemm_kernel_power10.c -SHGEMMINCOPY = ../generic/gemm_ncopy_16.c -SHGEMMITCOPY = ../generic/gemm_tcopy_16.c -SHGEMMONCOPY = ../generic/gemm_ncopy_8.c -SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c -SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) -SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) -SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) -SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +SBGEMM_BETA = ../generic/gemm_beta.c +SBGEMMKERNEL = sbgemm_kernel_power10.c +SBGEMMINCOPY = ../generic/gemm_ncopy_16.c +SBGEMMITCOPY = ../generic/gemm_tcopy_16.c +SBGEMMONCOPY = ../generic/gemm_ncopy_8.c +SBGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c From 756062afa5f3de899e6b8dea397c95c8bae848af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 23:56:17 +0200 Subject: [PATCH 298/349] Rename "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/generic/gemmkernel_2x2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c index cc7bb8e487..bf1c3ae381 100644 --- a/kernel/generic/gemmkernel_2x2.c +++ b/kernel/generic/gemmkernel_2x2.c @@ -1,5 +1,5 @@ #include "common.h" -#if defined(HALF) && defined(HALFCONVERSION) +#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) static float bfloat16tof32 (bfloat16 f16) { From 3aecafad801b05d2606ba2cafa5deb6f6731e8c4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:00:55 +0200 Subject: [PATCH 299/349] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- kernel/CMakeLists.txt | 28 ++++----- kernel/Makefile.L1 | 20 +++---- kernel/Makefile.L3 | 134 +++++++++++++++++++++--------------------- kernel/setparam-ref.c | 68 ++++++++++----------- 4 files changed, 125 insertions(+), 125 deletions(-) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 988b83338a..6d8d759ad8 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -41,8 +41,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) @@ -149,8 +149,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) @@ -208,13 +208,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) endif() - foreach (float_type SINGLE DOUBLE HALF) + foreach (float_type SINGLE DOUBLE BFLOAT16) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - if (NOT ${BUILD_HALF}) + if (${float_type} STREQUAL "BFLOAT16") + if (NOT ${BUILD_BFLOAT16}) continue () else () - set (float_char "SH") + set (float_char "SB") endif () endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) @@ -254,8 +254,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) @@ -620,8 +620,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.LA if(NOT NO_LAPACK) foreach (float_type ${FLOAT_TYPES}) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (NOT DEFINED ${float_char}NEG_TCOPY) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") @@ -688,8 +688,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type}) GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index c6576ee07b..6fe6778d0a 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -262,9 +262,9 @@ ifndef XDOTKERNEL XDOTKERNEL = zdot.S endif -ifeq ($(BUILD_HALF),1) -ifndef SHDOTKERNEL -SHDOTKERNEL = ../x86_64/shdot.c +ifeq ($(BUILD_BFLOAT16),1) +ifndef SBDOTKERNEL +SBDOTKERNEL = ../x86_64/sbdot.c endif ifndef TOBF16KERNEL @@ -530,11 +530,11 @@ XBLASOBJS += \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) -ifeq ($(BUILD_HALF),1) +ifeq ($(BUILD_BFLOAT16),1) SHBLASOBJS += \ - shdot_k$(TSUFFIX).$(SUFFIX) + sbdot_k$(TSUFFIX).$(SUFFIX) SHEXTOBJS += \ - shstobf16_k$(TSUFFIX).$(SUFFIX) shdtobf16_k$(TSUFFIX).$(SUFFIX) + sbstobf16_k$(TSUFFIX).$(SUFFIX) sbdtobf16_k$(TSUFFIX).$(SUFFIX) SHEXTOBJS += \ sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX) endif @@ -757,12 +757,12 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ -ifeq ($(BUILD_HALF),1) -$(KDIR)shdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)shdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHDOTKERNEL) +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sbdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ -$(KDIR)shstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) +$(KDIR)sbstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ -$(KDIR)shdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) +$(KDIR)sbdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) $(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@ $(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL) $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index e03ed0fad2..65d4290128 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -80,24 +80,24 @@ SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c endif endif -ifeq ($(BUILD_HALF), 1) -ifndef SHGEMMKERNEL -SHGEMM_BETA = ../generic/gemm_beta.c -SHGEMMKERNEL = ../generic/gemmkernel_2x2.c -SHGEMMINCOPY = ../generic/gemm_ncopy_2.c -SHGEMMITCOPY = ../generic/gemm_tcopy_2.c -SHGEMMONCOPY = ../generic/gemm_ncopy_2.c -SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) -SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) -SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) -SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16), 1) +ifndef SBGEMMKERNEL +SBGEMM_BETA = ../generic/gemm_beta.c +SBGEMMKERNEL = ../generic/gemmkernel_2x2.c +SBGEMMINCOPY = ../generic/gemm_ncopy_2.c +SBGEMMITCOPY = ../generic/gemm_tcopy_2.c +SBGEMMONCOPY = ../generic/gemm_ncopy_2.c +SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) endif SHKERNELOBJS += \ - shgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ - $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) + sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ + $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) endif ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" @@ -149,7 +149,7 @@ XKERNELOBJS += \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) -ifeq ($(BUILD_HALF),1) +ifeq ($(BUILD_BFLOAT16),1) SHBLASOBJS += $(SHKERNELOBJS) endif SBLASOBJS += $(SKERNELOBJS) @@ -159,8 +159,8 @@ CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SHBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) endif ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" @@ -492,11 +492,11 @@ ZBLASOBJS += \ zgeadd_k$(TSUFFIX).$(SUFFIX) endif -ifeq ($(BUILD_HALF), 1) -SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ifeq ($(BUILD_BFLOAT16), 1) +SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) endif SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -524,9 +524,9 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ifeq ($(BUILD_HALF),1) -$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) @@ -548,35 +548,35 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF), 1) +ifeq ($(BUILD_BFLOAT16), 1) -$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) +$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s - m4 shgemmotcopy.s > shgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ - rm shgemmotcopy.s shgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s + m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ + rm sbgemmotcopy.s sbgemmotcopy_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif -ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) -$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) +$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s - m4 shgemmitcopy.s > shgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ - rm shgemmitcopy.s shgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s + m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ + rm sbgemmitcopy.s sbgemmitcopy_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -746,16 +746,16 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif -ifeq ($(BUILD_HALF), 1) +ifeq ($(BUILD_BFLOAT16), 1) -$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) +$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s - m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s + m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -2375,9 +2375,9 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF),1) -$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) @@ -2396,19 +2396,19 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF), 1) -$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16), 1) +$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) -$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) +$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -2518,9 +2518,9 @@ endif endif -ifeq ($(BUILD_HALF), 1) -$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16), 1) +$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index dd49d8e4ec..72fbf32bf7 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -53,32 +53,32 @@ gotoblas_t TABLE_NAME = { GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 0, 0, 0, - SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, -#ifdef SHGEMM_DEFAULT_UNROLL_MN - SHGEMM_DEFAULT_UNROLL_MN, + SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N, +#ifdef SBGEMM_DEFAULT_UNROLL_MN + SBGEMM_DEFAULT_UNROLL_MN, #else - MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), + MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), #endif - shstobf16_kTS, shdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, + sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, shdot_kTS, + snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sgemv_nTS, sgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, - shgemm_kernelTS, shgemm_betaTS, -#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N - shgemm_incopyTS, shgemm_itcopyTS, + sbgemm_kernelTS, sbgemm_betaTS, +#if SBGEMM_DEFAULT_UNROLL_M != SBGEMM_DEFAULT_UNROLL_N + sbgemm_incopyTS, sbgemm_itcopyTS, #else - shgemm_oncopyTS, shgemm_otcopyTS, + sbgemm_oncopyTS, sbgemm_otcopyTS, #endif - shgemm_oncopyTS, shgemm_otcopyTS, + sbgemm_oncopyTS, sbgemm_otcopyTS, strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N @@ -830,8 +830,8 @@ gotoblas_t TABLE_NAME = { #if (ARCH_ARM64) static void init_parameter(void) { -#if (BUILD_HALF) - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#if (BUILD_BFLOAT16) + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; @@ -846,8 +846,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; #endif -#if (BUILD_HALF) - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#if (BUILD_BFLOAT16) + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif #if BUILD_SINGLE == 1 TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; @@ -862,8 +862,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; #endif -#if (BUILD_HALF) - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#if (BUILD_BFLOAT16) + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif #if BUILD_SINGLE == 1 TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; @@ -936,16 +936,16 @@ static void init_parameter(void) { #if (ARCH_POWER) static void init_parameter(void) { -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; @@ -953,8 +953,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; @@ -965,16 +965,16 @@ static void init_parameter(void) { #if (ARCH_ZARCH) static void init_parameter(void) { -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; @@ -982,8 +982,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; @@ -1124,10 +1124,10 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; From 052f31bc3c72abbe8b166d6a6aca1096769d6e16 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:02:16 +0200 Subject: [PATCH 300/349] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- interface/Makefile | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index 71393aaba9..a35d532705 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -46,10 +46,10 @@ SBLAS3OBJS = \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) -ifeq ($(BUILD_HALF),1) -SHBLAS1OBJS = shdot.$(SUFFIX) -SHBLAS3OBJS = shgemm.$(SUFFIX) -SHEXTOBJS = shstobf16.$(SUFFIX) shdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLAS1OBJS = sbdot.$(SUFFIX) +SBBLAS3OBJS = sbgemm.$(SUFFIX) +SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif DBLAS1OBJS = \ @@ -282,10 +282,10 @@ CSBLAS3OBJS = \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) -ifeq ($(BUILD_HALF),1) -CSHBLAS1OBJS = cblas_shdot.$(SUFFIX) -CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) -CSHEXTOBJS = cblas_shstobf16.$(SUFFIX) cblas_shdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +CBHBLAS1OBJS = cblas_sbdot.$(SUFFIX) +CBHBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CBHEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif CDBLAS1OBJS = \ @@ -381,8 +381,8 @@ override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) -SHBLAS1OBJS += $(CSHBLAS1OBJS) -SHBLAS3OBJS += $(CSHBLAS3OBJS) +SBBLAS1OBJS += $(CSBBLAS1OBJS) +SBBLAS3OBJS += $(CSBBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS3OBJS += $(CDBLAS3OBJS) @@ -393,13 +393,13 @@ ZBLAS1OBJS += $(CZBLAS1OBJS) ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS) -SHEXTOBJS += $(CSHEXTOBJS) +SBEXTOBJS += $(CSBEXTOBJS) CBAUXOBJS += $(CXERBLAOBJ) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SHBLASOBJS = $(SHBLAS1OBJS) $(SHBLAS3OBJS) +SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -506,7 +506,7 @@ ifneq ($(BUILD_COMPLEX16),1) ZBLASOBJS= endif -FUNCOBJS = $(SHEXTOBJS) $(CXERBLAOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(info FUNCOBJS = {[$(FUNCOBJS)]} ) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -772,8 +772,8 @@ sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -c $< -o $(@F) -ifeq ($(BUILD_HALF),1) -shdot.$(SUFFIX) shdot.$(PSUFFIX) : bf16dot.c +ifeq ($(BUILD_BFLOAT16),1) +sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -c $< -o $(@F) shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) @@ -1278,8 +1278,8 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) -ifeq ($(BUILD_HALF),1) -shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif @@ -1523,8 +1523,8 @@ cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) -ifeq ($(BUILD_HALF),1) -cblas_shdot.$(SUFFIX) cblas_shdot.$(PSUFFIX) : bf16dot.c +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) @@ -1857,8 +1857,8 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) -ifeq ($(BUILD_HALF),1) -cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif From ae1ab5bfdf866add26f25cce5c261705178e428e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:03:21 +0200 Subject: [PATCH 301/349] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- exports/Makefile | 18 +++++++++--------- exports/gensymbol | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 960150c864..3f1ffba11e 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -30,8 +30,8 @@ ifndef BUILD_LAPACK_DEPRECATED BUILD_LAPACK_DEPRECATED = 0 endif -ifndef BUILD_HALF -BUILD_HALF = 0 +ifndef BUILD_BFLOAT16 +BUILD_BFLOAT16 = 0 endif ifndef BUILD_SINGLE BUILD_SINGLE = 0 @@ -120,10 +120,10 @@ dll : ../$(LIBDLLNAME) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) $(LIBPREFIX).def : gensymbol - perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) libgoto_hpl.def : gensymbol - perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) ifeq ($(OSNAME), Darwin) INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib @@ -258,23 +258,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index 736fdc2cdf..9ff8e10b16 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -51,7 +51,7 @@ zgeadd, dzsum); @cblasobjs = (lsame, xerbla); -@halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); +@halfblasobjs = (sbgemm, sbdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -94,7 +94,7 @@ @cblasobjs = ( cblas_xerbla ); -@halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, From 85154c2e18fbdcb8b45457dc2d8d51b8b69e71ae Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:05:05 +0200 Subject: [PATCH 302/349] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- driver/others/blas_server.c | 2 +- driver/others/blas_server_omp.c | 2 +- driver/others/blas_server_win32.c | 2 +- driver/others/parameter.c | 22 +++++++++++----------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index acfaed75d2..30e0cc6c23 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -227,7 +227,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index bfbe3a647d..d546553c1f 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -192,7 +192,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index d2cc917570..4624085d54 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -112,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ args -> a, args -> lda, args -> b, args -> ldb, args -> c, args -> ldc, sb); -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ /* REAL / BFLOAT16 */ void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 5d312fa87d..35fc0a253a 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -62,10 +62,10 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; BLASLONG gemm_offset_b = GEMM_OFFSET_B; #endif -#if SHGEMM_P == shgemm_p -BLASLONG shgemm_p = DEFAULT_GEMM_P; +#if SBGEMM_P == sbgemm_p +BLASLONG sbgemm_p = DEFAULT_GEMM_P; #else -BLASLONG shgemm_p = SHGEMM_P; +BLASLONG sbgemm_p = SBGEMM_P; #endif #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; @@ -88,10 +88,10 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P; BLASLONG zgemm_p = ZGEMM_P; #endif -#if SHGEMM_Q == shgemm_q -BLASLONG shgemm_q = DEFAULT_GEMM_Q; +#if SBGEMM_Q == sbgemm_q +BLASLONG sbgemm_q = DEFAULT_GEMM_Q; #else -BLASLONG shgemm_q = SHGEMM_Q; +BLASLONG sbgemm_q = SBGEMM_Q; #endif #if SGEMM_Q == sgemm_q BLASLONG sgemm_q = DEFAULT_GEMM_Q; @@ -114,10 +114,10 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q; BLASLONG zgemm_q = ZGEMM_Q; #endif -#if SHGEMM_R == shgemm_r -BLASLONG shgemm_r = DEFAULT_GEMM_R; +#if SBGEMM_R == sbgemm_r +BLASLONG sbgemm_r = DEFAULT_GEMM_R; #else -BLASLONG shgemm_r = SHGEMM_R; +BLASLONG sbgemm_r = SBGEMM_R; #endif #if SGEMM_R == sgemm_r BLASLONG sgemm_r = DEFAULT_GEMM_R; @@ -615,7 +615,7 @@ void blas_set_parameter(void){ size = BITMASK(cpuid3, 16, 0xff); - shgemm_p = 192 * (size + 1); + sbgemm_p = 192 * (size + 1); sgemm_p = 192 * (size + 1); dgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1); @@ -629,7 +629,7 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif - shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; + sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; From 006c7f6671895d36153e8a93cd6fd8c084aadfe0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:06:06 +0200 Subject: [PATCH 303/349] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- driver/level3/Makefile | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/driver/level3/Makefile b/driver/level3/Makefile index e3aa30256c..b4f1e2b264 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -19,8 +19,8 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SHBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) endif SBLASOBJS += \ @@ -207,8 +207,8 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SHBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) @@ -343,16 +343,16 @@ endif all :: -shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h +sbgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h +sbgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h +sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h +sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h @@ -550,16 +550,16 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h beta_thread.$(SUFFIX) : beta_thread.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) -shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h @@ -2735,16 +2735,16 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c xtrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) -shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h +sbgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h +sbgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h +sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h +sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h @@ -2943,16 +2943,16 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) -shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h +sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h From e3a29f6b58ffdf656ff9b05438f235646b59586a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:07:37 +0200 Subject: [PATCH 304/349] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- cmake/kernel.cmake | 32 ++++++++++++++++---------------- cmake/prebuild.cmake | 8 ++++---- cmake/system.cmake | 27 ++++++++++++++++----------- cmake/utils.cmake | 10 +++++----- 4 files changed, 41 insertions(+), 36 deletions(-) diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 79eeaae6fe..7d7f5ffdae 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -113,7 +113,7 @@ macro(SetDefaultL1) set(ZSUMKERNEL zsum.S) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) -if (BUILD_HALF) +if (BUILD_BFLOAT16) set(SHAMINKERNEL ../arm/amin.c) set(SHAMAXKERNEL ../arm/amax.c) set(SHMAXKERNEL ../arm/max.c) @@ -126,7 +126,7 @@ if (BUILD_HALF) set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPBYKERNEL ../arm/axpby.c) set(SHCOPYKERNEL ../arm/copy.c) - set(SHDOTKERNEL ../x86_64/shdot.c) + set(SBDOTKERNEL ../x86_64/sbdot.c) set(SHROTKERNEL ../arm/rot.c) set(SHSCALKERNEL ../arm/scal.c) set(SHNRM2KERNEL ../arm/nrm2.c) @@ -183,9 +183,9 @@ macro(SetDefaultL2) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) -if (BUILD_HALF) - set(SHGEMVNKERNEL ../arm/gemv_n.c) - set(SHGEMVTKERNEL ../arm/gemv_t.c) +if (BUILD_BFLOAT16) + set(SBGEMVNKERNEL ../arm/gemv_n.c) + set(SBGEMVTKERNEL ../arm/gemv_t.c) set(SHGERKERNEL ../generic/ger.c) endif () endmacro () @@ -195,18 +195,18 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) -if (BUILD_HALF) +if (BUILD_BFLOAT16) set(SHGEADD_KERNEL ../generic/geadd.c) - set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) - set(SHGEMM_BETA ../generic/gemm_beta.c) - set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) - set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) - set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) - set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) - set(SHGEMMINCOPYOBJ shgemm_incopy.o) - set(SHGEMMITCOPYOBJ shgemm_itcopy.o) - set(SHGEMMONCOPYOBJ shgemm_oncopy.o) - set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) + set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) + set(SBGEMM_BETA ../generic/gemm_beta.c) + set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) + set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) + set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) + set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) + set(SBGEMMINCOPYOBJ sbgemm_incopy.o) + set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) + set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) + set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) endif () endmacro () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 3b2a9d6a28..f40304c097 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -16,8 +16,8 @@ # HAVE_SSE2 # HAVE_SSE3 # MAKE -# SHGEMM_UNROLL_M -# SHGEMM_UNROLL_N +# SBGEMM_UNROLL_M +# SBGEMM_UNROLL_N # SGEMM_UNROLL_M # SGEMM_UNROLL_N # DGEMM_UNROLL_M @@ -471,8 +471,8 @@ endif () set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) endif() - set(SHGEMM_UNROLL_M 8) - set(SHGEMM_UNROLL_N 4) + set(SBGEMM_UNROLL_M 8) + set(SBGEMM_UNROLL_N 4) # Or should this actually be NUM_CORES? if (${NUM_THREADS} GREATER 0) diff --git a/cmake/system.cmake b/cmake/system.cmake index a504530fb4..b34d4a9a56 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,9 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() endif() if (DEFINED TARGET) @@ -323,7 +326,13 @@ else () set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") endif () endif () - +if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") +if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD) +if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32) +set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}") +endif() +endif() +endif() if (DEFINED LIBNAMESUFFIX) set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") else () @@ -401,20 +410,16 @@ if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_CO set (BUILD_COMPLEX16 ON) endif() if (BUILD_SINGLE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE=1") - set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE") endif() if (BUILD_DOUBLE) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") - set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE") endif() if (BUILD_COMPLEX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") - set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX") endif() if (BUILD_COMPLEX16) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") - set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX16=1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") endif() if(NOT MSVC) set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") @@ -588,8 +593,8 @@ endif () #export FUNCTION_PROFILE #export TARGET_CORE # -#export SHGEMM_UNROLL_M -#export SHGEMM_UNROLL_N +#export SBGEMM_UNROLL_M +#export SBGEMM_UNROLL_N #export SGEMM_UNROLL_M #export SGEMM_UNROLL_N #export DGEMM_UNROLL_M diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 1c21e776ea..8f25c1b274 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -211,7 +211,7 @@ function(GenerateNamedObjects sources_in) if (complex_only) list(REMOVE_ITEM float_list "SINGLE") list(REMOVE_ITEM float_list "DOUBLE") - list(REMOVE_ITEM float_list "HALF") + list(REMOVE_ITEM float_list "BFLOAT16") elseif (real_only) list(REMOVE_ITEM float_list "COMPLEX") list(REMOVE_ITEM float_list "ZCOMPLEX") @@ -225,8 +225,8 @@ function(GenerateNamedObjects sources_in) if (NOT no_float_type) string(SUBSTRING ${float_type} 0 1 float_char) string(TOLOWER ${float_char} float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "sh") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "sb") endif () endif () @@ -262,8 +262,8 @@ function(GenerateNamedObjects sources_in) if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "DOUBLE") endif () - if (${float_type} STREQUAL "HALF") - list(APPEND obj_defines "HALF") + if (${float_type} STREQUAL "BFLOAT16") + list(APPEND obj_defines "BFLOAT16") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "COMPLEX") From 7ae9e8960e85a1b0c0d163a1c5980b9e8cacb71e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:08:29 +0200 Subject: [PATCH 305/349] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- benchmark/Makefile | 12 ++++++------ benchmark/gemm.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 2f70ceaf37..f2f3b354a4 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -49,8 +49,8 @@ else GOTO_LAPACK_TARGETS= endif -ifeq ($(BUILD_HALF),1) -GOTO_HALF_TARGETS=shgemm.goto +ifeq ($(BUILD_BFLOAT16),1) +GOTO_HALF_TARGETS=sbgemm.goto else GOTO_HALF_TARGETS= endif @@ -620,8 +620,8 @@ zcholesky.essl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### -ifeq ($(BUILD_HALF),1) -shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm endif @@ -2927,8 +2927,8 @@ ccholesky.$(SUFFIX) : cholesky.c zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -ifeq ($(BUILD_HALF),1) -shgemm.$(SUFFIX) : gemm.c +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ endif diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 84dd292c5e..8cd14bbedb 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -40,7 +40,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) #elif defined(HALF) -#define GEMM BLASFUNC(shgemm) +#define GEMM BLASFUNC(sbgemm) #else #define GEMM BLASFUNC(sgemm) #endif From 2c552f1074743f968bbd53ac0d7353e15064ddbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:11:31 +0200 Subject: [PATCH 306/349] Change "HALF" and "sh" to "BFLOAT16" and "sb" --- CMakeLists.txt | 36 ++++++++++++++++-------------------- Makefile.rule | 34 +++++++++++++++++++++++++--------- Makefile.system | 10 +++++----- Makefile.tail | 4 ++-- 4 files changed, 48 insertions(+), 36 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f43e0e0fc0..a6cf2ef834 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,10 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() -option(BUILD_SINGLE "Single precision" OFF) -option(BUILD_DOUBLE "Double precision" OFF) -option(BUILD_COMPLEX "Single precision" OFF) -option(BUILD_COMPLEX16 "Single precision" OFF) +option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) +option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -91,13 +89,13 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () -if (NOT DEFINED BUILD_HALF) - set (BUILD_HALF false) +if (NOT DEFINED BUILD_BFLOAT16) + set (BUILD_BFLOAT16 false) endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all -# set(BUILD_HALF true) +# set(BUILD_BFLOAT16 true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -110,33 +108,28 @@ endif() set(FLOAT_TYPES "") if (BUILD_SINGLE) - message(STATUS "Building Songle Precision") - list(APPEND FLOAT_TYPES "SINGLE") - # set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") + message(STATUS "Building Single Precision") + list(APPEND FLOAT_TYPES "SINGLE") # defines nothing endif () if (BUILD_DOUBLE) message(STATUS "Building Double Precision") - list(APPEND FLOAT_TYPES "DOUBLE") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") + list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE endif () if (BUILD_COMPLEX) message(STATUS "Building Complex Precision") - list(APPEND FLOAT_TYPES "COMPLEX") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") -endif () + list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX +endif () if (BUILD_COMPLEX16) message(STATUS "Building Double Complex Precision") - list(APPEND FLOAT_TYPES "ZCOMPLEX") - #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") + list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () -if (BUILD_HALF) +if (BUILD_BFLOAT16) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "HALF") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF") + list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") @@ -243,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) + if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) + add_subdirectory(cpp_thread_test) + endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES diff --git a/Makefile.rule b/Makefile.rule index 09dfb08813..67d1839363 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,17 +272,33 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 +# +# use this to run only the less memory-hungry GEMV test +# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support -# BUILD_HALF = 1 -# -# Select if you need to build only select types -# BUILD_SINGLE = 1 -# BUILD_DOUBLE = 1 -# BUILD_COMPLEX = 1 -# BUILD_COMPLEX16 = 1 -# -# +# BUILD_BFLOAT16 = 1 + + +# Set the thread number threshold beyond which the job array for the threaded level3 BLAS +# will be allocated on the heap rather than the stack. (This array alone requires +# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu +# counts, but obviously it is not the only item that ends up on the stack. +# The default value of 32 ensures that the overall requirement is compatible +# with the default 1MB stacksize imposed by having the Java VM loaded without use +# of its -Xss parameter. +# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible +# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java +# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code +# BLAS3_MEM_ALLOC_THRESHOLD = 160 + + + +# the below is not yet configurable, use cmake if you need to build only select types +BUILD_SINGLE = 1 +BUILD_DOUBLE = 1 +BUILD_COMPLEX = 1 +BUILD_COMPLEX16 = 1 # End of user configuration # diff --git a/Makefile.system b/Makefile.system index eb6e14a982..461f7370bf 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1232,8 +1232,8 @@ ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif -ifeq ($(BUILD_HALF), 1) -CCOMMON_OPT += -DBUILD_HALF +ifeq ($(BUILD_BFLOAT16), 1) +CCOMMON_OPT += -DBUILD_BFLOAT16 endif ifeq ($(BUILD_SINGLE), 1) CCOMMON_OPT += -DBUILD_SINGLE=1 @@ -1521,10 +1521,10 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 -export BUILD_HALF +export BUILD_BFLOAT16 -export SHGEMM_UNROLL_M -export SHGEMM_UNROLL_N +export SBGEMM_UNROLL_M +export SBGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M diff --git a/Makefile.tail b/Makefile.tail index 6410824503..b14689fc7c 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -24,14 +24,14 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif -$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX +$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX -$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX +$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) From 629c497b6c34d63c5df133cb1ca74d1189a28652 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:27:11 +0200 Subject: [PATCH 307/349] common_sh.h renamed to common_sb.h --- common_macro.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_macro.h b/common_macro.h index 605d74adae..510813b0f3 100644 --- a/common_macro.h +++ b/common_macro.h @@ -39,7 +39,7 @@ #ifndef COMMON_MACRO #define COMMON_MACRO -#include "common_sh.h" +#include "common_sb.h" #include "common_s.h" #include "common_d.h" #include "common_q.h" From bb74dd29db44b9d57770e8f27c7815aecc675611 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 00:42:05 +0200 Subject: [PATCH 308/349] Restore -msse3 --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index a504530fb4..78544f661e 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,6 +70,9 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() + if (DEFINED HAVE_SSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") + endif() endif() if (DEFINED TARGET) From 0ed1f07660b1836e530d5d9b0a140a36a8bca39d Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 12 Oct 2020 19:48:53 +0800 Subject: [PATCH 309/349] Optimize the performance of sum by using universal intrinsics --- kernel/arm/sum.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c index 7b78ec61a4..d4b3fbc839 100644 --- a/kernel/arm/sum.c +++ b/kernel/arm/sum.c @@ -29,23 +29,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * trivial copy of asum.c with the ABS() removed * **************************************************************************************/ - #include "common.h" +#include "../simd/intrin.h" #include FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - + if (n <= 0 || inc_x <= 0) + return (sumf); n *= inc_x; - while(i < n) + if (inc_x == 1) + { +#if V_SIMD + const int vstep = v_nlanes_f32; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + v_f32 vsum0 = v_zero_f32(); + v_f32 vsum1 = v_zero_f32(); + v_f32 vsum2 = v_zero_f32(); + v_f32 vsum3 = v_zero_f32(); + while (i < unrollx4) + { + vsum0 = v_add_f32(vsum0, v_loadu_f32(x)); + vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep)); + vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2)); + vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3)); + i += vstep * 4; + } + vsum0 = v_add_f32( + v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3)); + while (i < unrollx) + { + vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); + i += vstep; + } + sumf = v_sum_f32(vsum0); +#else + int n1 = n & -4; + for (; i < n1; i += 4) + { + sumf += x[i] + x[i + 1] + x[i + 2] + x[i + 3]; + } +#endif + } + while (i < n) { sumf += x[i]; i += inc_x; } - return(sumf); + return (sumf); } - - From cb839575ed71b959f1dbd32d82c8789ea0f54bce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 12 Oct 2020 14:44:33 +0200 Subject: [PATCH 310/349] Convert the prototypes of the unimplemented BFLOAT16 functions to the new naming scheme --- common_param.h | 146 ++++++++++++++++++++++++------------------------- 1 file changed, 73 insertions(+), 73 deletions(-) diff --git a/common_param.h b/common_param.h index 3615230810..b50e4ff803 100644 --- a/common_param.h +++ b/common_param.h @@ -51,39 +51,39 @@ typedef struct { int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; - void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); - void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); + void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); + void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); - float (*shamax_k) (BLASLONG, float *, BLASLONG); - float (*shamin_k) (BLASLONG, float *, BLASLONG); - float (*shmax_k) (BLASLONG, float *, BLASLONG); - float (*shmin_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); -BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); -BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); - - float (*shnrm2_k) (BLASLONG, float *, BLASLONG); - float (*shasum_k) (BLASLONG, float *, BLASLONG); - float (*shsum_k) (BLASLONG, float *, BLASLONG); - int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + float (*sbamax_k) (BLASLONG, float *, BLASLONG); + float (*sbamin_k) (BLASLONG, float *, BLASLONG); + float (*sbmax_k) (BLASLONG, float *, BLASLONG); + float (*sbmin_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*isbamax_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*isbamin_k)(BLASLONG, float *, BLASLONG); +BLASLONG (*isbmax_k) (BLASLONG, float *, BLASLONG); +BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); + + float (*sbnrm2_k) (BLASLONG, float *, BLASLONG); + float (*sbasum_k) (BLASLONG, float *, BLASLONG); + float (*sbsum_k) (BLASLONG, float *, BLASLONG); + int (*sbcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sbdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); + int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); @@ -93,57 +93,57 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - - int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); - - int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - - int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); - int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); + int (*sbtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*sbtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + int (*sbtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); + + int (*sbtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + int (*sbtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); + + int (*sbtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*sbsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*sbsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + + int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif From 403eb513a0616020e7238b531bad739f6baef43a Mon Sep 17 00:00:00 2001 From: Matti Picus Date: Mon, 12 Oct 2020 18:15:01 +0300 Subject: [PATCH 311/349] use emms instead, add WIN guards --- kernel/x86_64/amax.S | 4 +++- kernel/x86_64/asum.S | 5 ++++- kernel/x86_64/dot.S | 5 ++++- kernel/x86_64/iamax.S | 5 ++++- kernel/x86_64/izamax.S | 5 ++++- kernel/x86_64/nrm2.S | 5 ++++- kernel/x86_64/qconjg.S | 5 ++++- kernel/x86_64/qdot.S | 4 +++- kernel/x86_64/qgemm_kernel_2x2.S | 4 +++- kernel/x86_64/qgemv_n.S | 4 +++- kernel/x86_64/qgemv_t.S | 5 ++++- kernel/x86_64/qtrsm_kernel_LN_2x2.S | 4 +++- kernel/x86_64/qtrsm_kernel_LT_2x2.S | 4 +++- kernel/x86_64/qtrsm_kernel_RT_2x2.S | 5 +++-- kernel/x86_64/sum.S | 4 +++- kernel/x86_64/xdot.S | 4 +++- kernel/x86_64/xgemm3m_kernel_2x2.S | 4 +++- kernel/x86_64/xgemm_kernel_1x1.S | 4 +++- kernel/x86_64/xgemv_n.S | 4 +++- kernel/x86_64/xgemv_t.S | 4 +++- kernel/x86_64/xtrsm_kernel_LT_1x1.S | 4 +++- kernel/x86_64/zamax.S | 4 +++- kernel/x86_64/zasum.S | 4 +++- kernel/x86_64/zdot.S | 4 ++-- kernel/x86_64/znrm2.S | 4 +++- kernel/x86_64/zscal.S | 4 +++- kernel/x86_64/zsum.S | 4 +++- 27 files changed, 87 insertions(+), 29 deletions(-) diff --git a/kernel/x86_64/amax.S b/kernel/x86_64/amax.S index 257147dfb8..1498bb226c 100644 --- a/kernel/x86_64/amax.S +++ b/kernel/x86_64/amax.S @@ -55,7 +55,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/asum.S b/kernel/x86_64/asum.S index 24f57dd111..a2cbfd4804 100644 --- a/kernel/x86_64/asum.S +++ b/kernel/x86_64/asum.S @@ -50,7 +50,10 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/dot.S b/kernel/x86_64/dot.S index 2319885f19..a11d25e5d1 100644 --- a/kernel/x86_64/dot.S +++ b/kernel/x86_64/dot.S @@ -49,7 +49,10 @@ PROLOGUE PROFCODE - fninit + +#ifdef WINDOWS_ABI + emms +#endif salq $BASE_SHIFT, INCX salq $BASE_SHIFT, INCY diff --git a/kernel/x86_64/iamax.S b/kernel/x86_64/iamax.S index 0c666d623b..00999e25f2 100644 --- a/kernel/x86_64/iamax.S +++ b/kernel/x86_64/iamax.S @@ -59,7 +59,10 @@ PROLOGUE PROFCODE - fninit + +#ifdef WINDOWS_ABI + emms +#endif salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/izamax.S b/kernel/x86_64/izamax.S index e450c2cd23..b24b2e6925 100644 --- a/kernel/x86_64/izamax.S +++ b/kernel/x86_64/izamax.S @@ -59,7 +59,10 @@ PROLOGUE PROFCODE - fninit + +#ifdef WINDOWS_ABI + emms +#endif salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S index 548e3b7447..b79ac2adb0 100644 --- a/kernel/x86_64/nrm2.S +++ b/kernel/x86_64/nrm2.S @@ -50,7 +50,10 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif + fldz testq M, M jle .L999 diff --git a/kernel/x86_64/qconjg.S b/kernel/x86_64/qconjg.S index bab5418311..823a15a845 100644 --- a/kernel/x86_64/qconjg.S +++ b/kernel/x86_64/qconjg.S @@ -41,7 +41,10 @@ PROLOGUE PROFCODE - fninit + +#ifdef WINDOWS_ABI + emms +#endif fldz FLD 1 * SIZE(ARG1) diff --git a/kernel/x86_64/qdot.S b/kernel/x86_64/qdot.S index e7d31360b0..2243b6b6d8 100644 --- a/kernel/x86_64/qdot.S +++ b/kernel/x86_64/qdot.S @@ -58,7 +58,9 @@ PROLOGUE - fninit +#ifdef WINDOWS_ABI + emms +#endif pushl %edi pushl %esi diff --git a/kernel/x86_64/qgemm_kernel_2x2.S b/kernel/x86_64/qgemm_kernel_2x2.S index 7b5e7707d5..c11f3a91d6 100644 --- a/kernel/x86_64/qgemm_kernel_2x2.S +++ b/kernel/x86_64/qgemm_kernel_2x2.S @@ -74,7 +74,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/qgemv_n.S b/kernel/x86_64/qgemv_n.S index 1b65b03f0e..c9d345cb18 100644 --- a/kernel/x86_64/qgemv_n.S +++ b/kernel/x86_64/qgemv_n.S @@ -76,7 +76,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/qgemv_t.S b/kernel/x86_64/qgemv_t.S index 00188c2578..32372ff15d 100644 --- a/kernel/x86_64/qgemv_t.S +++ b/kernel/x86_64/qgemv_t.S @@ -75,7 +75,10 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif + subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) movq %rbp, 8(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_LN_2x2.S b/kernel/x86_64/qtrsm_kernel_LN_2x2.S index 030eff8934..0a545faf87 100644 --- a/kernel/x86_64/qtrsm_kernel_LN_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LN_2x2.S @@ -74,7 +74,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_LT_2x2.S b/kernel/x86_64/qtrsm_kernel_LT_2x2.S index d86972c72c..16063fbcdd 100644 --- a/kernel/x86_64/qtrsm_kernel_LT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_LT_2x2.S @@ -74,7 +74,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/qtrsm_kernel_RT_2x2.S b/kernel/x86_64/qtrsm_kernel_RT_2x2.S index 2826a62c93..4c94ac02cf 100644 --- a/kernel/x86_64/qtrsm_kernel_RT_2x2.S +++ b/kernel/x86_64/qtrsm_kernel_RT_2x2.S @@ -74,8 +74,9 @@ PROLOGUE PROFCODE - fninit - +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/sum.S b/kernel/x86_64/sum.S index 3d5fa7cc29..9f2cdc1ecd 100644 --- a/kernel/x86_64/sum.S +++ b/kernel/x86_64/sum.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif fldz testq M, M diff --git a/kernel/x86_64/xdot.S b/kernel/x86_64/xdot.S index ec89b799c2..c4b4734947 100644 --- a/kernel/x86_64/xdot.S +++ b/kernel/x86_64/xdot.S @@ -59,7 +59,9 @@ PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif #define N %ebx diff --git a/kernel/x86_64/xgemm3m_kernel_2x2.S b/kernel/x86_64/xgemm3m_kernel_2x2.S index e8da78d82a..1d0b23c402 100644 --- a/kernel/x86_64/xgemm3m_kernel_2x2.S +++ b/kernel/x86_64/xgemm3m_kernel_2x2.S @@ -78,7 +78,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/xgemm_kernel_1x1.S b/kernel/x86_64/xgemm_kernel_1x1.S index f04ab07f59..ee67d8d430 100644 --- a/kernel/x86_64/xgemm_kernel_1x1.S +++ b/kernel/x86_64/xgemm_kernel_1x1.S @@ -97,7 +97,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/xgemv_n.S b/kernel/x86_64/xgemv_n.S index 7d28c118ac..b66f28d586 100644 --- a/kernel/x86_64/xgemv_n.S +++ b/kernel/x86_64/xgemv_n.S @@ -76,7 +76,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/xgemv_t.S b/kernel/x86_64/xgemv_t.S index e796760883..d6d37010d1 100644 --- a/kernel/x86_64/xgemv_t.S +++ b/kernel/x86_64/xgemv_t.S @@ -75,7 +75,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/xtrsm_kernel_LT_1x1.S b/kernel/x86_64/xtrsm_kernel_LT_1x1.S index 54d41932f8..875206363f 100644 --- a/kernel/x86_64/xtrsm_kernel_LT_1x1.S +++ b/kernel/x86_64/xtrsm_kernel_LT_1x1.S @@ -90,7 +90,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif subq $STACKSIZE, %rsp movq %rbx, 0(%rsp) diff --git a/kernel/x86_64/zamax.S b/kernel/x86_64/zamax.S index bfd836193d..5cb2f60198 100644 --- a/kernel/x86_64/zamax.S +++ b/kernel/x86_64/zamax.S @@ -55,7 +55,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/zasum.S b/kernel/x86_64/zasum.S index 9ea2aadc05..3460fcea30 100644 --- a/kernel/x86_64/zasum.S +++ b/kernel/x86_64/zasum.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif fldz testq M, M diff --git a/kernel/x86_64/zdot.S b/kernel/x86_64/zdot.S index f7df919b7c..87c08d7c80 100644 --- a/kernel/x86_64/zdot.S +++ b/kernel/x86_64/zdot.S @@ -54,9 +54,9 @@ PROLOGUE PROFCODE - fninit - #ifdef WINDOWS_ABI + emms + movq 40(%rsp), INCY #endif diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S index cb02a5a9fe..0d2aa3480b 100644 --- a/kernel/x86_64/znrm2.S +++ b/kernel/x86_64/znrm2.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif fldz testq M, M diff --git a/kernel/x86_64/zscal.S b/kernel/x86_64/zscal.S index 08c0831a44..5ed4c4576b 100644 --- a/kernel/x86_64/zscal.S +++ b/kernel/x86_64/zscal.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/zsum.S b/kernel/x86_64/zsum.S index 1c39048396..aa02637e47 100644 --- a/kernel/x86_64/zsum.S +++ b/kernel/x86_64/zsum.S @@ -50,7 +50,9 @@ PROLOGUE PROFCODE - fninit +#ifdef WINDOWS_ABI + emms +#endif fldz testq M, M From 8d2df7d066dbe6988502b352a4594cc78f9d89c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 00:14:29 +0200 Subject: [PATCH 312/349] Revert special handling of Windows xNRM2 and enable C+intrinsics kernel for SSUM/DSUM --- kernel/x86_64/KERNEL | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index d75196974e..cb98fd89a3 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -259,12 +259,8 @@ SNRM2KERNEL = nrm2_sse.S endif ifndef DNRM2KERNEL -ifeq ($(OSNAME),WINNT) -DNRM2KERNEL = ../arm/nrm2.c -else DNRM2KERNEL = nrm2.S endif -endif ifndef QNRM2KERNEL QNRM2KERNEL = nrm2.S @@ -275,12 +271,8 @@ CNRM2KERNEL = znrm2_sse.S endif ifndef ZNRM2KERNEL -ifeq ($(OSNAME),WINNT) -ZNRM2KERNEL = ../arm/znrm2.c -else ZNRM2KERNEL = znrm2.S endif -endif ifndef XNRM2KERNEL XNRM2KERNEL = znrm2.S @@ -486,3 +478,6 @@ XTRSMKERNEL_RN = xtrsm_kernel_LT_1x1.S XTRSMKERNEL_RT = xtrsm_kernel_LT_1x1.S XGEMM3MKERNEL = xgemm3m_kernel_2x2.S + +SSUMKERNEL = ../arm/sum.c +DSUMKERNEL = ../arm/sum.c From e05af6575ee9fa12f2afea8c2c20e80b1529ba84 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 09:05:04 +0200 Subject: [PATCH 313/349] Fix some overlooked "SHBLAS" entries --- Makefile.tail | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Makefile.tail b/Makefile.tail index b14689fc7c..54ba649dbf 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -1,18 +1,18 @@ -SHBLASOBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +SBBLASOBJS_P = $(SBBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) -SHEXTOBJS_P = $(SHEXTOBJS:.$(SUFFIX)=.$(PSUFFIX)) +SBEXTOBJS_P = $(SBEXTOBJS:.$(SUFFIX)=.$(PSUFFIX)) COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) -BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) +BLASOBJS = $(SBEXTOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) +BLASOBJS_P = $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -24,23 +24,23 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif -$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX +$(SBBLASOBJS) $(SBBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX -$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX +$(SBEXTOBJS) $(SBEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX -$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(SBBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) -$(SHEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(SBEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) libs :: $(BLASOBJS) $(COMMONOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ From 2ae87856039e78cf736fb22efb9bc8020697cbe3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 09:07:50 +0200 Subject: [PATCH 314/349] Add a POWER9 build with BFLOAT16 enabled --- .travis.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.travis.yml b/.travis.yml index 4bfdf485c3..3f917ce72c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -104,6 +104,23 @@ matrix: # for matrix annotation only - TARGET_BOX=PPC64LE_LINUX_P9 + - os: linux + arch: ppc64le + dist: bionic + compiler: gcc + before_script: + - sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y + - sudo apt-get update + - sudo apt-get install gcc-9 gfortran-9 -y + script: + - make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 + - make -C test $COMMON_FLAGS $BTYPE + - make -C ctest $COMMON_FLAGS $BTYPE + - make -C utest $COMMON_FLAGS $BTYPE + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX_P9 + - os: linux compiler: gcc addons: From 84949754a0d62fe70beb8d36285328eb446a5dcd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 09:11:36 +0200 Subject: [PATCH 315/349] Fix bfloat16 conditional --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 89eeb197db..a3ef99b598 100644 --- a/common.h +++ b/common.h @@ -257,7 +257,7 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif -#ifndef BFLOAT16 +#ifndef bfloat16 #include typedef uint16_t bfloat16; #define BFLOAT16CONVERSION 1 From 1e7eb7b7a91838ccba39b9183fb0a5a814c09b7b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 09:17:15 +0200 Subject: [PATCH 316/349] Fix typos in currently unused sections --- interface/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index a35d532705..1905827f97 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -283,9 +283,9 @@ CSBLAS3OBJS = \ cblas_sgeadd.$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) -CBHBLAS1OBJS = cblas_sbdot.$(SUFFIX) -CBHBLAS3OBJS = cblas_sbgemm.$(SUFFIX) -CBHEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) +CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) +CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif CDBLAS1OBJS = \ @@ -535,19 +535,19 @@ endif clean :: @rm -f functable.h -level1 : $(BEXTOBJS) $(SHBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) +level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) +level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ aux : $(CBAUXOBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -$(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ +$(CSBBLASOBJS) $(CSBBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) $(CBAUXOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c From 9dca578c79aec1e736f9fbb233489de85703928d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 10:14:08 +0200 Subject: [PATCH 317/349] Cleanup From 6999086a2bc4be5796a5d091f491af3b32970a71 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 10:32:19 +0200 Subject: [PATCH 318/349] whitelist SANDYBRIDGE for SSE3 --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 290fb2afe4..6745a79dde 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,7 +41,7 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO NEHALEM BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE NEHALEM BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) override CFLAGS += -msse3 endif ifeq ($(TARGET_CORE), COOPERLAKE) From 0eacbca85fa30657f749f7818e081952b9fb49f4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 11:42:39 +0200 Subject: [PATCH 319/349] Add Haswell and Zen to temporary sse3 whitelist --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 6745a79dde..e567485a63 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -41,7 +41,7 @@ ifdef NO_AVX2 endif ifdef TARGET_CORE - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE NEHALEM BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) override CFLAGS += -msse3 endif ifeq ($(TARGET_CORE), COOPERLAKE) From fecedc9c699527dfdb208bde4634374eca1ebbce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 11:55:41 +0200 Subject: [PATCH 320/349] Add -mssse3 --- kernel/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index e567485a63..c95c15f56b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,6 +8,9 @@ include $(TOPDIR)/Makefile.system ifdef HAVE_SSE3 CFLAGS += -msse3 endif +ifdef HAVE_SSSE3 +CFLAGS += -mssse3 +endif ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) @@ -42,7 +45,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse3 + override CFLAGS += -msse3 -mssse3 endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From 5f60a32cacc4e168202c7f8729d97b11e861e0c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 11:57:04 +0200 Subject: [PATCH 321/349] Add -mssse3 if supported by the hardware --- Makefile.x86_64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index e793a1c2f9..f055828a90 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -12,6 +12,10 @@ ifdef HAVE_SSE3 ifndef DYNAMIC_ARCH CCOMMON_OPT += -msse3 FCOMMON_OPT += -msse3 +ifdef HAVE_SSSE3 +CCOMMON_OPT += -mssse3 +FCOMMON_OPT += -mssse3 +endif endif endif From 9e3cff5cf2cf841e9a7a73b70b4465c87ac45643 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 14:41:25 +0200 Subject: [PATCH 322/349] Expressly enable -mavx2 on Zen, SkylakeX and Cooperlake as well --- Makefile.x86_64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index f055828a90..9e75dc91c4 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -64,7 +64,7 @@ endif endif endif -ifeq ($(CORE), HASWELL) +ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE) ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) From 137ae618dba8ddf2ee899cb2a7854b34f1100ed3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 15:02:17 +0200 Subject: [PATCH 323/349] Fix typo --- Makefile.x86_64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 9e75dc91c4..8a3fc4eaea 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -64,7 +64,7 @@ endif endif endif -ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE) +ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE)) ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) From b5d30b390dd8d6aed4617c94e5b4fd94425c96d1 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 13 Oct 2020 11:00:22 -0500 Subject: [PATCH 324/349] Fix build issues with bfloat16 This patch fixes compilation errors due to recent renaming from SH to SB with BUILD_BFLOAT16. --- cblas.h | 4 ++-- common_interface.h | 4 ++-- common_level1.h | 4 ++-- common_macro.h | 4 ++-- driver/level3/Makefile | 4 ++-- exports/gensymbol | 4 ++-- interface/Makefile | 8 ++++---- kernel/Makefile.L1 | 6 +++--- kernel/Makefile.L3 | 6 +++--- test/Makefile | 8 +++----- 10 files changed, 25 insertions(+), 27 deletions(-) diff --git a/cblas.h b/cblas.h index 4fc6f86812..bf310bed2e 100644 --- a/cblas.h +++ b/cblas.h @@ -384,9 +384,9 @@ void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint /*** BFLOAT16 and INT8 extensions ***/ /* convert float array to BFLOAT16 array by rounding */ -void cblas_shstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); +void cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); /* convert double array to BFLOAT16 array by rounding */ -void cblas_shdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); +void cblas_sbdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout); /* convert BFLOAT16 array to float array */ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout); /* convert BFLOAT16 array to double array */ diff --git a/common_interface.h b/common_interface.h index bee09e8941..032877fe1d 100644 --- a/common_interface.h +++ b/common_interface.h @@ -55,8 +55,8 @@ double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); -void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); -void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(sbstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); +void BLASFUNC(sbdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); void BLASFUNC(dbf16tod) (blasint *, bfloat16 *, blasint *, double *, blasint *); diff --git a/common_level1.h b/common_level1.h index 7b17962c48..d2ed47e567 100644 --- a/common_level1.h +++ b/common_level1.h @@ -48,8 +48,8 @@ double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); float sbdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); -void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); -void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); +void sbstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); +void sbdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); void sbf16tos_k (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); void dbf16tod_k (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 510813b0f3..54deed57cb 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,9 +646,9 @@ #elif defined(BFLOAT16) -#define D_TO_BF16_K SHDTOBF16_K +#define D_TO_BF16_K SBDTOBF16_K #define D_BF16_TO_K DBF16TOD_K -#define S_TO_BF16_K SHSTOBF16_K +#define S_TO_BF16_K SBSTOBF16_K #define S_BF16_TO_K SBF16TOS_K #define AMAX_K SAMAX_K diff --git a/driver/level3/Makefile b/driver/level3/Makefile index b4f1e2b264..b528dfa2de 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -20,7 +20,7 @@ USE_GEMM3M = 1 endif ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) +SBBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) endif SBLASOBJS += \ @@ -208,7 +208,7 @@ COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) +SBBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) diff --git a/exports/gensymbol b/exports/gensymbol index 9ff8e10b16..8482ecb7eb 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -51,7 +51,7 @@ zgeadd, dzsum); @cblasobjs = (lsame, xerbla); -@halfblasobjs = (sbgemm, sbdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); +@halfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -94,7 +94,7 @@ @cblasobjs = ( cblas_xerbla ); -@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, diff --git a/interface/Makefile b/interface/Makefile index 1905827f97..6b247b49f9 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -775,9 +775,9 @@ dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c ifeq ($(BUILD_BFLOAT16),1) sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -c $< -o $(@F) -shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c +sbstobf16.$(SUFFIX) sbstobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) -shdtobf16.$(SUFFIX) shdtobf16.$(PSUFFIX) : tobf16.c +sbdtobf16.$(SUFFIX) sbdtobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) sbf16tos.$(SUFFIX) sbf16tos.$(PSUFFIX) : bf16to.c $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) @@ -1526,9 +1526,9 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c ifeq ($(BUILD_BFLOAT16),1) cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) -cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c +cblas_sbstobf16.$(SUFFIX) cblas_sbstobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) -cblas_shdtobf16.$(SUFFIX) cblas_shdtobf16.$(PSUFFIX) : tobf16.c +cblas_sbdtobf16.$(SUFFIX) cblas_sbdtobf16.$(PSUFFIX) : tobf16.c $(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F) cblas_sbf16tos.$(SUFFIX) cblas_sbf16tos.$(PSUFFIX) : bf16to.c $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 6fe6778d0a..7ad94118a0 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -531,11 +531,11 @@ XBLASOBJS += \ xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += \ +SBBLASOBJS += \ sbdot_k$(TSUFFIX).$(SUFFIX) -SHEXTOBJS += \ +SBEXTOBJS += \ sbstobf16_k$(TSUFFIX).$(SUFFIX) sbdtobf16_k$(TSUFFIX).$(SUFFIX) -SHEXTOBJS += \ +SBEXTOBJS += \ sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX) endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 65d4290128..2ba593c2ee 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -94,7 +94,7 @@ SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) endif -SHKERNELOBJS += \ +SBKERNELOBJS += \ sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) @@ -150,7 +150,7 @@ XKERNELOBJS += \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += $(SHKERNELOBJS) +SBBLASOBJS += $(SBKERNELOBJS) endif SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) @@ -160,7 +160,7 @@ ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) ifeq ($(BUILD_BFLOAT16),1) -SHBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) +SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) endif ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" diff --git a/test/Makefile b/test/Makefile index 06fb7fe86f..2123433895 100644 --- a/test/Makefile +++ b/test/Makefile @@ -214,11 +214,9 @@ endif -#ifeq ($(BUILD_BFLOAT16),1) -#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 -#else -#level3 : sblat3 dblat3 cblat3 zblat3 -#endif +ifeq ($(BUILD_BFLOAT16),1) +level3 : test_sbgemm +endif ifndef CROSS rm -f ?BLAT3.SUMM From 437b7fe261f7026f0fcc517e0e3015cad29bb579 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 19:55:14 +0200 Subject: [PATCH 325/349] sh prefix renamed to sb --- ctest/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt index 8d301c239f..8aed9eb855 100644 --- a/ctest/CMakeLists.txt +++ b/ctest/CMakeLists.txt @@ -12,7 +12,7 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh foreach(float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char_upper) string(TOLOWER ${float_char_upper} float_char) - if (${float_char} STREQUAL "h") + if (${float_char} STREQUAL "b") continue() endif() #level1 From bc5c7f95781adcea95b60e553ad785d8e25cead8 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 19:56:09 +0200 Subject: [PATCH 326/349] Cleanup --- test/Makefile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/Makefile b/test/Makefile index 2123433895..06fb7fe86f 100644 --- a/test/Makefile +++ b/test/Makefile @@ -214,9 +214,11 @@ endif -ifeq ($(BUILD_BFLOAT16),1) -level3 : test_sbgemm -endif +#ifeq ($(BUILD_BFLOAT16),1) +#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 +#else +#level3 : sblat3 dblat3 cblat3 zblat3 +#endif ifndef CROSS rm -f ?BLAT3.SUMM From 4bb73c01713c43f28a3ab464399fb716516ffc70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 20:07:19 +0200 Subject: [PATCH 327/349] Rename "HALF" type to "BFLOAT16" --- lapack/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 778e6f8fae..fd4e570484 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -2,7 +2,7 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) -list (REMOVE_ITEM FLOAT_TYPES "HALF") +list (REMOVE_ITEM FLOAT_TYPES "BFLOAT16") set(LAPACK_SOURCES potrf/potrf_U_single.c From 0826d68f93ef1fed021c426911c464728d60ccb3 Mon Sep 17 00:00:00 2001 From: Rajalakshmi Srinivasaraghavan Date: Tue, 13 Oct 2020 16:05:10 -0500 Subject: [PATCH 328/349] POWER10: Change the packing format for bfloat16 As the new MMA instructions need the inputs in 4x2 order for bfloat16, changing the format in copy/packing code. This avoids permute instructions in the gemm kernel inner loop. --- kernel/power/KERNEL.POWER10 | 8 +- kernel/power/sbgemm_kernel_power10.c | 477 ++++++++---------- kernel/power/sbgemm_ncopy_16_power10.c | 437 ++++++++++++++++ kernel/power/sbgemm_ncopy_8_power10.c | 383 ++++++++++++++ kernel/power/sbgemm_tcopy_16_power10.c | 244 +++++++++ kernel/power/sbgemm_tcopy_8_power10.c | 659 +++++++++++++++++++++++++ 6 files changed, 1923 insertions(+), 285 deletions(-) create mode 100644 kernel/power/sbgemm_ncopy_16_power10.c create mode 100644 kernel/power/sbgemm_ncopy_8_power10.c create mode 100644 kernel/power/sbgemm_tcopy_16_power10.c create mode 100644 kernel/power/sbgemm_tcopy_8_power10.c diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 5cf1660a25..031d96581c 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -9,10 +9,10 @@ else SBGEMM_BETA = ../generic/gemm_beta.c SBGEMMKERNEL = sbgemm_kernel_power10.c -SBGEMMINCOPY = ../generic/gemm_ncopy_16.c -SBGEMMITCOPY = ../generic/gemm_tcopy_16.c -SBGEMMONCOPY = ../generic/gemm_ncopy_8.c -SBGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SBGEMMINCOPY = sbgemm_ncopy_16_power10.c +SBGEMMITCOPY = sbgemm_tcopy_16_power10.c +SBGEMMONCOPY = sbgemm_ncopy_8_power10.c +SBGEMMOTCOPY = sbgemm_tcopy_8_power10.c SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c index 46d82598af..d155867037 100644 --- a/kernel/power/sbgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -137,15 +137,13 @@ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) { - BLASLONG N = n; BLASLONG i1; v4sf_t valpha = { alpha, alpha, alpha, alpha }; vector short vzero = { 0, 0, 0, 0, 0, 0, 0, 0 }; - N = n >> 3; /* Loop for n >= 8. */ - for (i1 = 0; i1 < N; i1++) + for (i1 = 0; i1 < (n >> 3); i1++) { - BLASLONG i, j; + BLASLONG j; FLOAT *CO; IFLOAT *AO; CO = C; @@ -153,9 +151,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO = A; PREFETCH1 (A, 128); PREFETCH1 (A, 256); - i = m >> 4; /* Loop for m >= 16. */ - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 4); j++) { IFLOAT *BO = B; v4sf_t *rowC; @@ -167,20 +164,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 5]); vec_t *rowB = (vec_t *) & (BO[l << 4]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); - vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[2]); - vec_t rowA_l = MERGE_LOW (rowA[0], rowA[2]); - vec_t rowA2_h = MERGE_HIGH (rowA[1], rowA[3]); - vec_t rowA2_l = MERGE_LOW (rowA[1], rowA[3]); - MMA (&acc0, rowB_h, rowA_h); - MMA (&acc1, rowB_l, rowA_h); - MMA (&acc2, rowB_h, rowA_l); - MMA (&acc3, rowB_l, rowA_l); - MMA (&acc4, rowB_h, rowA2_h); - MMA (&acc5, rowB_l, rowA2_h); - MMA (&acc6, rowB_h, rowA2_l); - MMA (&acc7, rowB_l, rowA2_l); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[1], rowA[0]); + MMA (&acc2, rowB[0], rowA[1]); + MMA (&acc3, rowB[1], rowA[1]); + MMA (&acc4, rowB[0], rowA[2]); + MMA (&acc5, rowB[1], rowA[2]); + MMA (&acc6, rowB[0], rowA[3]); + MMA (&acc7, rowB[1], rowA[3]); } if (k % 2 == 1) { @@ -216,9 +207,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += (k << 4); BO += (k << 3); } - i = (m & 15) >> 3; - /* Loop for m >= 8. */ - for (j = 0; j < i; j++) + if (m & 8) { IFLOAT *BO = B; v4sf_t *rowC; @@ -230,14 +219,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 4]); vec_t *rowB = (vec_t *) & (BO[l << 4]); - vec_t rowB_h = MERGE_HIGH (rowB[0], rowB[1]); - vec_t rowB_l = MERGE_LOW (rowB[0], rowB[1]); - vec_t rowA_h = MERGE_HIGH (rowA[0], rowA[1]); - vec_t rowA_l = MERGE_LOW (rowA[0], rowA[1]); - MMA (&acc0, rowB_h, rowA_h); - MMA (&acc1, rowB_l, rowA_h); - MMA (&acc2, rowB_h, rowA_l); - MMA (&acc3, rowB_l, rowA_l); + + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[1], rowA[0]); + MMA (&acc2, rowB[0], rowA[1]); + MMA (&acc3, rowB[1], rowA[1]); } if (k % 2 == 1) { @@ -262,9 +248,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += (k << 3); BO += (k << 3); } - i = (m & 7) >> 2; - /* Loop for m >= 4. */ - for (j = 0; j < i; j++) + if (m & 4) { IFLOAT *BO = B; v4sf_t *rowC; @@ -277,9 +261,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 3]); vec_t *rowB = (vec_t *) & (BO[l << 4]); - vec_t rowA_mrg = MERGE_ROW (rowA[0]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), rowA_mrg); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), rowA_mrg); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[1], rowA[0]); } if (k % 2 == 1) { @@ -297,9 +280,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += (k << 2); BO += (k << 3); } - i = (m & 3) >> 1; - /* Loop for m >= 2. */ - for (j = 0; j < i; j++) + if (m & 2) { IFLOAT *BO = B; v2sf_t *rowC; @@ -316,8 +297,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[l << 4]); - MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); - MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc0, rowB[0], (vec_t) rowA); + MMA (&acc1, rowB[1], (vec_t) rowA); } if (k % 2 == 1) { @@ -334,64 +315,50 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += (k << 1); BO += (k << 3); } - i = (m & 1) >> 0; - /* Loop for m = 1. */ - for (j = 0; j < i; j++) + if (m & 1) { IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 } - , t1 = - { - 0, 0, 0, 0}; - for (l = 0; l < k; l++) + for (l = 0; l < k / 2; l++) { - v4sf_t rowA = - { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), - BF16TOF32 (AO[l]) - }; - v4sf_t rowB = - { BF16TOF32 (BO[l << 3]), BF16TOF32 (BO[(l << 3) + 1]), - BF16TOF32 (BO[(l << 3) + 2]), - BF16TOF32 (BO[(l << 3) + 3]) - }; - v4sf_t rowB1 = - { BF16TOF32 (BO[(l << 3) + 4]), BF16TOF32 (BO[(l << 3) + 5]), - BF16TOF32 (BO[(l << 3) + 6]), - BF16TOF32 (BO[(l << 3) + 7]) - }; - t += rowA * rowB; - t1 += rowA * rowB1; + vector short rowA = + { AO[(l << 1) + 0], AO[(l << 1) + 1], 0, 0, 0, 0, 0, 0}; + vec_t *rowB = (vec_t *) & (BO[l << 4]); + MMA (&acc0, rowB[0], (vec_t) rowA); + MMA (&acc1, rowB[1], (vec_t) rowA); } - t = t * valpha; - t1 = t1 * valpha; - CO[0 * ldc] += t[0]; - CO[1 * ldc] += t[1]; - CO[2 * ldc] += t[2]; - CO[3 * ldc] += t[3]; - CO[4 * ldc] += t1[0]; - CO[5 * ldc] += t1[1]; - CO[6 * ldc] += t1[2]; - CO[7 * ldc] += t1[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[(l << 3)]); + MMA (&acc0, MERGE_HIGH (rowB[0], rowB[1]), (vec_t) rowA); + MMA (&acc1, MERGE_LOW (rowB[0], rowB[1]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); + SAVE4x2_ACC1 (&acc1, 0); CO += 1; AO += k; BO += (k << 3); } B += k << 3; } - N = (n & 7) >> 2; - /* Loop for n >= 4. */ - for (i1 = 0; i1 < N; i1++) + if (n & 4) { - BLASLONG i, j; + BLASLONG j; FLOAT *CO; IFLOAT *AO; CO = C; C += ldc << 2; AO = A; - i = m >> 5; /* Loop for m >= 32. */ - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 5); j++) { IFLOAT *BO = B; IFLOAT *A1 = AO + (16 * k); @@ -405,15 +372,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, vec_t *rowA = (vec_t *) & (AO[l << 5]); vec_t *rowA1 = (vec_t *) & (A1[l << 5]); vec_t *rowB = (vec_t *) & (BO[l << 3]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); - MMA (&acc4, rowB_mrg, MERGE_HIGH (rowA1[0], rowA1[2])); - MMA (&acc5, rowB_mrg, MERGE_LOW (rowA1[0], rowA1[2])); - MMA (&acc6, rowB_mrg, MERGE_HIGH (rowA1[1], rowA1[3])); - MMA (&acc7, rowB_mrg, MERGE_LOW (rowA1[1], rowA1[3])); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[0], rowA[1]); + MMA (&acc2, rowB[0], rowA[2]); + MMA (&acc3, rowB[0], rowA[3]); + MMA (&acc4, rowB[0], rowA1[0]); + MMA (&acc5, rowB[0], rowA1[1]); + MMA (&acc6, rowB[0], rowA1[2]); + MMA (&acc7, rowB[0], rowA1[3]); } if (k % 2 == 1) { @@ -448,9 +414,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 5; BO += k << 2; } - i = (m & 31) >> 4; - /* Loop for m >= 16. */ - for (j = 0; j < i; j++) + if (m & 16) { IFLOAT *BO = B; v4sf_t *rowC; @@ -462,11 +426,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 5]); vec_t *rowB = (vec_t *) & (BO[l << 3]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, rowB_mrg, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, rowB_mrg, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[0], rowA[1]); + MMA (&acc2, rowB[0], rowA[2]); + MMA (&acc3, rowB[0], rowA[3]); } if (k % 2 == 1) { @@ -490,9 +453,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 4; BO += k << 2; } - i = (m & 15) >> 3; - /* Loop for m >= 8. */ - for (j = 0; j < i; j++) + if (m & 8) { IFLOAT *BO = B; v4sf_t *rowC; @@ -505,9 +466,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 4]); vec_t *rowB = (vec_t *) & (BO[l << 3]); - vec_t rowB_mrg = MERGE_ROW (rowB[0]); - MMA (&acc0, rowB_mrg, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, rowB_mrg, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, rowB[0], rowA[0]); + MMA (&acc1, rowB[0], rowA[1]); } if (k % 2 == 1) { @@ -525,9 +485,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 3; BO += k << 2; } - i = (m & 7) >> 2; - /* Loop for m >= 4. */ - for (j = 0; j < i; j++) + if (m & 4) { IFLOAT *BO = B; v4sf_t *rowC; @@ -539,7 +497,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, { vec_t *rowA = (vec_t *) & (AO[l << 3]); vec_t *rowB = (vec_t *) & (BO[l << 3]); - MMA (&acc0, MERGE_ROW (rowB[0]), MERGE_ROW (rowA[0])); + MMA (&acc0, rowB[0], rowA[0]); } if (k % 2 == 1) { @@ -555,9 +513,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 2; BO += k << 2; } - i = (m & 3) >> 1; - /* Loop for m >= 2. */ - for (j = 0; j < i; j++) + if (m & 2) { IFLOAT *BO = B; v2sf_t *rowC; @@ -573,7 +529,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowB = (vec_t *) & (BO[l << 3]); - MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + MMA (&acc0, rowB[0], (vec_t) rowA); } if (k % 2 == 1) { @@ -588,31 +544,32 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 1; BO += k << 2; } - i = (m & 1) >> 0; - /* Loop for m = 1. */ - for (j = 0; j < i; j++) + if (m & 1) { IFLOAT *BO = B; + v2sf_t *rowC; + v2sf_t result[8]; + __vector_quad acc0; BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 }; - for (l = 0; l < k; l++) + __builtin_mma_xxsetaccz (&acc0); + for (l = 0; l < k / 2; l++) { - v4sf_t rowA = - { BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), BF16TOF32 (AO[l]), - BF16TOF32 (AO[l]) - }; - v4sf_t rowB = - { BF16TOF32 (BO[l << 2]), BF16TOF32 (BO[(l << 2) + 1]), - BF16TOF32 (BO[(l << 2) + 2]), - BF16TOF32 (BO[(l << 2) + 3]) + vector short rowA = + { AO[(l << 1) + 0], AO[(l << 1) + 1], 0, + 0, 0, 0, 0 }; - t += rowA * rowB; + vec_t *rowB = (vec_t *) & (BO[l << 3]); + MMA (&acc0, rowB[0], (vec_t) rowA); } - t = t * valpha; - CO[0 * ldc] += t[0]; - CO[1 * ldc] += t[1]; - CO[2 * ldc] += t[2]; - CO[3 * ldc] += t[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowA = { AO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowB = (vec_t *) & (BO[l << 2]); + MMA (&acc0, MERGE_ROW (rowB[0]), (vec_t) rowA); + } + SAVE4x2_ACC (&acc0, 0); AO += k; BO += (k << 2); CO += 1; @@ -620,19 +577,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, B += k << 2; } - N = (n & 3) >> 1; - /* Loop for n >= 2. */ - for (i1 = 0; i1 < N; i1++) + if (n & 2) { - BLASLONG i, j; + BLASLONG j; FLOAT *CO; IFLOAT *AO; CO = C; C += ldc << 1; AO = A; - i = m >> 5; /* Loop for m >= 32. */ - for (j = 0; j < i; j++) + for (j = 0; j < (m >> 5); j++) { IFLOAT *BO = B; v4sf_t *rowC; @@ -650,14 +604,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, }; vec_t *rowA = (vec_t *) & (AO[l << 5]); vec_t *rowA1 = (vec_t *) & (A1[l << 5]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); - MMA (&acc4, (vec_t) rowB, MERGE_HIGH (rowA1[0], rowA1[2])); - MMA (&acc5, (vec_t) rowB, MERGE_LOW (rowA1[0], rowA1[2])); - MMA (&acc6, (vec_t) rowB, MERGE_HIGH (rowA1[1], rowA1[3])); - MMA (&acc7, (vec_t) rowB, MERGE_LOW (rowA1[1], rowA1[3])); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); + MMA (&acc2, (vec_t) rowB, rowA[2]); + MMA (&acc3, (vec_t) rowB, rowA[3]); + MMA (&acc4, (vec_t) rowB, rowA1[0]); + MMA (&acc5, (vec_t) rowB, rowA1[1]); + MMA (&acc6, (vec_t) rowB, rowA1[2]); + MMA (&acc7, (vec_t) rowB, rowA1[3]); } if (k % 2 == 1) { @@ -688,9 +642,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 5; BO += k << 1; } - i = (m & 31) >> 4; - /* Loop for m >= 16. */ - for (j = 0; j < i; j++) + if (m & 16) { IFLOAT *BO = B; v4sf_t *rowC; @@ -706,10 +658,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 5]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); - MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); - MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); + MMA (&acc2, (vec_t) rowB, rowA[2]); + MMA (&acc3, (vec_t) rowB, rowA[3]); } if (k % 2 == 1) { @@ -730,9 +682,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 4; BO += k << 1; } - i = (m & 15) >> 3; - /* Loop for m >= 8. */ - for (j = 0; j < i; j++) + if (m & 8) { IFLOAT *BO = B; v4sf_t *rowC; @@ -749,8 +699,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 4]); - MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); - MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); } if (k % 2 == 1) { @@ -767,9 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 3; BO += k << 1; } - i = (m & 7) >> 2; - /* Loop for m >= 4. */ - for (j = 0; j < i; j++) + if (m & 4) { IFLOAT *BO = B; v4sf_t *rowC; @@ -785,7 +733,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, 0, 0, 0, 0 }; vec_t *rowA = (vec_t *) & (AO[l << 3]); - MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + MMA (&acc0, (vec_t) rowB, rowA[0]); } if (k % 2 == 1) { @@ -800,9 +748,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 2; BO += k << 1; } - i = (m & 3) >> 1; - /* Loop for m >= 2. */ - for (j = 0; j < i; j++) + if (m & 2) { IFLOAT *BO = B; BLASLONG l = 0; @@ -828,9 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 1; BO += k << 1; } - i = (m & 1) >> 0; - /* Loop for m = 1. */ - for (j = 0; j < i; j++) + if (m & 1) { IFLOAT *BO = B; BLASLONG l = 0; @@ -852,153 +796,126 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, } B += k << 1; } - N = (n & 1) >> 0; - /* Loop for n = 1. */ - for (i1 = 0; i1 < N; i1++) + if (n & 1) { - BLASLONG i; + BLASLONG j; FLOAT *CO; IFLOAT *AO; CO = C; C += ldc; AO = A; - i = m; /* Loop for m >= 16. */ - while (i >= 16) + for (j = 0; j < (m >> 4); j++) { IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + SET_ACC_ZERO4 (); BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 }; - v4sf_t t1 = { 0, 0, 0, 0 }; - v4sf_t t2 = { 0, 0, 0, 0 }; - v4sf_t t3 = { 0, 0, 0, 0 }; - for (l = 0; l < k; l++) + for (l = 0; l < k / 2; l++) { - v4sf_t rowB = - { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), - BF16TOF32 (BO[l]) - }; - v4sf_t rowA = - { BF16TOF32 (AO[l << 4]), BF16TOF32 (AO[(l << 4) + 1]), - BF16TOF32 (AO[(l << 4) + 2]), - BF16TOF32 (AO[(l << 4) + 3]) - }; - v4sf_t rowA1 = - { BF16TOF32 (AO[(l << 4) + 4]), BF16TOF32 (AO[(l << 4) + 5]), - BF16TOF32 (AO[(l << 4) + 6]), - BF16TOF32 (AO[(l << 4) + 7]) - }; - v4sf_t rowA2 = - { BF16TOF32 (AO[(l << 4) + 8]), BF16TOF32 (AO[(l << 4) + 9]), - BF16TOF32 (AO[(l << 4) + 10]), - BF16TOF32 (AO[(l << 4) + 11]) - }; - v4sf_t rowA3 = { BF16TOF32 (AO[(l << 4) + 12]), - BF16TOF32 (AO[(l << 4) + 13]), BF16TOF32 (AO[(l << 4) + 14]), - BF16TOF32 (AO[(l << 4) + 15]) - }; - t += rowA * rowB; - t1 += rowA1 * rowB; - t2 += rowA2 * rowB; - t3 += rowA3 * rowB; + vector short rowB = + { BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0}; + vec_t *rowA = (vec_t *) & (AO[l << 5]); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); + MMA (&acc2, (vec_t) rowB, rowA[2]); + MMA (&acc3, (vec_t) rowB, rowA[3]); } - t = t * valpha; - t1 = t1 * valpha; - t2 = t2 * valpha; - t3 = t3 * valpha; - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t[2]; - CO[3] += t[3]; - CO[4] += t1[0]; - CO[5] += t1[1]; - CO[6] += t1[2]; - CO[7] += t1[3]; - CO[8] += t2[0]; - CO[9] += t2[1]; - CO[10] += t2[2]; - CO[11] += t2[3]; - CO[12] += t3[0]; - CO[13] += t3[1]; - CO[14] += t3[2]; - CO[15] += t3[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 4)]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[2])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[2])); + MMA (&acc2, (vec_t) rowB, MERGE_HIGH (rowA[1], rowA[3])); + MMA (&acc3, (vec_t) rowB, MERGE_LOW (rowA[1], rowA[3])); + } + rowC = (v4sf_t *) &CO[0]; + __builtin_mma_disassemble_acc ((void *)result, &acc0); + rowC[0] += result[0] * alpha; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + rowC[1] += result[0] * alpha; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + rowC[2] += result[0] * alpha; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + rowC[3] += result[0] * alpha; AO += k << 4; BO += k; CO += 16; - i -= 16; } /* Loop for m >= 8. */ - while (i >= 8) + if (m & 8) { IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 }; - v4sf_t t1 = { 0, 0, 0, 0 }; - for (l = 0; l < k; l++) + for (l = 0; l < k / 2; l++) { - v4sf_t rowB = - { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), - BF16TOF32 (BO[l]) - }; - v4sf_t rowA = - { BF16TOF32 (AO[l << 3]), BF16TOF32 (AO[(l << 3) + 1]), - BF16TOF32 (AO[(l << 3) + 2]), - BF16TOF32 (AO[(l << 3) + 3]) - }; - v4sf_t rowA1 = - { BF16TOF32 (AO[(l << 3) + 4]), BF16TOF32 (AO[(l << 3) + 5]), - BF16TOF32 (AO[(l << 3) + 6]), - BF16TOF32 (AO[(l << 3) + 7]) - }; - t += rowA * rowB; - t1 += rowA1 * rowB; + vector short rowB = + { BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0}; + vec_t *rowA = (vec_t *) & (AO[l << 4]); + MMA (&acc0, (vec_t) rowB, rowA[0]); + MMA (&acc1, (vec_t) rowB, rowA[1]); } - t = t * valpha; - t1 = t1 * valpha; - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t[2]; - CO[3] += t[3]; - CO[4] += t1[0]; - CO[5] += t1[1]; - CO[6] += t1[2]; - CO[7] += t1[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 3)]); + MMA (&acc0, (vec_t) rowB, MERGE_HIGH (rowA[0], rowA[1])); + MMA (&acc1, (vec_t) rowB, MERGE_LOW (rowA[0], rowA[1])); + } + rowC = (v4sf_t *) &CO[0]; + __builtin_mma_disassemble_acc ((void *)result, &acc0); + rowC[0] += result[0] * alpha; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + rowC[1] += result[0] * alpha; AO += k << 3; BO += k; CO += 8; - i -= 8; } /* Loop for m >= 4. */ - while (i >= 4) + if (m & 4) { IFLOAT *BO = B; + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + __builtin_mma_xxsetaccz (&acc0); BLASLONG l = 0; - v4sf_t t = { 0, 0, 0, 0 }; - for (l = 0; l < k; l++) + for (l = 0; l < k / 2; l++) { - v4sf_t rowB = - { BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), BF16TOF32 (BO[l]), - BF16TOF32 (BO[l]) - }; - v4sf_t rowA = - { BF16TOF32 (AO[l << 2]), BF16TOF32 (AO[(l << 2) + 1]), - BF16TOF32 (AO[(l << 2) + 2]), - BF16TOF32 (AO[(l << 2) + 3]) - }; - t += rowA * rowB; + vector short rowB = + { BO[l << 1], BO[(l << 1) + 1], 0, 0, 0, 0, 0, 0}; + vec_t *rowA = (vec_t *) & (AO[l << 3]); + MMA (&acc0, (vec_t) rowB, rowA[0]); } - t = t * valpha; - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t[2]; - CO[3] += t[3]; + if (k % 2 == 1) + { + if (k > 1) + l = (k / 2) << 1; + vector short rowB = { BO[l], 0, 0, 0, 0, 0, 0, 0 }; + vec_t *rowA = (vec_t *) & (AO[(l << 2)]); + MMA (&acc0, (vec_t) rowB, MERGE_ROW (rowA[0])); + } + rowC = (v4sf_t *) &CO[0]; + __builtin_mma_disassemble_acc ((void *)result, &acc0); + rowC[0] += result[0] * alpha; AO += k << 2; BO += k; CO += 4; - i -= 4; } /* Loop for m >= 2. */ - while (i >= 2) + if (m & 2) { IFLOAT *BO = B; BLASLONG l = 0; @@ -1018,10 +935,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, AO += k << 1; BO += k; CO += 2; - i -= 2; } /* Loop for m = 1. */ - while (i >= 1) + if (m & 1) { IFLOAT *BO = B; BLASLONG l = 0; @@ -1034,7 +950,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, BO += k; CO[0] += t * alpha; CO += 1; - i -= 1; } B += k; diff --git a/kernel/power/sbgemm_ncopy_16_power10.c b/kernel/power/sbgemm_ncopy_16_power10.c new file mode 100644 index 0000000000..c6b633011c --- /dev/null +++ b/kernel/power/sbgemm_ncopy_16_power10.c @@ -0,0 +1,437 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; + IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; + + IFLOAT *boffset; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset9 = aoffset8 + lda; + aoffset10 = aoffset9 + lda; + aoffset11 = aoffset10 + lda; + aoffset12 = aoffset11 + lda; + aoffset13 = aoffset12 + lda; + aoffset14 = aoffset13 + lda; + aoffset15 = aoffset14 + lda; + aoffset16 = aoffset15 + lda; + aoffset += 16 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + ctemp17 = *(aoffset9 + 0); + ctemp18 = *(aoffset9 + 1); + ctemp19 = *(aoffset10 + 0); + ctemp20 = *(aoffset10 + 1); + + ctemp21 = *(aoffset11 + 0); + ctemp22 = *(aoffset11 + 1); + ctemp23 = *(aoffset12 + 0); + ctemp24 = *(aoffset12 + 1); + + ctemp25 = *(aoffset13 + 0); + ctemp26 = *(aoffset13 + 1); + ctemp27 = *(aoffset14 + 0); + ctemp28 = *(aoffset14 + 1); + + ctemp29 = *(aoffset15 + 0); + ctemp30 = *(aoffset15 + 1); + ctemp31 = *(aoffset16 + 0); + ctemp32 = *(aoffset16 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + *(boffset + 16) = ctemp17; + *(boffset + 17) = ctemp18; + *(boffset + 18) = ctemp19; + *(boffset + 19) = ctemp20; + *(boffset + 20) = ctemp21; + *(boffset + 21) = ctemp22; + *(boffset + 22) = ctemp23; + *(boffset + 23) = ctemp24; + + *(boffset + 24) = ctemp25; + *(boffset + 25) = ctemp26; + *(boffset + 26) = ctemp27; + *(boffset + 27) = ctemp28; + *(boffset + 28) = ctemp29; + *(boffset + 29) = ctemp30; + *(boffset + 30) = ctemp31; + *(boffset + 31) = ctemp32; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + aoffset9 += 2; + aoffset10 += 2; + aoffset11 += 2; + aoffset12 += 2; + aoffset13 += 2; + aoffset14 += 2; + aoffset15 += 2; + aoffset16 += 2; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + ctemp17 = *(aoffset9 + 0); + ctemp19 = *(aoffset10 + 0); + ctemp21 = *(aoffset11 + 0); + ctemp23 = *(aoffset12 + 0); + ctemp25 = *(aoffset13 + 0); + ctemp27 = *(aoffset14 + 0); + ctemp29 = *(aoffset15 + 0); + ctemp31 = *(aoffset16 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + *(boffset + 8) = ctemp17; + *(boffset + 9) = ctemp19; + *(boffset + 10) = ctemp21; + *(boffset + 11) = ctemp23; + *(boffset + 12) = ctemp25; + *(boffset + 13) = ctemp27; + *(boffset + 14) = ctemp29; + *(boffset + 15) = ctemp31; + + boffset += 16; + } + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + *(boffset + 8) = ctemp09; + *(boffset + 9) = ctemp10; + *(boffset + 10) = ctemp11; + *(boffset + 11) = ctemp12; + *(boffset + 12) = ctemp13; + *(boffset + 13) = ctemp14; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + ctemp09 = *(aoffset5 + 0); + ctemp11 = *(aoffset6 + 0); + ctemp13 = *(aoffset7 + 0); + ctemp15 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp11; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp15; + + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + *(boffset + 4) = ctemp05; + *(boffset + 5) = ctemp06; + *(boffset + 6) = ctemp07; + *(boffset + 7) = ctemp08; + + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + ctemp05 = *(aoffset3 + 0); + ctemp07 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp07; + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp03 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} diff --git a/kernel/power/sbgemm_ncopy_8_power10.c b/kernel/power/sbgemm_ncopy_8_power10.c new file mode 100644 index 0000000000..0e4a680fbc --- /dev/null +++ b/kernel/power/sbgemm_ncopy_8_power10.c @@ -0,0 +1,383 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16))); +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset; + vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04; + vec_bf16 vtemp05, vtemp06, vtemp07, vtemp08; + vec_bf16 vtemp09, vtemp10, vtemp11, vtemp12; + vector char mask = + { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; + vector char mask1 = + { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 }; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17; + IFLOAT ctemp25; + IFLOAT ctemp33; + IFLOAT ctemp41; + IFLOAT ctemp49; + IFLOAT ctemp57; + + + aoffset = a; + boffset = b; + + j = (n >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + i = (m >> 3); + if (i > 0){ + do{ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp02 = *(vec_bf16 *)(aoffset2); + vtemp03 = *(vec_bf16 *)(aoffset3); + vtemp04 = *(vec_bf16 *)(aoffset4); + vtemp05 = *(vec_bf16 *)(aoffset5); + vtemp06 = *(vec_bf16 *)(aoffset6); + vtemp07 = *(vec_bf16 *)(aoffset7); + vtemp08 = *(vec_bf16 *)(aoffset8); + + vtemp09 = vec_perm(vtemp01, vtemp02, mask); + vtemp10 = vec_perm(vtemp03, vtemp04, mask); + vtemp11 = vec_perm(vtemp05, vtemp06, mask); + vtemp12 = vec_perm(vtemp07, vtemp08, mask); + + *(vec_bf16 *)(boffset + 0) = vec_xxpermdi(vtemp09, vtemp10, 0); + *(vec_bf16 *)(boffset + 8) = vec_xxpermdi(vtemp11, vtemp12, 0); + *(vec_bf16 *)(boffset + 16) = vec_xxpermdi(vtemp09, vtemp10, 3); + *(vec_bf16 *)(boffset + 24) = vec_xxpermdi(vtemp11, vtemp12, 3); + + vtemp09 = vec_perm(vtemp01, vtemp02, mask1); + vtemp10 = vec_perm(vtemp03, vtemp04, mask1); + vtemp11 = vec_perm(vtemp05, vtemp06, mask1); + vtemp12 = vec_perm(vtemp07, vtemp08, mask1); + + *(vec_bf16 *)(boffset + 32) = vec_xxpermdi(vtemp09, vtemp10, 0); + *(vec_bf16 *)(boffset + 40) = vec_xxpermdi(vtemp11, vtemp12, 0); + *(vec_bf16 *)(boffset + 48) = vec_xxpermdi(vtemp09, vtemp10, 3); + *(vec_bf16 *)(boffset + 56) = vec_xxpermdi(vtemp11, vtemp12, 3); + + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + boffset += 64; + i --; + }while(i > 0); + } + + i = (m & 7); + if (i >= 2){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset1 + 1); + ctemp17 = *(aoffset2 + 0); + ctemp25 = *(aoffset2 + 1); + ctemp33 = *(aoffset3 + 0); + ctemp41 = *(aoffset3 + 1); + ctemp49 = *(aoffset4 + 0); + ctemp57 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + ctemp01 = *(aoffset5 + 0); + ctemp09 = *(aoffset5 + 1); + ctemp17 = *(aoffset6 + 0); + ctemp25 = *(aoffset6 + 1); + ctemp33 = *(aoffset7 + 0); + ctemp41 = *(aoffset7 + 1); + ctemp49 = *(aoffset8 + 0); + ctemp57 = *(aoffset8 + 1); + *(boffset + 8) = ctemp01; + *(boffset + 9) = ctemp09; + *(boffset + 10) = ctemp17; + *(boffset + 11) = ctemp25; + *(boffset + 12) = ctemp33; + *(boffset + 13) = ctemp41; + *(boffset + 14) = ctemp49; + *(boffset + 15) = ctemp57; + + aoffset5 += 2; + aoffset6 += 2; + aoffset7 += 2; + aoffset8 += 2; + + boffset += 16; + i -= 2; + }while(i > 1); + } + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset2 + 0); + ctemp17 = *(aoffset3 + 0); + ctemp25 = *(aoffset4 + 0); + ctemp33 = *(aoffset5 + 0); + ctemp41 = *(aoffset6 + 0); + ctemp49 = *(aoffset7 + 0); + ctemp57 = *(aoffset8 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + aoffset5 ++; + aoffset6 ++; + aoffset7 ++; + aoffset8 ++; + + boffset += 8; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + i = (m >> 2); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp05; + *(boffset + 3) = ctemp06; + + *(boffset + 4) = ctemp09; + *(boffset + 5) = ctemp10; + *(boffset + 6) = ctemp13; + *(boffset + 7) = ctemp14; + + *(boffset + 8) = ctemp03; + *(boffset + 9) = ctemp04; + *(boffset + 10) = ctemp07; + *(boffset + 11) = ctemp08; + + *(boffset + 12) = ctemp11; + *(boffset + 13) = ctemp12; + *(boffset + 14) = ctemp15; + *(boffset + 15) = ctemp16; + + aoffset1 += 4; + aoffset2 += 4; + aoffset3 += 4; + aoffset4 += 4; + boffset += 16; + i --; + }while(i > 0); + } + + i = (m & 3); + if (i >= 2){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp09 = *(aoffset1 + 1); + ctemp17 = *(aoffset2 + 0); + ctemp25 = *(aoffset2 + 1); + ctemp33 = *(aoffset3 + 0); + ctemp41 = *(aoffset3 + 1); + ctemp49 = *(aoffset4 + 0); + ctemp57 = *(aoffset4 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp09; + *(boffset + 2) = ctemp17; + *(boffset + 3) = ctemp25; + *(boffset + 4) = ctemp33; + *(boffset + 5) = ctemp41; + *(boffset + 6) = ctemp49; + *(boffset + 7) = ctemp57; + aoffset1 += 2; + aoffset2 += 2; + aoffset3 += 2; + aoffset4 += 2; + + boffset += 8; + i -= 2; + }while(i > 1); + } + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + ctemp03 = *(aoffset3 + 0); + ctemp04 = *(aoffset4 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 ++; + aoffset2 ++; + aoffset3 ++; + aoffset4 ++; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp03; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp04; + + aoffset1 += 2; + aoffset2 += 2; + boffset += 4; + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 ++; + aoffset2 ++; + boffset += 2; + } + } /* end of if(j > 0) */ + + if (n & 1){ + aoffset1 = aoffset; + + i = m; + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + + *(boffset + 0) = ctemp01; + + aoffset1 ++; + boffset ++; + i --; + }while(i > 0); + } + + } /* end of if(j > 0) */ + + return 0; +} diff --git a/kernel/power/sbgemm_tcopy_16_power10.c b/kernel/power/sbgemm_tcopy_16_power10.c new file mode 100644 index 0000000000..120c5ab7cb --- /dev/null +++ b/kernel/power/sbgemm_tcopy_16_power10.c @@ -0,0 +1,244 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" +typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16))); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2; + IFLOAT *boffset; + + vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "m = %d n = %d\n", m, n); +#endif + + j = (n >> 4); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 16; + + i = (m >> 1); + if (i > 0){ + do{ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp02 = *(vec_bf16 *)(aoffset1+8); + vtemp03 = *(vec_bf16 *)(aoffset2); + vtemp04 = *(vec_bf16 *)(aoffset2+8); + *(vec_bf16 *)(boffset + 0) = vec_mergeh(vtemp01, vtemp03); + *(vec_bf16 *)(boffset + 8) = vec_mergel(vtemp01, vtemp03); + *(vec_bf16 *)(boffset + 16) = vec_mergeh(vtemp02, vtemp04); + *(vec_bf16 *)(boffset + 24) = vec_mergel(vtemp02, vtemp04); + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 32; + + i --; + }while(i > 0); + } + + if (m & 1){ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp02 = *(vec_bf16 *)(aoffset1+8); + *(vec_bf16 *)(boffset + 0) = vtemp01; + *(vec_bf16 *)(boffset + 8) = vtemp02; + boffset += 16; + } + + j--; + }while(j > 0); + } /* end of if(j > 0) */ + + if (n & 8){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 8; + + i = (m >> 1); + if (i > 0){ + do{ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp03 = *(vec_bf16 *)(aoffset2); + *(vec_bf16 *)(boffset + 0) = vec_mergeh(vtemp01, vtemp03); + *(vec_bf16 *)(boffset + 8) = vec_mergel(vtemp01, vtemp03); + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 16; + + i --; + }while(i > 0); + } + + if (m & 1){ + vtemp01 = *(vec_bf16 *)(aoffset1); + *(vec_bf16 *)(boffset + 0) = vtemp01; + boffset += 8; + } + } + + if (n & 4){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 4; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp05; + *(boffset + 2) = ctemp02; + *(boffset + 3) = ctemp06; + *(boffset + 4) = ctemp03; + *(boffset + 5) = ctemp07; + *(boffset + 6) = ctemp04; + *(boffset + 7) = ctemp08; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 8; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + boffset += 4; + } + } + + if (n & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + aoffset += 2; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + *(boffset + 2) = ctemp03; + *(boffset + 3) = ctemp04; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 4; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + boffset += 2; + } + } + + if (n & 1){ + aoffset1 = aoffset; + aoffset2 = aoffset + lda; + + i = (m >> 1); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset2 + 0); + + *(boffset + 0) = ctemp01; + *(boffset + 1) = ctemp02; + + aoffset1 += 2 * lda; + aoffset2 += 2 * lda; + boffset += 2; + + i --; + }while(i > 0); + } + + if (m & 1){ + ctemp01 = *(aoffset1 + 0); + *(boffset + 0) = ctemp01; + // boffset += 1; + } + } + + return 0; +} diff --git a/kernel/power/sbgemm_tcopy_8_power10.c b/kernel/power/sbgemm_tcopy_8_power10.c new file mode 100644 index 0000000000..aceb0c9d80 --- /dev/null +++ b/kernel/power/sbgemm_tcopy_8_power10.c @@ -0,0 +1,659 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include + +typedef IFLOAT vec_bf16 __attribute__ ((vector_size (16))); + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + + BLASLONG i, j; + + IFLOAT *aoffset; + IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; + IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; + + IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; + vec_bf16 vtemp01, vtemp02, vtemp03, vtemp04; + vec_bf16 vtemp05, vtemp06, vtemp07, vtemp08; + IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; + IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; + IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; + IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; + IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; + IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; + IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; + IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; + + aoffset = a; + boffset = b; + +#if 0 + fprintf(stderr, "M = %d N = %d\n", m, n); +#endif + + boffset2 = b + m * (n & ~7); + boffset3 = b + m * (n & ~3); + boffset4 = b + m * (n & ~1); + + j = (m >> 3); + if (j > 0){ + do{ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset5 = aoffset4 + lda; + aoffset6 = aoffset5 + lda; + aoffset7 = aoffset6 + lda; + aoffset8 = aoffset7 + lda; + aoffset += 8 * lda; + + boffset1 = boffset; + boffset += 64; + + i = (n >> 3); + if (i > 0){ + do{ + vtemp01 = *(vec_bf16 *)(aoffset1); + vtemp02 = *(vec_bf16 *)(aoffset2); + vtemp03 = *(vec_bf16 *)(aoffset3); + vtemp04 = *(vec_bf16 *)(aoffset4); + vtemp05 = *(vec_bf16 *)(aoffset5); + vtemp06 = *(vec_bf16 *)(aoffset6); + vtemp07 = *(vec_bf16 *)(aoffset7); + vtemp08 = *(vec_bf16 *)(aoffset8); + aoffset1 += 8; + aoffset2 += 8; + aoffset3 += 8; + aoffset4 += 8; + aoffset5 += 8; + aoffset6 += 8; + aoffset7 += 8; + aoffset8 += 8; + + *(vec_bf16 *)(boffset1 + 0) = vec_mergeh(vtemp01, vtemp02); + *(vec_bf16 *)(boffset1 + 8) = vec_mergel(vtemp01, vtemp02); + *(vec_bf16 *)(boffset1 + 16) = vec_mergeh(vtemp03, vtemp04); + *(vec_bf16 *)(boffset1 + 24) = vec_mergel(vtemp03, vtemp04); + *(vec_bf16 *)(boffset1 + 32) = vec_mergeh(vtemp05, vtemp06); + *(vec_bf16 *)(boffset1 + 40) = vec_mergel(vtemp05, vtemp06); + *(vec_bf16 *)(boffset1 + 48) = vec_mergeh(vtemp07, vtemp08); + *(vec_bf16 *)(boffset1 + 56) = vec_mergel(vtemp07, vtemp08); + + boffset1 += m * 8; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + ctemp17 = *(aoffset5 + 0); + ctemp18 = *(aoffset5 + 1); + ctemp19 = *(aoffset5 + 2); + ctemp20 = *(aoffset5 + 3); + aoffset5 += 4; + + ctemp21 = *(aoffset6 + 0); + ctemp22 = *(aoffset6 + 1); + ctemp23 = *(aoffset6 + 2); + ctemp24 = *(aoffset6 + 3); + aoffset6 += 4; + + ctemp25 = *(aoffset7 + 0); + ctemp26 = *(aoffset7 + 1); + ctemp27 = *(aoffset7 + 2); + ctemp28 = *(aoffset7 + 3); + aoffset7 += 4; + + ctemp29 = *(aoffset8 + 0); + ctemp30 = *(aoffset8 + 1); + ctemp31 = *(aoffset8 + 2); + ctemp32 = *(aoffset8 + 3); + aoffset8 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp05; + *(boffset2 + 2) = ctemp02; + *(boffset2 + 3) = ctemp06; + *(boffset2 + 4) = ctemp03; + *(boffset2 + 5) = ctemp07; + *(boffset2 + 6) = ctemp04; + *(boffset2 + 7) = ctemp08; + + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp13; + *(boffset2 + 10) = ctemp10; + *(boffset2 + 11) = ctemp14; + *(boffset2 + 12) = ctemp11; + *(boffset2 + 13) = ctemp15; + *(boffset2 + 14) = ctemp12; + *(boffset2 + 15) = ctemp16; + + *(boffset2 + 16) = ctemp17; + *(boffset2 + 17) = ctemp21; + *(boffset2 + 18) = ctemp18; + *(boffset2 + 19) = ctemp22; + *(boffset2 + 20) = ctemp19; + *(boffset2 + 21) = ctemp23; + *(boffset2 + 22) = ctemp20; + *(boffset2 + 23) = ctemp24; + + *(boffset2 + 24) = ctemp25; + *(boffset2 + 25) = ctemp29; + *(boffset2 + 26) = ctemp26; + *(boffset2 + 27) = ctemp30; + *(boffset2 + 28) = ctemp27; + *(boffset2 + 29) = ctemp31; + *(boffset2 + 30) = ctemp28; + *(boffset2 + 31) = ctemp32; + + boffset2 += 32; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + ctemp09 = *(aoffset5 + 0); + ctemp10 = *(aoffset5 + 1); + aoffset5 += 2; + + ctemp11 = *(aoffset6 + 0); + ctemp12 = *(aoffset6 + 1); + aoffset6 += 2; + + ctemp13 = *(aoffset7 + 0); + ctemp14 = *(aoffset7 + 1); + aoffset7 += 2; + + ctemp15 = *(aoffset8 + 0); + ctemp16 = *(aoffset8 + 1); + aoffset8 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + *(boffset3 + 8) = ctemp09; + *(boffset3 + 9) = ctemp10; + *(boffset3 + 10) = ctemp11; + *(boffset3 + 11) = ctemp12; + *(boffset3 + 12) = ctemp13; + *(boffset3 + 13) = ctemp14; + *(boffset3 + 14) = ctemp15; + *(boffset3 + 15) = ctemp16; + boffset3 += 16; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + ctemp05 = *(aoffset5 + 0); + aoffset5 ++; + ctemp06 = *(aoffset6 + 0); + aoffset6 ++; + ctemp07 = *(aoffset7 + 0); + aoffset7 ++; + ctemp08 = *(aoffset8 + 0); + aoffset8 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + *(boffset4 + 4) = ctemp05; + *(boffset4 + 5) = ctemp06; + *(boffset4 + 6) = ctemp07; + *(boffset4 + 7) = ctemp08; + boffset4 += 8; + } + + j--; + }while(j > 0); + } + + if (m & 4){ + + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset3 = aoffset2 + lda; + aoffset4 = aoffset3 + lda; + aoffset += 4 * lda; + + boffset1 = boffset; + boffset += 32; + + i = (n >> 3); + if (i > 0){ + + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + ctemp17 = *(aoffset3 + 0); + ctemp18 = *(aoffset3 + 1); + ctemp19 = *(aoffset3 + 2); + ctemp20 = *(aoffset3 + 3); + ctemp21 = *(aoffset3 + 4); + ctemp22 = *(aoffset3 + 5); + ctemp23 = *(aoffset3 + 6); + ctemp24 = *(aoffset3 + 7); + aoffset3 += 8; + + ctemp25 = *(aoffset4 + 0); + ctemp26 = *(aoffset4 + 1); + ctemp27 = *(aoffset4 + 2); + ctemp28 = *(aoffset4 + 3); + ctemp29 = *(aoffset4 + 4); + ctemp30 = *(aoffset4 + 5); + ctemp31 = *(aoffset4 + 6); + ctemp32 = *(aoffset4 + 7); + aoffset4 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp09; + *(boffset1 + 2) = ctemp02; + *(boffset1 + 3) = ctemp10; + *(boffset1 + 4) = ctemp03; + *(boffset1 + 5) = ctemp11; + *(boffset1 + 6) = ctemp04; + *(boffset1 + 7) = ctemp12; + + *(boffset1 + 8) = ctemp05; + *(boffset1 + 9) = ctemp13; + *(boffset1 + 10) = ctemp06; + *(boffset1 + 11) = ctemp14; + *(boffset1 + 12) = ctemp07; + *(boffset1 + 13) = ctemp15; + *(boffset1 + 14) = ctemp08; + *(boffset1 + 15) = ctemp16; + + *(boffset1 + 16) = ctemp17; + *(boffset1 + 17) = ctemp25; + *(boffset1 + 18) = ctemp18; + *(boffset1 + 19) = ctemp26; + *(boffset1 + 20) = ctemp19; + *(boffset1 + 21) = ctemp27; + *(boffset1 + 22) = ctemp20; + *(boffset1 + 23) = ctemp28; + + *(boffset1 + 24) = ctemp21; + *(boffset1 + 25) = ctemp29; + *(boffset1 + 26) = ctemp22; + *(boffset1 + 27) = ctemp30; + *(boffset1 + 28) = ctemp23; + *(boffset1 + 29) = ctemp31; + *(boffset1 + 30) = ctemp24; + *(boffset1 + 31) = ctemp32; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4) { + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + ctemp09 = *(aoffset3 + 0); + ctemp10 = *(aoffset3 + 1); + ctemp11 = *(aoffset3 + 2); + ctemp12 = *(aoffset3 + 3); + aoffset3 += 4; + + ctemp13 = *(aoffset4 + 0); + ctemp14 = *(aoffset4 + 1); + ctemp15 = *(aoffset4 + 2); + ctemp16 = *(aoffset4 + 3); + aoffset4 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp05; + *(boffset2 + 2) = ctemp02; + *(boffset2 + 3) = ctemp06; + *(boffset2 + 4) = ctemp03; + *(boffset2 + 5) = ctemp07; + *(boffset2 + 6) = ctemp04; + *(boffset2 + 7) = ctemp08; + + *(boffset2 + 8) = ctemp09; + *(boffset2 + 9) = ctemp13; + *(boffset2 + 10) = ctemp10; + *(boffset2 + 11) = ctemp14; + *(boffset2 + 12) = ctemp11; + *(boffset2 + 13) = ctemp15; + *(boffset2 + 14) = ctemp12; + *(boffset2 + 15) = ctemp16; + boffset2 += 16; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + ctemp05 = *(aoffset3 + 0); + ctemp06 = *(aoffset3 + 1); + aoffset3 += 2; + + ctemp07 = *(aoffset4 + 0); + ctemp08 = *(aoffset4 + 1); + aoffset4 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + *(boffset3 + 4) = ctemp05; + *(boffset3 + 5) = ctemp06; + *(boffset3 + 6) = ctemp07; + *(boffset3 + 7) = ctemp08; + boffset3 += 8; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + ctemp03 = *(aoffset3 + 0); + aoffset3 ++; + ctemp04 = *(aoffset4 + 0); + aoffset4 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + *(boffset4 + 2) = ctemp03; + *(boffset4 + 3) = ctemp04; + boffset4 += 4; + } + } + + if (m & 2){ + aoffset1 = aoffset; + aoffset2 = aoffset1 + lda; + aoffset += 2 * lda; + + boffset1 = boffset; + boffset += 16; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + ctemp09 = *(aoffset2 + 0); + ctemp10 = *(aoffset2 + 1); + ctemp11 = *(aoffset2 + 2); + ctemp12 = *(aoffset2 + 3); + ctemp13 = *(aoffset2 + 4); + ctemp14 = *(aoffset2 + 5); + ctemp15 = *(aoffset2 + 6); + ctemp16 = *(aoffset2 + 7); + aoffset2 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp09; + *(boffset1 + 2) = ctemp02; + *(boffset1 + 3) = ctemp10; + *(boffset1 + 4) = ctemp03; + *(boffset1 + 5) = ctemp11; + *(boffset1 + 6) = ctemp04; + *(boffset1 + 7) = ctemp12; + + *(boffset1 + 8) = ctemp05; + *(boffset1 + 9) = ctemp13; + *(boffset1 + 10) = ctemp06; + *(boffset1 + 11) = ctemp14; + *(boffset1 + 12) = ctemp07; + *(boffset1 + 13) = ctemp15; + *(boffset1 + 14) = ctemp08; + *(boffset1 + 15) = ctemp16; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + ctemp05 = *(aoffset2 + 0); + ctemp06 = *(aoffset2 + 1); + ctemp07 = *(aoffset2 + 2); + ctemp08 = *(aoffset2 + 3); + aoffset2 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp05; + *(boffset2 + 2) = ctemp02; + *(boffset2 + 3) = ctemp06; + *(boffset2 + 4) = ctemp03; + *(boffset2 + 5) = ctemp07; + *(boffset2 + 6) = ctemp04; + *(boffset2 + 7) = ctemp08; + boffset2 += 8; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + ctemp03 = *(aoffset2 + 0); + ctemp04 = *(aoffset2 + 1); + aoffset2 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + *(boffset3 + 2) = ctemp03; + *(boffset3 + 3) = ctemp04; + boffset3 += 4; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + ctemp02 = *(aoffset2 + 0); + aoffset2 ++; + + *(boffset4 + 0) = ctemp01; + *(boffset4 + 1) = ctemp02; + boffset4 += 2; + } + } + + if (m & 1){ + aoffset1 = aoffset; + // aoffset += lda; + + boffset1 = boffset; + // boffset += 8; + + i = (n >> 3); + if (i > 0){ + do{ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + ctemp05 = *(aoffset1 + 4); + ctemp06 = *(aoffset1 + 5); + ctemp07 = *(aoffset1 + 6); + ctemp08 = *(aoffset1 + 7); + aoffset1 += 8; + + *(boffset1 + 0) = ctemp01; + *(boffset1 + 1) = ctemp02; + *(boffset1 + 2) = ctemp03; + *(boffset1 + 3) = ctemp04; + *(boffset1 + 4) = ctemp05; + *(boffset1 + 5) = ctemp06; + *(boffset1 + 6) = ctemp07; + *(boffset1 + 7) = ctemp08; + + boffset1 += 8 * m; + i --; + }while(i > 0); + } + + if (n & 4){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + ctemp03 = *(aoffset1 + 2); + ctemp04 = *(aoffset1 + 3); + aoffset1 += 4; + + *(boffset2 + 0) = ctemp01; + *(boffset2 + 1) = ctemp02; + *(boffset2 + 2) = ctemp03; + *(boffset2 + 3) = ctemp04; + // boffset2 += 4; + } + + if (n & 2){ + ctemp01 = *(aoffset1 + 0); + ctemp02 = *(aoffset1 + 1); + aoffset1 += 2; + + *(boffset3 + 0) = ctemp01; + *(boffset3 + 1) = ctemp02; + // boffset3 += 2; + } + + if (n & 1){ + ctemp01 = *(aoffset1 + 0); + aoffset1 ++; + *(boffset4 + 0) = ctemp01; + boffset4 ++; + } + } + + return 0; +} From 2a329baa81d60912328962d1cea98bfcd3eb37d7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 13 Oct 2020 23:21:38 +0200 Subject: [PATCH 329/349] Add the BFLOAT16 functions to cmake builds From 75e3a92df6b4100c05d034c85a6076678b5cc6af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 01:01:58 +0200 Subject: [PATCH 330/349] Add express -mavx and -msse options (and fix a stray = for cooperlake) --- cmake/cc.cmake | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index c490dd9abf..9f5cc1bf74 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -109,10 +109,25 @@ if (${CORE} STREQUAL "COOPERLAKE") if (NOT NO_AVX512) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) - set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") + set (CCOMMON_OPT "${CCOMMON_OPT} -march=cooperlake") else () set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") endif() endif () endif () endif () + +if (NOT DYNAMIC_ARCH) + if (HAVE_AVX2) + set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") + endif () + if (HAVE_AVX) + set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") + endif () + if (HAVE_SSE3) + set (CCOMMON_OPT "${CCOMMON_OPT} -msse3") + endif () + if (HAVE_SSSE3) + set (CCOMMON_OPT "${CCOMMON_OPT} -mssse3") + endif () +endif() From c1f4f5d4e790ec92effe8f0984e85706553f4b3f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 01:08:50 +0200 Subject: [PATCH 331/349] Replace Makefile with simplified version again --- test/Makefile | 138 ++++++++++++++------------------------------------ 1 file changed, 39 insertions(+), 99 deletions(-) diff --git a/test/Makefile b/test/Makefile index 06fb7fe86f..eb3bc34471 100644 --- a/test/Makefile +++ b/test/Makefile @@ -7,40 +7,22 @@ all :: else all :: level1 level2 level3 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) -level1: sblat1 dblat1 cblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) -level1: dblat1 cblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) -level1: sblat1 cblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) -level1: cblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) -level1: cblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) -level1: zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) -level1: sblat1 zblat1 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) -level1: sblat1 dblat1 zblat1 + +ifeq ($(BUILD_SINGLE),1) +S1=sblat1 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) -level1: sblat1 dblat1 +ifeq ($(BUILD_DOUBLE),1) +D1=dblat1 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) -level1: sblat1 +ifeq ($(BUILD_COMPLEX),1) +C1=cblat1 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) -level1: dblat1 +ifeq ($(BUILD_COMPLEX16),1) +Z1=zblat1 endif +level1: $(S1) $(D1) $(C1) $(Z1) + ifndef CROSS ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1 @@ -85,41 +67,22 @@ endif endif endif -#level2: sblat2 dblat2 cblat2 zblat2 -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) -level2: sblat2 dblat2 cblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) -level2: dblat2 cblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) -level2: sblat2 cblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) -level2: cblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) -level2: cblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) -level2: zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) -level2: sblat2 zblat2 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) -level2: sblat2 dblat2 zblat2 +ifeq ($(BUILD_SINGLE),1) +S2=sblat2 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) -level2: sblat2 dblat2 +ifeq ($(BUILD_DOUBLE),1) +D2=dblat2 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) -level2: sblat2 +ifeq ($(BUILD_COMPLEX),1) +C2=cblat2 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) -level2: dblat2 +ifeq ($(BUILD_COMPLEX16),1) +Z2=zblat2 endif +level2: $(S2) $(D2) $(C2) $(Z2) + + ifndef CROSS rm -f ?BLAT2.SUMM ifeq ($(BUILD_SINGLE),1) @@ -178,53 +141,30 @@ endif endif endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1x1x1) -level3: sblat3 dblat3 cblat3 zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1x1x1) -level3: dblat3 cblat3 zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xx1x1) -level3: sblat3 cblat3 zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x1) -level3: cblat3 zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xx1x) -level3: cblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),xxx1) -level3: zblat3 -endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx1) -level3: sblat3 zblat3 +ifeq ($(BUILD_BFLOAT16),1) +B3= test_sbgemm endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx1) -level3: sblat3 dblat3 zblat3 +ifeq ($(BUILD_SINGLE),1) +S3=sblat3 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1x1xx) -level3: sblat3 dblat3 +ifeq ($(BUILD_DOUBLE),1) +D3=dblat3 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),1xxx) -level3: sblat3 +ifeq ($(BUILD_COMPLEX),1) +C3=cblat3 endif -ifeq ($(BUILD_SINGLE)x$(BUILD_DOUBLE)x$(BUILD_COMPLEX)x$(BUILD_COMPLEX16),x1xx) -level3: dblat3 +ifeq ($(BUILD_COMPLEX16),1) +Z3=zblat3 endif +level3: $(B3) $(S3) $(D3) $(C3) $(Z3) -#ifeq ($(BUILD_BFLOAT16),1) -#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 -#else -#level3 : sblat3 dblat3 cblat3 zblat3 -#endif - ifndef CROSS rm -f ?BLAT3.SUMM ifeq ($(BUILD_BFLOAT16),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SHBLAT3.SUMM - @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @@ -246,8 +186,8 @@ ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) ifeq ($(BUILD_BFLOAT16),1) - OMP_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM - @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 + OMP_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @@ -267,8 +207,8 @@ ifeq ($(BUILD_COMPLEX16),1) endif else ifeq ($(BUILD_BFLOAT16),1) - OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM - @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 + OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM + @$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0 endif ifeq ($(BUILD_SINGLE),1) OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat From c9c3ae07afaf7833f14025164360da1efe3eb4df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 18:10:45 +0200 Subject: [PATCH 332/349] Add double precision operations --- kernel/simd/intrin_sse.h | 48 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 9de7e1b278..7449a5a0b7 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -3,25 +3,59 @@ /*************************** * Data Type ***************************/ +#ifdef DOUBLE +typedef __m128d v_f32; +#else typedef __m128 v_f32; +#endif + #define v_nlanes_f32 4 /*************************** * Arithmetic ***************************/ +#ifdef DOUBLE +#define v_add_f32 _mm_add_pd +#define v_mul_f32 _mm_mul_pd +#else #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps +#endif #ifdef HAVE_FMA3 // multiply and add, a*b + c - #define v_muladd_f32 _mm_fmadd_ps +#ifdef DOUBLE + #define v_muladd_f32 _mm_fmadd_pd +#else + #define v_muladd_f32 _mm_fmadd_ps +#endif #elif defined(HAVE_FMA4) // multiply and add, a*b + c - #define v_muladd_f32 _mm_macc_ps + #ifdef DOUBLE + #define v_muladd_f32 _mm_macc_pd + #else + #define v_muladd_f32 _mm_macc_ps + #endif #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } #endif // HAVE_FMA3 +// Horizontal add: Calculates the sum of all vector elements. +#ifdef DOUBLE +BLAS_FINLINE double v_sum_f32(__m128d a) +{ +#ifdef HAVE_SSE3 + __m128d sum_halves = _mm_hadd_pd(a, a); + return _mm_cvtsd_f64(_mm_hadd_pd(sum_halves, sum_halves)); +#else + __m128d t1 = _mm_movehl_pd(a, a); + __m128d t2 = _mm_add_pd(a, t1); + __m128d t3 = _mm_shuffle_pd(t2, t2, 1); + __m128d t4 = _mm_add_ss(t2, t3); + return _mm_cvtsd_f64(t4); +#endif +} +#else // Horizontal add: Calculates the sum of all vector elements. BLAS_FINLINE float v_sum_f32(__m128 a) { @@ -36,11 +70,19 @@ BLAS_FINLINE float v_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } +#endif /*************************** * memory ***************************/ // unaligned load +#ifdef DOUBLE +#define v_loadu_f32 _mm_loadu_pd +#define v_storeu_f32 _mm_storeu_pd +#define v_setall_f32(VAL) _mm_set1_pd(VAL) +#define v_zero_f32 _mm_setzero_pd +#else #define v_loadu_f32 _mm_loadu_ps #define v_storeu_f32 _mm_storeu_ps #define v_setall_f32(VAL) _mm_set1_ps(VAL) -#define v_zero_f32 _mm_setzero_ps \ No newline at end of file +#define v_zero_f32 _mm_setzero_ps +#endif From ca160bb4400a298f10ac358dce328eabb8c49a70 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 19:18:07 +0200 Subject: [PATCH 333/349] Add -msse4.1 when SSE4.1 is supported --- Makefile.x86_64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 8a3fc4eaea..27eb571eea 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -16,6 +16,10 @@ ifdef HAVE_SSSE3 CCOMMON_OPT += -mssse3 FCOMMON_OPT += -mssse3 endif +ifdef HAVE_SSE4_1 +CCOMMON_OPT += -msse4.1 +FCOMMON_OPT += -msse4.1 +endif endif endif From ebf0470fc25fd902a923d743977804ae672d4d20 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 14 Oct 2020 20:34:33 +0200 Subject: [PATCH 334/349] add sse4.1 for DYNAMIC_ARCH kernels --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index c95c15f56b..abe2e08d67 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,7 +45,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse3 -mssse3 + override CFLAGS += -msse3 -mssse3 -msse4.1 endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From bfdf4b56dac690cdb03ea06b362cc178f4228d1a Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 15 Oct 2020 10:29:42 +0800 Subject: [PATCH 335/349] Add double precision universal intrinsics for X86/ARM --- kernel/arm/sum.c | 21 +++++++++++++++++++++ kernel/simd/intrin_avx.h | 21 ++++++++++++++++++++- kernel/simd/intrin_avx512.h | 21 ++++++++++++++++++++- kernel/simd/intrin_neon.h | 28 +++++++++++++++++++++++++++- kernel/simd/intrin_sse.h | 23 ++++++++++++++++++++++- kernel/x86_64/daxpy.c | 10 ++++++++++ 6 files changed, 120 insertions(+), 4 deletions(-) diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c index d4b3fbc839..63584b95c1 100644 --- a/kernel/arm/sum.c +++ b/kernel/arm/sum.c @@ -43,6 +43,26 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (inc_x == 1) { #if V_SIMD +#ifdef DOUBLE + const int vstep = v_nlanes_f64; + const int unrollx2 = n & (-vstep * 2); + const int unrollx = n & -vstep; + v_f64 vsum0 = v_zero_f64(); + v_f64 vsum1 = v_zero_f64(); + while (i < unrollx2) + { + vsum0 = v_add_f64(vsum0, v_loadu_f64(x)); + vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep)); + i += vstep * 2; + } + vsum0 = v_add_f64(vsum0, vsum1); + while (i < unrollx) + { + vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); + i += vstep; + } + sumf = v_sum_f64(vsum0); +#else const int vstep = v_nlanes_f32; const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; @@ -66,6 +86,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i += vstep; } sumf = v_sum_f32(vsum0); +#endif #else int n1 = n & -4; for (; i < n1; i += 4) diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index f36a3dbf0f..3f79646e05 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -4,20 +4,27 @@ * Data Type ***************************/ typedef __m256 v_f32; +typedef __m256d v_f64; #define v_nlanes_f32 8 +#define v_nlanes_f64 4 /*************************** * Arithmetic ***************************/ #define v_add_f32 _mm256_add_ps +#define v_add_f64 _mm256_add_pd #define v_mul_f32 _mm256_mul_ps +#define v_mul_f64 _mm256_mul_pd #ifdef HAVE_FMA3 // multiply and add, a*b + c #define v_muladd_f32 _mm256_fmadd_ps + #define v_muladd_f64 _mm256_fmadd_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_add_f64(v_mul_f64(a, b), c); } #endif // !HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. @@ -31,11 +38,23 @@ BLAS_FINLINE float v_sum_f32(__m256 a) return _mm_cvtss_f32(sum); } +BLAS_FINLINE double v_sum_f64(__m256d a) +{ + __m256d sum_halves = _mm256_hadd_pd(a, a); + __m128d lo = _mm256_castpd256_pd128(sum_halves); + __m128d hi = _mm256_extractf128_pd(sum_halves, 1); + __m128d sum = _mm_add_pd(lo, hi); + return _mm_cvtsd_f64(sum); +} /*************************** * memory ***************************/ // unaligned load #define v_loadu_f32 _mm256_loadu_ps +#define v_loadu_f64 _mm256_loadu_pd #define v_storeu_f32 _mm256_storeu_ps +#define v_storeu_f64 _mm256_storeu_pd #define v_setall_f32(VAL) _mm256_set1_ps(VAL) -#define v_zero_f32 _mm256_setzero_ps \ No newline at end of file +#define v_setall_f64(VAL) _mm256_set1_pd(VAL) +#define v_zero_f32 _mm256_setzero_ps +#define v_zero_f64 _mm256_setzero_pd \ No newline at end of file diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index 70e5f72e39..f00af53e94 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -4,15 +4,19 @@ * Data Type ***************************/ typedef __m512 v_f32; +typedef __m512d v_f64; #define v_nlanes_f32 16 +#define v_nlanes_f64 8 /*************************** * Arithmetic ***************************/ #define v_add_f32 _mm512_add_ps +#define v_add_f64 _mm512_add_pd #define v_mul_f32 _mm512_mul_ps +#define v_mul_f64 _mm512_mul_pd // multiply and add, a*b + c #define v_muladd_f32 _mm512_fmadd_ps - +#define v_muladd_f64 _mm512_fmadd_pd BLAS_FINLINE float v_sum_f32(v_f32 a) { __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); @@ -25,11 +29,26 @@ BLAS_FINLINE float v_sum_f32(v_f32 a) __m512 sum4 = _mm512_add_ps(sum8, h4); return _mm_cvtss_f32(_mm512_castps512_ps128(sum4)); } + +BLAS_FINLINE double v_sum_f64(v_f64 a) +{ + __m512d h64 = _mm512_shuffle_f64x2(a, a, _MM_SHUFFLE(3, 2, 3, 2)); + __m512d sum32 = _mm512_add_pd(a, h64); + __m512d h32 = _mm512_permutex_pd(sum32, _MM_SHUFFLE(1, 0, 3, 2)); + __m512d sum16 = _mm512_add_pd(sum32, h32); + __m512d h16 = _mm512_permute_pd(sum16, _MM_SHUFFLE(2, 3, 0, 1)); + __m512d sum8 = _mm512_add_pd(sum16, h16); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(sum8)); +} /*************************** * memory ***************************/ // unaligned load #define v_loadu_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) +#define v_loadu_f64(PTR) _mm512_loadu_pd((const __m512*)(PTR)) #define v_storeu_f32 _mm512_storeu_ps +#define v_storeu_f64 _mm512_storeu_pd #define v_setall_f32(VAL) _mm512_set1_ps(VAL) +#define v_setall_f64(VAL) _mm512_set1_pd(VAL) #define v_zero_f32 _mm512_setzero_ps +#define v_zero_f64 _mm512_setzero_pd diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h index 5875c0e4ea..6df41cdd0f 100644 --- a/kernel/simd/intrin_neon.h +++ b/kernel/simd/intrin_neon.h @@ -8,12 +8,18 @@ * Data Type ***************************/ typedef float32x4_t v_f32; +#if NPY_SIMD_F64 + typedef float64x2_t v_f64; +#endif #define v_nlanes_f32 4 +#define v_nlanes_f64 2 /*************************** * Arithmetic ***************************/ #define v_add_f32 vaddq_f32 +#define v_add_f64 vaddq_f64 #define v_mul_f32 vmulq_f32 +#define v_mul_f64 vmulq_f64 // FUSED F32 #ifdef HAVE_VFPV4 // FMA @@ -26,12 +32,26 @@ typedef float32x4_t v_f32; { return vmlaq_f32(c, a, b); } #endif +// FUSED F64 +#if NPY_SIMD_F64 + BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) + { return vfmaq_f64(c, a, b); } +#endif + // Horizontal add: Calculates the sum of all vector elements. BLAS_FINLINE float v_sum_f32(float32x4_t a) { float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); return vget_lane_f32(vpadd_f32(r, r), 0); } + +#if NPY_SIMD_F64 + BLAS_FINLINE double v_sum_f64(float64x2_t a) + { + return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); + } +#endif + /*************************** * memory ***************************/ @@ -39,4 +59,10 @@ BLAS_FINLINE float v_sum_f32(float32x4_t a) #define v_loadu_f32(a) vld1q_f32((const float*)a) #define v_storeu_f32 vst1q_f32 #define v_setall_f32(VAL) vdupq_n_f32(VAL) -#define v_zero_f32() vdupq_n_f32(0.0f) \ No newline at end of file +#define v_zero_f32() vdupq_n_f32(0.0f) +#if NPY_SIMD_F64 + #define v_loadu_f64(a) vld1q_f64((const double*)a) + #define v_storeu_f64 vst1q_f64 + #define v_setall_f64 vdupq_n_f64 + #define v_zero_f64() vdupq_n_f64(0.0) +#endif \ No newline at end of file diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 9de7e1b278..06a3fe78b6 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -4,22 +4,30 @@ * Data Type ***************************/ typedef __m128 v_f32; +typedef __m128d v_f64; #define v_nlanes_f32 4 +#define v_nlanes_f64 2 /*************************** * Arithmetic ***************************/ #define v_add_f32 _mm_add_ps +#define v_add_f64 _mm_add_pd #define v_mul_f32 _mm_mul_ps +#define v_mul_f64 _mm_mul_pd #ifdef HAVE_FMA3 // multiply and add, a*b + c #define v_muladd_f32 _mm_fmadd_ps + #define v_muladd_f64 _mm_fmadd_pd #elif defined(HAVE_FMA4) // multiply and add, a*b + c #define v_muladd_f32 _mm_macc_ps + #define v_muladd_f64 _mm_macc_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_add_f64(v_mul_f64(a, b), c); } #endif // HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. @@ -36,11 +44,24 @@ BLAS_FINLINE float v_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } + +BLAS_FINLINE double v_sum_f64(__m128d a) +{ +#ifdef HAVE_SSE3 + return _mm_cvtsd_f64(_mm_hadd_pd(a, a)); +#else + return _mm_cvtsd_f64(_mm_add_pd(a, _mm_unpackhi_pd(a, a))); +#endif +} /*************************** * memory ***************************/ // unaligned load #define v_loadu_f32 _mm_loadu_ps +#define v_loadu_f64 _mm_loadu_pd #define v_storeu_f32 _mm_storeu_ps +#define v_storeu_f64 _mm_storeu_pd #define v_setall_f32(VAL) _mm_set1_ps(VAL) -#define v_zero_f32 _mm_setzero_ps \ No newline at end of file +#define v_setall_f64(VAL) _mm_set1_pd(VAL) +#define v_zero_f32 _mm_setzero_ps +#define v_zero_f64 _mm_setzero_pd \ No newline at end of file diff --git a/kernel/x86_64/daxpy.c b/kernel/x86_64/daxpy.c index b62e3dcb3d..26437012c1 100644 --- a/kernel/x86_64/daxpy.c +++ b/kernel/x86_64/daxpy.c @@ -53,6 +53,15 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) BLASLONG register i = 0; FLOAT a = *alpha; #if V_SIMD +#ifdef DOUBLE + v_f64 __alpha, tmp; + __alpha = v_setall_f64(*alpha); + const int vstep = v_nlanes_f64; + for (; i < n; i += vstep) { + tmp = v_muladd_f64(__alpha, v_loadu_f64( x + i ), v_loadu_f64(y + i)); + v_storeu_f64(y + i, tmp); + } +#else v_f32 __alpha, tmp; __alpha = v_setall_f32(*alpha); const int vstep = v_nlanes_f32; @@ -60,6 +69,7 @@ static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) tmp = v_muladd_f32(__alpha, v_loadu_f32( x + i ), v_loadu_f32(y + i)); v_storeu_f32(y + i, tmp); } +#endif #else while(i < n) { From 4fac91ef37b37dc8979ac47d888320de3845acc3 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Thu, 15 Oct 2020 11:08:10 +0800 Subject: [PATCH 336/349] adapt arm platform --- kernel/simd/intrin_neon.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h index 6df41cdd0f..22cef10ca6 100644 --- a/kernel/simd/intrin_neon.h +++ b/kernel/simd/intrin_neon.h @@ -8,7 +8,7 @@ * Data Type ***************************/ typedef float32x4_t v_f32; -#if NPY_SIMD_F64 +#if V_SIMD_F64 typedef float64x2_t v_f64; #endif #define v_nlanes_f32 4 @@ -33,7 +33,7 @@ typedef float32x4_t v_f32; #endif // FUSED F64 -#if NPY_SIMD_F64 +#if V_SIMD_F64 BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return vfmaq_f64(c, a, b); } #endif @@ -45,7 +45,7 @@ BLAS_FINLINE float v_sum_f32(float32x4_t a) return vget_lane_f32(vpadd_f32(r, r), 0); } -#if NPY_SIMD_F64 +#if V_SIMD_F64 BLAS_FINLINE double v_sum_f64(float64x2_t a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); @@ -60,7 +60,7 @@ BLAS_FINLINE float v_sum_f32(float32x4_t a) #define v_storeu_f32 vst1q_f32 #define v_setall_f32(VAL) vdupq_n_f32(VAL) #define v_zero_f32() vdupq_n_f32(0.0f) -#if NPY_SIMD_F64 +#if V_SIMD_F64 #define v_loadu_f64(a) vld1q_f64((const double*)a) #define v_storeu_f64 vst1q_f64 #define v_setall_f64 vdupq_n_f64 From ae6ac83991539d688095bcfc66bfb22f054860be Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 08:37:02 +0200 Subject: [PATCH 337/349] Revert "add double precision SSE" --- kernel/simd/intrin_sse.h | 48 +++------------------------------------- 1 file changed, 3 insertions(+), 45 deletions(-) diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 7449a5a0b7..9de7e1b278 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -3,59 +3,25 @@ /*************************** * Data Type ***************************/ -#ifdef DOUBLE -typedef __m128d v_f32; -#else typedef __m128 v_f32; -#endif - #define v_nlanes_f32 4 /*************************** * Arithmetic ***************************/ -#ifdef DOUBLE -#define v_add_f32 _mm_add_pd -#define v_mul_f32 _mm_mul_pd -#else #define v_add_f32 _mm_add_ps #define v_mul_f32 _mm_mul_ps -#endif #ifdef HAVE_FMA3 // multiply and add, a*b + c -#ifdef DOUBLE - #define v_muladd_f32 _mm_fmadd_pd -#else - #define v_muladd_f32 _mm_fmadd_ps -#endif + #define v_muladd_f32 _mm_fmadd_ps #elif defined(HAVE_FMA4) // multiply and add, a*b + c - #ifdef DOUBLE - #define v_muladd_f32 _mm_macc_pd - #else - #define v_muladd_f32 _mm_macc_ps - #endif + #define v_muladd_f32 _mm_macc_ps #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } #endif // HAVE_FMA3 -// Horizontal add: Calculates the sum of all vector elements. -#ifdef DOUBLE -BLAS_FINLINE double v_sum_f32(__m128d a) -{ -#ifdef HAVE_SSE3 - __m128d sum_halves = _mm_hadd_pd(a, a); - return _mm_cvtsd_f64(_mm_hadd_pd(sum_halves, sum_halves)); -#else - __m128d t1 = _mm_movehl_pd(a, a); - __m128d t2 = _mm_add_pd(a, t1); - __m128d t3 = _mm_shuffle_pd(t2, t2, 1); - __m128d t4 = _mm_add_ss(t2, t3); - return _mm_cvtsd_f64(t4); -#endif -} -#else // Horizontal add: Calculates the sum of all vector elements. BLAS_FINLINE float v_sum_f32(__m128 a) { @@ -70,19 +36,11 @@ BLAS_FINLINE float v_sum_f32(__m128 a) return _mm_cvtss_f32(t4); #endif } -#endif /*************************** * memory ***************************/ // unaligned load -#ifdef DOUBLE -#define v_loadu_f32 _mm_loadu_pd -#define v_storeu_f32 _mm_storeu_pd -#define v_setall_f32(VAL) _mm_set1_pd(VAL) -#define v_zero_f32 _mm_setzero_pd -#else #define v_loadu_f32 _mm_loadu_ps #define v_storeu_f32 _mm_storeu_ps #define v_setall_f32(VAL) _mm_set1_ps(VAL) -#define v_zero_f32 _mm_setzero_ps -#endif +#define v_zero_f32 _mm_setzero_ps \ No newline at end of file From 10379fc83baced749a2e4f881daa923d9361df26 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 19:05:37 +0200 Subject: [PATCH 338/349] Use ifdef instead of if --- kernel/setparam-ref.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 72fbf32bf7..849a4194a7 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -1164,7 +1164,7 @@ static void init_parameter(void) { TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q; #endif -#if (CORE_KATMAI) || (CORE_COPPERMINE) || (CORE_BANIAS) || (CORE_YONAH) || (CORE_ATHLON) +#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON) #ifdef DEBUG fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n"); From ac8af9cec6e9c391f9047992c15454db8ada1821 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 19:06:45 +0200 Subject: [PATCH 339/349] Add -msse where supported, apparently required for older gcc --- Makefile.x86 | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Makefile.x86 b/Makefile.x86 index a6196d3650..330690935d 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -54,3 +54,19 @@ LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm else LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm endif + +ifdef HAVE_SSE3 +ifndef DYNAMIC_ARCH +CCOMMON_OPT += -msse3 +FCOMMON_OPT += -msse3 +ifdef HAVE_SSSE3 +CCOMMON_OPT += -mssse3 +FCOMMON_OPT += -mssse3 +endif +ifdef HAVE_SSE4_1 +CCOMMON_OPT += -msse4.1 +FCOMMON_OPT += -msse4.1 +endif +endif +endif + From c339c40c01c11046bd9886a00f16deb9a6d675a2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 19:08:12 +0200 Subject: [PATCH 340/349] Silence a redefinition warning --- kernel/x86_64/iamax_sse.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index 4f62b9be29..14c7f43ec7 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -51,6 +51,8 @@ #define MAXPS maxps #define MAXSS maxss #ifdef USE_MIN +#undef MAXPS +#undef MAXSS #define MAXPS minps #define MAXSS minss #endif From dc6cefd2f588c27847f2c4b5a8ad42cbf6331299 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 20:16:15 +0200 Subject: [PATCH 341/349] Expressly enable -msse for 32bit DYNAMIC_ARCH kernels --- kernel/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/Makefile b/kernel/Makefile index abe2e08d67..65e2a0ad66 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -46,6 +46,9 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) override CFLAGS += -msse3 -mssse3 -msse4.1 +endif + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE NEHALEM BARCELONA CORE2 PRESCOTT NORTHWOOD ATHLON)) + override CFLAGS += -msse endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From f071d1207ab2d25247bf6ba02a2f16bf02273a5b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 15 Oct 2020 22:10:32 +0200 Subject: [PATCH 342/349] add sse2 --- kernel/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index 65e2a0ad66..495f3609f3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,7 +48,7 @@ ifdef TARGET_CORE override CFLAGS += -msse3 -mssse3 -msse4.1 endif ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE NEHALEM BARCELONA CORE2 PRESCOTT NORTHWOOD ATHLON)) - override CFLAGS += -msse + override CFLAGS += -msse -msse2 endif ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) From df706670430ef39aeb0a423e367560e452909139 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Oct 2020 09:55:48 +0200 Subject: [PATCH 343/349] fix core list for sse/sse2 --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index 495f3609f3..43318d4753 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -45,9 +45,9 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3)) - override CFLAGS += -msse3 -mssse3 -msse4.1 + override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1 endif - ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE NEHALEM BARCELONA CORE2 PRESCOTT NORTHWOOD ATHLON)) + ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON)) override CFLAGS += -msse -msse2 endif ifeq ($(TARGET_CORE), COOPERLAKE) From 786c0a3ce80b4a3598d7a534470aa5f6b7e6b01c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Oct 2020 10:41:53 +0200 Subject: [PATCH 344/349] Add sse options for use of intrinics with older compilers --- cmake/cc.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cmake/cc.cmake b/cmake/cc.cmake index 9f5cc1bf74..2f4d1c6d72 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -124,10 +124,19 @@ if (NOT DYNAMIC_ARCH) if (HAVE_AVX) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") endif () + if (HAVE_SSE) + set (CCOMMON_OPT "${CCOMMON_OPT} -msse") + endif () + if (HAVE_SSE2) + set (CCOMMON_OPT "${CCOMMON_OPT} -msse2") + endif () if (HAVE_SSE3) set (CCOMMON_OPT "${CCOMMON_OPT} -msse3") endif () if (HAVE_SSSE3) set (CCOMMON_OPT "${CCOMMON_OPT} -mssse3") endif () + if (HAVE_SSE4_1) + set (CCOMMON_OPT "${CCOMMON_OPT} -msse4.1") + endif () endif() From f64243ff57d79c6bd23d39c49648adfddbe018a4 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Oct 2020 10:47:06 +0200 Subject: [PATCH 345/349] Add compiler options for sse/sse2/ssse3/sse4.1 --- cmake/system.cmake | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index b34d4a9a56..4cc46236dd 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -70,9 +70,21 @@ if (DEFINED TARGET) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") endif() endif() + if (DEFINED HAVE_SSE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") + endif() + if (DEFINED HAVE_SSE2) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2") + endif() if (DEFINED HAVE_SSE3) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3") endif() + if (DEFINED HAVE_SSSE3) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3") + endif() + if (DEFINED HAVE_SSE4_1) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") + endif() endif() if (DEFINED TARGET) From f1bb85d378ef4ebcfd4f4c7bbb14b074bfdc945f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 16 Oct 2020 20:52:15 +0200 Subject: [PATCH 346/349] Add AVX flags for clang/aocc as well --- Makefile.x86_64 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 27eb571eea..3a42e19e4c 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -78,6 +78,10 @@ GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) CCOMMON_OPT += -mavx2 endif +else +ifeq ($(C_COMPILER), CLANG) +CCOMMON_OPT += -mavx2 +endif endif ifeq ($(F_COMPILER), GFORTRAN) # AVX2 support was added in 4.7.0 From 5381a18056c1ad6fe171eef275f4b0095e22ee57 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 17 Oct 2020 22:05:36 +0200 Subject: [PATCH 347/349] Update Changelog.txt with the 0.3.11 changes --- Changelog.txt | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/Changelog.txt b/Changelog.txt index cbf0b50f51..bd0e60992c 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,76 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.11 + 17-Oct-2020 + + common: + * API change: + the newly added BFLOAT16 functions were renamed to use the + letter "B" instead of "H" to avoid potential confusion with + the IEEE "half precision float" type, i.e. the 0.3.10 + SHGEMM is now SBGEMM and the corresponding build option + was changed from "BUILD_HALF" to "BUILD_BFLOAT16". + * Reduced the default BLAS3_MEM_ALLOC_THRESHOLD (used as an upper + limit for placing temporary arrays on the stack) to be compatible + with a stack size of 1mb (as imposed by the JAVA runtime library) + * Added mixed-precision dot function SBDOT and utility functions + shstobf16, shdtobf16, sbf16tos and dbf16tod to convert between + single or double precision float arrays and bfloat16 arrays + * Fixed prototypes of LAPACK_?ggsvp and LAPACK_?ggsvd functions + in lapack.h + * Fixed underflow and rounding errors in LAPACK SLANV2 and DLANV2 + (causing miscalculations in e.g. SHSEQR/DHSEQR, LAPACK issue #263) + * Fixed workspace calculation in LAPACK ?GELQ (LAPACK issue #415) + * Fixed several bugs in the LAPACK testsuite + * Improved performance of TRMM and TRSM for certain problem sizes + * Fixed infinite recursions and workspace miscalculations in ReLAPACK + * CMAKE builds no longer require pkg-config for creating the .pc file + * Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as + enabling these options + * Fixed detection of gfortran when invoked through an mpi wrapper + * Improve thread reinitialization performance with OpenMP xafter a fork + * Added support for building only the subset of the library required + for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE + * Optional function name prefixes and suffixes are now correctly + reflected in the generated cblas.h + * Added CMAKE build support for the LAPACK and multithreading tests + +POWER: + * Added optimized support for POWER10 + * Added support for compiling for POWER8 in 32bit mode + * Added support for compilation with LLVM/clang + * Added support for compilation with NVIDIA/PGI compilers + * Fixed building on big-endian POWER8 + * Fixed miscompilation of ZDOTC by gcc10 + * Fixed alignment errors in the POWER8 SAXPY kernel + * Improved CPU detection on AIX + * Supported building with older compilers on POWER9 + +x86_64: + * Added support for Intel Cooperlake + * Added autodetection of AMD Renoir/Matisse/Zen3 cpus + * Added autodetection of Intel Comet Lake cpus + * Reimplemented ?sum, ?dot and daxpy using universal intrinsics + * Reset the fpu state before using the fpu on Windows as a workaround + for a problem introduced in Windows 10 build 19041 (a.k.a. SDK 2004) + * Fixed potentially undefined behaviour in the dot and gemv_t kernels + * Fixed a potential segmentation fault in DYNAMIC_ARCH builds + * Fixed building for ZEN with PGI/NVIDIA and AMD AOCC compilers + +ARMV7: + * Fixed cpu detection on BSD-like systems + +ARMV8: + * Added preliminary support for Apple Vortex cpus + * Added support for the Cavium ThunderX3T110 cpu + * Fixed cpu detection on BSD-like systems + * Fixed compilation in -std=C18 mode + + +IBM Z: + * Added support for compiling with the clang compiler + * Improved GEMM performance on Z14 + ==================================================================== Version 0.3.10 14-Jun-2020 From fe9015b619037fdbd04b8ffe4d58ab4f22ea21fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 17 Oct 2020 22:10:50 +0200 Subject: [PATCH 348/349] Update version for 0.3.11 release --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a6cf2ef834..e77aec0305 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 10.dev) +set(OpenBLAS_PATCH_VERSION 11) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions From b8f689200eccb3802aaa1188a98d3b5578fce295 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 17 Oct 2020 22:11:34 +0200 Subject: [PATCH 349/349] Update version number to 0.3.11 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 67d1839363..acfe568d63 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.10.dev +VERSION = 0.3.11 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library