Skip to content

Commit

Permalink
Merge pull request #4 from xianyi/develop
Browse files Browse the repository at this point in the history
Update branch
  • Loading branch information
martin-frbg authored Sep 16, 2018
2 parents 61659f8 + fd081a9 commit ec0cac1
Show file tree
Hide file tree
Showing 71 changed files with 2,839 additions and 219 deletions.
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 1.dev)
set(OpenBLAS_PATCH_VERSION 4.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

# Adhere to GNU filesystem layout conventions
Expand Down Expand Up @@ -150,6 +150,7 @@ endif()

# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)

# Android needs to explicitly link against libm
if(ANDROID)
Expand All @@ -169,6 +170,7 @@ endif()
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS")

foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
Expand Down
138 changes: 138 additions & 0 deletions Changelog.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,142 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.3
31-Aug-2018

common:
* thread memory allocation has been switched back to the method
used before version 0.3.1 due to unexpected problems caused by
the new code under some circumstances. A new compile-time option
USE_TLS has been added to enable the new code, and it is hoped
that this can become the default again in the next version.
* LAPAck PR272 has been integrated, which fixes spurious errors
in DSYEVR and related functions caused by missing conversion
from ILAENV to ILAENV_2STAGE in several _2stage routines.
* the cmake-generated OpenBLASConfig.cmake now uses correct case
for the name of the library
* added support for Haiku OS

x86_64:
* added AVX512 implementations of SDOT, DDOT, SAXPY, DAXPY,
DSCAL, DGEMVN and DSYMVL
* added a workaround for a cygwin issue that prevented compilation
of AVX512 code

IBM Z:
* added autodetection of Z14
* fixed TRMM errors in the generic target

====================================================================
Version 0.3.2
30-Jul-2018

common:
* fixes for regressions caused by the rewrite of the thread
initialization code in 0.3.1

POWER:
* fixed cpu autodetection for the BSDs

MIPS64:
* fixed utest errors in AXPY, DSDOT, ROT and SWAP

x86_64:
* added autodetection of AMD Ryzen 2
* fixed build with older versions of MSVC

====================================================================
Version 0.3.1
01-Jul-2018

common:
* rewritten thread initialization code with significantly reduced overhead
* added CBLAS interfaces to the IxAMIN BLAS extension functions
* fixed the lapack-test target
* CMAKE builds now create an OpenBLASConfig.cmake file
* ZAXPY now uses a single thread for small input sizes
* the LAPACK code was updated from Reference-LAPACK/lapack#253
(fixing LAPACKE interfaces to Aasen's functions)

POWER:
* corrected CROT and ZROT behaviour with zero INC_X

ARMV7:
* corrected xDOT behaviour with zero INC_X or INC_Y

x86_64:
* retired some older targets of DYNAMIC_ARCH builds to a new option DYNAMIC_OLDER,
this affects PENRYN,DUNNINGTON,OPTERON,OPTERON_SSE3,BOBCAT,ATOM and NANO
(which will still be supported via the slower PRESCOTT kernels when this option is not set)
* added an option DYNAMIC_LIST that (used in conjunction with DYNAMIC_ARCH) allows to
specify the list of x86_64 targets to include. Any target not on the list will be supported
by the Sandybridge or Nehalem kernels if available, or by Prescott.
* improved SWITCH_RATIO on Haswell for increased GEMM throughput
* added initial support for Intel Skylake X, including an AVX512 SGEMM kernel
* added autodetection of Intel Cannon Lake series as Skylake X
* added a default L2 cache size for hypervisors that return zero here (Chromebook)
* fixed a name clash with recent Windows10 headers that broke the build with (at least)
recent mingw from MSYS2
* fixed a link error in mixed clang/gfortran builds with OpenMP
* updated the OSX deployment target to 10.8
* switched on parallel make for builds on MS Windows by default

x86:
* fixed SSWAP and DSWAP behaviour with zero INC_X and INC_Y

====================================================================
Version 0.3.0
23-May-2108

common:
* fixed some more thread race and locking bugs
* added preliminary support for calling an OpenMP build of the library from multiple threads
* removed performance impact of thread locks added in 0.2.20 on OpenMP code
* general code cleanup
* optimized DSDOT implementation
* improved thread distribution for GEMM
* corrected IMATCOPY/OMATCOPY implementation
* fixed out-of-bounds accesses in the multithreaded xBMV/xPMV and SYMV implementations
* cmake build improvements
* pkgconfig file now contains build options
* openblas_get_config() now reports USE_OPENMP and NUM_THREADS settings used for the build
* corrections and improvements for systems with more than 64 cpus
* LAPACK code updated to 3.8.0 including later fixes
* added ReLAPACK, a recursive implementation of several LAPACK functions
* Rewrote ROTMG to handle cases that the netlib code failed to address
* Disabled (broken) multithreading code for xTRMV
* corrected prototypes of complex CBLAS functions to make our cblas.h match the generally accepted standard
* shared memory access failures on startup are now handled more gracefully
* restored utests from earlier releases (and made them pass on all affected systems)

SPARC:
* several fixes for cpu autodetection

POWER:
* corrected vector register overwriting in several Power8 kernels
* optimized additional BLAS functions

ARM:
* added support for CortexA53 and A72
* added autodetection for ThunderX2T99
* made most optimized kernels the default for generic ARMv8 targets

x86_64:
* parallelized DDOT kernel for Haswell
* changed alignment directives in assembly kernels to boost performance on OSX
* fixed register handling in the GEMV microkernels (bug exposed by gcc7)
* added support for building on OpenBSD and Dragonfly
* updated compiler options to work with Intel release 2018
* support fully optimized build with clang/flang on Microsoft Windows
* fixed building on AIX

IBM Z:
* added optimized BLAS 1/2 functions

MIPS:
* fixed cpu autodetection helper code
* added mips32 1004K cpu (Mediatek MT7621 and similar SoC)
* added mips64 I6500 cpu

====================================================================
Version 0.2.20
24-Jul-2017
Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ endif

shared :
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
Expand Down Expand Up @@ -267,6 +267,8 @@ ifeq ($(F_COMPILER), GFORTRAN)
ifdef SMP
ifeq ($(OSNAME), WINNT)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else ifeq ($(OSNAME), Haiku)
-@echo "LOADER = $(FC)" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "LOADER = $(FC) -pthread" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
Expand Down
2 changes: 1 addition & 1 deletion Makefile.install
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
Expand Down
10 changes: 8 additions & 2 deletions Makefile.rule
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#

# This library's version
VERSION = 0.3.1.dev
VERSION = 0.3.4.dev

# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
Expand Down Expand Up @@ -107,7 +107,13 @@ BUILD_LAPACK_DEPRECATED = 1
# BUILD_RELAPACK = 1

# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
USE_SIMPLE_THREADED_LEVEL3 = 1

# If you want to use the new, still somewhat experimental code that uses
# thread-local storage instead of a central memory buffer in memory.c
# Note that if your system uses GLIBC, it needs to have at least glibc 2.21
# for this to work.
USE_TLS = 1

# If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you
Expand Down
4 changes: 4 additions & 0 deletions Makefile.system
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,10 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif

ifdef USE_TLS
CCOMMON_OPT += -DUSE_TLS
endif

ifndef SYMBOLPREFIX
SYMBOLPREFIX =
endif
Expand Down
3 changes: 3 additions & 0 deletions Makefile.x86_64
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ ifeq ($(CORE), SKYLAKEX)
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ Please read `GotoBLAS_01Readme.txt`.
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
Expand Down Expand Up @@ -200,6 +201,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* Please use GCC version 6 or LLVM version 6 and above to compile Skyalke AVX512 kernels.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.
Expand Down
2 changes: 1 addition & 1 deletion benchmark/gemv.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ int main(int argc, char *argv[]){

FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
FLOAT beta [] = {1.0, 0.0};
char trans='N';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
Expand Down
4 changes: 3 additions & 1 deletion c_check
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ $os = WINNT if ($data =~ /OS_WINNT/);
$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/);
$os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);

$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
Expand Down Expand Up @@ -223,14 +224,15 @@ $data =~ /globl\s([_\.]*)(.*)/;
$need_fu = $1;

$cross = 0;
$cross = 1 if ($os ne $hostos);

if ($architecture ne $hostarch) {
$cross = 1;
$cross = 0 if (($hostarch eq "x86_64") && ($architecture eq "x86"));
$cross = 0 if (($hostarch eq "mips64") && ($architecture eq "mips"));
}

$cross = 1 if ($os ne $hostos);

$openmp = "" if $ENV{USE_OPENMP} != 1;

$linker_L = "";
Expand Down
3 changes: 2 additions & 1 deletion cblas.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=1
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;

typedef CBLAS_ORDER CBLAS_LAYOUT;

float cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
float cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
Expand Down
2 changes: 1 addition & 1 deletion cmake/prebuild.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ if (NOT NOFORTRAN)
endif ()

# Cannot run getarch on target if we are cross-compiling
if (DEFINED CORE AND CMAKE_CROSSCOMPILING)
if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE"))
# Write to config as getarch would

# TODO: Set up defines that getarch sets up based on every other target
Expand Down
4 changes: 4 additions & 0 deletions cmake/system.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,10 @@ if (CONSISTENT_FPCSR)
set(CCOMMON_OPT "${CCOMMON_OPT} -DCONSISTENT_FPCSR")
endif ()

if (USE_TLS)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_TLS")
endif ()

# Only for development
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPARAMTEST")
# set(CCOMMON_OPT "${CCOMMON_OPT} -DPREFETCHTEST")
Expand Down
2 changes: 1 addition & 1 deletion cmake/system_check.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ endif()

if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()
Expand Down
10 changes: 10 additions & 0 deletions common.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ extern "C" {
#endif
#endif

#ifdef OS_HAIKU
#define NO_SYSV_IPC
#endif

#ifdef OS_WINDOWS
#ifdef ATOM
#define GOTO_ATOM ATOM
Expand Down Expand Up @@ -253,8 +257,14 @@ typedef unsigned long BLASULONG;

#ifdef USE64BITINT
typedef BLASLONG blasint;
#if defined(OS_WINDOWS) && defined(__64BIT__)
#define blasabs(x) llabs(x)
#else
#define blasabs(x) labs(x)
#endif
#else
typedef int blasint;
#define blasabs(x) abs(x)
#endif
#else
#ifdef USE64BITINT
Expand Down
Loading

0 comments on commit ec0cac1

Please sign in to comment.