Skip to content

Commit

Permalink
Merge pull request #2 from martin-frbg/develop
Browse files Browse the repository at this point in the history
merge develop
  • Loading branch information
martin-frbg authored Jun 25, 2018
2 parents 1833a67 + c38c65e commit e6d93f2
Show file tree
Hide file tree
Showing 17 changed files with 419 additions and 362 deletions.
28 changes: 21 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,17 @@ ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack
endif

ifeq ($(NO_FORTRAN), 1)
define NOFORTRAN
1
endef
define NO_LAPACK
1
endef
export NOFORTRAN
export NO_LAPACK
endif

LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))

SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
Expand All @@ -47,7 +58,7 @@ endif
endif

@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
endif
ifneq ($(OSNAME), AIX)
Expand Down Expand Up @@ -108,7 +119,7 @@ endif
endif

tests :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
Expand Down Expand Up @@ -210,7 +221,7 @@ netlib :

else
netlib : lapack_prebuild
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif
Expand All @@ -231,7 +242,10 @@ prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof

lapack_prebuild :
ifndef NOFORTRAN
$(info filter value of NOFORTRAN is:)
$(info x$(filter-out $(NOFORTRAN), 1 2)x)

ifneq ($(NOFORTRAN), $(filter-out $(NOFORTRAN), 1 2))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
Expand Down Expand Up @@ -274,21 +288,21 @@ endif
endif

large.tgz :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
fi
endif

timing.tgz :
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz;
fi
endif

lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
ifneq ($(NOFORTRAN), $(filter $(NOFORTRAN), 1 2))
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
Expand Down
15 changes: 13 additions & 2 deletions Makefile.rule
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,14 @@ VERSION = 0.3.1.dev
# This flag is always set for POWER8. Don't modify the flag
# USE_OPENMP = 1

# The OpenMP scheduler to use - by default this is "static" and you
# will normally not want to change this unless you know that your main
# workload will involve tasks that have highly unbalanced running times
# for individual threads. Changing away from "static" may also adversely
# affect memory access locality in NUMA systems. Setting to "runtime" will
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic

# You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script.
Expand Down Expand Up @@ -156,8 +164,11 @@ NO_AFFINITY = 1
# CONSISTENT_FPCSR = 1

# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 4.
# with single thread. (Actually in recent versions this is a factor proportional to the
# number of floating point operations necessary for the given problem size, no longer
# an individual dimension). You can use this setting to avoid the overhead of multi-
# threading in small matrix sizes. The default value is 4, but values as high as 50 have
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4

# If you need santy check by comparing reference BLAS. It'll be very
Expand Down
7 changes: 7 additions & 0 deletions Makefile.x86_64
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ endif
endif
endif

ifeq ($(CORE), SKYLAKEX)
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
endif
endif

ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64
endif
Expand Down
5 changes: 5 additions & 0 deletions cblas.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
Expand Down
17 changes: 9 additions & 8 deletions common_stackalloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* - large enough to support all architectures and kernel
* Chosing a too small SIZE will lead to a stack smashing.
*/
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) \
stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
TYPE stack_buffer[stack_alloc_size] __attribute__((aligned(0x20))); \
#define STACK_ALLOC(SIZE, TYPE, BUFFER) \
/* make it volatile because some function (ex: dgemv_n.S) */ \
/* do not restore all register */ \
volatile int stack_alloc_size = SIZE; \
if (stack_alloc_size > MAX_STACK_ALLOC / sizeof(TYPE)) stack_alloc_size = 0; \
STACK_ALLOC_PROTECT_SET \
/* Avoid declaring an array of length 0 */ \
TYPE stack_buffer[stack_alloc_size ? stack_alloc_size : 1] \
__attribute__((aligned(0x20))); \
BUFFER = stack_alloc_size ? stack_buffer : (TYPE *)blas_memory_alloc(1);
#else
//Original OpenBLAS/GotoBLAS codes.
Expand Down
9 changes: 7 additions & 2 deletions common_x86_64.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,13 @@
#endif
*/

#define MB
#define WMB
#ifdef __GNUC__
#define MB do { __asm__ __volatile__("": : :"memory"); } while (0)
#define WMB do { __asm__ __volatile__("": : :"memory"); } while (0)
#else
#define MB do {} while (0)
#define WMB do {} while (0)
#endif

static void __inline blas_lock(volatile BLASULONG *address){

Expand Down
17 changes: 17 additions & 0 deletions cpuid_x86.c
Original file line number Diff line number Diff line change
Expand Up @@ -1339,6 +1339,23 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 6:
switch (model) {
case 6: // Cannon Lake
#ifndef NO_AVX512
return CPUTYPE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
#endif
}
break;
case 9:
case 8:
switch (model) {
Expand Down
8 changes: 7 additions & 1 deletion ctest/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,13 @@ clean ::
rm -f x*

FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =
ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
endif

# Single real
xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
Expand Down
40 changes: 18 additions & 22 deletions driver/level3/level3_thread.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,7 @@
#endif

typedef struct {
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

Expand Down Expand Up @@ -348,12 +344,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
div_n = (n_to - n_from + DIVIDE_RATE - 1) / DIVIDE_RATE;
for (js = n_from, bufferside = 0; js < n_to; js += div_n, bufferside ++) {

/* Make sure if no one is using workspace */
START_RPCC();
for (i = 0; i < args -> nthreads; i++)
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
STOP_RPCC(waiting1);

#if defined(FUSED_GEMM) && !defined(TIMING)

/* Fused operation to copy region of B into workspace and apply kernel */
Expand Down Expand Up @@ -391,10 +381,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
}
#endif

/* Set flag so other threads can access local region of B */
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++)
for (i = mypos_n * nthreads_m; i < (mypos_n + 1) * nthreads_m; i++) {
/* Make sure if no one is using workspace */
START_RPCC();
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
STOP_RPCC(waiting1);
/* Set flag so other threads can access local region of B */
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
WMB;
WMB;
}
}

/* Get regions of B from other threads and apply kernel */
Expand All @@ -413,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

/* Wait until other region of B is initialized */
START_RPCC();
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
STOP_RPCC(waiting2);

/* Apply kernel with local region of A and part of other region of B */
Expand All @@ -430,12 +425,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,

/* Clear synchronization flag if this thread is done with other region of B */
if (m_to - m_from == min_i) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
} while (current != mypos);

/* Iterate through steps of m
/* Iterate through steps of m
* Note: First step has already been finished */
for(is = m_from + min_i; is < m_to; is += min_i){
min_i = m_to - is;
Expand Down Expand Up @@ -465,14 +461,14 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside],
c, ldc, is, js);
STOP_RPCC(kernel);

#ifdef TIMING
ops += 2 * min_i * MIN(range_n[current + 1] - js, div_n) * min_l;
#endif

/* Clear synchronization flag if this thread is done with region of B */
if (is + min_i >= m_to) {
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
WMB;
}
}
Expand All @@ -492,7 +488,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
for (i = 0; i < args -> nthreads; i++) {
for (js = 0; js < DIVIDE_RATE; js++) {
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;};
while (job[mypos].working[i][CACHE_LINE_SIZE * js] ) {YIELDING;MB;};
}
}
STOP_RPCC(waiting3);
Expand Down Expand Up @@ -658,8 +654,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
}

/* Clear synchronization flags */
for (i = 0; i < MAX_CPU_NUMBER; i++) {
for (j = 0; j < MAX_CPU_NUMBER; j++) {
for (i = 0; i < nthreads; i++) {
for (j = 0; j < nthreads; j++) {
for (k = 0; k < DIVIDE_RATE; k++) {
job[i].working[j][CACHE_LINE_SIZE * k] = 0;
}
Expand Down
6 changes: 5 additions & 1 deletion driver/others/blas_server_omp.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@

#else

#ifndef OMP_SCHED
#define OMP_SCHED static
#endif

int blas_server_avail = 0;

static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
Expand Down Expand Up @@ -331,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
break;
}

#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(OMP_SCHED)
for (i = 0; i < num; i ++) {

#ifndef USE_SIMPLE_THREADED_LEVEL3
Expand Down
Loading

0 comments on commit e6d93f2

Please sign in to comment.