From de8a10dc543f69e2111d250c79f1414b70a08686 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 9 Dec 2019 20:47:28 +0200 Subject: [PATCH] BUG: core: use blas_ilp64 also for *_matmul, *_dot, and *_vdot Changing these to support ILP64 blas was missed in gh-15012 --- numpy/core/src/multiarray/arraytypes.c.src | 16 +++---- numpy/core/src/multiarray/common.h | 10 ++++- numpy/core/src/multiarray/vdot.c | 16 +++---- numpy/core/src/umath/matmul.c.src | 49 ++++++++++++---------- 4 files changed, 53 insertions(+), 38 deletions(-) diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src index e36b95c00918..9e108e3e140a 100644 --- a/numpy/core/src/multiarray/arraytypes.c.src +++ b/numpy/core/src/multiarray/arraytypes.c.src @@ -3535,17 +3535,17 @@ NPY_NO_EXPORT void npy_intp n, void *NPY_UNUSED(ignore)) { #if defined(HAVE_CBLAS) - int is1b = blas_stride(is1, sizeof(@type@)); - int is2b = blas_stride(is2, sizeof(@type@)); + CBLAS_INT is1b = blas_stride(is1, sizeof(@type@)); + CBLAS_INT is2b = blas_stride(is2, sizeof(@type@)); if (is1b && is2b) { double sum = 0.; /* double for stability */ while (n > 0) { - int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK; + CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK; - sum += cblas_@prefix@dot(chunk, + sum += CBLAS_FUNC(cblas_@prefix@dot)(chunk, (@type@ *) ip1, is1b, (@type@ *) ip2, is2b); /* use char strides here */ @@ -3584,17 +3584,17 @@ NPY_NO_EXPORT void char *op, npy_intp n, void *NPY_UNUSED(ignore)) { #if defined(HAVE_CBLAS) - int is1b = blas_stride(is1, sizeof(@ctype@)); - int is2b = blas_stride(is2, sizeof(@ctype@)); + CBLAS_INT is1b = blas_stride(is1, sizeof(@ctype@)); + CBLAS_INT is2b = blas_stride(is2, sizeof(@ctype@)); if (is1b && is2b) { double sum[2] = {0., 0.}; /* double for stability */ while (n > 0) { - int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK; + CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK; @type@ tmp[2]; - cblas_@prefix@dotu_sub((int)n, ip1, is1b, ip2, is2b, tmp); + CBLAS_FUNC(cblas_@prefix@dotu_sub)((CBLAS_INT)n, ip1, is1b, ip2, is2b, tmp); sum[0] += (double)tmp[0]; sum[1] += (double)tmp[1]; /* use char strides here */ diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h index 487d530a148b..7eee9ddc5db6 100644 --- a/numpy/core/src/multiarray/common.h +++ b/numpy/core/src/multiarray/common.h @@ -303,7 +303,11 @@ blas_stride(npy_intp stride, unsigned itemsize) */ if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) { stride /= itemsize; +#ifndef HAVE_BLAS_ILP64 if (stride <= INT_MAX) { +#else + if (stride <= NPY_MAX_INT64) { +#endif return stride; } } @@ -314,7 +318,11 @@ blas_stride(npy_intp stride, unsigned itemsize) * Define a chunksize for CBLAS. CBLAS counts in integers. */ #if NPY_MAX_INTP > INT_MAX -# define NPY_CBLAS_CHUNK (INT_MAX / 2 + 1) +# ifndef HAVE_BLAS_ILP64 +# define NPY_CBLAS_CHUNK (INT_MAX / 2 + 1) +# else +# define NPY_CBLAS_CHUNK (NPY_MAX_INT64 / 2 + 1) +# endif #else # define NPY_CBLAS_CHUNK NPY_MAX_INTP #endif diff --git a/numpy/core/src/multiarray/vdot.c b/numpy/core/src/multiarray/vdot.c index 424a21710f25..9b5d19522029 100644 --- a/numpy/core/src/multiarray/vdot.c +++ b/numpy/core/src/multiarray/vdot.c @@ -15,17 +15,17 @@ CFLOAT_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n, void *NPY_UNUSED(ignore)) { #if defined(HAVE_CBLAS) - int is1b = blas_stride(is1, sizeof(npy_cfloat)); - int is2b = blas_stride(is2, sizeof(npy_cfloat)); + CBLAS_INT is1b = blas_stride(is1, sizeof(npy_cfloat)); + CBLAS_INT is2b = blas_stride(is2, sizeof(npy_cfloat)); if (is1b && is2b) { double sum[2] = {0., 0.}; /* double for stability */ while (n > 0) { - int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK; + CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK; float tmp[2]; - cblas_cdotc_sub((int)n, ip1, is1b, ip2, is2b, tmp); + CBLAS_FUNC(cblas_cdotc_sub)((CBLAS_INT)n, ip1, is1b, ip2, is2b, tmp); sum[0] += (double)tmp[0]; sum[1] += (double)tmp[1]; /* use char strides here */ @@ -66,17 +66,17 @@ CDOUBLE_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n, void *NPY_UNUSED(ignore)) { #if defined(HAVE_CBLAS) - int is1b = blas_stride(is1, sizeof(npy_cdouble)); - int is2b = blas_stride(is2, sizeof(npy_cdouble)); + CBLAS_INT is1b = blas_stride(is1, sizeof(npy_cdouble)); + CBLAS_INT is2b = blas_stride(is2, sizeof(npy_cdouble)); if (is1b && is2b) { double sum[2] = {0., 0.}; /* double for stability */ while (n > 0) { - int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK; + CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK; double tmp[2]; - cblas_zdotc_sub((int)n, ip1, is1b, ip2, is2b, tmp); + CBLAS_FUNC(cblas_zdotc_sub)((CBLAS_INT)n, ip1, is1b, ip2, is2b, tmp); sum[0] += (double)tmp[0]; sum[1] += (double)tmp[1]; /* use char strides here */ diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src index b5204eca5747..c8f7c654dc17 100644 --- a/numpy/core/src/umath/matmul.c.src +++ b/numpy/core/src/umath/matmul.c.src @@ -31,7 +31,11 @@ * -1 to be conservative, in case blas internally uses a for loop with an * inclusive upper bound */ +#ifndef HAVE_BLAS_ILP64 #define BLAS_MAXSIZE (NPY_MAX_INT - 1) +#else +#define BLAS_MAXSIZE (NPY_MAX_INT64 - 1) +#endif /* * Determine if a 2d matrix can be used by BLAS @@ -84,25 +88,25 @@ NPY_NO_EXPORT void * op: data in c order, m shape */ enum CBLAS_ORDER order; - int M, N, lda; + CBLAS_INT M, N, lda; assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE); assert (is_blasable2d(is2_n, sizeof(@typ@), n, 1, sizeof(@typ@))); - M = (int)m; - N = (int)n; + M = (CBLAS_INT)m; + N = (CBLAS_INT)n; if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) { order = CblasColMajor; - lda = (int)(is1_m / sizeof(@typ@)); + lda = (CBLAS_INT)(is1_m / sizeof(@typ@)); } else { /* If not ColMajor, caller should have ensured we are RowMajor */ /* will not assert in release mode */ order = CblasRowMajor; assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@))); - lda = (int)(is1_n / sizeof(@typ@)); + lda = (CBLAS_INT)(is1_n / sizeof(@typ@)); } - cblas_@prefix@gemv(order, CblasTrans, N, M, @step1@, ip1, lda, ip2, + CBLAS_FUNC(cblas_@prefix@gemv)(order, CblasTrans, N, M, @step1@, ip1, lda, ip2, is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@)); } @@ -117,37 +121,37 @@ NPY_NO_EXPORT void */ enum CBLAS_ORDER order = CblasRowMajor; enum CBLAS_TRANSPOSE trans1, trans2; - int M, N, P, lda, ldb, ldc; + CBLAS_INT M, N, P, lda, ldb, ldc; assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE); - M = (int)m; - N = (int)n; - P = (int)p; + M = (CBLAS_INT)m; + N = (CBLAS_INT)n; + P = (CBLAS_INT)p; assert(is_blasable2d(os_m, os_p, m, p, sizeof(@typ@))); - ldc = (int)(os_m / sizeof(@typ@)); + ldc = (CBLAS_INT)(os_m / sizeof(@typ@)); if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) { trans1 = CblasNoTrans; - lda = (int)(is1_m / sizeof(@typ@)); + lda = (CBLAS_INT)(is1_m / sizeof(@typ@)); } else { /* If not ColMajor, caller should have ensured we are RowMajor */ /* will not assert in release mode */ assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@))); trans1 = CblasTrans; - lda = (int)(is1_n / sizeof(@typ@)); + lda = (CBLAS_INT)(is1_n / sizeof(@typ@)); } if (is_blasable2d(is2_n, is2_p, n, p, sizeof(@typ@))) { trans2 = CblasNoTrans; - ldb = (int)(is2_n / sizeof(@typ@)); + ldb = (CBLAS_INT)(is2_n / sizeof(@typ@)); } else { /* If not ColMajor, caller should have ensured we are RowMajor */ /* will not assert in release mode */ assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(@typ@))); trans2 = CblasTrans; - ldb = (int)(is2_p / sizeof(@typ@)); + ldb = (CBLAS_INT)(is2_p / sizeof(@typ@)); } /* * Use syrk if we have a case of a matrix times its transpose. @@ -162,12 +166,14 @@ NPY_NO_EXPORT void ) { npy_intp i,j; if (trans1 == CblasNoTrans) { - cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@, - ip1, lda, @step0@, op, ldc); + CBLAS_FUNC(cblas_@prefix@syrk)( + order, CblasUpper, trans1, P, N, @step1@, + ip1, lda, @step0@, op, ldc); } else { - cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@, - ip1, ldb, @step0@, op, ldc); + CBLAS_FUNC(cblas_@prefix@syrk)( + order, CblasUpper, trans1, P, N, @step1@, + ip1, ldb, @step0@, op, ldc); } /* Copy the triangle */ for (i = 0; i < P; i++) { @@ -178,8 +184,9 @@ NPY_NO_EXPORT void } else { - cblas_@prefix@gemm(order, trans1, trans2, M, P, N, @step1@, ip1, lda, - ip2, ldb, @step0@, op, ldc); + CBLAS_FUNC(cblas_@prefix@gemm)( + order, trans1, trans2, M, P, N, @step1@, ip1, lda, + ip2, ldb, @step0@, op, ldc); } }