From de8a10dc543f69e2111d250c79f1414b70a08686 Mon Sep 17 00:00:00 2001
From: Pauli Virtanen <pav@iki.fi>
Date: Mon, 9 Dec 2019 20:47:28 +0200
Subject: [PATCH] BUG: core: use blas_ilp64 also for *_matmul, *_dot, and
 *_vdot

Changing these to support ILP64 blas was missed in gh-15012
---
 numpy/core/src/multiarray/arraytypes.c.src | 16 +++----
 numpy/core/src/multiarray/common.h         | 10 ++++-
 numpy/core/src/multiarray/vdot.c           | 16 +++----
 numpy/core/src/umath/matmul.c.src          | 49 ++++++++++++----------
 4 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
index e36b95c00918..9e108e3e140a 100644
--- a/numpy/core/src/multiarray/arraytypes.c.src
+++ b/numpy/core/src/multiarray/arraytypes.c.src
@@ -3535,17 +3535,17 @@ NPY_NO_EXPORT void
            npy_intp n, void *NPY_UNUSED(ignore))
 {
 #if defined(HAVE_CBLAS)
-    int is1b = blas_stride(is1, sizeof(@type@));
-    int is2b = blas_stride(is2, sizeof(@type@));
+    CBLAS_INT is1b = blas_stride(is1, sizeof(@type@));
+    CBLAS_INT is2b = blas_stride(is2, sizeof(@type@));
 
     if (is1b && is2b)
     {
         double sum = 0.;  /* double for stability */
 
         while (n > 0) {
-            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
 
-            sum += cblas_@prefix@dot(chunk,
+            sum += CBLAS_FUNC(cblas_@prefix@dot)(chunk,
                                      (@type@ *) ip1, is1b,
                                      (@type@ *) ip2, is2b);
             /* use char strides here */
@@ -3584,17 +3584,17 @@ NPY_NO_EXPORT void
            char *op, npy_intp n, void *NPY_UNUSED(ignore))
 {
 #if defined(HAVE_CBLAS)
-    int is1b = blas_stride(is1, sizeof(@ctype@));
-    int is2b = blas_stride(is2, sizeof(@ctype@));
+    CBLAS_INT is1b = blas_stride(is1, sizeof(@ctype@));
+    CBLAS_INT is2b = blas_stride(is2, sizeof(@ctype@));
 
     if (is1b && is2b) {
         double sum[2] = {0., 0.};  /* double for stability */
 
         while (n > 0) {
-            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
             @type@ tmp[2];
 
-            cblas_@prefix@dotu_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            CBLAS_FUNC(cblas_@prefix@dotu_sub)((CBLAS_INT)n, ip1, is1b, ip2, is2b, tmp);
             sum[0] += (double)tmp[0];
             sum[1] += (double)tmp[1];
             /* use char strides here */
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 487d530a148b..7eee9ddc5db6 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -303,7 +303,11 @@ blas_stride(npy_intp stride, unsigned itemsize)
      */
     if (stride > 0 && npy_is_aligned((void *)stride, itemsize)) {
         stride /= itemsize;
+#ifndef HAVE_BLAS_ILP64
         if (stride <= INT_MAX) {
+#else
+        if (stride <= NPY_MAX_INT64) {
+#endif
             return stride;
         }
     }
@@ -314,7 +318,11 @@ blas_stride(npy_intp stride, unsigned itemsize)
  * Define a chunksize for CBLAS. CBLAS counts in integers.
  */
 #if NPY_MAX_INTP > INT_MAX
-# define NPY_CBLAS_CHUNK  (INT_MAX / 2 + 1)
+# ifndef HAVE_BLAS_ILP64
+#  define NPY_CBLAS_CHUNK  (INT_MAX / 2 + 1)
+# else
+#  define NPY_CBLAS_CHUNK  (NPY_MAX_INT64 / 2 + 1)
+# endif
 #else
 # define NPY_CBLAS_CHUNK  NPY_MAX_INTP
 #endif
diff --git a/numpy/core/src/multiarray/vdot.c b/numpy/core/src/multiarray/vdot.c
index 424a21710f25..9b5d19522029 100644
--- a/numpy/core/src/multiarray/vdot.c
+++ b/numpy/core/src/multiarray/vdot.c
@@ -15,17 +15,17 @@ CFLOAT_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
             char *op, npy_intp n, void *NPY_UNUSED(ignore))
 {
 #if defined(HAVE_CBLAS)
-    int is1b = blas_stride(is1, sizeof(npy_cfloat));
-    int is2b = blas_stride(is2, sizeof(npy_cfloat));
+    CBLAS_INT is1b = blas_stride(is1, sizeof(npy_cfloat));
+    CBLAS_INT is2b = blas_stride(is2, sizeof(npy_cfloat));
 
     if (is1b && is2b) {
         double sum[2] = {0., 0.};  /* double for stability */
 
         while (n > 0) {
-            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
             float tmp[2];
 
-            cblas_cdotc_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            CBLAS_FUNC(cblas_cdotc_sub)((CBLAS_INT)n, ip1, is1b, ip2, is2b, tmp);
             sum[0] += (double)tmp[0];
             sum[1] += (double)tmp[1];
             /* use char strides here */
@@ -66,17 +66,17 @@ CDOUBLE_vdot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
              char *op, npy_intp n, void *NPY_UNUSED(ignore))
 {
 #if defined(HAVE_CBLAS)
-    int is1b = blas_stride(is1, sizeof(npy_cdouble));
-    int is2b = blas_stride(is2, sizeof(npy_cdouble));
+    CBLAS_INT is1b = blas_stride(is1, sizeof(npy_cdouble));
+    CBLAS_INT is2b = blas_stride(is2, sizeof(npy_cdouble));
 
     if (is1b && is2b) {
         double sum[2] = {0., 0.};  /* double for stability */
 
         while (n > 0) {
-            int chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
             double tmp[2];
 
-            cblas_zdotc_sub((int)n, ip1, is1b, ip2, is2b, tmp);
+            CBLAS_FUNC(cblas_zdotc_sub)((CBLAS_INT)n, ip1, is1b, ip2, is2b, tmp);
             sum[0] += (double)tmp[0];
             sum[1] += (double)tmp[1];
             /* use char strides here */
diff --git a/numpy/core/src/umath/matmul.c.src b/numpy/core/src/umath/matmul.c.src
index b5204eca5747..c8f7c654dc17 100644
--- a/numpy/core/src/umath/matmul.c.src
+++ b/numpy/core/src/umath/matmul.c.src
@@ -31,7 +31,11 @@
  * -1 to be conservative, in case blas internally uses a for loop with an
  * inclusive upper bound
  */
+#ifndef HAVE_BLAS_ILP64
 #define BLAS_MAXSIZE (NPY_MAX_INT - 1)
+#else
+#define BLAS_MAXSIZE (NPY_MAX_INT64 - 1)
+#endif
 
 /*
  * Determine if a 2d matrix can be used by BLAS
@@ -84,25 +88,25 @@ NPY_NO_EXPORT void
      * op:  data in c order, m shape
      */
     enum CBLAS_ORDER order;
-    int M, N, lda;
+    CBLAS_INT M, N, lda;
 
     assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
     assert (is_blasable2d(is2_n, sizeof(@typ@), n, 1, sizeof(@typ@)));
-    M = (int)m;
-    N = (int)n;
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
 
     if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
         order = CblasColMajor;
-        lda = (int)(is1_m / sizeof(@typ@));
+        lda = (CBLAS_INT)(is1_m / sizeof(@typ@));
     }
     else {
         /* If not ColMajor, caller should have ensured we are RowMajor */
         /* will not assert in release mode */
         order = CblasRowMajor;
         assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
-        lda = (int)(is1_n / sizeof(@typ@));
+        lda = (CBLAS_INT)(is1_n / sizeof(@typ@));
     }
-    cblas_@prefix@gemv(order, CblasTrans, N, M, @step1@, ip1, lda, ip2,
+    CBLAS_FUNC(cblas_@prefix@gemv)(order, CblasTrans, N, M, @step1@, ip1, lda, ip2,
                                      is2_n / sizeof(@typ@), @step0@, op, op_m / sizeof(@typ@));
 }
 
@@ -117,37 +121,37 @@ NPY_NO_EXPORT void
      */
     enum CBLAS_ORDER order = CblasRowMajor;
     enum CBLAS_TRANSPOSE trans1, trans2;
-    int M, N, P, lda, ldb, ldc;
+    CBLAS_INT M, N, P, lda, ldb, ldc;
     assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
-    M = (int)m;
-    N = (int)n;
-    P = (int)p;
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+    P = (CBLAS_INT)p;
 
     assert(is_blasable2d(os_m, os_p, m, p, sizeof(@typ@)));
-    ldc = (int)(os_m / sizeof(@typ@));
+    ldc = (CBLAS_INT)(os_m / sizeof(@typ@));
 
     if (is_blasable2d(is1_m, is1_n, m, n, sizeof(@typ@))) {
         trans1 = CblasNoTrans;
-        lda = (int)(is1_m / sizeof(@typ@));
+        lda = (CBLAS_INT)(is1_m / sizeof(@typ@));
     }
     else {
         /* If not ColMajor, caller should have ensured we are RowMajor */
         /* will not assert in release mode */
         assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(@typ@)));
         trans1 = CblasTrans;
-        lda = (int)(is1_n / sizeof(@typ@));
+        lda = (CBLAS_INT)(is1_n / sizeof(@typ@));
     }
 
     if (is_blasable2d(is2_n, is2_p, n, p, sizeof(@typ@))) {
         trans2 = CblasNoTrans;
-        ldb = (int)(is2_n / sizeof(@typ@));
+        ldb = (CBLAS_INT)(is2_n / sizeof(@typ@));
     }
     else {
         /* If not ColMajor, caller should have ensured we are RowMajor */
         /* will not assert in release mode */
         assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(@typ@)));
         trans2 = CblasTrans;
-        ldb = (int)(is2_p / sizeof(@typ@));
+        ldb = (CBLAS_INT)(is2_p / sizeof(@typ@));
     }
     /*
      * Use syrk if we have a case of a matrix times its transpose.
@@ -162,12 +166,14 @@ NPY_NO_EXPORT void
     ) {
         npy_intp i,j;
         if (trans1 == CblasNoTrans) {
-            cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
-                               ip1, lda, @step0@, op, ldc);
+            CBLAS_FUNC(cblas_@prefix@syrk)(
+                order, CblasUpper, trans1, P, N, @step1@,
+                ip1, lda, @step0@, op, ldc);
         }
         else {
-            cblas_@prefix@syrk(order, CblasUpper, trans1, P, N, @step1@,
-                               ip1, ldb, @step0@, op, ldc);
+            CBLAS_FUNC(cblas_@prefix@syrk)(
+                order, CblasUpper, trans1, P, N, @step1@,
+                ip1, ldb, @step0@, op, ldc);
         }
         /* Copy the triangle */
         for (i = 0; i < P; i++) {
@@ -178,8 +184,9 @@ NPY_NO_EXPORT void
 
     }
     else {
-        cblas_@prefix@gemm(order, trans1, trans2, M, P, N, @step1@, ip1, lda,
-                           ip2, ldb, @step0@, op, ldc);
+        CBLAS_FUNC(cblas_@prefix@gemm)(
+            order, trans1, trans2, M, P, N, @step1@, ip1, lda,
+            ip2, ldb, @step0@, op, ldc);
     }
 }