From ec49fa861c6c4a1073ea12bb88391ec47a106753 Mon Sep 17 00:00:00 2001 From: "Victor A. P. Magri" Date: Tue, 3 Dec 2024 22:43:13 -0500 Subject: [PATCH 1/4] Add hypre_NumOptimalThreads + fix threading issue in ext+i prolongation --- src/parcsr_ls/par_lr_interp.c | 28 ++++++-------- src/utilities/_hypre_utilities.h | 6 ++- src/utilities/threading.c | 65 +++++++++++++++++++++++++++----- src/utilities/threading.h | 6 ++- 4 files changed, 76 insertions(+), 29 deletions(-) diff --git a/src/parcsr_ls/par_lr_interp.c b/src/parcsr_ls/par_lr_interp.c index 5311f81bbf..914ca05692 100644 --- a/src/parcsr_ls/par_lr_interp.c +++ b/src/parcsr_ls/par_lr_interp.c @@ -1120,17 +1120,16 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A, HYPRE_Int strong_f_marker; /* Loop variables */ - /*HYPRE_Int index;*/ HYPRE_Int start_indexing = 0; HYPRE_Int i, i1, i2, jj, kk, k1, jj1; HYPRE_BigInt big_k1; /* Threading variables */ - HYPRE_Int my_thread_num, num_threads, start, stop; - HYPRE_Int * max_num_threads = hypre_CTAlloc(HYPRE_Int, 1, HYPRE_MEMORY_HOST); - HYPRE_Int * diag_offset; - HYPRE_Int * fine_to_coarse_offset; - HYPRE_Int * offd_offset; + HYPRE_Int num_threads = hypre_NumOptimalThreads(n_fine); + HYPRE_Int my_thread_num, start, stop; + HYPRE_Int *diag_offset; + HYPRE_Int *fine_to_coarse_offset; + HYPRE_Int *offd_offset; /* Definitions */ HYPRE_Real zero = 0.0; @@ -1209,11 +1208,10 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A, /*----------------------------------------------------------------------- * Initialize threading variables *-----------------------------------------------------------------------*/ - max_num_threads[0] = hypre_NumThreads(); - diag_offset = hypre_CTAlloc(HYPRE_Int, max_num_threads[0], HYPRE_MEMORY_HOST); - fine_to_coarse_offset = hypre_CTAlloc(HYPRE_Int, max_num_threads[0], HYPRE_MEMORY_HOST); - offd_offset = hypre_CTAlloc(HYPRE_Int, max_num_threads[0], HYPRE_MEMORY_HOST); - for (i = 0; i < max_num_threads[0]; i++) + diag_offset = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST); + fine_to_coarse_offset = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST); + offd_offset = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST); + for (i = 0; i < num_threads; i++) { diag_offset[i] = 0; fine_to_coarse_offset[i] = 0; @@ -1224,7 +1222,7 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A, * Loop over fine grid. *-----------------------------------------------------------------------*/ #ifdef HYPRE_USING_OPENMP - #pragma omp parallel private(i,my_thread_num,num_threads,start,stop,coarse_counter,jj_counter,jj_counter_offd, P_marker, P_marker_offd,jj,kk,i1,k1,loc_col,jj_begin_row,jj_begin_row_offd,jj_end_row,jj_end_row_offd,diagonal,sum,sgn,jj1,i2,distribute,strong_f_marker, big_k1) + #pragma omp parallel num_threads(num_threads) private(i,my_thread_num,start,stop,coarse_counter,jj_counter,jj_counter_offd, P_marker, P_marker_offd,jj,kk,i1,k1,loc_col,jj_begin_row,jj_begin_row_offd,jj_end_row,jj_end_row_offd,diagonal,sum,sgn,jj1,i2,distribute,strong_f_marker, big_k1) #endif { @@ -1253,20 +1251,19 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A, jj_counter_offd = start_indexing; if (n_fine) { - P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); + P_marker = hypre_CTAlloc(HYPRE_Int, n_fine, HYPRE_MEMORY_HOST); for (i = 0; i < n_fine; i++) { P_marker[i] = -1; } } if (full_off_procNodes) { - P_marker_offd = hypre_CTAlloc(HYPRE_Int, full_off_procNodes, HYPRE_MEMORY_HOST); + P_marker_offd = hypre_CTAlloc(HYPRE_Int, full_off_procNodes, HYPRE_MEMORY_HOST); for (i = 0; i < full_off_procNodes; i++) { P_marker_offd[i] = -1;} } /* this thread's row range */ my_thread_num = hypre_GetThreadNum(); - num_threads = hypre_NumActiveThreads(); start = (n_fine / num_threads) * my_thread_num; if (my_thread_num == num_threads - 1) { stop = n_fine; } @@ -1895,7 +1892,6 @@ hypre_BoomerAMGBuildExtPIInterpHost(hypre_ParCSRMatrix *A, *P_ptr = P; /* Deallocate memory */ - hypre_TFree(max_num_threads, HYPRE_MEMORY_HOST); hypre_TFree(fine_to_coarse, HYPRE_MEMORY_HOST); hypre_TFree(diag_offset, HYPRE_MEMORY_HOST); hypre_TFree(offd_offset, HYPRE_MEMORY_HOST); diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 1cc4e7dcd0..abfe876a19 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -1593,16 +1593,18 @@ HYPRE_Int HYPRE_OMPOffloadStatPrint(void); #ifdef HYPRE_USING_OPENMP HYPRE_Int hypre_NumThreads( void ); +HYPRE_Int hypre_NumOptimalThreads( HYPRE_Int size ); +void hypre_SetNumThreads(HYPRE_Int nt); HYPRE_Int hypre_NumActiveThreads( void ); HYPRE_Int hypre_GetThreadNum( void ); -void hypre_SetNumThreads(HYPRE_Int nt); #else #define hypre_NumThreads() 1 +#define hypre_NumOptimalThreads(x) 1 +#define hypre_SetNumThreads(x) #define hypre_NumActiveThreads() 1 #define hypre_GetThreadNum() 0 -#define hypre_SetNumThreads(x) #endif diff --git a/src/utilities/threading.c b/src/utilities/threading.c index fbd12ffc5b..bd10850664 100644 --- a/src/utilities/threading.c +++ b/src/utilities/threading.c @@ -11,6 +11,12 @@ #ifdef HYPRE_USING_OPENMP +/*-------------------------------------------------------------------------- + * hypre_NumThreads + * + * Returns the maximum number of threads that can be used. + *--------------------------------------------------------------------------*/ + HYPRE_Int hypre_NumThreads( void ) { @@ -21,7 +27,44 @@ hypre_NumThreads( void ) return num_threads; } -/* This next function must be called from within a parallel region! */ +/*-------------------------------------------------------------------------- + * hypre_NumOptimalThreads + * + * Returns the optimal number of threads for the given problem size. Considers + * the minimum work per thread and the maximum number of threads to avoid + * thread creation overhead. Must be called from outside of a parallel region. + *--------------------------------------------------------------------------*/ + +HYPRE_Int +hypre_NumOptimalThreads(HYPRE_Int size) +{ + /* Minimum work per thread */ + const HYPRE_Int min_rows_per_thread = 500; + + HYPRE_Int optimal_threads = size / min_rows_per_thread; + + return hypre_max(1, hypre_min(optimal_threads, omp_get_max_threads())); +} + +/*-------------------------------------------------------------------------- + * hypre_SetNumThreads + * + * Sets the number of threads to use. Must be called from outside of a + * parallel region. + *--------------------------------------------------------------------------*/ + +void +hypre_SetNumThreads( HYPRE_Int nt ) +{ + omp_set_num_threads(nt); +} + +/*-------------------------------------------------------------------------- + * hypre_NumActiveThreads + * + * Returns the number of threads currently active. Must be called from within + * a parallel region. + *--------------------------------------------------------------------------*/ HYPRE_Int hypre_NumActiveThreads( void ) @@ -33,7 +76,12 @@ hypre_NumActiveThreads( void ) return num_threads; } -/* This next function must be called from within a parallel region! */ +/*-------------------------------------------------------------------------- + * hypre_GetThreadNum + * + * Returns the thread ID of the calling thread. Must be called from within a + * parallel region. + *--------------------------------------------------------------------------*/ HYPRE_Int hypre_GetThreadNum( void ) @@ -45,15 +93,14 @@ hypre_GetThreadNum( void ) return my_thread_num; } -void -hypre_SetNumThreads( HYPRE_Int nt ) -{ - omp_set_num_threads(nt); -} - #endif -/* This next function must be called from within a parallel region! */ +/*-------------------------------------------------------------------------- + * hypre_GetSimpleThreadPartition + * + * Partitions the rows of a matrix into a simple thread partition. Must be + * called from within a parallel region. + *--------------------------------------------------------------------------*/ void hypre_GetSimpleThreadPartition( HYPRE_Int *begin, HYPRE_Int *end, HYPRE_Int n ) diff --git a/src/utilities/threading.h b/src/utilities/threading.h index d5621645fd..f1f70ed569 100644 --- a/src/utilities/threading.h +++ b/src/utilities/threading.h @@ -11,16 +11,18 @@ #ifdef HYPRE_USING_OPENMP HYPRE_Int hypre_NumThreads( void ); +HYPRE_Int hypre_NumOptimalThreads( HYPRE_Int size ); +void hypre_SetNumThreads(HYPRE_Int nt); HYPRE_Int hypre_NumActiveThreads( void ); HYPRE_Int hypre_GetThreadNum( void ); -void hypre_SetNumThreads(HYPRE_Int nt); #else #define hypre_NumThreads() 1 +#define hypre_NumOptimalThreads(x) 1 +#define hypre_SetNumThreads(x) #define hypre_NumActiveThreads() 1 #define hypre_GetThreadNum() 0 -#define hypre_SetNumThreads(x) #endif From 3dbc8e726a4fa5a9a920c2ffeb8c251eb079ac39 Mon Sep 17 00:00:00 2001 From: "Victor A. P. Magri" Date: Tue, 3 Dec 2024 23:06:38 -0500 Subject: [PATCH 2/4] Improve opt threads computation --- src/utilities/threading.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/utilities/threading.c b/src/utilities/threading.c index bd10850664..4a72ef8ca6 100644 --- a/src/utilities/threading.c +++ b/src/utilities/threading.c @@ -30,9 +30,9 @@ hypre_NumThreads( void ) /*-------------------------------------------------------------------------- * hypre_NumOptimalThreads * - * Returns the optimal number of threads for the given problem size. Considers - * the minimum work per thread and the maximum number of threads to avoid - * thread creation overhead. Must be called from outside of a parallel region. + * Returns the optimal number of threads for the given problem size. Ensures + * each thread has at least min_rows_per_thread amount of work. + * Must be called from outside of a parallel region. *--------------------------------------------------------------------------*/ HYPRE_Int @@ -41,9 +41,13 @@ hypre_NumOptimalThreads(HYPRE_Int size) /* Minimum work per thread */ const HYPRE_Int min_rows_per_thread = 500; - HYPRE_Int optimal_threads = size / min_rows_per_thread; + /* Calculate threads needed to maintain minimum workload per thread */ + HYPRE_Int max_available_threads = omp_get_max_threads(); + HYPRE_Int desired_threads = (size + min_rows_per_thread - 1) / + min_rows_per_thread; - return hypre_max(1, hypre_min(optimal_threads, omp_get_max_threads())); + /* Return minimum of desired and available threads, but at least 1 */ + return hypre_max(1, hypre_min(desired_threads, max_available_threads)); } /*-------------------------------------------------------------------------- From de6500c6b24adb011204df3aee8f27ebaedc6e2a Mon Sep 17 00:00:00 2001 From: "Victor A. P. Magri" Date: Fri, 6 Dec 2024 00:46:16 -0500 Subject: [PATCH 3/4] Add HYPRE_SetNumThreads --- src/utilities/HYPRE_utilities.h | 9 +++++++++ src/utilities/threading.c | 15 +++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/src/utilities/HYPRE_utilities.h b/src/utilities/HYPRE_utilities.h index 8e928ff4e9..561f883c77 100644 --- a/src/utilities/HYPRE_utilities.h +++ b/src/utilities/HYPRE_utilities.h @@ -489,6 +489,15 @@ HYPRE_Int HYPRE_SetUseGpuRand( HYPRE_Int use_curand ); **/ HYPRE_Int HYPRE_SetGpuAwareMPI( HYPRE_Int use_gpu_aware_mpi ); +/** + * Sets the number of threads to use in parallel regions. + * Must be called from outside of a parallel region. + * + * @param num_threads The number of threads to use + * @return Returns hypre's global error code + */ +HYPRE_Int HYPRE_SetNumThreads( HYPRE_Int num_threads ); + /*-------------------------------------------------------------------------- * Base objects *--------------------------------------------------------------------------*/ diff --git a/src/utilities/threading.c b/src/utilities/threading.c index 4a72ef8ca6..e71054f286 100644 --- a/src/utilities/threading.c +++ b/src/utilities/threading.c @@ -117,3 +117,18 @@ hypre_GetSimpleThreadPartition( HYPRE_Int *begin, HYPRE_Int *end, HYPRE_Int n ) *begin = hypre_min(n_per_thread * my_thread_num, n); *end = hypre_min(*begin + n_per_thread, n); } + +/*-------------------------------------------------------------------------- + * HYPRE_SetNumThreads + * + * Sets the number of threads to use. Must be called from outside of a + * parallel region. + *--------------------------------------------------------------------------*/ + +HYPRE_Int +HYPRE_SetNumThreads( HYPRE_Int num_threads ) +{ + hypre_SetNumThreads(num_threads); + + return hypre_error_flag; +} \ No newline at end of file From 237def8db2c17bc5db69f80ed3bab9803487c588 Mon Sep 17 00:00:00 2001 From: "Victor A. P. Magri" Date: Tue, 7 Jan 2025 11:47:06 -0500 Subject: [PATCH 4/4] Fix num_threads usage in CSRMatrix routines --- src/seq_mv/csr_matop.c | 125 +++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 55 deletions(-) diff --git a/src/seq_mv/csr_matop.c b/src/seq_mv/csr_matop.c index cbedfb8c2c..6bcb2e2600 100644 --- a/src/seq_mv/csr_matop.c +++ b/src/seq_mv/csr_matop.c @@ -333,13 +333,7 @@ hypre_CSRMatrixAddSecondPass( HYPRE_Int firstrow, } /*-------------------------------------------------------------------------- - * hypre_CSRMatrixAdd: - * - * Adds two CSR Matrices A and B and returns a CSR Matrix C = alpha*A + beta*B; - * - * Note: The routine does not check for 0-elements which might be generated - * through cancellation of elements in A and B or already contained - * in A and B. To remove those, use hypre_CSRMatrixDeleteZeros + * hypre_CSRMatrixAddHost *--------------------------------------------------------------------------*/ hypre_CSRMatrix* @@ -366,6 +360,7 @@ hypre_CSRMatrixAddHost ( HYPRE_Complex alpha, HYPRE_Int *rownnz_C; HYPRE_Int nnzrows_C; + HYPRE_Int num_threads; HYPRE_Int *twspace; HYPRE_MemoryLocation memory_location_A = hypre_CSRMatrixMemoryLocation(A); @@ -387,12 +382,7 @@ hypre_CSRMatrixAddHost ( HYPRE_Complex alpha, return NULL; } - /* Allocate memory */ - twspace = hypre_TAlloc(HYPRE_Int, hypre_NumThreads(), HYPRE_MEMORY_HOST); - C_i = hypre_CTAlloc(HYPRE_Int, nrows_A + 1, memory_location_C); - /* Set nonzero rows data of diag_C */ - nnzrows_C = nrows_A; if ((nnzrows_A < nrows_A) && (nnzrows_B < nrows_B)) { hypre_IntArray arr_A; @@ -412,17 +402,23 @@ hypre_CSRMatrixAddHost ( HYPRE_Complex alpha, } else { - rownnz_C = NULL; + nnzrows_C = nrows_A; + rownnz_C = NULL; } + /* Allocate memory */ + num_threads = hypre_NumOptimalThreads(nnzrows_C); + twspace = hypre_TAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST); + C_i = hypre_CTAlloc(HYPRE_Int, nrows_A + 1, memory_location_C); + #ifdef HYPRE_USING_OPENMP - #pragma omp parallel + #pragma omp parallel num_threads(num_threads) #endif { HYPRE_Int ns, ne; HYPRE_Int *marker = NULL; - hypre_partition1D(nnzrows_C, hypre_NumActiveThreads(), hypre_GetThreadNum(), &ns, &ne); + hypre_partition1D(nnzrows_C, num_threads, hypre_GetThreadNum(), &ns, &ne); marker = hypre_CTAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_HOST); @@ -442,6 +438,16 @@ hypre_CSRMatrixAddHost ( HYPRE_Complex alpha, return C; } +/*-------------------------------------------------------------------------- + * hypre_CSRMatrixAdd + * + * Adds two CSR Matrices A and B and returns a CSR Matrix C = alpha*A + beta*B; + * + * Note: The routine does not check for 0-elements which might be generated + * through cancellation of elements in A and B or already contained + * in A and B. To remove those, use hypre_CSRMatrixDeleteZeros + *--------------------------------------------------------------------------*/ + hypre_CSRMatrix* hypre_CSRMatrixAdd( HYPRE_Complex alpha, hypre_CSRMatrix *A, @@ -503,6 +509,7 @@ hypre_CSRMatrixBigAdd( hypre_CSRMatrix *A, HYPRE_Int *C_i; HYPRE_BigInt *C_j; HYPRE_Int *twspace; + HYPRE_Int num_threads = hypre_NumOptimalThreads(nrows_A); HYPRE_MemoryLocation memory_location_A = hypre_CSRMatrixMemoryLocation(A); HYPRE_MemoryLocation memory_location_B = hypre_CSRMatrixMemoryLocation(B); @@ -524,22 +531,21 @@ hypre_CSRMatrixBigAdd( hypre_CSRMatrix *A, } /* Allocate memory */ - twspace = hypre_TAlloc(HYPRE_Int, hypre_NumThreads(), HYPRE_MEMORY_HOST); + twspace = hypre_TAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST); C_i = hypre_CTAlloc(HYPRE_Int, nrows_A + 1, memory_location_C); #ifdef HYPRE_USING_OPENMP - #pragma omp parallel + #pragma omp parallel num_threads(num_threads) #endif { HYPRE_Int ia, ib, ic, num_nonzeros; HYPRE_Int ns, ne, pos; HYPRE_BigInt jcol; - HYPRE_Int ii, num_threads; + HYPRE_Int ii; HYPRE_Int jj; HYPRE_Int *marker = NULL; ii = hypre_GetThreadNum(); - num_threads = hypre_NumActiveThreads(); hypre_partition1D(nrows_A, num_threads, ii, &ns, &ne); marker = hypre_CTAlloc(HYPRE_Int, ncols_A, HYPRE_MEMORY_HOST); @@ -694,6 +700,7 @@ hypre_CSRMatrixMultiplyHost( hypre_CSRMatrix *A, HYPRE_Complex a_entry, b_entry; HYPRE_Int allsquare = 0; HYPRE_Int *twspace; + HYPRE_Int num_threads = hypre_NumOptimalThreads(nnzrows_A); /* RL: TODO cannot guarantee, maybe should never assert hypre_assert(memory_location_A == memory_location_B); @@ -726,20 +733,18 @@ hypre_CSRMatrixMultiplyHost( hypre_CSRMatrix *A, } /* Allocate memory */ - twspace = hypre_TAlloc(HYPRE_Int, hypre_NumThreads(), HYPRE_MEMORY_HOST); + twspace = hypre_TAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST); C_i = hypre_CTAlloc(HYPRE_Int, nrows_A + 1, memory_location_C); #ifdef HYPRE_USING_OPENMP - #pragma omp parallel private(ia, ib, ic, ja, jb, num_nonzeros, counter, a_entry, b_entry) + #pragma omp parallel private(ia, ib, ic, ja, jb, num_nonzeros, counter, a_entry, b_entry) num_threads(num_threads) #endif { HYPRE_Int *B_marker = NULL; HYPRE_Int ns, ne, ii, jj; - HYPRE_Int num_threads; HYPRE_Int i1, iic; ii = hypre_GetThreadNum(); - num_threads = hypre_NumActiveThreads(); hypre_partition1D(nnzrows_A, num_threads, ii, &ns, &ne); B_marker = hypre_CTAlloc(HYPRE_Int, ncols_B, HYPRE_MEMORY_HOST); @@ -926,6 +931,16 @@ hypre_CSRMatrixMultiplyHost( hypre_CSRMatrix *A, return C; } +/*-------------------------------------------------------------------------- + * hypre_CSRMatrixMultiply + * + * Multiplies two CSR Matrices A and B and returns a CSR Matrix C; + * + * Note: The routine does not check for 0-elements which might be generated + * through cancellation of elements in A and B or already contained + * in A and B. To remove those, use hypre_CSRMatrixDeleteZeros + *--------------------------------------------------------------------------*/ + hypre_CSRMatrix* hypre_CSRMatrixMultiply( hypre_CSRMatrix *A, hypre_CSRMatrix *B) @@ -1061,6 +1076,8 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix *A, HYPRE_Int num_cols_AT; HYPRE_Int num_nnzs_AT; + HYPRE_Int *bucket; + HYPRE_Int num_threads; HYPRE_Int max_col; HYPRE_Int i, j; @@ -1119,14 +1136,14 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix *A, /*----------------------------------------------------------------- * Parallel count sort *-----------------------------------------------------------------*/ - HYPRE_Int *bucket = hypre_CTAlloc(HYPRE_Int, (num_cols_A + 1) * hypre_NumThreads(), - HYPRE_MEMORY_HOST); + num_threads = hypre_NumOptimalThreads(nnzrows_A); + bucket = hypre_CTAlloc(HYPRE_Int, (num_cols_A + 1) * num_threads, HYPRE_MEMORY_HOST); #ifdef HYPRE_USING_OPENMP - #pragma omp parallel + #pragma omp parallel num_threads(num_threads) #endif { - HYPRE_Int ii, num_threads, ns, ne; + HYPRE_Int ii, ns, ne; HYPRE_Int i, j, j0, j1, ir; HYPRE_Int idx, offset; HYPRE_Int transpose_i; @@ -1136,7 +1153,6 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix *A, HYPRE_Int transpose_j1; ii = hypre_GetThreadNum(); - num_threads = hypre_NumActiveThreads(); hypre_partition1D(nnzrows_A, num_threads, ii, &ns, &ne); /*----------------------------------------------------------------- @@ -1252,8 +1268,8 @@ hypre_CSRMatrixTransposeHost(hypre_CSRMatrix *A, } /* end parallel region */ hypre_CSRMatrixI(*AT) = hypre_TAlloc(HYPRE_Int, num_cols_A + 1, memory_location); - hypre_TMemcpy(hypre_CSRMatrixI(*AT), bucket, HYPRE_Int, num_cols_A + 1, memory_location, - HYPRE_MEMORY_HOST); + hypre_TMemcpy(hypre_CSRMatrixI(*AT), bucket, HYPRE_Int, num_cols_A + 1, + memory_location, HYPRE_MEMORY_HOST); hypre_CSRMatrixI(*AT)[num_cols_A] = num_nnzs_A; hypre_TFree(bucket, HYPRE_MEMORY_HOST); @@ -1313,47 +1329,46 @@ hypre_CSRMatrixSplit(hypre_CSRMatrix *Bs_ext, hypre_CSRMatrix **Bext_diag_ptr, hypre_CSRMatrix **Bext_offd_ptr) { - HYPRE_Complex *Bs_ext_data = hypre_CSRMatrixData(Bs_ext); - HYPRE_Int *Bs_ext_i = hypre_CSRMatrixI(Bs_ext); - HYPRE_BigInt *Bs_ext_j = hypre_CSRMatrixBigJ(Bs_ext); - HYPRE_Int num_rows_Bext = hypre_CSRMatrixNumRows(Bs_ext); + HYPRE_Complex *Bs_ext_data = hypre_CSRMatrixData(Bs_ext); + HYPRE_Int *Bs_ext_i = hypre_CSRMatrixI(Bs_ext); + HYPRE_BigInt *Bs_ext_j = hypre_CSRMatrixBigJ(Bs_ext); + HYPRE_Int num_rows_Bext = hypre_CSRMatrixNumRows(Bs_ext); HYPRE_Int B_ext_diag_size = 0; HYPRE_Int B_ext_offd_size = 0; - HYPRE_Int *B_ext_diag_i = NULL; - HYPRE_Int *B_ext_diag_j = NULL; + HYPRE_Int *B_ext_diag_i = NULL; + HYPRE_Int *B_ext_diag_j = NULL; HYPRE_Complex *B_ext_diag_data = NULL; - HYPRE_Int *B_ext_offd_i = NULL; - HYPRE_Int *B_ext_offd_j = NULL; + HYPRE_Int *B_ext_offd_i = NULL; + HYPRE_Int *B_ext_offd_j = NULL; HYPRE_BigInt *B_ext_offd_bigj = NULL; HYPRE_Complex *B_ext_offd_data = NULL; + HYPRE_Int *my_diag_array; HYPRE_Int *my_offd_array; HYPRE_BigInt *temp = NULL; - HYPRE_Int max_num_threads; + HYPRE_Int num_threads = hypre_NumOptimalThreads(num_rows_Bext); + HYPRE_Int cnt = 0; hypre_CSRMatrix *Bext_diag = NULL; hypre_CSRMatrix *Bext_offd = NULL; HYPRE_BigInt *col_map_offd_C = NULL; HYPRE_Int num_cols_offd_C = 0; - B_ext_diag_i = hypre_CTAlloc(HYPRE_Int, num_rows_Bext + 1, HYPRE_MEMORY_HOST); - B_ext_offd_i = hypre_CTAlloc(HYPRE_Int, num_rows_Bext + 1, HYPRE_MEMORY_HOST); - - max_num_threads = hypre_NumThreads(); - my_diag_array = hypre_CTAlloc(HYPRE_Int, max_num_threads, HYPRE_MEMORY_HOST); - my_offd_array = hypre_CTAlloc(HYPRE_Int, max_num_threads, HYPRE_MEMORY_HOST); + B_ext_diag_i = hypre_CTAlloc(HYPRE_Int, num_rows_Bext + 1, HYPRE_MEMORY_HOST); + B_ext_offd_i = hypre_CTAlloc(HYPRE_Int, num_rows_Bext + 1, HYPRE_MEMORY_HOST); + my_diag_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST); + my_offd_array = hypre_CTAlloc(HYPRE_Int, num_threads, HYPRE_MEMORY_HOST); #ifdef HYPRE_USING_OPENMP - #pragma omp parallel + #pragma omp parallel num_threads(num_threads) #endif { - HYPRE_Int ns, ne, ii, num_threads; + HYPRE_Int ns, ne, ii; HYPRE_Int i1, i, j; HYPRE_Int my_offd_size, my_diag_size; HYPRE_Int cnt_offd, cnt_diag; ii = hypre_GetThreadNum(); - num_threads = hypre_NumActiveThreads(); hypre_partition1D(num_rows_Bext, num_threads, ii, &ns, &ne); my_diag_size = 0; @@ -1536,13 +1551,13 @@ hypre_CSRMatrixSplit(hypre_CSRMatrix *Bs_ext, HYPRE_Int hypre_CSRMatrixReorderHost(hypre_CSRMatrix *A) { - HYPRE_Complex *A_data = hypre_CSRMatrixData(A); - HYPRE_Int *A_i = hypre_CSRMatrixI(A); - HYPRE_Int *A_j = hypre_CSRMatrixJ(A); - HYPRE_Int *rownnz_A = hypre_CSRMatrixRownnz(A); - HYPRE_Int nnzrows_A = hypre_CSRMatrixNumRownnz(A); - HYPRE_Int num_rows_A = hypre_CSRMatrixNumRows(A); - HYPRE_Int num_cols_A = hypre_CSRMatrixNumCols(A); + HYPRE_Complex *A_data = hypre_CSRMatrixData(A); + HYPRE_Int *A_i = hypre_CSRMatrixI(A); + HYPRE_Int *A_j = hypre_CSRMatrixJ(A); + HYPRE_Int *rownnz_A = hypre_CSRMatrixRownnz(A); + HYPRE_Int nnzrows_A = hypre_CSRMatrixNumRownnz(A); + HYPRE_Int num_rows_A = hypre_CSRMatrixNumRows(A); + HYPRE_Int num_cols_A = hypre_CSRMatrixNumCols(A); HYPRE_Int i, ii, j;