Skip to content

Commit

Permalink
Merge pull request #1131 from KineticTheory/tstomp-limit-threads
Browse files Browse the repository at this point in the history
Limit number of threads spawned for tstOMP and tstatomics
  • Loading branch information
alexrlongne authored Sep 9, 2021
2 parents 916ec72 + 93a82f0 commit 3499862
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 39 deletions.
55 changes: 32 additions & 23 deletions src/c4/test/tstOMP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
* \author Kelly Thompson
* \date Tue Jun 6 15:03:08 2006
* \brief Demonstrate basic OMP threads under MPI.
* \note Copyright (C) 2016-2020 Triad National Security, LLC.
* All rights reserved. */
* \note Copyright (C) 2011-2021 Triad National Security, LLC., All rights reserved. */
//------------------------------------------------------------------------------------------------//

#include "c4/ParallelUnitTest.hh"
Expand Down Expand Up @@ -79,17 +78,14 @@ bool topology_report() {
void topo_report(rtt_dsxx::UnitTest &ut, bool &one_mpi_rank_per_node) {
// Determine if MPI ranks are on unique machine nodes:
//
// If there are multiple MPI ranks per machine node, then don't use OMP
// because OMP can't restrict its threads to running only on an MPI rank's
// cores. The OMP threads will be distributed over the whole machine node.
// For example, we might choose to use 4 MPI ranks on a machine node with 16
// cores. Ideally, we could allow each MPI rank to use 4 OMP threads for a
// maximum of 4x4=16 OMP threads on the 16 core node. However, because OMP
// doesn't know about the MPI ranks sharing the 16 cores, the even
// distribution of OMP threads is not guaranteed.
// If there are multiple MPI ranks per machine node, then don't use OMP because OMP can't restrict
// its threads to running only on an MPI rank's cores. The OMP threads will be distributed over
// the whole machine node. For example, we might choose to use 4 MPI ranks on a machine node with
// 16 cores. Ideally, we could allow each MPI rank to use 4 OMP threads for a maximum of 4x4=16
// OMP threads on the 16 core node. However, because OMP doesn't know about the MPI ranks sharing
// the 16 cores, the even distribution of OMP threads is not guaranteed.
//
// So - if we have more than one MPI rank per machine node, then turn off OMP
// threads.
// So - if we have more than one MPI rank per machine node, then turn off OMP threads.
one_mpi_rank_per_node = topology_report();

std::string procname = rtt_c4::get_processor_name();
Expand All @@ -101,9 +97,13 @@ void topo_report(rtt_dsxx::UnitTest &ut, bool &one_mpi_rank_per_node) {
int num_dynamic_threads = omp_get_dynamic();

int tid(-1);
int nthreads(-1), maxthreads(-1);
int nthreads(-1);
int maxthreads(-1);

maxthreads = omp_get_max_threads();
// This is just a unit test. Limit the parallelism.
if (maxthreads > 16)
omp_set_num_threads(16);

#pragma omp parallel private(tid)
{
Expand Down Expand Up @@ -172,6 +172,12 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) {

#ifdef OPENMP_FOUND
{
// This is just a unit test. Limit the parallelism.
int maxthreads(-1);
maxthreads = omp_get_max_threads();
if (maxthreads > 16)
omp_set_num_threads(16);

// More than 1 MPI rank per node --> turn off OMP.
if (!omrpn)
omp_set_num_threads(1);
Expand Down Expand Up @@ -231,9 +237,8 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) {
<< std::endl;
}

// [2015-11-17 KT] The accumulate test no longer provides enough work
// to offset the overhead of OpenMP, especially for the optimized
// build. Turn this test off...
// [2015-11-17 KT] The accumulate test no longer provides enough work to offset the overhead of
// OpenMP, especially for the optimized build. Turn this test off...

// if( omrpn && nthreads > 4 )
// {
Expand All @@ -251,12 +256,9 @@ void sample_sum(rtt_dsxx::UnitTest &ut, bool const omrpn) {
}

//------------------------------------------------------------------------------------------------//
// This is a simple demonstration problem for OMP. Nothing really to check
// for PASS/FAIL.
// This is a simple demonstration problem for OMP. Nothing really to check for PASS/FAIL.
int MandelbrotCalculate(std::complex<double> c, int maxiter) {
// iterates z = z*z + c until |z| >= 2 or maxiter is reached, returns the
// number of iterations

// iterates z = z*z + c until |z| >= 2 or maxiter is reached, returns the number of iterations
std::complex<double> z = c;
int n = 0;
for (; n < maxiter; ++n) {
Expand All @@ -277,16 +279,23 @@ void MandelbrotDriver(rtt_dsxx::UnitTest &ut) {
const complex<double> center(-0.7, 0.0);
const complex<double> span(2.7, -(4 / 3.0) * 2.7 * height / width);
const complex<double> begin = center - span / 2.0;
// const complex<double> end = center+span/2.0;
const int maxiter = 100000;

// Use OMP threads
Timer t;
ostringstream image1, image2;
ostringstream image1;
ostringstream image2;
t.start();

int nthreads(-1);
#ifdef OPENMP_FOUND

// This is just a unit test. Limit the parallelism.
int maxthreads(-1);
maxthreads = omp_get_max_threads();
if (maxthreads > 16)
omp_set_num_threads(16);

#pragma omp parallel
{
if (node() == 0 && omp_get_thread_num() == 0) {
Expand Down
28 changes: 12 additions & 16 deletions src/ds++/test/tstatomics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
* \file ds++/test/tstatomics.cc
* \author Tim Kelley
* \date Thursday, Sept. 6, 2018, 10:51 am
* \note Copyright (C) 2018-2020 Triad National Security, LLC.
* All rights reserved. */
* \note Copyright (C) 2018-2021 Triad National Security, LLC., All rights reserved. */
//------------------------------------------------------------------------------------------------//

#include "ds++/Release.hh"
Expand All @@ -17,10 +16,9 @@
using rtt_dsxx::UnitTest;

//------------------------------------------------------------------------------------------------//
/* Hammer an atomic from each thread. Each iteration, the thread adds
* (tid * iteration) to the counter. The atomic ensures that everyone sees
* a consistent view of the counter: no thread overwrites the contribution
* from any other thread.
/* Hammer an atomic from each thread. Each iteration, the thread adds (tid * iteration) to the
* counter. The atomic ensures that everyone sees a consistent view of the counter: no thread
* overwrites the contribution from any other thread.
*/
void thread_action(std::atomic<double> &d, size_t N, size_t tid) {
auto const did = static_cast<double>(tid);
Expand Down Expand Up @@ -87,14 +85,12 @@ void test_fetch_add_atomic_1e6(UnitTest &ut) {
} // test_fetch_add_atomic

// --------------------- non-atomic version --------------------------
// This should give the wrong answer nearly every time on any respectable
// thread implementation.
// This should give the wrong answer nearly every time on any respectable thread implementation.

//------------------------------------------------------------------------------------------------//
/* Similarly, hammer a POD from each thread. Each iteration, the thread adds
* (tid * iteration) to the counter. Since the threads are contending, we expect
* to have a race condition where two threads read the same value from d and
* one of the thread's write (+=) overwrites the other's.
/* Similarly, hammer a POD from each thread. Each iteration, the thread adds (tid * iteration) to
* the counter. Since the threads are contending, we expect to have a race condition where two
* threads read the same value from d and one of the thread's write (+=) overwrites the other's.
*/
void thread_action_pod(double &d, size_t N, size_t tid) {
auto const did = static_cast<double>(tid);
Expand Down Expand Up @@ -155,8 +151,8 @@ void test_fetch_add_not_atomic(UnitTest & /*ut*/) {

// fetch_sub tests

/* Same as thread_action above, except uses fetch_sub. Total sum is just the
* negative of the preceding test.
/* Same as thread_action above, except uses fetch_sub. Total sum is just the negative of the
* preceding test.
*/
void thread_action_sub(std::atomic<double> &d, size_t N, size_t tid) {
auto const did = static_cast<double>(tid);
Expand Down Expand Up @@ -210,14 +206,14 @@ void fetch_sub_atomic_core(UnitTest &ut, size_t const n_threads, size_t const n_
} // fetch_add_atomic_core

void test_fetch_sub_atomic(UnitTest &ut) {
size_t const n_threads(19);
size_t const n_threads(8);
size_t const n_iterations(10001);
fetch_sub_atomic_core(ut, n_threads, n_iterations);
return;
} // test_fetch_add_atomic

void test_fetch_sub_atomic_1e6(UnitTest &ut) {
size_t const n_threads(19);
size_t const n_threads(8);
size_t const n_iterations(1000001);
fetch_sub_atomic_core(ut, n_threads, n_iterations);
return;
Expand Down

0 comments on commit 3499862

Please sign in to comment.