Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add method to query HW size #630

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions frame/include/blis.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ extern "C" {

#include "bli_thread.h"
#include "bli_pthread.h"
#include "bli_affinity.h"


// -- Constant definitions --
Expand Down
106 changes: 106 additions & 0 deletions frame/thread/bli_affinity.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*

BLIS
An object-based framework for developing high-performance BLAS-like
libraries.

Copyright (C) 2022 NVIDIA

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

// this macro has to come before any other headers.
// i hate this but cannot figure out any other way to solve it.
#define _GNU_SOURCE

#include "bli_affinity.h"

// we need a way to detect oversubscription of the kind where
// hierarchical parallelism is used and the affinity mask within
// which BLIS runs does not have enough hardware threads to support
// the requested software threads.
//
// this is motivated by, or related to:
// https://github.com/flame/blis/issues/588
// https://github.com/flame/blis/pull/607
// https://github.com/flame/blis/issues/604
// https://github.com/flame/blis/issues/603

#ifndef BLIS_OS_LINUX

// define the symbol for platforms like Windows and MacOS that do not support the Linux affinity API

dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope)
{
// this is the largest possible value returned by this function
// and it means that the affinity mask does not constrain the current scope.
return (dim_t)1024;
}

#else // BLIS_OS_LINUX

#include <sched.h>
#include <unistd.h>

// scope is either the calling process or the calling thread:
// 0 = calling process
// 1 = calling thread

dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope)
{
int rc;
int active_cpus;
pid_t pid;
cpu_set_t mask;

if (scope == process) {
pid = getpid();
} else {
// this means the current thread
pid = 0;
}

CPU_ZERO(&mask);

// if the CPU mask is larger than 1024 bits, this needs to change.
// see https://man7.org/linux/man-pages/man2/sched_getaffinity.2.html for details.
rc = sched_getaffinity(pid, sizeof(cpu_set_t), &mask);
if (rc) {
bli_print_msg( "sched_getaffinity failed",
__FILE__, __LINE__ );
bli_abort();
}

active_cpus = 0;
for (int i=0; i<sizeof(cpu_set_t); i++) {
const int on = CPU_ISSET(i, &mask);
if (on) active_cpus++;
}

return active_cpus;
}

#endif // BLIS_OS_LINUX
44 changes: 44 additions & 0 deletions frame/thread/bli_affinity.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/*

BLIS
An object-based framework for developing high-performance BLAS-like
libraries.

Copyright (C) 2022 NVIDIA

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#ifndef BLIS_AFFINITY_H
#define BLIS_AFFINITY_H

#include "blis.h"

typedef enum { process = 0, thread = 1 } bli_affinity_scope_t;

dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope);

#endif // BLIS_AFFINITY_H
28 changes: 28 additions & 0 deletions frame/thread/bli_l3_decor_openmp.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

Copyright (C) 2014, The University of Texas at Austin
Copyright (C) 2018, Advanced Micro Devices, Inc.
Copyright (C) 2022, NVIDIA

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
Expand Down Expand Up @@ -199,6 +200,12 @@ void bli_l3_thread_decorator_thread_check
)
{
dim_t n_threads_real = omp_get_num_threads();
dim_t n_threads_hwmask;
if ( omp_in_parallel() ) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@devinamatthews here, we look for the number of HW threads available to the thread if in OpenMP already, or the process, if not in OpenMP already, and if the user is trying to use more SW threads than HW threads, we abort.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, but this requires the user to have set the CPU mask (or OMP affinity?) correctly. What if I just run BLIS in OpenMP (with nesting) and set both OMP_NUM_THREADS and BLIS_NUM_THREADS equal to the number of cores? Or is this only meant to guard against a more specific use case?

n_threads_hwmask = bli_affinity_get_hw_size(thread);
} else {
n_threads_hwmask = bli_affinity_get_hw_size(process);
}

// Check if the number of OpenMP threads created within this parallel
// region is different from the number of threads that were requested
Expand Down Expand Up @@ -241,6 +248,27 @@ void bli_l3_thread_decorator_thread_check

// Synchronize all threads and continue.
_Pragma( "omp barrier" )

return;
}

// Check if the number of OpenMP threads created within this parallel
// region is different from the number of threads that are available
// to BLIS in the calling context.
if ( n_threads_hwmask < n_threads || n_threads_hwmask < n_threads_real)
{
bli_print_msg( "The affinity mask on this process does not have "
"enough HW threads for your requested SW threads.",
__FILE__, __LINE__ );
bli_abort();

bli_thrcomm_init( n_threads_hwmask, gl_comm );
bli_rntm_set_num_threads_only( n_threads_hwmask, rntm );
#warning HELP ME HERE
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm );
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@devinamatthews this is the part where i need help. i can just do serialization, but i was thinking about trying to do slightly better, e.g. if i have 40 cores and the user wants to run on 80 threads, i can set things up to use 40 threads properly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reducing the number of active threads after thread creation is possible, but not easy. If this logic can go before the OpenMP region then things are much easier.


// Synchronize all threads and continue.
_Pragma( "omp barrier" )
}
}

Expand Down
112 changes: 112 additions & 0 deletions test/other/test_affinity.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
/*

BLIS
An object-based framework for developing high-performance BLAS-like
libraries.

Copyright (C) 2022 NVIDIA

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
- Neither the name(s) of the copyright holder(s) nor the names of its
contributors may be used to endorse or promote products derived
from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

// this macro has to come before any other headers.
// i hate this but cannot figure out any other way to solve it.
#define _GNU_SOURCE

#include <sched.h>
#include <unistd.h>

#include <stdio.h>
#include <stdlib.h>

#include <cblas.h>

int main(void)
{
int m=10, n=10, k=10;
double A[100], B[100], C[100];

for (int i=0; i<100; i++) {
A[i] = B[i] = C[i] = 1.0;
}

cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
10, 10, 10, 1.0, A, 10, B, 10, 1.0, C, 10);

{
int rc;
pid_t pid = getpid();
cpu_set_t old_mask, new_mask;
int active_cpus;

CPU_ZERO(&old_mask);

rc = sched_getaffinity(pid, sizeof(cpu_set_t), &old_mask);
if (rc) {
printf("sched_getaffinity returned %d\n", rc);
abort();
}

active_cpus = 0;
for (int i=0; i<sizeof(cpu_set_t); i++) {
const int on = CPU_ISSET(i, &old_mask);
if (on) active_cpus++;
}
printf("active CPUs before = %d\n", active_cpus);

CPU_ZERO(&new_mask);

for (int i=0, j=0; i<sizeof(cpu_set_t); i++) {
const int on = CPU_ISSET(i, &old_mask);
if (on) {
if (j < active_cpus / 2) {
CPU_SET(i, &new_mask);
j++;
}
}
}

active_cpus = 0;
for (int i=0; i<sizeof(cpu_set_t); i++) {
const int on = CPU_ISSET(i, &new_mask);
if (on) active_cpus++;
}
printf("active CPUs after = %d\n", active_cpus);

rc = sched_setaffinity(pid, sizeof(cpu_set_t), &new_mask);
if (rc) {
printf("sched_getaffinity returned %d\n", rc);
abort();
}

cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
10, 10, 10, 1.0, A, 10, B, 10, 1.0, C, 10);

printf("AFTER\n");

}
return 0;
}