-
Notifications
You must be signed in to change notification settings - Fork 372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add method to query HW size #630
base: master
Are you sure you want to change the base?
Changes from all commits
8bd9160
f6e1f56
b3844fd
3193c90
629ef19
3697489
5906e3f
5550527
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
/* | ||
|
||
BLIS | ||
An object-based framework for developing high-performance BLAS-like | ||
libraries. | ||
|
||
Copyright (C) 2022 NVIDIA | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are | ||
met: | ||
- Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
- Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the distribution. | ||
- Neither the name(s) of the copyright holder(s) nor the names of its | ||
contributors may be used to endorse or promote products derived | ||
from this software without specific prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
*/ | ||
|
||
// this macro has to come before any other headers. | ||
// i hate this but cannot figure out any other way to solve it. | ||
#define _GNU_SOURCE | ||
|
||
#include "bli_affinity.h" | ||
|
||
// we need a way to detect oversubscription of the kind where | ||
// hierarchical parallelism is used and the affinity mask within | ||
// which BLIS runs does not have enough hardware threads to support | ||
// the requested software threads. | ||
// | ||
// this is motivated by, or related to: | ||
// https://github.com/flame/blis/issues/588 | ||
// https://github.com/flame/blis/pull/607 | ||
// https://github.com/flame/blis/issues/604 | ||
// https://github.com/flame/blis/issues/603 | ||
|
||
#ifndef BLIS_OS_LINUX | ||
|
||
// define the symbol for platforms like Windows and MacOS that do not support the Linux affinity API | ||
|
||
dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope) | ||
{ | ||
// this is the largest possible value returned by this function | ||
// and it means that the affinity mask does not constrain the current scope. | ||
return (dim_t)1024; | ||
} | ||
|
||
#else // BLIS_OS_LINUX | ||
|
||
#include <sched.h> | ||
#include <unistd.h> | ||
|
||
// scope is either the calling process or the calling thread: | ||
// 0 = calling process | ||
// 1 = calling thread | ||
|
||
dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope) | ||
{ | ||
int rc; | ||
int active_cpus; | ||
pid_t pid; | ||
cpu_set_t mask; | ||
|
||
if (scope == process) { | ||
pid = getpid(); | ||
} else { | ||
// this means the current thread | ||
pid = 0; | ||
} | ||
|
||
CPU_ZERO(&mask); | ||
|
||
// if the CPU mask is larger than 1024 bits, this needs to change. | ||
// see https://man7.org/linux/man-pages/man2/sched_getaffinity.2.html for details. | ||
rc = sched_getaffinity(pid, sizeof(cpu_set_t), &mask); | ||
if (rc) { | ||
bli_print_msg( "sched_getaffinity failed", | ||
__FILE__, __LINE__ ); | ||
bli_abort(); | ||
} | ||
|
||
active_cpus = 0; | ||
for (int i=0; i<sizeof(cpu_set_t); i++) { | ||
const int on = CPU_ISSET(i, &mask); | ||
if (on) active_cpus++; | ||
} | ||
|
||
return active_cpus; | ||
} | ||
|
||
#endif // BLIS_OS_LINUX |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
/* | ||
|
||
BLIS | ||
An object-based framework for developing high-performance BLAS-like | ||
libraries. | ||
|
||
Copyright (C) 2022 NVIDIA | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are | ||
met: | ||
- Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
- Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the distribution. | ||
- Neither the name(s) of the copyright holder(s) nor the names of its | ||
contributors may be used to endorse or promote products derived | ||
from this software without specific prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
*/ | ||
|
||
#ifndef BLIS_AFFINITY_H | ||
#define BLIS_AFFINITY_H | ||
|
||
#include "blis.h" | ||
|
||
typedef enum { process = 0, thread = 1 } bli_affinity_scope_t; | ||
|
||
dim_t bli_affinity_get_hw_size(bli_affinity_scope_t scope); | ||
|
||
#endif // BLIS_AFFINITY_H |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ | |
|
||
Copyright (C) 2014, The University of Texas at Austin | ||
Copyright (C) 2018, Advanced Micro Devices, Inc. | ||
Copyright (C) 2022, NVIDIA | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are | ||
|
@@ -199,6 +200,12 @@ void bli_l3_thread_decorator_thread_check | |
) | ||
{ | ||
dim_t n_threads_real = omp_get_num_threads(); | ||
dim_t n_threads_hwmask; | ||
if ( omp_in_parallel() ) { | ||
n_threads_hwmask = bli_affinity_get_hw_size(thread); | ||
} else { | ||
n_threads_hwmask = bli_affinity_get_hw_size(process); | ||
} | ||
|
||
// Check if the number of OpenMP threads created within this parallel | ||
// region is different from the number of threads that were requested | ||
|
@@ -241,6 +248,27 @@ void bli_l3_thread_decorator_thread_check | |
|
||
// Synchronize all threads and continue. | ||
_Pragma( "omp barrier" ) | ||
|
||
return; | ||
} | ||
|
||
// Check if the number of OpenMP threads created within this parallel | ||
// region is different from the number of threads that are available | ||
// to BLIS in the calling context. | ||
if ( n_threads_hwmask < n_threads || n_threads_hwmask < n_threads_real) | ||
{ | ||
bli_print_msg( "The affinity mask on this process does not have " | ||
"enough HW threads for your requested SW threads.", | ||
__FILE__, __LINE__ ); | ||
bli_abort(); | ||
|
||
bli_thrcomm_init( n_threads_hwmask, gl_comm ); | ||
bli_rntm_set_num_threads_only( n_threads_hwmask, rntm ); | ||
#warning HELP ME HERE | ||
bli_rntm_set_ways_only( 1, 1, 1, 1, 1, rntm ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @devinamatthews this is the part where i need help. i can just do serialization, but i was thinking about trying to do slightly better, e.g. if i have 40 cores and the user wants to run on 80 threads, i can set things up to use 40 threads properly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reducing the number of active threads after thread creation is possible, but not easy. If this logic can go before the OpenMP region then things are much easier. |
||
|
||
// Synchronize all threads and continue. | ||
_Pragma( "omp barrier" ) | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
/* | ||
|
||
BLIS | ||
An object-based framework for developing high-performance BLAS-like | ||
libraries. | ||
|
||
Copyright (C) 2022 NVIDIA | ||
|
||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions are | ||
met: | ||
- Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
- Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the distribution. | ||
- Neither the name(s) of the copyright holder(s) nor the names of its | ||
contributors may be used to endorse or promote products derived | ||
from this software without specific prior written permission. | ||
|
||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | ||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | ||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | ||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
*/ | ||
|
||
// this macro has to come before any other headers. | ||
// i hate this but cannot figure out any other way to solve it. | ||
#define _GNU_SOURCE | ||
|
||
#include <sched.h> | ||
#include <unistd.h> | ||
|
||
#include <stdio.h> | ||
#include <stdlib.h> | ||
|
||
#include <cblas.h> | ||
|
||
int main(void) | ||
{ | ||
int m=10, n=10, k=10; | ||
double A[100], B[100], C[100]; | ||
|
||
for (int i=0; i<100; i++) { | ||
A[i] = B[i] = C[i] = 1.0; | ||
} | ||
|
||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, | ||
10, 10, 10, 1.0, A, 10, B, 10, 1.0, C, 10); | ||
|
||
{ | ||
int rc; | ||
pid_t pid = getpid(); | ||
cpu_set_t old_mask, new_mask; | ||
int active_cpus; | ||
|
||
CPU_ZERO(&old_mask); | ||
|
||
rc = sched_getaffinity(pid, sizeof(cpu_set_t), &old_mask); | ||
if (rc) { | ||
printf("sched_getaffinity returned %d\n", rc); | ||
abort(); | ||
} | ||
|
||
active_cpus = 0; | ||
for (int i=0; i<sizeof(cpu_set_t); i++) { | ||
const int on = CPU_ISSET(i, &old_mask); | ||
if (on) active_cpus++; | ||
} | ||
printf("active CPUs before = %d\n", active_cpus); | ||
|
||
CPU_ZERO(&new_mask); | ||
|
||
for (int i=0, j=0; i<sizeof(cpu_set_t); i++) { | ||
const int on = CPU_ISSET(i, &old_mask); | ||
if (on) { | ||
if (j < active_cpus / 2) { | ||
CPU_SET(i, &new_mask); | ||
j++; | ||
} | ||
} | ||
} | ||
|
||
active_cpus = 0; | ||
for (int i=0; i<sizeof(cpu_set_t); i++) { | ||
const int on = CPU_ISSET(i, &new_mask); | ||
if (on) active_cpus++; | ||
} | ||
printf("active CPUs after = %d\n", active_cpus); | ||
|
||
rc = sched_setaffinity(pid, sizeof(cpu_set_t), &new_mask); | ||
if (rc) { | ||
printf("sched_getaffinity returned %d\n", rc); | ||
abort(); | ||
} | ||
|
||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, | ||
10, 10, 10, 1.0, A, 10, B, 10, 1.0, C, 10); | ||
|
||
printf("AFTER\n"); | ||
|
||
} | ||
return 0; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@devinamatthews here, we look for the number of HW threads available to the thread if in OpenMP already, or the process, if not in OpenMP already, and if the user is trying to use more SW threads than HW threads, we abort.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, but this requires the user to have set the CPU mask (or OMP affinity?) correctly. What if I just run BLIS in OpenMP (with nesting) and set both OMP_NUM_THREADS and BLIS_NUM_THREADS equal to the number of cores? Or is this only meant to guard against a more specific use case?