Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU FLOPS Report #100

Open
wants to merge 27 commits into
base: marenz.pin_ci_version
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
a4391cb
added private variables that count flops as 2*N³ for gemm
rschoene Sep 19, 2024
20add86
print the flops
rschoene Sep 19, 2024
e920af0
fixed variable name
rschoene Sep 19, 2024
8a98801
forgot to commit
rschoene Sep 19, 2024
ecae284
more fixes
rschoene Sep 19, 2024
463df23
in wrong block
rschoene Sep 19, 2024
f246dba
atomics cannot be returned, so load
rschoene Sep 19, 2024
6f543ac
fixed naming
rschoene Sep 19, 2024
e97bbbe
use ptr
rschoene Sep 19, 2024
2ab434e
try parentheses as fix
rschoene Sep 19, 2024
5a74a5e
remove some of the tests
rschoene Sep 19, 2024
c3b8825
removed a semicolon m(
rschoene Sep 19, 2024
6a206e1
variable name N is not used in CUDA
rschoene Sep 19, 2024
9e620e3
use references to atomics
rschoene Sep 19, 2024
c99acc9
actually point to variable if we expect a pointer
rschoene Sep 19, 2024
83f6a1d
now the variables for flops are static so that old gccs are not as sad
rschoene Sep 19, 2024
6ff988c
hopefully it works now
rschoene Sep 19, 2024
cab63e9
forgot to save before commit
rschoene Sep 19, 2024
b0f2903
removed a line
rschoene Sep 19, 2024
3d2511d
fixed oneapi
rschoene Sep 19, 2024
4676528
had flops, need gflops, so divide by 1E9
rschoene Sep 19, 2024
46b667a
now from GFLOP to GFLOP/s
rschoene Sep 19, 2024
13b5fc5
added remark to gpu flops
rschoene Sep 19, 2024
0d650f9
added remark to GPU Flops
rschoene Sep 20, 2024
286e777
Merge branch 'master' into gpu_flops
marenz2569 Jan 15, 2025
d635191
fix type
marenz2569 Jan 15, 2025
36a6361
Merge branch 'marenz.pin_ci_version' into gpu_flops
marenz2569 Jan 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 32 additions & 13 deletions include/firestarter/Cuda/Cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,21 +23,38 @@

#include "firestarter/Constants.hpp"

#include <atomic>
#include <condition_variable>
#include <cstddef>
#include <thread>

namespace firestarter::cuda {

/// This struct contains the number of flop estimated that have been executed. It will be incremented by the flop when
/// on execution of a kernel is complete.
struct GpuFlop {
/// The number of executed single precision flop
std::atomic<std::size_t> SingleFlop = 0;
/// The number of executed double precision flop
std::atomic<std::size_t> DoubleFlop = 0;
};

/// This class handles the workload on CUDA and HIP compatible GPUs. A gemm routine is used to stress them with a
/// constant high load. This header does not include any CUDA or HIP specific headers to allow us to not guard the
/// include of this header in other parts of the programm.
class Cuda {
private:
/// The thread that is used to initilize the gpus. This thread will wait until each thread that runs the gemm routine
/// joins.
/// The thread that is used to initilize the gpus. This thread will wait until each thread that runs the gemm
/// routine joins.
std::thread InitThread;

/// The estimation on the number of flops that have been executed. It will be incremented by the flops when on
/// execution of a kernel is complete.
GpuFlop ExecutedFlop;

/// Spawns a thread for each of the selected gpus, initilizes them and starts the execution of the gemm in parallel.
/// \arg ExecutedFlop The variable that contains the number of flop estimated that have been executed. It will be
/// incremented by the flops when on execution of a kernel is complete.
/// \arg WaitForInitCv The condition variables used to signal that all gpus are initialized.
/// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
/// \arg UseFloat Set to true if we want to stress using single precision floating points.
Expand All @@ -46,19 +63,17 @@ class Cuda {
/// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
/// automatic selection.
/// \arg Gpus Select the number of gpus to stress or -1 for all.
static void initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus);
static void initGpus(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv,
const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
unsigned MatrixSize, int Gpus);

public:
/// Initilize the cuda class. This will start a thread running the Cuda::initGpus function and wait until all gpus are
/// inititialized.
/// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
/// \arg UseFloat Set to true if we want to stress using single precision floating points.
/// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or
/// UseDouble is set the precision will be choosen automatically.
/// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
/// automatic selection.
/// \arg Gpus Select the number of gpus to stress or -1 for all.
/// Initilize the cuda class. This will start a thread running the Cuda::initGpus function and wait until all gpus
/// are inititialized. \arg LoadVar A reference to the variable that controlls the current load of Firestarter. \arg
/// UseFloat Set to true if we want to stress using single precision floating points. \arg UseDouble Set to true if
/// we want to stress using double precision floating points. If neither UseFloat or UseDouble is set the precision
/// will be choosen automatically. \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm
/// operation or set to 0 for automatic selection. \arg Gpus Select the number of gpus to stress or -1 for all.
Cuda(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, unsigned MatrixSize,
int Gpus)
#if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
Expand All @@ -78,6 +93,10 @@ class Cuda {
InitThread.join();
}
}

/// Get the estimation on the executed flops of the cuda thread.
/// \returns The number of estimated executed flop.
auto executedFlop() -> const GpuFlop& { return ExecutedFlop; };
};

} // namespace firestarter::cuda
26 changes: 24 additions & 2 deletions include/firestarter/OneAPI/OneAPI.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,22 @@

#include "firestarter/Constants.hpp"

#include <atomic>
#include <condition_variable>
#include <cstddef>
#include <thread>

namespace firestarter::oneapi {

/// This struct contains the number of flop estimated that have been executed. It will be incremented by the flop when
/// on execution of a kernel is complete.
struct GpuFlop {
/// The number of executed single precision flop
std::atomic<std::size_t> SingleFlop = 0;
/// The number of executed double precision flop
std::atomic<std::size_t> DoubleFlop = 0;
};

/// This class handles the workload on OneAPI compatible GPUs. A gemm routine is used to stress them with a
/// constant high load. This header does not include any OneAPI specific headers to allow us to not guard the
/// include of this header in other parts of the programm.
Expand All @@ -37,7 +48,13 @@ class OneAPI {
/// joins.
std::thread InitThread;

/// The estimation on the number of flops that have been executed. It will be incremented by the flops when on
/// execution of a kernel is complete.
GpuFlop ExecutedFlop;

/// Spawns a thread for each of the selected gpus, initilizes them and starts the execution of the gemm in parallel.
/// \arg ExecutedFlop The variable that contains the number of flop estimated that have been executed. It will be
/// incremented by the flops when on execution of a kernel is complete.
/// \arg WaitForInitCv The condition variables used to signal that all gpus are initialized.
/// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
/// \arg UseFloat Set to true if we want to stress using single precision floating points.
Expand All @@ -46,8 +63,9 @@ class OneAPI {
/// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
/// automatic selection.
/// \arg Gpus Select the number of gpus to stress or -1 for all.
static void initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus);
static void initGpus(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv,
const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
unsigned MatrixSize, int Gpus);

public:
/// Initilize the OneAPI class. This will start a thread running the OneAPI::initGpus function and wait until all gpus
Expand Down Expand Up @@ -78,6 +96,10 @@ class OneAPI {
InitThread.join();
}
}

/// Get the estimation on the executed flops of the cuda thread.
/// \returns The number of estimated executed flop.
auto executedFlop() -> const GpuFlop& { return ExecutedFlop; };
};

} // namespace firestarter::oneapi
37 changes: 23 additions & 14 deletions src/firestarter/Cuda/Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,11 @@ template <std::size_t Multiple> auto roundUp(int NumToRound) -> int {
}

/// Convert the UseDouble input (0 -> single precision, 1 -> double precision, 2 -> automatic) to either 0 or 1 for
/// float or double respectively. For CUDART_VERSION at least equal 8000 and automatic selection we check if the card a
/// singleToDoublePrecisionPerfRatio bigger than 3 and select float in this case otherwise double. In all other cases
/// automatic results in double.
/// \arg UseDouble The input that specifies either single precision, double precision or automatic selection.
/// \arg Properties The device properties.
/// \return The selected precision, either 0 or 1 for float or double respectively.
/// float or double respectively. For CUDART_VERSION at least equal 8000 and automatic selection we check if the card
/// a singleToDoublePrecisionPerfRatio bigger than 3 and select float in this case otherwise double. In all other
/// cases automatic results in double. \arg UseDouble The input that specifies either single precision, double
/// precision or automatic selection. \arg Properties The device properties. \return The selected precision, either 0
/// or 1 for float or double respectively.
auto getPrecision(int UseDouble, const compat::DeviceProperties& Properties) -> int {
#if (CUDART_VERSION >= 8000)
// read precision ratio (dp/sp) of GPU to choose the right variant for maximum
Expand Down Expand Up @@ -122,8 +121,8 @@ auto getPrecision(int DeviceIndex, int UseDouble) -> int {
// GPU index. Used to pin this thread to the GPU.
// Size use is one square matrix dim size
template <typename FloatingPointType>
void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex, int DeviceIndex,
std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
void createLoad(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex,
int DeviceIndex, std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
unsigned MatrixSize) {
static_assert(std::is_same_v<FloatingPointType, float> || std::is_same_v<FloatingPointType, double>,
"create_load<FloatingPointType>: Template argument must be either float or double");
Expand Down Expand Up @@ -248,6 +247,14 @@ void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitC
MatrixSize, Beta, CSectionPtr, MatrixSize),
__FILE__, __LINE__, DeviceIndex);
compat::accellSafeCall(compat::deviceSynchronize(), __FILE__, __LINE__, DeviceIndex);

// The number of executed flop for a gemm with two square 'MatrixSize' sized matricies is 2 *
// ('MatrixSize'^3)
if (std::is_same_v<FloatingPointType, float>) {
ExecutedFlop.SingleFlop += 2 * MatrixSize * MatrixSize * MatrixSize;
} else if (std::is_same_v<FloatingPointType, double>) {
ExecutedFlop.DoubleFlop += 2 * MatrixSize * MatrixSize * MatrixSize;
}
}
}

Expand All @@ -267,16 +274,18 @@ Cuda::Cuda(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloa
std::condition_variable WaitForInitCv;
std::mutex WaitForInitCvMutex;

std::thread T(Cuda::initGpus, std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat, UseDouble, MatrixSize, Gpus);
std::thread T(Cuda::initGpus, std::ref(ExecutedFlop), std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat,
UseDouble, MatrixSize, Gpus);
InitThread = std::move(T);

std::unique_lock<std::mutex> Lk(WaitForInitCvMutex);
// wait for gpus to initialize
WaitForInitCv.wait(Lk);
}

void Cuda::initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus) {
void Cuda::initGpus(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv,
const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
unsigned MatrixSize, int Gpus) {
std::condition_variable GpuThreadsWaitForInitCv;
std::mutex GpuThreadsWaitForInitCvMutex;
std::vector<std::thread> GpuThreads;
Expand Down Expand Up @@ -327,12 +336,12 @@ void Cuda::initGpus(std::condition_variable& WaitForInitCv, const volatile fires
// if there's a GPU in the system without Double Precision support, we
// have to correct this.
const auto Precision = getPrecision(I, UseDoubleConverted);
void (*LoadFunc)(std::condition_variable&, std::mutex&, int, std::atomic<int>&,
void (*LoadFunc)(GpuFlop&, std::condition_variable&, std::mutex&, int, std::atomic<int>&,
const volatile firestarter::LoadThreadWorkType&, unsigned) =
Precision ? createLoad<double> : createLoad<float>;

std::thread T(LoadFunc, std::ref(GpuThreadsWaitForInitCv), std::ref(GpuThreadsWaitForInitCvMutex), I,
std::ref(InitCount), std::cref(LoadVar), MatrixSize);
std::thread T(LoadFunc, std::ref(ExecutedFlop), std::ref(GpuThreadsWaitForInitCv),
std::ref(GpuThreadsWaitForInitCvMutex), I, std::ref(InitCount), std::cref(LoadVar), MatrixSize);
GpuThreads.emplace_back(std::move(T));
}
}
Expand Down
38 changes: 35 additions & 3 deletions src/firestarter/LoadWorker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,19 +224,51 @@ void Firestarter::printPerformanceReport() {
return Ss.str();
};

const auto PrintGpuFlops = [&Runtime, &FormatString](auto& GpuPtr) -> void {
if (!GpuPtr) {
return;
}

auto SingleFlops = static_cast<double>(GpuPtr->executedFlop().SingleFlop.load()) * 0.000000001 / Runtime;
auto DoubleFlops = static_cast<double>(GpuPtr->executedFlop().DoubleFlop.load()) * 0.000000001 / Runtime;

if (SingleFlops > 0) {
log::debug() << "\n"
<< "estimated floating point performance (GPU)**: " << FormatString(SingleFlops)
<< " GFLOPS (single)";
}

if (DoubleFlops > 0) {
log::debug() << "\n"
<< "estimated floating point performance (GPU)**: " << FormatString(DoubleFlops)
<< " GFLOPS (double)";
}
};

log::debug() << "\n"
<< "total iterations: " << Iterations << "\n"
<< "runtime: " << FormatString(Runtime) << " seconds (" << StopTimestamp - StartTimestamp << " cycles)\n"
<< "\n"
<< "estimated floating point performance: " << FormatString(GFlops) << " GFLOPS\n"
<< "estimated memory bandwidth*: " << FormatString(Bandwidth) << " GB/s\n"
<< "\n"
<< "estimated floating point performance (CPU): " << FormatString(GFlops) << " GFLOPS\n"
<< "estimated memory bandwidth (CPU)*: " << FormatString(Bandwidth) << " GB/s";

PrintGpuFlops(Cuda);
PrintGpuFlops(Oneapi);

log::debug() << "\n"
<< "* this estimate is highly unreliable if --function is used in order "
"to "
"select\n"
<< " a function that is not optimized for your architecture, or if "
"FIRESTARTER is\n"
<< " executed on an unsupported architecture!";

if (Cuda || Oneapi) {
log::debug()
<< "** this estimate is based on the assumption that no algorithmically optimized version\n"
<< " of the called algorithm has been implemented by the vendor. It also might not be not accurate\n"
<< " for short runs of FIRESTARTER";
}
}

void Firestarter::loadThreadWorker(const std::shared_ptr<LoadWorkerData>& Td) {
Expand Down
28 changes: 19 additions & 9 deletions src/firestarter/OneAPI/OneAPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ template <std::size_t Multiple> auto roundUp(int NumToRound) -> int {
// The main difference to the CUDA/HIP version is that we do not run multiple iterations of C=A*B, just one single
// iteration.
template <typename FloatingPointType>
void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex, int DeviceIndex,
std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
void createLoad(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex,
int DeviceIndex, std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
unsigned MatrixSize) {
static_assert(std::is_same<FloatingPointType, float>::value || std::is_same<FloatingPointType, double>::value,
"createLoad<T>: Template argument T must be either float or double");
Expand Down Expand Up @@ -236,6 +236,14 @@ void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitC
MatrixSize, MatrixSize, 1, A, MatrixSize, B, MatrixSize, 0, C, MatrixSize);
firestarter::log::trace() << "wait gemm on device nr. " << DeviceIndex;
DeviceQueue.wait_and_throw();

// The number of executed flop for a gemm with two square 'MatrixSize' sized matricies is 2 *
// ('MatrixSize'^3)
if (std::is_same_v<FloatingPointType, float>) {
ExecutedFlop.SingleFlop += 2 * MatrixSize * MatrixSize * MatrixSize;
} else if (std::is_same_v<FloatingPointType, double>) {
ExecutedFlop.DoubleFlop += 2 * MatrixSize * MatrixSize * MatrixSize;
}
}
}

Expand All @@ -246,16 +254,18 @@ OneAPI::OneAPI(const volatile firestarter::LoadThreadWorkType& LoadVar, bool Use
std::condition_variable WaitForInitCv;
std::mutex WaitForInitCvMutex;

std::thread T(OneAPI::initGpus, std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat, UseDouble, MatrixSize, Gpus);
std::thread T(OneAPI::initGpus, std::ref(ExecutedFlop), std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat,
UseDouble, MatrixSize, Gpus);
InitThread = std::move(T);

std::unique_lock<std::mutex> Lk(WaitForInitCvMutex);
// wait for gpus to initialize
WaitForInitCv.wait(Lk);
}

void OneAPI::initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus) {
void OneAPI::initGpus(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv,
const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
unsigned MatrixSize, int Gpus) {
std::condition_variable GpuThreadsWaitForInitCv;
std::mutex GpuThreadsWaitForInitCvMutex;
std::vector<std::thread> GpuThreads;
Expand Down Expand Up @@ -320,12 +330,12 @@ void OneAPI::initGpus(std::condition_variable& WaitForInitCv, const volatile fir
if (Precision == -1) {
firestarter::log::warn() << "This should not have happened. Could not get precision via SYCL.";
}
void (*LoadFunc)(std::condition_variable&, std::mutex&, int, std::atomic<int>&,
void (*LoadFunc)(GpuFlop&, std::condition_variable&, std::mutex&, int, std::atomic<int>&,
const volatile firestarter::LoadThreadWorkType&, unsigned) =
Precision ? createLoad<double> : createLoad<float>;

std::thread T(LoadFunc, std::ref(GpuThreadsWaitForInitCv), std::ref(GpuThreadsWaitForInitCvMutex), I,
std::ref(InitCount), std::cref(LoadVar), MatrixSize);
std::thread T(LoadFunc, std::ref(ExecutedFlop), std::ref(GpuThreadsWaitForInitCv),
std::ref(GpuThreadsWaitForInitCvMutex), I, std::ref(InitCount), std::cref(LoadVar), MatrixSize);
GpuThreads.emplace_back(std::move(T));
}
}
Expand Down Expand Up @@ -354,4 +364,4 @@ void OneAPI::initGpus(std::condition_variable& WaitForInitCv, const volatile fir
}
}

} // namespace firestarter::oneapi
} // namespace firestarter::oneapi
Loading