tud-zih-energy · marenz2569 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024 · Sep 19, 2024
diff --git a/include/firestarter/Cuda/Cuda.hpp b/include/firestarter/Cuda/Cuda.hpp
@@ -23,21 +23,38 @@
 
 #include "firestarter/Constants.hpp"
 
+#include <atomic>
 #include <condition_variable>
+#include <cstddef>
 #include <thread>
 
 namespace firestarter::cuda {
 
+/// This struct contains the number of flop estimated that have been executed. It will be incremented by the flop when
+/// on execution of a kernel is complete.
+struct GpuFlop {
+  /// The number of executed single precision flop
+  std::atomic<std::size_t> SingleFlop = 0;
+  /// The number of executed double precision flop
+  std::atomic<std::size_t> DoubleFlop = 0;
+};
+
 /// This class handles the workload on CUDA and HIP compatible GPUs. A gemm routine is used to stress them with a
 /// constant high load. This header does not include any CUDA or HIP specific headers to allow us to not guard the
 /// include of this header in other parts of the programm.
 class Cuda {
 private:
-  /// The thread that is used to initilize the gpus. This thread will wait until each thread that runs the gemm routine
-  /// joins.
+  /// The thread that is used to initilize the gpus. This thread will wait until each thread that runs the gemm
+  /// routine joins.
   std::thread InitThread;
 
+  /// The estimation on the number of flops that have been executed. It will be incremented by the flops when on
+  /// execution of a kernel is complete.
+  GpuFlop ExecutedFlop;
+
   /// Spawns a thread for each of the selected gpus, initilizes them and starts the execution of the gemm in parallel.
+  /// \arg ExecutedFlop The variable that contains the number of flop estimated that have been executed. It will be
+  /// incremented by the flops when on execution of a kernel is complete.
   /// \arg WaitForInitCv The condition variables used to signal that all gpus are initialized.
   /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
   /// \arg UseFloat Set to true if we want to stress using single precision floating points.
@@ -46,19 +63,17 @@ class Cuda {
   /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
   /// automatic selection.
   /// \arg Gpus Select the number of gpus to stress or -1 for all.
-  static void initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
-                       bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus);
+  static void initGpus(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv,
+                       const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
+                       unsigned MatrixSize, int Gpus);
 
 public:
-  /// Initilize the cuda class. This will start a thread running the Cuda::initGpus function and wait until all gpus are
-  /// inititialized.
-  /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
-  /// \arg UseFloat Set to true if we want to stress using single precision floating points.
-  /// \arg UseDouble Set to true if we want to stress using double precision floating points. If neither UseFloat or
-  /// UseDouble is set the precision will be choosen automatically.
-  /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
-  /// automatic selection.
-  /// \arg Gpus Select the number of gpus to stress or -1 for all.
+  /// Initilize the cuda class. This will start a thread running the Cuda::initGpus function and wait until all gpus
+  /// are inititialized. \arg LoadVar A reference to the variable that controlls the current load of Firestarter. \arg
+  /// UseFloat Set to true if we want to stress using single precision floating points. \arg UseDouble Set to true if
+  /// we want to stress using double precision floating points. If neither UseFloat or UseDouble is set the precision
+  /// will be choosen automatically. \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm
+  /// operation or set to 0 for automatic selection. \arg Gpus Select the number of gpus to stress or -1 for all.
   Cuda(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble, unsigned MatrixSize,
        int Gpus)
 #if defined(FIRESTARTER_BUILD_CUDA) || defined(FIRESTARTER_BUILD_HIP)
@@ -78,6 +93,10 @@ class Cuda {
       InitThread.join();
     }
   }
+
+  /// Get the estimation on the executed flops of the cuda thread.
+  /// \returns The number of estimated executed flop.
+  auto executedFlop() -> const GpuFlop& { return ExecutedFlop; };
 };
 
 } // namespace firestarter::cuda
diff --git a/include/firestarter/OneAPI/OneAPI.hpp b/include/firestarter/OneAPI/OneAPI.hpp
@@ -23,11 +23,22 @@
 
 #include "firestarter/Constants.hpp"
 
+#include <atomic>
 #include <condition_variable>
+#include <cstddef>
 #include <thread>
 
 namespace firestarter::oneapi {
 
+/// This struct contains the number of flop estimated that have been executed. It will be incremented by the flop when
+/// on execution of a kernel is complete.
+struct GpuFlop {
+  /// The number of executed single precision flop
+  std::atomic<std::size_t> SingleFlop = 0;
+  /// The number of executed double precision flop
+  std::atomic<std::size_t> DoubleFlop = 0;
+};
+
 /// This class handles the workload on OneAPI compatible GPUs. A gemm routine is used to stress them with a
 /// constant high load. This header does not include any OneAPI specific headers to allow us to not guard the
 /// include of this header in other parts of the programm.
@@ -37,7 +48,13 @@ class OneAPI {
   /// joins.
   std::thread InitThread;
 
+  /// The estimation on the number of flops that have been executed. It will be incremented by the flops when on
+  /// execution of a kernel is complete.
+  GpuFlop ExecutedFlop;
+
   /// Spawns a thread for each of the selected gpus, initilizes them and starts the execution of the gemm in parallel.
+  /// \arg ExecutedFlop The variable that contains the number of flop estimated that have been executed. It will be
+  /// incremented by the flops when on execution of a kernel is complete.
   /// \arg WaitForInitCv The condition variables used to signal that all gpus are initialized.
   /// \arg LoadVar A reference to the variable that controlls the current load of Firestarter.
   /// \arg UseFloat Set to true if we want to stress using single precision floating points.
@@ -46,8 +63,9 @@ class OneAPI {
   /// \arg MatrixSize Set to a specific matrix size which will be choosen for the gemm operation or set to 0 for
   /// automatic selection.
   /// \arg Gpus Select the number of gpus to stress or -1 for all.
-  static void initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
-                       bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus);
+  static void initGpus(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv,
+                       const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
+                       unsigned MatrixSize, int Gpus);
 
 public:
   /// Initilize the OneAPI class. This will start a thread running the OneAPI::initGpus function and wait until all gpus
@@ -78,6 +96,10 @@ class OneAPI {
       InitThread.join();
     }
   }
+
+  /// Get the estimation on the executed flops of the cuda thread.
+  /// \returns The number of estimated executed flop.
+  auto executedFlop() -> const GpuFlop& { return ExecutedFlop; };
 };
 
 } // namespace firestarter::oneapi
diff --git a/src/firestarter/Cuda/Cuda.cpp b/src/firestarter/Cuda/Cuda.cpp
@@ -51,12 +51,11 @@ template <std::size_t Multiple> auto roundUp(int NumToRound) -> int {
 }
 
 /// Convert the UseDouble input (0 -> single precision, 1 -> double precision, 2 -> automatic) to either 0 or 1 for
-/// float or double respectively. For CUDART_VERSION at least equal 8000 and automatic selection we check if the card a
-/// singleToDoublePrecisionPerfRatio bigger than 3 and select float in this case otherwise double. In all other cases
-/// automatic results in double.
-/// \arg UseDouble The input that specifies either single precision, double precision or automatic selection.
-/// \arg Properties The device properties.
-/// \return The selected precision, either 0 or 1 for float or double respectively.
+/// float or double respectively. For CUDART_VERSION at least equal 8000 and automatic selection we check if the card
+/// a singleToDoublePrecisionPerfRatio bigger than 3 and select float in this case otherwise double. In all other
+/// cases automatic results in double. \arg UseDouble The input that specifies either single precision, double
+/// precision or automatic selection. \arg Properties The device properties. \return The selected precision, either 0
+/// or 1 for float or double respectively.
 auto getPrecision(int UseDouble, const compat::DeviceProperties& Properties) -> int {
 #if (CUDART_VERSION >= 8000)
   // read precision ratio (dp/sp) of GPU to choose the right variant for maximum
@@ -122,8 +121,8 @@ auto getPrecision(int DeviceIndex, int UseDouble) -> int {
 // GPU index. Used to pin this thread to the GPU.
 // Size use is one square matrix dim size
 template <typename FloatingPointType>
-void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex, int DeviceIndex,
-                std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
+void createLoad(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex,
+                int DeviceIndex, std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
                 unsigned MatrixSize) {
   static_assert(std::is_same_v<FloatingPointType, float> || std::is_same_v<FloatingPointType, double>,
                 "create_load<FloatingPointType>: Template argument must be either float or double");
@@ -248,6 +247,14 @@ void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitC
                                                              MatrixSize, Beta, CSectionPtr, MatrixSize),
                              __FILE__, __LINE__, DeviceIndex);
       compat::accellSafeCall(compat::deviceSynchronize(), __FILE__, __LINE__, DeviceIndex);
+
+      // The number of executed flop for a gemm with two square 'MatrixSize' sized matricies is 2 *
+      // ('MatrixSize'^3)
+      if (std::is_same_v<FloatingPointType, float>) {
+        ExecutedFlop.SingleFlop += 2 * MatrixSize * MatrixSize * MatrixSize;
+      } else if (std::is_same_v<FloatingPointType, double>) {
+        ExecutedFlop.DoubleFlop += 2 * MatrixSize * MatrixSize * MatrixSize;
+      }
     }
   }
 
@@ -267,16 +274,18 @@ Cuda::Cuda(const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloa
   std::condition_variable WaitForInitCv;
   std::mutex WaitForInitCvMutex;
 
-  std::thread T(Cuda::initGpus, std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat, UseDouble, MatrixSize, Gpus);
+  std::thread T(Cuda::initGpus, std::ref(ExecutedFlop), std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat,
+                UseDouble, MatrixSize, Gpus);
   InitThread = std::move(T);
 
   std::unique_lock<std::mutex> Lk(WaitForInitCvMutex);
   // wait for gpus to initialize
   WaitForInitCv.wait(Lk);
 }
 
-void Cuda::initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
-                    bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus) {
+void Cuda::initGpus(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv,
+                    const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
+                    unsigned MatrixSize, int Gpus) {
   std::condition_variable GpuThreadsWaitForInitCv;
   std::mutex GpuThreadsWaitForInitCvMutex;
   std::vector<std::thread> GpuThreads;
@@ -327,12 +336,12 @@ void Cuda::initGpus(std::condition_variable& WaitForInitCv, const volatile fires
           // if there's a GPU in the system without Double Precision support, we
           // have to correct this.
           const auto Precision = getPrecision(I, UseDoubleConverted);
-          void (*LoadFunc)(std::condition_variable&, std::mutex&, int, std::atomic<int>&,
+          void (*LoadFunc)(GpuFlop&, std::condition_variable&, std::mutex&, int, std::atomic<int>&,
                            const volatile firestarter::LoadThreadWorkType&, unsigned) =
               Precision ? createLoad<double> : createLoad<float>;
 
-          std::thread T(LoadFunc, std::ref(GpuThreadsWaitForInitCv), std::ref(GpuThreadsWaitForInitCvMutex), I,
-                        std::ref(InitCount), std::cref(LoadVar), MatrixSize);
+          std::thread T(LoadFunc, std::ref(ExecutedFlop), std::ref(GpuThreadsWaitForInitCv),
+                        std::ref(GpuThreadsWaitForInitCvMutex), I, std::ref(InitCount), std::cref(LoadVar), MatrixSize);
           GpuThreads.emplace_back(std::move(T));
         }
       }

diff --git a/src/firestarter/LoadWorker.cpp b/src/firestarter/LoadWorker.cpp
@@ -224,19 +224,51 @@ void Firestarter::printPerformanceReport() {
     return Ss.str();
   };
 
+  const auto PrintGpuFlops = [&Runtime, &FormatString](auto& GpuPtr) -> void {
+    if (!GpuPtr) {
+      return;
+    }
+
+    auto SingleFlops = static_cast<double>(GpuPtr->executedFlop().SingleFlop.load()) * 0.000000001 / Runtime;
+    auto DoubleFlops = static_cast<double>(GpuPtr->executedFlop().DoubleFlop.load()) * 0.000000001 / Runtime;
+
+    if (SingleFlops > 0) {
+      log::debug() << "\n"
+                   << "estimated floating point performance (GPU)**: " << FormatString(SingleFlops)
+                   << " GFLOPS (single)";
+    }
+
+    if (DoubleFlops > 0) {
+      log::debug() << "\n"
+                   << "estimated floating point performance (GPU)**: " << FormatString(DoubleFlops)
+                   << " GFLOPS (double)";
+    }
+  };
+
   log::debug() << "\n"
                << "total iterations: " << Iterations << "\n"
                << "runtime: " << FormatString(Runtime) << " seconds (" << StopTimestamp - StartTimestamp << " cycles)\n"
                << "\n"
-               << "estimated floating point performance: " << FormatString(GFlops) << " GFLOPS\n"
-               << "estimated memory bandwidth*: " << FormatString(Bandwidth) << " GB/s\n"
-               << "\n"
+               << "estimated floating point performance (CPU): " << FormatString(GFlops) << " GFLOPS\n"
+               << "estimated memory bandwidth (CPU)*: " << FormatString(Bandwidth) << " GB/s";
+
+  PrintGpuFlops(Cuda);
+  PrintGpuFlops(Oneapi);
+
+  log::debug() << "\n"
                << "* this estimate is highly unreliable if --function is used in order "
                   "to "
                   "select\n"
                << "  a function that is not optimized for your architecture, or if "
                   "FIRESTARTER is\n"
                << "  executed on an unsupported architecture!";
+
+  if (Cuda || Oneapi) {
+    log::debug()
+        << "** this estimate is based on the assumption that no algorithmically optimized version\n"
+        << "    of the called algorithm has been implemented by the vendor. It also might not be not accurate\n"
+        << "    for short runs of FIRESTARTER";
+  }
 }
 
 void Firestarter::loadThreadWorker(const std::shared_ptr<LoadWorkerData>& Td) {

diff --git a/src/firestarter/OneAPI/OneAPI.cpp b/src/firestarter/OneAPI/OneAPI.cpp
@@ -130,8 +130,8 @@ template <std::size_t Multiple> auto roundUp(int NumToRound) -> int {
 // The main difference to the CUDA/HIP version is that we do not run multiple iterations of C=A*B, just one single
 // iteration.
 template <typename FloatingPointType>
-void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex, int DeviceIndex,
-                std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
+void createLoad(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv, std::mutex& WaitForInitCvMutex,
+                int DeviceIndex, std::atomic<int>& InitCount, const volatile firestarter::LoadThreadWorkType& LoadVar,
                 unsigned MatrixSize) {
   static_assert(std::is_same<FloatingPointType, float>::value || std::is_same<FloatingPointType, double>::value,
                 "createLoad<T>: Template argument T must be either float or double");
@@ -236,6 +236,14 @@ void createLoad(std::condition_variable& WaitForInitCv, std::mutex& WaitForInitC
                               MatrixSize, MatrixSize, 1, A, MatrixSize, B, MatrixSize, 0, C, MatrixSize);
     firestarter::log::trace() << "wait gemm on device nr. " << DeviceIndex;
     DeviceQueue.wait_and_throw();
+
+    // The number of executed flop for a gemm with two square 'MatrixSize' sized matricies is 2 *
+    // ('MatrixSize'^3)
+    if (std::is_same_v<FloatingPointType, float>) {
+      ExecutedFlop.SingleFlop += 2 * MatrixSize * MatrixSize * MatrixSize;
+    } else if (std::is_same_v<FloatingPointType, double>) {
+      ExecutedFlop.DoubleFlop += 2 * MatrixSize * MatrixSize * MatrixSize;
+    }
   }
 }
 
@@ -246,16 +254,18 @@ OneAPI::OneAPI(const volatile firestarter::LoadThreadWorkType& LoadVar, bool Use
   std::condition_variable WaitForInitCv;
   std::mutex WaitForInitCvMutex;
 
-  std::thread T(OneAPI::initGpus, std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat, UseDouble, MatrixSize, Gpus);
+  std::thread T(OneAPI::initGpus, std::ref(ExecutedFlop), std::ref(WaitForInitCv), std::cref(LoadVar), UseFloat,
+                UseDouble, MatrixSize, Gpus);
   InitThread = std::move(T);
 
   std::unique_lock<std::mutex> Lk(WaitForInitCvMutex);
   // wait for gpus to initialize
   WaitForInitCv.wait(Lk);
 }
 
-void OneAPI::initGpus(std::condition_variable& WaitForInitCv, const volatile firestarter::LoadThreadWorkType& LoadVar,
-                      bool UseFloat, bool UseDouble, unsigned MatrixSize, int Gpus) {
+void OneAPI::initGpus(GpuFlop& ExecutedFlop, std::condition_variable& WaitForInitCv,
+                      const volatile firestarter::LoadThreadWorkType& LoadVar, bool UseFloat, bool UseDouble,
+                      unsigned MatrixSize, int Gpus) {
   std::condition_variable GpuThreadsWaitForInitCv;
   std::mutex GpuThreadsWaitForInitCvMutex;
   std::vector<std::thread> GpuThreads;
@@ -320,12 +330,12 @@ void OneAPI::initGpus(std::condition_variable& WaitForInitCv, const volatile fir
           if (Precision == -1) {
             firestarter::log::warn() << "This should not have happened. Could not get precision via SYCL.";
           }
-          void (*LoadFunc)(std::condition_variable&, std::mutex&, int, std::atomic<int>&,
+          void (*LoadFunc)(GpuFlop&, std::condition_variable&, std::mutex&, int, std::atomic<int>&,
                            const volatile firestarter::LoadThreadWorkType&, unsigned) =
               Precision ? createLoad<double> : createLoad<float>;
 
-          std::thread T(LoadFunc, std::ref(GpuThreadsWaitForInitCv), std::ref(GpuThreadsWaitForInitCvMutex), I,
-                        std::ref(InitCount), std::cref(LoadVar), MatrixSize);
+          std::thread T(LoadFunc, std::ref(ExecutedFlop), std::ref(GpuThreadsWaitForInitCv),
+                        std::ref(GpuThreadsWaitForInitCvMutex), I, std::ref(InitCount), std::cref(LoadVar), MatrixSize);
           GpuThreads.emplace_back(std::move(T));
         }
       }
@@ -354,4 +364,4 @@ void OneAPI::initGpus(std::condition_variable& WaitForInitCv, const volatile fir
   }
 }
 
-} // namespace firestarter::oneapi
+} // namespace firestarter::oneapi