Merge pull request #144 from valassi/fpsingle

[epoch1] Single precision: fix build failures, improve NaN determination
madgraph5 · Mar 31, 2021 · 8fc6978 · 8fc6978
2 parents 44460dd + 69d8d8f
commit 8fc6978
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 30 deletions.
diff --git a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc
@@ -34,14 +34,64 @@ bool is_number(const char *s) {
   return (int)strlen(s) == t - s;
 }
 
+// Disabling fast math is essential here, otherwise results are undefined
+// See https://stackoverflow.com/a/40702790 about the __attribute__ syntax
+__attribute__((optimize("-fno-fast-math")))
+bool fp_is_abnormal( const fptype& fp )
+{
+  if ( std::isnan( fp ) ) return true;
+  if ( fp != fp ) return true;
+  return false;
+}
+
+__attribute__((optimize("-fno-fast-math")))
+bool fp_is_zero( const fptype& fp )
+{
+  if ( fp == 0 ) return true;
+  return false;
+}
+
+// See https://en.cppreference.com/w/cpp/numeric/math/FP_categories
+__attribute__((optimize("-fno-fast-math")))
+const char* fp_show_class( const fptype& fp )
+{
+  switch( std::fpclassify( fp ) ) {
+  case FP_INFINITE:  return "Inf";
+  case FP_NAN:       return "NaN";
+  case FP_NORMAL:    return "normal";
+  case FP_SUBNORMAL: return "subnormal";
+  case FP_ZERO:      return "zero";
+  default:           return "unknown";
+  }
+}
+
+__attribute__((optimize("-fno-fast-math")))
+void debug_me_is_abnormal( const fptype& me, int ievtALL )
+{
+  std::cout << "DEBUG[" << ievtALL << "]"
+            << " ME=" << me
+            << " fpisabnormal=" << fp_is_abnormal( me )
+            << " fpclass=" << fp_show_class( me )
+            << " (me==me)=" << ( me == me )
+            << " (me==me+1)=" << ( me == me+1 )
+            << " isnan=" << std::isnan( me )
+            << " isfinite=" << std::isfinite( me )
+            << " isnormal=" << std::isnormal( me )
+            << " is0=" << ( me == 0 )
+            << " is1=" << ( me == 1 )
+            << " abs(ME)=" << std::abs( me )
+            << " isnan=" << std::isnan( std::abs( me ) )
+            << std::endl;
+}
+
 int usage(char* argv0, int ret = 1) {
   std::cout << "Usage: " << argv0
             << " [--verbose|-v] [--debug|-d] [--performance|-p] [--json|-j]"
             << " [#gpuBlocksPerGrid #gpuThreadsPerBlock] #iterations" << std::endl << std::endl;
   std::cout << "The number of events per iteration is #gpuBlocksPerGrid * #gpuThreadsPerBlock" << std::endl;
   std::cout << "(also in CPU/C++ code, where only the product of these two parameters counts)" << std::endl << std::endl;
   std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl;
-  std::cout << "The '-d' flag only enables nan warnings and OMP debugging" << std::endl;
+  std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl;
 #ifndef __CUDACC__
   std::cout << std::endl << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl;
   std::cout << "(OMP multithreading will be disabled if OMP_NUM_THREADS is not set)" << std::endl;
@@ -193,7 +243,7 @@ int main(int argc, char **argv)
   struct CudaTearDown {
     CudaTearDown(bool print) : _print(print) { }
     ~CudaTearDown() {
-      if ( _print ) std::cout << "Calling cudaDeviceReset()." << std::endl;
+      //if ( _print ) std::cout << "Calling cudaDeviceReset()." << std::endl;
       checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full
     }
     bool _print{false};
@@ -530,19 +580,30 @@ int main(int argc, char **argv)
   double meanwtim = sumwtim / niter;
   //double stdwtim = std::sqrt( sqswtim / niter - meanwtim * meanwtim );
 
-  int nnan = 0;
+  int nabn = 0;
+  int nzero = 0;
   double minelem = matrixelementALL[0];
   double maxelem = matrixelementALL[0];
   double minweig = weightALL[0];
   double maxweig = weightALL[0];
   for ( int ievtALL = 0; ievtALL < nevtALL; ++ievtALL )
   {
+    // The following events are abnormal in a run with "-p 2048 256 12 -d"
+    // - check.exe/commonrand: ME[310744,451171,3007871,3163868,4471038,5473927] with fast math
+    // - check.exe/curand: ME[578162,1725762,2163579,5407629,5435532,6014690] with fast math
+    // - gcheck.exe/curand: ME[596016,1446938] with fast math
+    // Debug NaN/abnormal issues
+    //if ( ievtALL == 310744 ) // this ME is abnormal both with and without fast math
+    //  debug_me_is_abnormal( matrixelementALL[ievtALL], ievtALL );
+    //if ( ievtALL == 5473927 ) // this ME is abnormal only with fast math
+    //  debug_me_is_abnormal( matrixelementALL[ievtALL], ievtALL );
     // Compute min/max
-    if ( std::isnan( matrixelementALL[ievtALL] ) )
+    if ( fp_is_zero( matrixelementALL[ievtALL] ) ) nzero++;
+    if ( fp_is_abnormal( matrixelementALL[ievtALL] ) )
     {
       if ( debug ) // only printed out with "-p -d" (matrixelementALL is not filled without -p)
-        std::cout << "WARNING! ME[" << ievtALL << "} is nan" << std::endl;
-      nnan++;
+        std::cout << "WARNING! ME[" << ievtALL << "] is NaN/abnormal" << std::endl;
+      nabn++;
       continue;
     }
     minelem = std::min( minelem, (double)matrixelementALL[ievtALL] );
@@ -555,23 +616,23 @@ int main(int argc, char **argv)
   for ( int ievtALL = 0; ievtALL < nevtALL; ++ievtALL )
   {
     // Compute mean from the sum of diff to min
-    if ( std::isnan( matrixelementALL[ievtALL] ) ) continue;
+    if ( fp_is_abnormal( matrixelementALL[ievtALL] ) ) continue;
     sumelemdiff += ( matrixelementALL[ievtALL] - minelem );
     sumweigdiff += ( weightALL[ievtALL] - minweig );
   }
-  double meanelem = minelem + sumelemdiff / ( nevtALL - nnan );
-  double meanweig = minweig + sumweigdiff / ( nevtALL - nnan );
+  double meanelem = minelem + sumelemdiff / ( nevtALL - nabn );
+  double meanweig = minweig + sumweigdiff / ( nevtALL - nabn );
   double sqselemdiff = 0;
   double sqsweigdiff = 0;
   for ( int ievtALL = 0; ievtALL < nevtALL; ++ievtALL )
   {
     // Compute stddev from the squared sum of diff to mean
-    if ( std::isnan( matrixelementALL[ievtALL] ) ) continue;
+    if ( fp_is_abnormal( matrixelementALL[ievtALL] ) ) continue;
     sqselemdiff += std::pow( matrixelementALL[ievtALL] - meanelem, 2 );
     sqsweigdiff += std::pow( weightALL[ievtALL] - meanweig, 2 );
   }
-  double stdelem = std::sqrt( sqselemdiff / ( nevtALL - nnan ) );
-  double stdweig = std::sqrt( sqsweigdiff / ( nevtALL - nnan ) );
+  double stdelem = std::sqrt( sqselemdiff / ( nevtALL - nabn ) );
+  double stdweig = std::sqrt( sqsweigdiff / ( nevtALL - nabn ) );
 
   // === STEP 9 FINALISE
   // --- 9a. Destroy curand generator
@@ -611,9 +672,9 @@ int main(int argc, char **argv)
               << "NumIterations               = " << niter << std::endl
               << "-----------------------------------------------------------------------" << std::endl
 #if defined MGONGPU_FPTYPE_DOUBLE
-              << "FP precision                = DOUBLE (nan=" << nnan << ")" << std::endl
+              << "FP precision                = DOUBLE (NaN/abnormal=" << nabn << ", zero=" << nzero << " )" << std::endl
 #elif defined MGONGPU_FPTYPE_FLOAT
-              << "FP precision                = FLOAT (nan=" << nnan << ")" << std::endl
+              << "FP precision                = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl
 #endif
 #ifdef __CUDACC__
 #if defined MGONGPU_CXTYPE_CUCOMPLEX
@@ -676,15 +737,15 @@ int main(int argc, char **argv)
               << std::string(16, ' ') << " )  sec^-1" << std::endl
               << std::defaultfloat; // default format: affects all floats
     std::cout << "***********************************************************************" << std::endl
-              << "NumMatrixElements(notNan)   = " << nevtALL - nnan << std::endl
+              << "NumMatrixElems(notAbnormal) = " << nevtALL - nabn << std::endl
               << std::scientific // fixed format: affects all floats (default precision: 6)
               << "MeanMatrixElemValue         = ( " << meanelem
-              << " +- " << stdelem/sqrt(nevtALL - nnan) << " )  GeV^" << meGeVexponent << std::endl // standard error
+              << " +- " << stdelem/sqrt(nevtALL - nabn) << " )  GeV^" << meGeVexponent << std::endl // standard error
               << "[Min,Max]MatrixElemValue    = [ " << minelem
               << " ,  " << maxelem << " ]  GeV^" << meGeVexponent << std::endl
               << "StdDevMatrixElemValue       = ( " << stdelem << std::string(16, ' ') << " )  GeV^" << meGeVexponent << std::endl
               << "MeanWeight                  = ( " << meanweig
-              << " +- " << stdweig/sqrt(nevtALL - nnan) << " )" << std::endl // standard error
+              << " +- " << stdweig/sqrt(nevtALL - nabn) << " )" << std::endl // standard error
               << "[Min,Max]Weight             = [ " << minweig
               << " ,  " << maxweig << " ]" << std::endl
               << "StdDevWeight                = ( " << stdweig << std::string(16, ' ') << " )" << std::endl
@@ -727,10 +788,9 @@ int main(int argc, char **argv)
              << "\"NumThreadsPerBlock\": " << gputhreads << ", " << std::endl
              << "\"NumBlocksPerGrid\": " << gpublocks << ", " << std::endl
 #if defined MGONGPU_FPTYPE_DOUBLE
-             << "\"FP precision\": "
-             << "\"DOUBLE (nan=" << nnan << ")\"," << std::endl
+             << "\"FP precision\": " << "\"DOUBLE (NaN/abnormal=" << nabn << ")\"," << std::endl
 #elif defined MGONGPU_FPTYPE_FLOAT
-             << "\"FP precision\": " << "FLOAT (nan=" << nnan << ")," << std::endl
+             << "\"FP precision\": " << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl
 #endif
              << "\"Complex type\": "
 #ifdef __CUDACC__
@@ -797,7 +857,7 @@ int main(int argc, char **argv)
              << std::endl
              << "\"EvtsPerSec[MatrixElems] (3)\": \""
              << std::to_string(nevtALL/sumwtim) << " sec^-1\"," << std::endl
-             << "\"NumMatrixElements(notNan)\": " << nevtALL - nnan << "," << std::endl
+             << "\"NumMatrixElems(notAbnormal)\": " << nevtALL - nabn << "," << std::endl
              << std::scientific
              << "\"MeanMatrixElemValue\": "
              << "\"" << std::to_string(meanelem) << " GeV^"

diff --git a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/runTest.cc b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/runTest.cc
@@ -14,7 +14,7 @@
 
 
 
-struct CUDA_CPU_TestBase : public TestDriverBase<double> {
+struct CUDA_CPU_TestBase : public TestDriverBase<fptype> {
   static_assert( gputhreads%mgOnGpu::neppR == 0, "ERROR! #threads/block should be a multiple of neppR" );
   static_assert( gputhreads%mgOnGpu::neppM == 0, "ERROR! #threads/block should be a multiple of neppM" );
   static_assert( gputhreads <= mgOnGpu::ntpbMAX, "ERROR! #threads/block should be <= ntpbMAX" );
@@ -82,15 +82,15 @@ struct CPUTest : public CUDA_CPU_TestBase {
 
 
 
-  double getMomentum(std::size_t evtNo, unsigned int particle, unsigned int component) const override {
+  fptype getMomentum(std::size_t evtNo, unsigned int particle, unsigned int component) const override {
     assert(component < mgOnGpu::np4);
     assert(particle  < mgOnGpu::npar);
     const auto page  = evtNo / mgOnGpu::neppM; // #eventpage in this iteration
     const auto ieppM = evtNo % mgOnGpu::neppM; // #event in the current eventpage in this iteration
     return hstMomenta[page * mgOnGpu::npar*mgOnGpu::np4*mgOnGpu::neppM + particle * mgOnGpu::neppM*mgOnGpu::np4 + component * mgOnGpu::neppM + ieppM];
   };
 
-  double getMatrixElement(std::size_t evtNo) const override {
+  fptype getMatrixElement(std::size_t evtNo) const override {
     return hstMEs[evtNo];
   }
 };
@@ -187,21 +187,23 @@ struct CUDATest : public CUDA_CPU_TestBase {
   }
 
 
-  double getMomentum(std::size_t evtNo, unsigned int particle, unsigned int component) const override {
+  fptype getMomentum(std::size_t evtNo, unsigned int particle, unsigned int component) const override {
     assert(component < mgOnGpu::np4);
     assert(particle  < mgOnGpu::npar);
     const auto page  = evtNo / mgOnGpu::neppM; // #eventpage in this iteration
     const auto ieppM = evtNo % mgOnGpu::neppM; // #event in the current eventpage in this iteration
     return hstMomenta[page * mgOnGpu::npar*mgOnGpu::np4*mgOnGpu::neppM + particle * mgOnGpu::neppM*mgOnGpu::np4 + component * mgOnGpu::neppM + ieppM];
   };
 
-  double getMatrixElement(std::size_t evtNo) const override {
+  fptype getMatrixElement(std::size_t evtNo) const override {
     return hstMEs[evtNo];
   }
 };
 #endif
 
 
+#if defined MGONGPU_FPTYPE_DOUBLE
+
 #ifdef __CUDACC__
 INSTANTIATE_TEST_SUITE_P(EP1_CUDA_GPU, MadgraphTestDouble,
     testing::Values( [](){ return new CUDATest; } )
@@ -212,4 +214,9 @@ INSTANTIATE_TEST_SUITE_P(EP1_CUDA_CPU, MadgraphTestDouble,
 );
 #endif
 
+#else
+
+#warning runTest.cc has not been ported to single precision yet (issue #143)
+
+#endif
 
diff --git a/epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc b/epoch2/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc
@@ -285,10 +285,10 @@ int main(int argc, char **argv)
     const std::string rngnKey = "1b GenRnGen";
     timermap.start( rngnKey );
 #ifdef MGONGPU_COMMONRAND_ONHOST
-    std::vector<double> commonRnd = commonRandomPromises[iiter].get_future().get();
+    std::vector<fptype> commonRnd = commonRandomPromises[iiter].get_future().get();
     assert( nRnarray == static_cast<int>( commonRnd.size() ) );
     // NB (PR #45): memcpy is strictly needed only in CUDA (copy to pinned memory), but keep it also in C++ for consistency
-    memcpy( hstRnarray.get(), commonRnd.data(), nRnarray * sizeof(hstRnarray[0]) );
+    memcpy( hstRnarray.get(), commonRnd.data(), nRnarray * sizeof(fptype) );
 #elif defined __CUDACC__
 #ifdef MGONGPU_CURAND_ONDEVICE
     grambo2toNm0::generateRnarray( rnGen, devRnarray.get(), nevt );

diff --git a/test/include/MadgraphTest.h b/test/include/MadgraphTest.h
@@ -50,8 +50,8 @@ class TestDriverBase {
   // ------------------------------------------------
   // Interface for retrieving info from madgraph
   // ------------------------------------------------
-  virtual double getMomentum(std::size_t evtNo, unsigned int particleNo, unsigned int component) const = 0;
-  virtual double getMatrixElement(std::size_t evtNo) const = 0;
+  virtual fptype getMomentum(std::size_t evtNo, unsigned int particleNo, unsigned int component) const = 0;
+  virtual fptype getMatrixElement(std::size_t evtNo) const = 0;
 
 
   // ------------------------------------------------