IMPROVE GLOBAL so that it can use many more blocks.

Much slower than LOCAL or SHARED. NB 1. Numbers are now on a 1GPU/4CPU system, throughputs are 10% higher NB 2. There is a functional bug also for LOCAL (mean ME decreases with #iterations) time ./gcheck.exe -p 16384 32 12 *************************************** NumIterations = 12 NumThreadsPerBlock = 32 NumBlocksPerGrid = 16384 --------------------------------------- FP precision = DOUBLE (nan=0) Complex type = THRUST::COMPLEX Momenta memory layout = AOSOA[32] Wavefunction GPU memory = GLOBAL Curand generation = DEVICE (CUDA code) --------------------------------------- NumberOfEntries = 12 TotalTimeInWaveFuncs = 3.505193e-02 sec MeanTimeInWaveFuncs = 2.920994e-03 sec StdDevTimeInWaveFuncs = 6.692049e-05 sec MinTimeInWaveFuncs = 2.893571e-03 sec MaxTimeInWaveFuncs = 2.903941e-03 sec --------------------------------------- NumMatrixElementsComputed = 6291456 MatrixElementsPerSec = 1.794896e+08 sec^-1 *************************************** NumMatrixElements(notNan) = 6291456 MeanMatrixElemValue = 1.393760e-02 GeV^0 StdErrMatrixElemValue = 3.035624e-06 GeV^0 StdDevMatrixElemValue = 7.614188e-03 GeV^0 MinMatrixElemValue = 1.807353e-11 GeV^0 MaxMatrixElemValue = 3.374925e-02 GeV^0 *************************************** 00 CudaFree : 0.160871 sec 0a ProcInit : 0.000533 sec 0b MemAlloc : 0.078608 sec 0c GenCreat : 0.015293 sec 1a GenSeed : 0.000021 sec 1b GenRnGen : 0.007990 sec 2a RamboIni : 0.000112 sec 2b RamboFin : 0.000056 sec 2c CpDTHwgt : 0.007330 sec 2d CpDTHmom : 0.074818 sec 3a SigmaKin : 0.000111 sec 3b CpDTHmes : 0.034941 sec 4a DumpLoop : 0.022739 sec 9a DumpAll : 0.046494 sec 9b GenDestr : 0.000237 sec 9c MemFree : 0.022495 sec 9d CudReset : 0.041520 sec TOTAL : 0.514170 sec TOTAL(n-2) : 0.311779 sec *************************************** real 0m0.530s user 0m0.206s sys 0m0.313s
roiser · Aug 13, 2020 · 649bdc4 · 649bdc4
1 parent b482fc5
commit 649bdc4
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 75 deletions.
diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -661,48 +661,7 @@ namespace Proc
   using mgOnGpu::nwf;
   using mgOnGpu::nw6;
 
-#ifdef __CUDACC__
-#if defined MGONGPU_WFMEM_GLOBAL
-
-  using mgOnGpu::nbpgMAX;
-  // Allocate global or shared memory for the wavefunctions of all (external and internal) particles
-  // See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#allocation-persisting-kernel-launches
-  __device__ cxtype* dwf[nbpgMAX]; // device dwf[#blocks][5 * 6 * #threads_in_block]
-
-  __global__
-  void sigmakin_alloc()
-  {
-    // Wavefunctions for this block: bwf[5 * 6 * #threads_in_block]
-    cxtype*& bwf = dwf[blockIdx.x];
-
-    // Only the first thread in the block does the allocation (we need one allocation per block)
-    if ( threadIdx.x == 0 )
-    {
-      bwf = (cxtype*)malloc( nwf * nw6 * blockDim.x * sizeof(cxtype) ); // cxtype bwf[5 * 6 * #threads_in_block]
-      if ( bwf == NULL )
-      {
-        printf( "ERROR in sigmakin_alloc (block #%4d): malloc failed\n", blockIdx.x );
-        assert( bwf != NULL );
-      }
-      //else printf( "INFO in sigmakin_alloc (block #%4d): malloc successful\n", blockIdx.x );
-    }
-    __syncthreads();
-
-    // All threads in the block should see the allocation by now
-    assert( bwf != NULL );
-  }
-
-  __global__
-  void sigmakin_free()
-  {
-    // Only free from one thread!
-    // [NB: if this free is missing, cuda-memcheck fails to detect it]
-    // [NB: but if free is called twice, cuda-memcheck does detect it]
-    cxtype* bwf = dwf[blockIdx.x];
-    if ( threadIdx.x == 0 ) free( bwf );
-  }
-
-#elif defined MGONGPU_WFMEM_SHARED
+#if defined __CUDACC__ && defined MGONGPU_WFMEM_SHARED
 
   int sigmakin_sharedmem_nbytes( const int ntpb ) // input: #threads per block
   {
@@ -711,7 +670,6 @@ namespace Proc
     return nbytesBwf;
   }
 
-#endif
 #endif
 
   //--------------------------------------------------------------------------
@@ -726,8 +684,12 @@ namespace Proc
   // AOS: allmomenta[ndim][npar][np4]
   void calculate_wavefunctions( int ihel,
                                 const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt]
-                                fptype &matrix
-#ifndef __CUDACC__
+    fptype &matrix
+#ifdef __CUDACC__
+#if defined MGONGPU_WFMEM_GLOBAL
+                                , cxtype* tmpWFs          // tmp[(nwf=5)*(nw6=6)*(nevt=nblk*ntpb)] 
+#endif
+#else
                                 , const int ievt
 #endif
                                 )
@@ -741,7 +703,8 @@ namespace Proc
     // eventually move to same AOSOA everywhere, blocks and threads
 #if defined MGONGPU_WFMEM_GLOBAL
     const int iblk = blockIdx.x; // index of block in grid
-    cxtype* bwf = dwf[iblk];
+    const int ntpb = blockDim.x; // index of block in grid
+    cxtype* bwf = &tmpWFs[iblk*nwf*nw6*ntpb];
 #elif defined MGONGPU_WFMEM_SHARED
     extern __shared__ cxtype bwf[];
 #endif
@@ -916,6 +879,9 @@ namespace Proc
                  fptype* output            // output[nevt]
 #ifdef __CUDACC__
                  // NB: nevt == ndim=gpublocks*gputhreads in CUDA
+#if defined MGONGPU_WFMEM_GLOBAL
+                 , cxtype* tmpWFs          // tmp[(nwf=5)*(nw6=6)*nevt] 
+#endif
 #else
                  , const int nevt          // input: #events
 #endif
@@ -959,8 +925,13 @@ namespace Proc
       for ( int ihel = 0; ihel < ncomb; ihel++ )
       {
         if ( sigmakin_itry>maxtry && !sigmakin_goodhel[ihel] ) continue;
+        // Adds ME for ihel to matrix_element[0]
 #ifdef __CUDACC__
-        calculate_wavefunctions(ihel, allmomenta, matrix_element[0]); // adds ME for ihel to matrix_element[0]
+#if defined MGONGPU_WFMEM_GLOBAL
+        calculate_wavefunctions(ihel, allmomenta, matrix_element[0], tmpWFs); 
+#else
+        calculate_wavefunctions(ihel, allmomenta, matrix_element[0]); 
+#endif
 #else
         calculate_wavefunctions(ihel, allmomenta, matrix_element[0], ievt); // adds ME for ihel to matrix_element[0]
 #endif

diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -115,31 +115,17 @@ namespace Proc
 
 #ifdef __CUDACC__
   __global__
+#if defined MGONGPU_WFMEM_GLOBAL
+  void sigmaKin( const fptype* allmomenta, fptype* output, cxtype* tmpWFs );
+#else
   void sigmaKin( const fptype* allmomenta, fptype* output );
+#endif
 #else
   void sigmaKin( const fptype* allmomenta, fptype* output, const int nevt );
 #endif
 
   //--------------------------------------------------------------------------
 
-#ifdef __CUDACC__
-#if defined MGONGPU_WFMEM_GLOBAL
-  __global__
-  void sigmakin_alloc();
-#endif
-#endif
-
-  //--------------------------------------------------------------------------
-
-#ifdef __CUDACC__
-#if defined MGONGPU_WFMEM_GLOBAL
-  __global__
-  void sigmakin_free();
-#endif
-#endif
-
-  //--------------------------------------------------------------------------
-
 #ifdef __CUDACC__
 #if defined MGONGPU_WFMEM_SHARED
   int sigmakin_sharedmem_nbytes( const int ntpb ); // input: #threads per block

diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc
@@ -162,6 +162,15 @@ int main(int argc, char **argv)
   hstMomenta = new fptype[nMomenta]();
 #endif
 
+#if defined __CUDACC__ && defined MGONGPU_WFMEM_GLOBAL
+  using mgOnGpu::nwf;
+  using mgOnGpu::nw6;
+  const int nAllWFs = nwf * nw6 * ndim;
+  const int nbytesAllWFs = nAllWFs * sizeof(cxtype);
+  cxtype* devAllWFs = 0; // AOSOA[nblk][nparf][np4][ntpb] (NB: ndim=nblk*ntpb)
+  checkCuda( cudaMalloc( &devAllWFs, nbytesAllWFs ) );
+#endif
+
   const int nWeights = ndim; //  (NB: ndim=npag*nepp for ASA layouts)
   fptype* hstWeights = 0; // (previously was: meHostPtr)
 #ifdef __CUDACC__
@@ -188,9 +197,7 @@ int main(int argc, char **argv)
   fptype* matrixelementvector = new fptype[niter * ndim * process.nprocesses]();
 
 #ifdef __CUDACC__
-#if defined MGONGPU_WFMEM_GLOBAL
-  gProc::sigmakin_alloc<<<gpublocks, gputhreads>>>();
-#elif defined MGONGPU_WFMEM_SHARED
+#if defined MGONGPU_WFMEM_SHARED
   const int nbytesSharedSK = gProc::sigmakin_sharedmem_nbytes(gputhreads);
 #endif
 #endif
@@ -300,7 +307,9 @@ int main(int argc, char **argv)
     const std::string skinKey = "3a SigmaKin";
     timermap.start( skinKey );
 #ifdef __CUDACC__
-#if defined MGONGPU_WFMEM_SHARED
+#if defined MGONGPU_WFMEM_GLOBAL
+    gProc::sigmaKin<<<gpublocks, gputhreads>>>(devMomenta, devMEs, devAllWFs);
+#elif defined MGONGPU_WFMEM_SHARED
     gProc::sigmaKin<<<gpublocks, gputhreads, nbytesSharedSK>>>(devMomenta, devMEs);
 #else
     gProc::sigmaKin<<<gpublocks, gputhreads>>>(devMomenta, devMEs);
@@ -540,16 +549,16 @@ int main(int argc, char **argv)
   timermap.start( freeKey );
 
 #ifdef __CUDACC__
-#if defined MGONGPU_WFMEM_GLOBAL
-  gProc::sigmakin_free<<<gpublocks, gputhreads>>>();
-#endif
   checkCuda( cudaFreeHost( hstMEs ) );
   checkCuda( cudaFreeHost( hstWeights ) );
   checkCuda( cudaFreeHost( hstMomenta ) );
 #if defined MGONGPU_CURAND_ONHOST
   checkCuda( cudaFreeHost( hstRnarray ) );
 #endif
   checkCuda( cudaFree( devMEs ) );
+#if defined MGONGPU_WFMEM_GLOBAL
+  checkCuda( cudaFree( devAllWFs ) );
+#endif
   checkCuda( cudaFree( devWeights ) );
   checkCuda( cudaFree( devMomenta ) );
   checkCuda( cudaFree( devRnarray ) );

diff --git a/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h b/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h
@@ -14,8 +14,8 @@
 //#define MGONGPU_CURAND_ONHOST 1
 
 // Use global memory or shared memory for wavefunctions (CHOOSE ONLY ONE)
-#define MGONGPU_WFMEM_LOCAL 1 // default (~5.00E8)
-//#define MGONGPU_WFMEM_GLOBAL 1 // 30% slower, limited to 256*32 threads (1.18E8 against 1.78E8 for "-p 256 32 12")
+//#define MGONGPU_WFMEM_LOCAL 1 // default (~5.00E8)
+#define MGONGPU_WFMEM_GLOBAL 1 // 30% slower, limited to 256*32 threads (1.18E8 against 1.78E8 for "-p 256 32 12")
 //#define MGONGPU_WFMEM_SHARED 1 // 30% slower, limited to 32 threads/block (~3.5E8 against 5.0E8)
 
 // Floating point precision (CHOOSE ONLY ONE)
@@ -46,7 +46,7 @@ namespace mgOnGpu
   // Maximum number of blocks per grid
   // ** NB Some arrays of pointers will be allocated statically to fit all these blocks
   // ** (the actual memory for each block will then be allocated dynamically only for existing blocks)
-  const int nbpgMAX = 2048;
+  //const int nbpgMAX = 2048;
 
   // Maximum number of threads per block
   const int ntpbMAX = 256;