Skip to content

Commit

Permalink
IMPROVE GLOBAL so that it can use many more blocks.
Browse files Browse the repository at this point in the history
Much slower than LOCAL or SHARED.
NB 1. Numbers are now on a 1GPU/4CPU system, throughputs are 10% higher
NB 2. There is a functional bug also for LOCAL (mean ME decreases with #iterations)

time ./gcheck.exe -p 16384 32 12
***************************************
NumIterations             = 12
NumThreadsPerBlock        = 32
NumBlocksPerGrid          = 16384
---------------------------------------
FP precision              = DOUBLE (nan=0)
Complex type              = THRUST::COMPLEX
Momenta memory layout     = AOSOA[32]
Wavefunction GPU memory   = GLOBAL
Curand generation         = DEVICE (CUDA code)
---------------------------------------
NumberOfEntries           = 12
TotalTimeInWaveFuncs      = 3.505193e-02 sec
MeanTimeInWaveFuncs       = 2.920994e-03 sec
StdDevTimeInWaveFuncs     = 6.692049e-05 sec
MinTimeInWaveFuncs        = 2.893571e-03 sec
MaxTimeInWaveFuncs        = 2.903941e-03 sec
---------------------------------------
NumMatrixElementsComputed = 6291456
MatrixElementsPerSec      = 1.794896e+08 sec^-1
***************************************
NumMatrixElements(notNan) = 6291456
MeanMatrixElemValue       = 1.393760e-02 GeV^0
StdErrMatrixElemValue     = 3.035624e-06 GeV^0
StdDevMatrixElemValue     = 7.614188e-03 GeV^0
MinMatrixElemValue        = 1.807353e-11 GeV^0
MaxMatrixElemValue        = 3.374925e-02 GeV^0
***************************************
00 CudaFree : 0.160871 sec
0a ProcInit : 0.000533 sec
0b MemAlloc : 0.078608 sec
0c GenCreat : 0.015293 sec
1a GenSeed  : 0.000021 sec
1b GenRnGen : 0.007990 sec
2a RamboIni : 0.000112 sec
2b RamboFin : 0.000056 sec
2c CpDTHwgt : 0.007330 sec
2d CpDTHmom : 0.074818 sec
3a SigmaKin : 0.000111 sec
3b CpDTHmes : 0.034941 sec
4a DumpLoop : 0.022739 sec
9a DumpAll  : 0.046494 sec
9b GenDestr : 0.000237 sec
9c MemFree  : 0.022495 sec
9d CudReset : 0.041520 sec
TOTAL       : 0.514170 sec
TOTAL(n-2)  : 0.311779 sec
***************************************
real    0m0.530s
user    0m0.206s
sys     0m0.313s
  • Loading branch information
valassi committed Aug 13, 2020
1 parent b482fc5 commit 649bdc4
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 75 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -661,48 +661,7 @@ namespace Proc
using mgOnGpu::nwf;
using mgOnGpu::nw6;

#ifdef __CUDACC__
#if defined MGONGPU_WFMEM_GLOBAL

using mgOnGpu::nbpgMAX;
// Allocate global or shared memory for the wavefunctions of all (external and internal) particles
// See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#allocation-persisting-kernel-launches
__device__ cxtype* dwf[nbpgMAX]; // device dwf[#blocks][5 * 6 * #threads_in_block]

__global__
void sigmakin_alloc()
{
// Wavefunctions for this block: bwf[5 * 6 * #threads_in_block]
cxtype*& bwf = dwf[blockIdx.x];

// Only the first thread in the block does the allocation (we need one allocation per block)
if ( threadIdx.x == 0 )
{
bwf = (cxtype*)malloc( nwf * nw6 * blockDim.x * sizeof(cxtype) ); // cxtype bwf[5 * 6 * #threads_in_block]
if ( bwf == NULL )
{
printf( "ERROR in sigmakin_alloc (block #%4d): malloc failed\n", blockIdx.x );
assert( bwf != NULL );
}
//else printf( "INFO in sigmakin_alloc (block #%4d): malloc successful\n", blockIdx.x );
}
__syncthreads();

// All threads in the block should see the allocation by now
assert( bwf != NULL );
}

__global__
void sigmakin_free()
{
// Only free from one thread!
// [NB: if this free is missing, cuda-memcheck fails to detect it]
// [NB: but if free is called twice, cuda-memcheck does detect it]
cxtype* bwf = dwf[blockIdx.x];
if ( threadIdx.x == 0 ) free( bwf );
}

#elif defined MGONGPU_WFMEM_SHARED
#if defined __CUDACC__ && defined MGONGPU_WFMEM_SHARED

int sigmakin_sharedmem_nbytes( const int ntpb ) // input: #threads per block
{
Expand All @@ -711,7 +670,6 @@ namespace Proc
return nbytesBwf;
}

#endif
#endif

//--------------------------------------------------------------------------
Expand All @@ -726,8 +684,12 @@ namespace Proc
// AOS: allmomenta[ndim][npar][np4]
void calculate_wavefunctions( int ihel,
const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt]
fptype &matrix
#ifndef __CUDACC__
fptype &matrix
#ifdef __CUDACC__
#if defined MGONGPU_WFMEM_GLOBAL
, cxtype* tmpWFs // tmp[(nwf=5)*(nw6=6)*(nevt=nblk*ntpb)]
#endif
#else
, const int ievt
#endif
)
Expand All @@ -741,7 +703,8 @@ namespace Proc
// eventually move to same AOSOA everywhere, blocks and threads
#if defined MGONGPU_WFMEM_GLOBAL
const int iblk = blockIdx.x; // index of block in grid
cxtype* bwf = dwf[iblk];
const int ntpb = blockDim.x; // index of block in grid
cxtype* bwf = &tmpWFs[iblk*nwf*nw6*ntpb];
#elif defined MGONGPU_WFMEM_SHARED
extern __shared__ cxtype bwf[];
#endif
Expand Down Expand Up @@ -916,6 +879,9 @@ namespace Proc
fptype* output // output[nevt]
#ifdef __CUDACC__
// NB: nevt == ndim=gpublocks*gputhreads in CUDA
#if defined MGONGPU_WFMEM_GLOBAL
, cxtype* tmpWFs // tmp[(nwf=5)*(nw6=6)*nevt]
#endif
#else
, const int nevt // input: #events
#endif
Expand Down Expand Up @@ -959,8 +925,13 @@ namespace Proc
for ( int ihel = 0; ihel < ncomb; ihel++ )
{
if ( sigmakin_itry>maxtry && !sigmakin_goodhel[ihel] ) continue;
// Adds ME for ihel to matrix_element[0]
#ifdef __CUDACC__
calculate_wavefunctions(ihel, allmomenta, matrix_element[0]); // adds ME for ihel to matrix_element[0]
#if defined MGONGPU_WFMEM_GLOBAL
calculate_wavefunctions(ihel, allmomenta, matrix_element[0], tmpWFs);
#else
calculate_wavefunctions(ihel, allmomenta, matrix_element[0]);
#endif
#else
calculate_wavefunctions(ihel, allmomenta, matrix_element[0], ievt); // adds ME for ihel to matrix_element[0]
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,31 +115,17 @@ namespace Proc

#ifdef __CUDACC__
__global__
#if defined MGONGPU_WFMEM_GLOBAL
void sigmaKin( const fptype* allmomenta, fptype* output, cxtype* tmpWFs );
#else
void sigmaKin( const fptype* allmomenta, fptype* output );
#endif
#else
void sigmaKin( const fptype* allmomenta, fptype* output, const int nevt );
#endif

//--------------------------------------------------------------------------

#ifdef __CUDACC__
#if defined MGONGPU_WFMEM_GLOBAL
__global__
void sigmakin_alloc();
#endif
#endif

//--------------------------------------------------------------------------

#ifdef __CUDACC__
#if defined MGONGPU_WFMEM_GLOBAL
__global__
void sigmakin_free();
#endif
#endif

//--------------------------------------------------------------------------

#ifdef __CUDACC__
#if defined MGONGPU_WFMEM_SHARED
int sigmakin_sharedmem_nbytes( const int ntpb ); // input: #threads per block
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,15 @@ int main(int argc, char **argv)
hstMomenta = new fptype[nMomenta]();
#endif

#if defined __CUDACC__ && defined MGONGPU_WFMEM_GLOBAL
using mgOnGpu::nwf;
using mgOnGpu::nw6;
const int nAllWFs = nwf * nw6 * ndim;
const int nbytesAllWFs = nAllWFs * sizeof(cxtype);
cxtype* devAllWFs = 0; // AOSOA[nblk][nparf][np4][ntpb] (NB: ndim=nblk*ntpb)
checkCuda( cudaMalloc( &devAllWFs, nbytesAllWFs ) );
#endif

const int nWeights = ndim; // (NB: ndim=npag*nepp for ASA layouts)
fptype* hstWeights = 0; // (previously was: meHostPtr)
#ifdef __CUDACC__
Expand All @@ -188,9 +197,7 @@ int main(int argc, char **argv)
fptype* matrixelementvector = new fptype[niter * ndim * process.nprocesses]();

#ifdef __CUDACC__
#if defined MGONGPU_WFMEM_GLOBAL
gProc::sigmakin_alloc<<<gpublocks, gputhreads>>>();
#elif defined MGONGPU_WFMEM_SHARED
#if defined MGONGPU_WFMEM_SHARED
const int nbytesSharedSK = gProc::sigmakin_sharedmem_nbytes(gputhreads);
#endif
#endif
Expand Down Expand Up @@ -300,7 +307,9 @@ int main(int argc, char **argv)
const std::string skinKey = "3a SigmaKin";
timermap.start( skinKey );
#ifdef __CUDACC__
#if defined MGONGPU_WFMEM_SHARED
#if defined MGONGPU_WFMEM_GLOBAL
gProc::sigmaKin<<<gpublocks, gputhreads>>>(devMomenta, devMEs, devAllWFs);
#elif defined MGONGPU_WFMEM_SHARED
gProc::sigmaKin<<<gpublocks, gputhreads, nbytesSharedSK>>>(devMomenta, devMEs);
#else
gProc::sigmaKin<<<gpublocks, gputhreads>>>(devMomenta, devMEs);
Expand Down Expand Up @@ -540,16 +549,16 @@ int main(int argc, char **argv)
timermap.start( freeKey );

#ifdef __CUDACC__
#if defined MGONGPU_WFMEM_GLOBAL
gProc::sigmakin_free<<<gpublocks, gputhreads>>>();
#endif
checkCuda( cudaFreeHost( hstMEs ) );
checkCuda( cudaFreeHost( hstWeights ) );
checkCuda( cudaFreeHost( hstMomenta ) );
#if defined MGONGPU_CURAND_ONHOST
checkCuda( cudaFreeHost( hstRnarray ) );
#endif
checkCuda( cudaFree( devMEs ) );
#if defined MGONGPU_WFMEM_GLOBAL
checkCuda( cudaFree( devAllWFs ) );
#endif
checkCuda( cudaFree( devWeights ) );
checkCuda( cudaFree( devMomenta ) );
checkCuda( cudaFree( devRnarray ) );
Expand Down
6 changes: 3 additions & 3 deletions examples/gpu/eemumu_AV/src/mgOnGpuConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
//#define MGONGPU_CURAND_ONHOST 1

// Use global memory or shared memory for wavefunctions (CHOOSE ONLY ONE)
#define MGONGPU_WFMEM_LOCAL 1 // default (~5.00E8)
//#define MGONGPU_WFMEM_GLOBAL 1 // 30% slower, limited to 256*32 threads (1.18E8 against 1.78E8 for "-p 256 32 12")
//#define MGONGPU_WFMEM_LOCAL 1 // default (~5.00E8)
#define MGONGPU_WFMEM_GLOBAL 1 // 30% slower, limited to 256*32 threads (1.18E8 against 1.78E8 for "-p 256 32 12")
//#define MGONGPU_WFMEM_SHARED 1 // 30% slower, limited to 32 threads/block (~3.5E8 against 5.0E8)

// Floating point precision (CHOOSE ONLY ONE)
Expand Down Expand Up @@ -46,7 +46,7 @@ namespace mgOnGpu
// Maximum number of blocks per grid
// ** NB Some arrays of pointers will be allocated statically to fit all these blocks
// ** (the actual memory for each block will then be allocated dynamically only for existing blocks)
const int nbpgMAX = 2048;
//const int nbpgMAX = 2048;

// Maximum number of threads per block
const int ntpbMAX = 256;
Expand Down

0 comments on commit 649bdc4

Please sign in to comment.