diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 395dfd6cf0..c46d260219 100644 --- a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -11,7 +11,6 @@ #include #include "mgOnGpuConfig.h" -using mgOnGpu::cxtype; namespace MG5_sm { @@ -28,7 +27,7 @@ namespace MG5_sm #ifdef __CUDACC__ __device__ #endif - inline const double& pIparIp4Ievt( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt] + inline const fptype& pIparIp4Ievt( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt] const int ipar, const int ip4, const int ievt ) @@ -60,8 +59,8 @@ namespace MG5_sm #ifdef __CUDACC__ __device__ #endif - void imzxxxM0( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt] - //const double fmass, + void imzxxxM0( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt] + //const fptype fmass, const int nhel, const int nsf, #ifndef __CUDACC__ @@ -89,10 +88,10 @@ namespace MG5_sm const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "imzxxxM0: ievt=%d ieib=%d\n", ievt, threadIdx.x ); #endif - const double& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt ); - const double& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt ); - const double& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt ); - const double& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt ); + const fptype& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt ); + const fptype& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt ); + const fptype& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt ); + const fptype& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt ); #if defined __CUDACC__ && !defined MGONGPU_WFMEM_LOCAL cxtype& fi0 = fiv[ipar*nw6*neib + 0*neib + ieib]; cxtype& fi1 = fiv[ipar*nw6*neib + 1*neib + ieib]; @@ -141,8 +140,8 @@ namespace MG5_sm #ifdef __CUDACC__ __device__ #endif - void ixzxxxM0( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt] - //const double fmass, + void ixzxxxM0( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt] + //const fptype fmass, const int nhel, const int nsf, #ifndef __CUDACC__ @@ -170,10 +169,10 @@ namespace MG5_sm const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "ixzxxxM0: ievt=%d ieib=%d\n", ievt, threadIdx.x ); #endif - const double& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt ); - const double& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt ); - const double& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt ); - const double& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt ); + const fptype& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt ); + const fptype& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt ); + const fptype& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt ); + const fptype& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt ); #if defined __CUDACC__ && !defined MGONGPU_WFMEM_LOCAL cxtype& fi0 = fiv[ipar*nw6*neib + 0*neib + ieib]; cxtype& fi1 = fiv[ipar*nw6*neib + 1*neib + ieib]; @@ -195,7 +194,7 @@ namespace MG5_sm // ASSUMPTIONS FMASS = 0 and // (PX and PY are not 0) { - const double sqp0p3 = sqrt( pvec0 + pvec3 ) * nsf; + const fptype sqp0p3 = sqrt( pvec0 + pvec3 ) * nsf; const cxtype chi0( sqp0p3, 0 ); const cxtype chi1( nh * pvec1 / sqp0p3, pvec2 / sqp0p3 ); if ( nh == 1 ) @@ -223,8 +222,8 @@ namespace MG5_sm #ifdef __CUDACC__ __device__ #endif - void oxzxxxM0( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt] - //const double fmass, + void oxzxxxM0( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt] + //const fptype fmass, const int nhel, const int nsf, #ifndef __CUDACC__ @@ -252,10 +251,10 @@ namespace MG5_sm const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "oxzxxxM0: ievt=%d ieib=%d\n", ievt, threadIdx.x ); #endif - const double& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt ); - const double& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt ); - const double& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt ); - const double& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt ); + const fptype& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt ); + const fptype& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt ); + const fptype& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt ); + const fptype& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt ); #if defined __CUDACC__ && !defined MGONGPU_WFMEM_LOCAL cxtype& fo0 = fov[ipar*nw6*neib + 0*neib + ieib]; cxtype& fo1 = fov[ipar*nw6*neib + 1*neib + ieib]; @@ -278,7 +277,7 @@ namespace MG5_sm // EITHER (Px and Py are not zero) // OR (PX = PY = 0 and E = P3 > 0) { - const double sqp0p3 = sqrt( pvec0 + pvec3 ) * nsf; + const fptype sqp0p3 = sqrt( pvec0 + pvec3 ) * nsf; const cxtype chi0( sqp0p3, 0 ); const cxtype chi1( nh * pvec1 / sqp0p3, -pvec2 / sqp0p3 ); if( nh == 1 ) @@ -312,7 +311,7 @@ namespace MG5_sm const cxtype COUP, cxtype * vertex) { - const cxtype cI = cxtype (0., 1.); + const cxtype cI = cxtype( 0, 1 ); const cxtype TMP4 = (F1[2] * (F2[4] * (V3[2] + V3[5]) + F2[5] * (V3[3] + cI * (V3[4]))) + (F1[3] * (F2[4] * (V3[3] - cI * (V3[4])) + F2[5] * (V3[2] - V3[5])) + @@ -329,14 +328,14 @@ namespace MG5_sm void FFV1P0_3(const cxtype F1[], const cxtype F2[], const cxtype COUP, - const double M3, - const double W3, + const fptype M3, + const fptype W3, cxtype V3[]) { - const cxtype cI = cxtype (0., 1.); - V3[0] = +F1[0] + F2[0]; - V3[1] = +F1[1] + F2[1]; - const double P3[4] = { -V3[0].real(), + const cxtype cI = cxtype( 0, 1 ); + V3[0] = + F1[0] + F2[0]; + V3[1] = + F1[1] + F2[1]; + const fptype P3[4] = { -V3[0].real(), -V3[1].real(), -V3[1].imag(), -V3[0].imag() }; @@ -360,14 +359,16 @@ namespace MG5_sm const cxtype COUP2, cxtype * vertex) { - const cxtype cI = cxtype (0., 1.); + const fptype fp1 = 1; + const fptype fp2 = 2; + const cxtype cI = cxtype( 0, 1 ); const cxtype TMP2 = (F1[4] * (F2[2] * (V3[2] - V3[5]) - F2[3] * (V3[3] + cI * (V3[4]))) + F1[5] * (F2[2] * (-V3[3] + cI * (V3[4])) + F2[3] * (V3[2] + V3[5]))); const cxtype TMP0 = (F1[2] * (F2[4] * (V3[2] + V3[5]) + F2[5] * (V3[3] + cI * (V3[4]))) + F1[3] * (F2[4] * (V3[3] - cI * (V3[4])) + F2[5] * (V3[2] - V3[5]))); - (*vertex) = (-1.) * (COUP2 * (+cI * (TMP0) + 2. * cI * (TMP2)) + cI * (TMP0 * COUP1)); + (*vertex) = -fp1 * (COUP2 * (+cI * (TMP0) + fp2 * cI * (TMP2)) + cI * (TMP0 * COUP1)); } //-------------------------------------------------------------------------- @@ -379,16 +380,18 @@ namespace MG5_sm const cxtype F2[], const cxtype COUP1, const cxtype COUP2, - const double M3, - const double W3, + const fptype M3, + const fptype W3, cxtype V3[]) { - const cxtype cI = cxtype (0., 1.); - double OM3 = 0.; - if (M3 != 0.) OM3 = 1./(M3 * M3); - V3[0] = +F1[0] + F2[0]; - V3[1] = +F1[1] + F2[1]; - const double P3[4] = { -V3[0].real(), + const fptype fp1 = 1; + const fptype fp2 = 2; + const cxtype cI = cxtype( 0, 1 ); + fptype OM3 = 0; + if ( M3 != 0 ) OM3 = fp1 / ( M3 * M3 ); + V3[0] = + F1[0] + F2[0]; + V3[1] = + F1[1] + F2[1]; + const fptype P3[4] = { -V3[0].real(), -V3[1].real(), -V3[1].imag(), -V3[0].imag() }; @@ -399,26 +402,26 @@ namespace MG5_sm (F1[4] * (F2[2] * (P3[0] - P3[3]) - F2[3] * (P3[1] + cI * (P3[2]))) + F1[5] * (F2[2] * (-P3[1] + cI * (P3[2])) + F2[3] * (P3[0] + P3[3]))); const cxtype denom = - 1./((P3[0] * P3[0]) - (P3[1] * P3[1]) - (P3[2] * P3[2]) - + fp1 / ((P3[0] * P3[0]) - (P3[1] * P3[1]) - (P3[2] * P3[2]) - (P3[3] * P3[3]) - M3 * (M3 - cI * W3)); - V3[2] = denom * (-2. * cI) * - (COUP2 * (OM3 * - 1./2. * P3[0] * (TMP1 + 2. * (TMP3)) - + (+1./2. * (F1[2] * F2[4] + F1[3] * F2[5]) + F1[4] * F2[2] + F1[5] * F2[3])) - + 1./2. * (COUP1 * (F1[2] * F2[4] + F1[3] * F2[5] - P3[0] * OM3 * TMP1))); - V3[3] = denom * (-2. * cI) * - (COUP2 * (OM3 * - 1./2. * P3[1] * (TMP1 + 2. * (TMP3)) - + (-1./2. * (F1[2] * F2[5] + F1[3] * F2[4]) + F1[4] * F2[3] + F1[5] * F2[2])) - - 1./2. * (COUP1 * (F1[2] * F2[5] + F1[3] * F2[4] + P3[1] * OM3 * TMP1))); + V3[2] = denom * (-fp2 * cI) * + (COUP2 * (OM3 * - fp1/fp2 * P3[0] * (TMP1 + fp2 * (TMP3)) + + (+fp1/fp2 * (F1[2] * F2[4] + F1[3] * F2[5]) + F1[4] * F2[2] + F1[5] * F2[3])) + + fp1/fp2 * (COUP1 * (F1[2] * F2[4] + F1[3] * F2[5] - P3[0] * OM3 * TMP1))); + V3[3] = denom * (-fp2 * cI) * + (COUP2 * (OM3 * - fp1/fp2 * P3[1] * (TMP1 + fp2 * (TMP3)) + + (-fp1/fp2 * (F1[2] * F2[5] + F1[3] * F2[4]) + F1[4] * F2[3] + F1[5] * F2[2])) + - fp1/fp2 * (COUP1 * (F1[2] * F2[5] + F1[3] * F2[4] + P3[1] * OM3 * TMP1))); V3[4] = denom * cI * - (COUP2 * (OM3 * P3[2] * (TMP1 + 2. * (TMP3)) + (COUP2 * (OM3 * P3[2] * (TMP1 + fp2 * (TMP3)) + (+cI * (F1[2] * F2[5]) - cI * (F1[3] * F2[4]) - - 2. * cI * (F1[4] * F2[3]) - + 2. * cI * (F1[5] * F2[2]))) + - fp2 * cI * (F1[4] * F2[3]) + + fp2 * cI * (F1[5] * F2[2]))) + COUP1 * (+cI * (F1[2] * F2[5]) - cI * (F1[3] * F2[4]) + P3[2] * OM3 * TMP1)); - V3[5] = denom * 2. * cI * - (COUP2 * (OM3 * 1./2. * P3[3] * (TMP1 + 2. * (TMP3)) + - (+1./2. * (F1[2] * F2[4]) - 1./2. * (F1[3] * F2[5]) - F1[4] * F2[2] + F1[5] * F2[3])) - + 1./2. * (COUP1 * (F1[2] * F2[4] + P3[3] * OM3 * TMP1 - F1[3] * F2[5]))); + V3[5] = denom * fp2 * cI * + (COUP2 * (OM3 * fp1/fp2 * P3[3] * (TMP1 + fp2 * (TMP3)) + + (+fp1/fp2 * (F1[2] * F2[4]) - fp1/fp2 * (F1[3] * F2[5]) - F1[4] * F2[2] + F1[5] * F2[3])) + + fp1/fp2 * (COUP1 * (F1[2] * F2[4] + P3[3] * OM3 * TMP1 - F1[3] * F2[5]))); } @@ -455,12 +458,12 @@ namespace Proc #ifdef __CUDACC__ __device__ __constant__ int cHel[ncomb][npar]; - __device__ __constant__ double cIPC[6]; // coupling ? - __device__ __constant__ double cIPD[2]; + __device__ __constant__ fptype cIPC[6]; // coupling ? + __device__ __constant__ fptype cIPD[2]; #else static int cHel[ncomb][npar]; - static double cIPC[6]; // coupling ? - static double cIPD[2]; + static fptype cIPC[6]; // coupling ? + static fptype cIPD[2]; #endif #ifdef __CUDACC__ @@ -547,8 +550,8 @@ namespace Proc // SOA: allmomenta[npar][np4][ndim] // AOS: allmomenta[ndim][npar][np4] void calculate_wavefunctions( int ihel, - const double* allmomenta, // input[(npar=4)*(np4=4)*nevt] - double &matrix + const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt] + fptype &matrix #ifndef __CUDACC__ , const int ievt #endif @@ -603,8 +606,8 @@ namespace Proc cxtype jamp[ncolor]; // The color matrix; - static const double denom[ncolor] = {1}; - static const double cf[ncolor][ncolor] = {{1}}; + static const fptype denom[ncolor] = {1}; + static const fptype cf[ncolor][ncolor] = {{1}}; // Calculate color flows jamp[0] = -amp[0] - amp[1]; @@ -649,8 +652,8 @@ namespace Proc #else memcpy( cHel, tHel, ncomb * nexternal * sizeof(int) ); #endif - // SANITY CHECK: GPU shared memory usage is based on casts of double[2] to complex - assert( sizeof(cxtype) == 2*sizeof(double) ); + // SANITY CHECK: GPU shared memory usage is based on casts of fptype[2] to cxtype + assert( sizeof(cxtype) == 2*sizeof(fptype) ); } //-------------------------------------------------------------------------- @@ -659,7 +662,7 @@ namespace Proc //-------------------------------------------------------------------------- - const std::vector &CPPProcess::getMasses() const {return mME;} + const std::vector &CPPProcess::getMasses() const {return mME;} //-------------------------------------------------------------------------- // Initialize process. @@ -682,16 +685,15 @@ namespace Proc mME.push_back(pars->ZERO); mME.push_back(pars->ZERO); mME.push_back(pars->ZERO); - static cxtype tIPC[3] = {pars->GC_3, pars->GC_50, - pars->GC_59}; - static double tIPD[2] = {pars->mdl_MZ, pars->mdl_WZ}; + static cxtype tIPC[3] = {(cxtype)pars->GC_3, (cxtype)pars->GC_50, (cxtype)pars->GC_59}; + static fptype tIPD[2] = {(fptype)pars->mdl_MZ, (fptype)pars->mdl_WZ}; #ifdef __CUDACC__ checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof(cxtype ) ) ); - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof(double) ) ); + checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof(fptype) ) ); #else memcpy( cIPC, tIPC, 3 * sizeof(cxtype ) ); - memcpy( cIPD, tIPD, 2 * sizeof(double) ); + memcpy( cIPD, tIPD, 2 * sizeof(fptype) ); #endif } @@ -706,8 +708,8 @@ namespace Proc #ifdef __CUDACC__ __global__ #endif - void sigmaKin( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt] - double* output // output[nevt] + void sigmaKin( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt] + fptype* output // output[nevt] #ifdef __CUDACC__ // NB: nevt == ndim=gpublocks*gputhreads in CUDA #else @@ -743,7 +745,7 @@ namespace Proc const int denominators[nprocesses] = {4}; // Reset the matrix elements - double matrix_element[nprocesses]; + fptype matrix_element[nprocesses]; for(int iproc = 0; iproc < nprocesses; iproc++ ) { matrix_element[iproc] = 0.; @@ -754,7 +756,7 @@ namespace Proc sigmakin_alloc(); #endif #endif - double melast = matrix_element[0]; + fptype melast = matrix_element[0]; for (int ihel = 0; ihel < ncomb; ihel++ ) { if ( sigmakin_itry>maxtry && !sigmakin_goodhel[ihel] ) continue; diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index 372bf94e3d..9ebd5aec93 100644 --- a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -1,5 +1,4 @@ #include "mgOnGpuConfig.h" -using mgOnGpu::cxtype; //========================================================================== // This file has been automatically generated for C++ Standalone by @@ -64,7 +63,7 @@ namespace Proc virtual int code() const {return 1;} - const std::vector &getMasses() const; + const std::vector &getMasses() const; void setInitial(int inid1, int inid2) { @@ -108,7 +107,7 @@ namespace Proc Parameters_sm * pars; // vector with external particle masses - std::vector mME; + std::vector mME; // Initial particle ids int id1, id2; @@ -119,9 +118,9 @@ namespace Proc #ifdef __CUDACC__ __global__ - void sigmaKin( const double* allmomenta, double* output ); + void sigmaKin( const fptype* allmomenta, fptype* output ); #else - void sigmaKin( const double* allmomenta, double* output, const int nevt ); + void sigmaKin( const fptype* allmomenta, fptype* output, const int nevt ); #endif //-------------------------------------------------------------------------- diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc index a63194712e..025cd2e0bd 100644 --- a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc +++ b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc @@ -118,7 +118,7 @@ int main(int argc, char **argv) // Read param_card and set parameters process.initProc("../../Cards/param_card.dat"); - const double energy = 1500; + const fptype energy = 1500; const int meGeVexponent = -(2 * process.nexternal - 8); // --- 0b. Allocate memory structures @@ -131,59 +131,59 @@ int main(int argc, char **argv) using mgOnGpu::npar; const int nRnarray = np4*nparf*ndim; // (NB: ndim=npag*nepp for ASA layouts) #ifdef __CUDACC__ - const int nbytesRnarray = nRnarray * sizeof(double); - double* devRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp) + const int nbytesRnarray = nRnarray * sizeof(fptype); + fptype* devRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp) checkCuda( cudaMalloc( &devRnarray, nbytesRnarray ) ); #if defined MGONGPU_CURAND_ONHOST - double* hstRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp) + fptype* hstRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp) checkCuda( cudaMallocHost( &hstRnarray, nbytesRnarray ) ); #endif #else - double* hstRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp) - hstRnarray = new double[nRnarray](); + fptype* hstRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp) + hstRnarray = new fptype[nRnarray](); #endif const int nMomenta = np4*npar*ndim; // (NB: ndim=npag*nepp for ASA layouts) #if defined MGONGPU_LAYOUT_ASA - double* hstMomenta = 0; // AOSOA[npag][npar][np4][nepp] (previously was: lp) + fptype* hstMomenta = 0; // AOSOA[npag][npar][np4][nepp] (previously was: lp) #elif defined MGONGPU_LAYOUT_SOA - double* hstMomenta = 0; // SOA[npar][np4][ndim] (previously was: lp) + fptype* hstMomenta = 0; // SOA[npar][np4][ndim] (previously was: lp) #elif defined MGONGPU_LAYOUT_AOS - double* hstMomenta = 0; // AOS[ndim][npar][np4] (previously was: lp) + fptype* hstMomenta = 0; // AOS[ndim][npar][np4] (previously was: lp) #endif #ifdef __CUDACC__ - const int nbytesMomenta = nMomenta * sizeof(double); + const int nbytesMomenta = nMomenta * sizeof(fptype); checkCuda( cudaMallocHost( &hstMomenta, nbytesMomenta ) ); - double* devMomenta = 0; // (previously was: allMomenta) + fptype* devMomenta = 0; // (previously was: allMomenta) checkCuda( cudaMalloc( &devMomenta, nbytesMomenta ) ); #else - hstMomenta = new double[nMomenta](); + hstMomenta = new fptype[nMomenta](); #endif const int nWeights = ndim; // (NB: ndim=npag*nepp for ASA layouts) - double* hstWeights = 0; // (previously was: meHostPtr) + fptype* hstWeights = 0; // (previously was: meHostPtr) #ifdef __CUDACC__ - const int nbytesWeights = nWeights * sizeof(double); + const int nbytesWeights = nWeights * sizeof(fptype); checkCuda( cudaMallocHost( &hstWeights, nbytesWeights ) ); - double* devWeights = 0; // (previously was: meDevPtr) + fptype* devWeights = 0; // (previously was: meDevPtr) checkCuda( cudaMalloc( &devWeights, nbytesWeights ) ); #else - hstWeights = new double[nWeights](); + hstWeights = new fptype[nWeights](); #endif const int nMEs = ndim; // (NB: ndim=npag*nepp for ASA layouts) - double* hstMEs = 0; // (previously was: meHostPtr) + fptype* hstMEs = 0; // (previously was: meHostPtr) #ifdef __CUDACC__ - const int nbytesMEs = nMEs * sizeof(double); + const int nbytesMEs = nMEs * sizeof(fptype); checkCuda( cudaMallocHost( &hstMEs, nbytesMEs ) ); - double* devMEs = 0; // (previously was: meDevPtr) + fptype* devMEs = 0; // (previously was: meDevPtr) checkCuda( cudaMalloc( &devMEs, nbytesMEs ) ); #else - hstMEs = new double[nMEs](); + hstMEs = new fptype[nMEs](); #endif float* wavetimes = new float[niter](); - double* matrixelementvector = new double[niter * ndim * process.nprocesses](); + fptype* matrixelementvector = new fptype[niter * ndim * process.nprocesses](); #ifdef __CUDACC__ #if defined MGONGPU_WFMEM_GLOBAL diff --git a/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h b/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h index a0c8eeb28e..7f8fbc5388 100644 --- a/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h +++ b/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h @@ -15,6 +15,10 @@ //#define MGONGPU_WFMEM_GLOBAL 1 //#define MGONGPU_WFMEM_SHARED 1 +// Floating point precision (CHOOSE ONLY ONE) +//#define MGONGPU_FPTYPE_DOUBLE 1 // default +#define MGONGPU_FPTYPE_FLOAT 1 + #ifdef __CUDACC__ #include #else @@ -52,14 +56,17 @@ namespace mgOnGpu //const int nepp = 4; // FOR DEBUGGING! // Floating point type: fptype (tflpoint? tfloatpt?) - //typedef double fptype; // double precision (8 bytes, fp64) - //typedef float fptype; // single precision (4 bytes, fp32) +#if defined MGONGPU_FPTYPE_DOUBLE + typedef double fptype; // double precision (8 bytes, fp64) +#elif defined MGONGPU_FPTYPE_FLOAT + typedef float fptype; // single precision (4 bytes, fp32) +#endif // Complex type: cxtype (tcomplex?) #ifdef __CUDACC__ - typedef thrust::complex cxtype; // two doubles: RI + typedef thrust::complex cxtype; // two doubles: RI #else - typedef std::complex cxtype; // two doubles: RI + typedef std::complex cxtype; // two doubles: RI #endif // Vector types: _v is a [256] @@ -68,4 +75,8 @@ namespace mgOnGpu } +// Expose typedefs outside the namespace +using mgOnGpu::fptype; +using mgOnGpu::cxtype; + #endif // MGONGPUCONFIG_H diff --git a/examples/gpu/eemumu_AV/src/rambo2toNm0.cc b/examples/gpu/eemumu_AV/src/rambo2toNm0.cc index 800dbfdd47..7f33aabb15 100644 --- a/examples/gpu/eemumu_AV/src/rambo2toNm0.cc +++ b/examples/gpu/eemumu_AV/src/rambo2toNm0.cc @@ -19,28 +19,28 @@ namespace rambo2toNm0 #ifdef __CUDACC__ __global__ #endif - void getMomentaInitial( const double energy, // input: energy + void getMomentaInitial( const fptype energy, // input: energy #if defined MGONGPU_LAYOUT_ASA - double momenta1d[], // output: momenta as AOSOA[npag][npar][4][nepp] + fptype momenta1d[], // output: momenta as AOSOA[npag][npar][4][nepp] #elif defined MGONGPU_LAYOUT_SOA - double momenta1d[], // output: momenta as SOA[npar][4][nevt] + fptype momenta1d[], // output: momenta as SOA[npar][4][nevt] #elif defined MGONGPU_LAYOUT_AOS - double momenta1d[], // output: momenta as AOS[nevt][npar][4] + fptype momenta1d[], // output: momenta as AOS[nevt][npar][4] #endif const int nevt ) // input: #events { #if defined MGONGPU_LAYOUT_ASA using mgOnGpu::nepp; - double (*momenta)[npar][np4][nepp] = (double (*)[npar][np4][nepp]) momenta1d; // cast to multiD array pointer (AOSOA) + fptype (*momenta)[npar][np4][nepp] = (fptype (*)[npar][np4][nepp]) momenta1d; // cast to multiD array pointer (AOSOA) #elif defined MGONGPU_LAYOUT_SOA // Cast is impossible in CUDA C ("error: expression must have a constant value") - //double (*momenta)[np4][nevt] = (double (*)[np4][nevt]) momenta1d; // cast to multiD array pointer (SOA) + //fptype (*momenta)[np4][nevt] = (fptype (*)[np4][nevt]) momenta1d; // cast to multiD array pointer (SOA) #elif defined MGONGPU_LAYOUT_AOS - double (*momenta)[npar][np4] = (double (*)[npar][np4]) momenta1d; // cast to multiD array pointer (AOS) + fptype (*momenta)[npar][np4] = (fptype (*)[npar][np4]) momenta1d; // cast to multiD array pointer (AOS) #endif - const double energy1 = energy/2; - const double energy2 = energy/2; - const double mom = energy/2; + const fptype energy1 = energy/2; + const fptype energy2 = energy/2; + const fptype mom = energy/2; #ifndef __CUDACC__ // ** START LOOP ON IEVT ** for (int ievt = 0; ievt < nevt; ++ievt) @@ -92,16 +92,16 @@ namespace rambo2toNm0 #ifdef __CUDACC__ __global__ #endif - void getMomentaFinal( const double energy, // input: energy - const double rnarray1d[], // input: randomnumbers in [0,1] as AOSOA[npag][nparf][4][nepp] + void getMomentaFinal( const fptype energy, // input: energy + const fptype rnarray1d[], // input: randomnumbers in [0,1] as AOSOA[npag][nparf][4][nepp] #if defined MGONGPU_LAYOUT_ASA - double momenta1d[], // output: momenta as AOSOA[npag][npar][4][nepp] + fptype momenta1d[], // output: momenta as AOSOA[npag][npar][4][nepp] #elif defined MGONGPU_LAYOUT_SOA - double momenta1d[], // output: momenta as SOA[npar][4][nevt] + fptype momenta1d[], // output: momenta as SOA[npar][4][nevt] #elif defined MGONGPU_LAYOUT_AOS - double momenta1d[], // output: momenta as AOS[nevt][npar][4] + fptype momenta1d[], // output: momenta as AOS[nevt][npar][4] #endif - double wgts[], // output: weights[nevt] + fptype wgts[], // output: weights[nevt] const int nevt ) // input: #events { /**************************************************************************** @@ -117,14 +117,14 @@ namespace rambo2toNm0 ****************************************************************************/ // initialization step: factorials for the phase space weight - const double twopi = 8. * atan(1.); - const double po2log = log(twopi / 4.); - double z[nparf]; + const fptype twopi = 8. * atan(1.); + const fptype po2log = log(twopi / 4.); + fptype z[nparf]; z[1] = po2log; for (int kpar = 2; kpar < nparf; kpar++) - z[kpar] = z[kpar - 1] + po2log - 2. * log(double(kpar - 1)); + z[kpar] = z[kpar - 1] + po2log - 2. * log(fptype(kpar - 1)); for (int kpar = 2; kpar < nparf; kpar++) - z[kpar] = (z[kpar] - log(double(kpar))); + z[kpar] = (z[kpar] - log(fptype(kpar))); #ifndef __CUDACC__ // ** START LOOP ON IEVT ** @@ -140,27 +140,27 @@ namespace rambo2toNm0 using mgOnGpu::nepp; const int ipag = ievt/nepp; // #eventpage in this iteration const int iepp = ievt%nepp; // #event in the current eventpage in this iteration - double (*rnarray)[nparf][np4][nepp] = (double (*)[nparf][np4][nepp]) rnarray1d; // cast to multiD array pointer (AOSOA) + fptype (*rnarray)[nparf][np4][nepp] = (fptype (*)[nparf][np4][nepp]) rnarray1d; // cast to multiD array pointer (AOSOA) #if defined MGONGPU_LAYOUT_ASA - double (*momenta)[npar][np4][nepp] = (double (*)[npar][np4][nepp]) momenta1d; // cast to multiD array pointer (AOSOA) + fptype (*momenta)[npar][np4][nepp] = (fptype (*)[npar][np4][nepp]) momenta1d; // cast to multiD array pointer (AOSOA) #elif defined MGONGPU_LAYOUT_SOA // Cast is impossible in CUDA C ("error: expression must have a constant value") - //double (*momenta)[np4][nevt] = (double (*)[np4][nevt]) momenta1d; // cast to multiD array pointer (SOA) + //fptype (*momenta)[np4][nevt] = (fptype (*)[np4][nevt]) momenta1d; // cast to multiD array pointer (SOA) #elif defined MGONGPU_LAYOUT_AOS - double (*momenta)[npar][np4] = (double (*)[npar][np4]) momenta1d; // cast to multiD array pointer (AOS) + fptype (*momenta)[npar][np4] = (fptype (*)[npar][np4]) momenta1d; // cast to multiD array pointer (AOS) #endif - double& wt = wgts[ievt]; + fptype& wt = wgts[ievt]; // generate n massless momenta in infinite phase space - double q[nparf][np4]; + fptype q[nparf][np4]; for (int iparf = 0; iparf < nparf; iparf++) { - const double r1 = rnarray[ipag][iparf][0][iepp]; - const double r2 = rnarray[ipag][iparf][1][iepp]; - const double r3 = rnarray[ipag][iparf][2][iepp]; - const double r4 = rnarray[ipag][iparf][3][iepp]; - const double c = 2. * r1 - 1.; - const double s = sqrt(1. - c * c); - const double f = twopi * r2; + const fptype r1 = rnarray[ipag][iparf][0][iepp]; + const fptype r2 = rnarray[ipag][iparf][1][iepp]; + const fptype r3 = rnarray[ipag][iparf][2][iepp]; + const fptype r4 = rnarray[ipag][iparf][3][iepp]; + const fptype c = 2. * r1 - 1.; + const fptype s = sqrt(1. - c * c); + const fptype f = twopi * r2; q[iparf][0] = -log(r3 * r4); q[iparf][3] = q[iparf][0] * c; q[iparf][2] = q[iparf][0] * s * cos(f); @@ -168,24 +168,24 @@ namespace rambo2toNm0 } // calculate the parameters of the conformal transformation - double r[np4]; - double b[np4-1]; + fptype r[np4]; + fptype b[np4-1]; for (int i4 = 0; i4 < np4; i4++) r[i4] = 0.; for (int iparf = 0; iparf < nparf; iparf++) { for (int i4 = 0; i4 < np4; i4++) r[i4] = r[i4] + q[iparf][i4]; } - const double rmas = sqrt(pow(r[0], 2) - pow(r[3], 2) - pow(r[2], 2) - pow(r[1], 2)); + const fptype rmas = sqrt(pow(r[0], 2) - pow(r[3], 2) - pow(r[2], 2) - pow(r[1], 2)); for (int i4 = 1; i4 < np4; i4++) b[i4-1] = -r[i4] / rmas; - const double g = r[0] / rmas; - const double a = 1. / (1. + g); - const double x0 = energy / rmas; + const fptype g = r[0] / rmas; + const fptype a = 1. / (1. + g); + const fptype x0 = energy / rmas; // transform the q's conformally into the p's (i.e. the 'momenta') for (int iparf = 0; iparf < nparf; iparf++) { - double bq = b[0] * q[iparf][1] + b[1] * q[iparf][2] + b[2] * q[iparf][3]; + fptype bq = b[0] * q[iparf][1] + b[1] * q[iparf][2] + b[2] * q[iparf][3]; #if defined MGONGPU_LAYOUT_ASA for (int i4 = 1; i4 < np4; i4++) momenta[ipag][iparf+npari][i4][iepp] = x0 * (q[iparf][i4] + b[i4-1] * (q[iparf][0] + a * bq)); @@ -276,10 +276,14 @@ namespace rambo2toNm0 // ** NB: the random numbers are always produced in the same order and are interpreted as an AOSOA // AOSOA: rnarray[npag][nparf][np4][nepp] where nevt=npag*nepp void generateRnArray( curandGenerator_t gen, // input: curand generator - double rnarray1d[], // output: randomnumbers in [0,1] + fptype rnarray1d[], // output: randomnumbers in [0,1] const int nevt ) // input: #events { +#if defined MGONGPU_FPTYPE_DOUBLE checkCurand( curandGenerateUniformDouble( gen, rnarray1d, np4*nparf*nevt ) ); +#elif defined MGONGPU_FPTYPE_FLOAT + checkCurand( curandGenerateUniform( gen, rnarray1d, np4*nparf*nevt ) ); +#endif } //-------------------------------------------------------------------------- diff --git a/examples/gpu/eemumu_AV/src/rambo2toNm0.h b/examples/gpu/eemumu_AV/src/rambo2toNm0.h index 27c5005163..8f5e0497ec 100644 --- a/examples/gpu/eemumu_AV/src/rambo2toNm0.h +++ b/examples/gpu/eemumu_AV/src/rambo2toNm0.h @@ -43,13 +43,13 @@ namespace rambo2toNm0 #ifdef __CUDACC__ __global__ #endif - void getMomentaInitial( const double energy, // input: energy + void getMomentaInitial( const fptype energy, // input: energy #if defined MGONGPU_LAYOUT_ASA - double momenta1d[], // output: momenta as AOSOA[npag][npar][4][nepp] + fptype momenta1d[], // output: momenta as AOSOA[npag][npar][4][nepp] #elif defined MGONGPU_LAYOUT_SOA - double momenta1d[], // output: momenta as SOA[npar][4][nevt] + fptype momenta1d[], // output: momenta as SOA[npar][4][nevt] #elif defined MGONGPU_LAYOUT_AOS - double momenta1d[], // output: momenta as AOS[nevt][npar][4] + fptype momenta1d[], // output: momenta as AOS[nevt][npar][4] #endif const int nevt ); // input: #events @@ -60,16 +60,16 @@ namespace rambo2toNm0 #ifdef __CUDACC__ __global__ #endif - void getMomentaFinal( const double energy, // input: energy - const double rnarray1d[], // input: randomnumbers in [0,1] as AOSOA[npag][nparf][4][nepp] + void getMomentaFinal( const fptype energy, // input: energy + const fptype rnarray1d[], // input: randomnumbers in [0,1] as AOSOA[npag][nparf][4][nepp] #if defined MGONGPU_LAYOUT_ASA - double momenta1d[], // output: momenta as AOSOA[npag][npar][4][nepp] + fptype momenta1d[], // output: momenta as AOSOA[npag][npar][4][nepp] #elif defined MGONGPU_LAYOUT_SOA - double momenta1d[], // output: momenta as SOA[npar][4][nevt] + fptype momenta1d[], // output: momenta as SOA[npar][4][nevt] #elif defined MGONGPU_LAYOUT_AOS - double momenta1d[], // output: momenta as AOS[nevt][npar][4] + fptype momenta1d[], // output: momenta as AOS[nevt][npar][4] #endif - double wgts[], // output: weights[nevt] + fptype wgts[], // output: weights[nevt] const int nevt ); // input: #events //-------------------------------------------------------------------------- @@ -93,7 +93,7 @@ namespace rambo2toNm0 // ** NB: the random numbers are always produced in the same order and are interpreted as an AOSOA // AOSOA: rnarray[npag][nparf][np4][nepp] where nevt=npag*nepp void generateRnArray( curandGenerator_t gen, // input: curand generator - double rnarray1d[], // output: randomnumbers in [0,1] + fptype rnarray1d[], // output: randomnumbers in [0,1] const int nevt ); // input: #events //--------------------------------------------------------------------------