diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
index 395dfd6cf0..c46d260219 100644
--- a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
+++ b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc
@@ -11,7 +11,6 @@
 #include <iostream>
 
 #include "mgOnGpuConfig.h"
-using mgOnGpu::cxtype;
 
 namespace MG5_sm
 {
@@ -28,7 +27,7 @@ namespace MG5_sm
 #ifdef __CUDACC__
   __device__
 #endif
-  inline const double& pIparIp4Ievt( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt]
+  inline const fptype& pIparIp4Ievt( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt]
                                      const int ipar,
                                      const int ip4,
                                      const int ievt )
@@ -60,8 +59,8 @@ namespace MG5_sm
 #ifdef __CUDACC__
   __device__
 #endif
-  void imzxxxM0( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt]
-                 //const double fmass,
+  void imzxxxM0( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt]
+                 //const fptype fmass,
                  const int nhel,
                  const int nsf,
 #ifndef __CUDACC__
@@ -89,10 +88,10 @@ namespace MG5_sm
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "imzxxxM0: ievt=%d ieib=%d\n", ievt, threadIdx.x );
 #endif
-      const double& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt );
-      const double& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt );
-      const double& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt );
-      const double& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt );
+      const fptype& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt );
+      const fptype& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt );
+      const fptype& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt );
+      const fptype& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt );
 #if defined __CUDACC__ && !defined MGONGPU_WFMEM_LOCAL
       cxtype& fi0 = fiv[ipar*nw6*neib + 0*neib + ieib];
       cxtype& fi1 = fiv[ipar*nw6*neib + 1*neib + ieib];
@@ -141,8 +140,8 @@ namespace MG5_sm
 #ifdef __CUDACC__
   __device__
 #endif
-  void ixzxxxM0( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt]
-                 //const double fmass,
+  void ixzxxxM0( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt]
+                 //const fptype fmass,
                  const int nhel,
                  const int nsf,
 #ifndef __CUDACC__
@@ -170,10 +169,10 @@ namespace MG5_sm
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "ixzxxxM0: ievt=%d ieib=%d\n", ievt, threadIdx.x );
 #endif
-      const double& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt );
-      const double& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt );
-      const double& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt );
-      const double& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt );
+      const fptype& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt );
+      const fptype& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt );
+      const fptype& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt );
+      const fptype& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt );
 #if defined __CUDACC__ && !defined MGONGPU_WFMEM_LOCAL
       cxtype& fi0 = fiv[ipar*nw6*neib + 0*neib + ieib];
       cxtype& fi1 = fiv[ipar*nw6*neib + 1*neib + ieib];
@@ -195,7 +194,7 @@ namespace MG5_sm
       // ASSUMPTIONS FMASS = 0 and
       // (PX and PY are not 0)
       {
-        const double sqp0p3 = sqrt( pvec0 + pvec3 ) * nsf;
+        const fptype sqp0p3 = sqrt( pvec0 + pvec3 ) * nsf;
         const cxtype chi0( sqp0p3, 0 );
         const cxtype chi1( nh * pvec1 / sqp0p3, pvec2 / sqp0p3 );
         if ( nh == 1 )
@@ -223,8 +222,8 @@ namespace MG5_sm
 #ifdef __CUDACC__
   __device__
 #endif
-  void oxzxxxM0( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt]
-                 //const double fmass,
+  void oxzxxxM0( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt]
+                 //const fptype fmass,
                  const int nhel,
                  const int nsf,
 #ifndef __CUDACC__
@@ -252,10 +251,10 @@ namespace MG5_sm
       const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
       //printf( "oxzxxxM0: ievt=%d ieib=%d\n", ievt, threadIdx.x );
 #endif
-      const double& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt );
-      const double& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt );
-      const double& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt );
-      const double& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt );
+      const fptype& pvec0 = pIparIp4Ievt( allmomenta, ipar, 0, ievt );
+      const fptype& pvec1 = pIparIp4Ievt( allmomenta, ipar, 1, ievt );
+      const fptype& pvec2 = pIparIp4Ievt( allmomenta, ipar, 2, ievt );
+      const fptype& pvec3 = pIparIp4Ievt( allmomenta, ipar, 3, ievt );
 #if defined __CUDACC__ && !defined MGONGPU_WFMEM_LOCAL
       cxtype& fo0 = fov[ipar*nw6*neib + 0*neib + ieib];
       cxtype& fo1 = fov[ipar*nw6*neib + 1*neib + ieib];
@@ -278,7 +277,7 @@ namespace MG5_sm
       // EITHER (Px and Py are not zero)
       // OR (PX = PY = 0 and E = P3 > 0)
       {
-        const double sqp0p3 = sqrt( pvec0 + pvec3 ) * nsf;
+        const fptype sqp0p3 = sqrt( pvec0 + pvec3 ) * nsf;
         const cxtype chi0( sqp0p3, 0 );
         const cxtype chi1( nh * pvec1 / sqp0p3, -pvec2 / sqp0p3 );
         if( nh == 1 )
@@ -312,7 +311,7 @@ namespace MG5_sm
               const cxtype COUP,
               cxtype * vertex)
   {
-    const cxtype cI = cxtype (0., 1.);
+    const cxtype cI = cxtype( 0, 1 );
     const cxtype TMP4 =
       (F1[2] * (F2[4] * (V3[2] + V3[5]) + F2[5] * (V3[3] + cI * (V3[4]))) +
        (F1[3] * (F2[4] * (V3[3] - cI * (V3[4])) + F2[5] * (V3[2] - V3[5])) +
@@ -329,14 +328,14 @@ namespace MG5_sm
   void FFV1P0_3(const cxtype F1[],
                 const cxtype F2[],
                 const cxtype COUP,
-                const double M3,
-                const double W3,
+                const fptype M3,
+                const fptype W3,
                 cxtype V3[])
   {
-    const cxtype cI = cxtype (0., 1.);
-    V3[0] = +F1[0] + F2[0];
-    V3[1] = +F1[1] + F2[1];
-    const double P3[4] = { -V3[0].real(),
+    const cxtype cI = cxtype( 0, 1 );
+    V3[0] = + F1[0] + F2[0];
+    V3[1] = + F1[1] + F2[1];
+    const fptype P3[4] = { -V3[0].real(),
                            -V3[1].real(),
                            -V3[1].imag(),
                            -V3[0].imag() };
@@ -360,14 +359,16 @@ namespace MG5_sm
                 const cxtype COUP2,
                 cxtype * vertex)
   {
-    const cxtype cI = cxtype (0., 1.);
+    const fptype fp1 = 1;
+    const fptype fp2 = 2;
+    const cxtype cI = cxtype( 0, 1 );
     const cxtype TMP2 =
       (F1[4] * (F2[2] * (V3[2] - V3[5]) - F2[3] * (V3[3] + cI * (V3[4]))) +
        F1[5] * (F2[2] * (-V3[3] + cI * (V3[4])) + F2[3] * (V3[2] + V3[5])));
     const cxtype TMP0 =
       (F1[2] * (F2[4] * (V3[2] + V3[5]) + F2[5] * (V3[3] + cI * (V3[4]))) +
        F1[3] * (F2[4] * (V3[3] - cI * (V3[4])) + F2[5] * (V3[2] - V3[5])));
-    (*vertex) = (-1.) * (COUP2 * (+cI * (TMP0) + 2. * cI * (TMP2)) + cI * (TMP0 * COUP1));
+    (*vertex) = -fp1 * (COUP2 * (+cI * (TMP0) + fp2 * cI * (TMP2)) + cI * (TMP0 * COUP1));
   }
 
   //--------------------------------------------------------------------------
@@ -379,16 +380,18 @@ namespace MG5_sm
                 const cxtype F2[],
                 const cxtype COUP1,
                 const cxtype COUP2,
-                const double M3,
-                const double W3,
+                const fptype M3,
+                const fptype W3,
                 cxtype V3[])
   {
-    const cxtype cI = cxtype (0., 1.);
-    double OM3 = 0.;
-    if (M3 != 0.) OM3 = 1./(M3 * M3);
-    V3[0] = +F1[0] + F2[0];
-    V3[1] = +F1[1] + F2[1];
-    const double P3[4] = { -V3[0].real(),
+    const fptype fp1 = 1;
+    const fptype fp2 = 2;
+    const cxtype cI = cxtype( 0, 1 );
+    fptype OM3 = 0;
+    if ( M3 != 0 ) OM3 = fp1 / ( M3 * M3 );
+    V3[0] = + F1[0] + F2[0];
+    V3[1] = + F1[1] + F2[1];
+    const fptype P3[4] = { -V3[0].real(),
                            -V3[1].real(),
                            -V3[1].imag(),
                            -V3[0].imag() };
@@ -399,26 +402,26 @@ namespace MG5_sm
       (F1[4] * (F2[2] * (P3[0] - P3[3]) - F2[3] * (P3[1] + cI * (P3[2]))) +
        F1[5] * (F2[2] * (-P3[1] + cI * (P3[2])) + F2[3] * (P3[0] + P3[3])));
     const cxtype denom =
-      1./((P3[0] * P3[0]) - (P3[1] * P3[1]) - (P3[2] * P3[2]) -
+      fp1 / ((P3[0] * P3[0]) - (P3[1] * P3[1]) - (P3[2] * P3[2]) -
           (P3[3] * P3[3]) - M3 * (M3 - cI * W3));
-    V3[2] = denom * (-2. * cI) *
-      (COUP2 * (OM3 * - 1./2. * P3[0] * (TMP1 + 2. * (TMP3))
-                + (+1./2. * (F1[2] * F2[4] + F1[3] * F2[5]) + F1[4] * F2[2] + F1[5] * F2[3]))
-       + 1./2. * (COUP1 * (F1[2] * F2[4] + F1[3] * F2[5] - P3[0] * OM3 * TMP1)));
-    V3[3] = denom * (-2. * cI) *
-      (COUP2 * (OM3 * - 1./2. * P3[1] * (TMP1 + 2. * (TMP3))
-                + (-1./2. * (F1[2] * F2[5] + F1[3] * F2[4]) + F1[4] * F2[3] + F1[5] * F2[2]))
-       - 1./2. * (COUP1 * (F1[2] * F2[5] + F1[3] * F2[4] + P3[1] * OM3 * TMP1)));
+    V3[2] = denom * (-fp2 * cI) *
+      (COUP2 * (OM3 * - fp1/fp2 * P3[0] * (TMP1 + fp2 * (TMP3))
+                + (+fp1/fp2 * (F1[2] * F2[4] + F1[3] * F2[5]) + F1[4] * F2[2] + F1[5] * F2[3]))
+       + fp1/fp2 * (COUP1 * (F1[2] * F2[4] + F1[3] * F2[5] - P3[0] * OM3 * TMP1)));
+    V3[3] = denom * (-fp2 * cI) *
+      (COUP2 * (OM3 * - fp1/fp2 * P3[1] * (TMP1 + fp2 * (TMP3))
+                + (-fp1/fp2 * (F1[2] * F2[5] + F1[3] * F2[4]) + F1[4] * F2[3] + F1[5] * F2[2]))
+       - fp1/fp2 * (COUP1 * (F1[2] * F2[5] + F1[3] * F2[4] + P3[1] * OM3 * TMP1)));
     V3[4] = denom * cI *
-      (COUP2 * (OM3 * P3[2] * (TMP1 + 2. * (TMP3))
+      (COUP2 * (OM3 * P3[2] * (TMP1 + fp2 * (TMP3))
                 + (+cI * (F1[2] * F2[5]) - cI * (F1[3] * F2[4])
-                   - 2. * cI * (F1[4] * F2[3])
-                   + 2. * cI * (F1[5] * F2[2])))
+                   - fp2 * cI * (F1[4] * F2[3])
+                   + fp2 * cI * (F1[5] * F2[2])))
        + COUP1 * (+cI * (F1[2] * F2[5]) - cI * (F1[3] * F2[4]) + P3[2] * OM3 * TMP1));
-    V3[5] = denom * 2. * cI *
-      (COUP2 * (OM3 * 1./2. * P3[3] * (TMP1 + 2. * (TMP3)) +
-                (+1./2. * (F1[2] * F2[4]) - 1./2. * (F1[3] * F2[5]) - F1[4] * F2[2] + F1[5] * F2[3]))
-       + 1./2. * (COUP1 * (F1[2] * F2[4] + P3[3] * OM3 * TMP1 - F1[3] * F2[5])));
+    V3[5] = denom * fp2 * cI *
+      (COUP2 * (OM3 * fp1/fp2 * P3[3] * (TMP1 + fp2 * (TMP3)) +
+                (+fp1/fp2 * (F1[2] * F2[4]) - fp1/fp2 * (F1[3] * F2[5]) - F1[4] * F2[2] + F1[5] * F2[3]))
+       + fp1/fp2 * (COUP1 * (F1[2] * F2[4] + P3[3] * OM3 * TMP1 - F1[3] * F2[5])));
   }
 
 
@@ -455,12 +458,12 @@ namespace Proc
 
 #ifdef __CUDACC__
   __device__ __constant__ int cHel[ncomb][npar];
-  __device__ __constant__ double cIPC[6];  // coupling ?
-  __device__ __constant__ double cIPD[2];
+  __device__ __constant__ fptype cIPC[6];  // coupling ?
+  __device__ __constant__ fptype cIPD[2];
 #else
   static int cHel[ncomb][npar];
-  static double cIPC[6];  // coupling ?
-  static double cIPD[2];
+  static fptype cIPC[6];  // coupling ?
+  static fptype cIPD[2];
 #endif
 
 #ifdef __CUDACC__
@@ -547,8 +550,8 @@ namespace Proc
   // SOA: allmomenta[npar][np4][ndim]
   // AOS: allmomenta[ndim][npar][np4]
   void calculate_wavefunctions( int ihel,
-                                const double* allmomenta, // input[(npar=4)*(np4=4)*nevt]
-                                double &matrix
+                                const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt]
+                                fptype &matrix
 #ifndef __CUDACC__
                                 , const int ievt
 #endif
@@ -603,8 +606,8 @@ namespace Proc
     cxtype jamp[ncolor];
 
     // The color matrix;
-    static const double denom[ncolor] = {1};
-    static const double cf[ncolor][ncolor] = {{1}};
+    static const fptype denom[ncolor] = {1};
+    static const fptype cf[ncolor][ncolor] = {{1}};
 
     // Calculate color flows
     jamp[0] = -amp[0] - amp[1];
@@ -649,8 +652,8 @@ namespace Proc
 #else
     memcpy( cHel, tHel, ncomb * nexternal * sizeof(int) );
 #endif
-    // SANITY CHECK: GPU shared memory usage is based on casts of double[2] to complex
-    assert( sizeof(cxtype) == 2*sizeof(double) );
+    // SANITY CHECK: GPU shared memory usage is based on casts of fptype[2] to cxtype
+    assert( sizeof(cxtype) == 2*sizeof(fptype) );
   }
 
   //--------------------------------------------------------------------------
@@ -659,7 +662,7 @@ namespace Proc
 
   //--------------------------------------------------------------------------
 
-  const std::vector<double> &CPPProcess::getMasses() const {return mME;}
+  const std::vector<fptype> &CPPProcess::getMasses() const {return mME;}
 
   //--------------------------------------------------------------------------
   // Initialize process.
@@ -682,16 +685,15 @@ namespace Proc
     mME.push_back(pars->ZERO);
     mME.push_back(pars->ZERO);
     mME.push_back(pars->ZERO);
-    static cxtype tIPC[3] = {pars->GC_3, pars->GC_50,
-                               pars->GC_59};
-    static double tIPD[2] = {pars->mdl_MZ, pars->mdl_WZ};
+    static cxtype tIPC[3] = {(cxtype)pars->GC_3, (cxtype)pars->GC_50, (cxtype)pars->GC_59};
+    static fptype tIPD[2] = {(fptype)pars->mdl_MZ, (fptype)pars->mdl_WZ};
 
 #ifdef __CUDACC__
     checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof(cxtype ) ) );
-    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof(double) ) );
+    checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof(fptype) ) );
 #else
     memcpy( cIPC, tIPC, 3 * sizeof(cxtype ) );
-    memcpy( cIPD, tIPD, 2 * sizeof(double) );
+    memcpy( cIPD, tIPD, 2 * sizeof(fptype) );
 #endif
 
   }
@@ -706,8 +708,8 @@ namespace Proc
 #ifdef __CUDACC__
   __global__
 #endif
-  void sigmaKin( const double* allmomenta, // input[(npar=4)*(np4=4)*nevt]
-                 double* output            // output[nevt]
+  void sigmaKin( const fptype* allmomenta, // input[(npar=4)*(np4=4)*nevt]
+                 fptype* output            // output[nevt]
 #ifdef __CUDACC__
                  // NB: nevt == ndim=gpublocks*gputhreads in CUDA
 #else
@@ -743,7 +745,7 @@ namespace Proc
       const int denominators[nprocesses] = {4};
 
       // Reset the matrix elements
-      double matrix_element[nprocesses];
+      fptype matrix_element[nprocesses];
       for(int iproc = 0; iproc < nprocesses; iproc++ )
       {
         matrix_element[iproc] = 0.;
@@ -754,7 +756,7 @@ namespace Proc
       sigmakin_alloc();
 #endif
 #endif
-      double melast = matrix_element[0];
+      fptype melast = matrix_element[0];
       for (int ihel = 0; ihel < ncomb; ihel++ )
       {
         if ( sigmakin_itry>maxtry && !sigmakin_goodhel[ihel] ) continue;
diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
index 372bf94e3d..9ebd5aec93 100644
--- a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
+++ b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h
@@ -1,5 +1,4 @@
 #include "mgOnGpuConfig.h"
-using mgOnGpu::cxtype;
 
 //==========================================================================
 // This file has been automatically generated for C++ Standalone by
@@ -64,7 +63,7 @@ namespace Proc
 
     virtual int code() const {return 1;}
 
-    const std::vector<double> &getMasses() const;
+    const std::vector<fptype> &getMasses() const;
 
     void setInitial(int inid1, int inid2)
     {
@@ -108,7 +107,7 @@ namespace Proc
     Parameters_sm * pars;
 
     // vector with external particle masses
-    std::vector<double> mME;
+    std::vector<fptype> mME;
 
     // Initial particle ids
     int id1, id2;
@@ -119,9 +118,9 @@ namespace Proc
 
 #ifdef __CUDACC__
   __global__
-  void sigmaKin( const double* allmomenta, double* output );
+  void sigmaKin( const fptype* allmomenta, fptype* output );
 #else
-  void sigmaKin( const double* allmomenta, double* output, const int nevt );
+  void sigmaKin( const fptype* allmomenta, fptype* output, const int nevt );
 #endif
 
   //--------------------------------------------------------------------------
diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc
index a63194712e..025cd2e0bd 100644
--- a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc
+++ b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check.cc
@@ -118,7 +118,7 @@ int main(int argc, char **argv)
   // Read param_card and set parameters
   process.initProc("../../Cards/param_card.dat");
 
-  const double energy = 1500;
+  const fptype energy = 1500;
   const int meGeVexponent = -(2 * process.nexternal - 8);
 
   // --- 0b. Allocate memory structures
@@ -131,59 +131,59 @@ int main(int argc, char **argv)
   using mgOnGpu::npar;
   const int nRnarray = np4*nparf*ndim; // (NB: ndim=npag*nepp for ASA layouts) 
 #ifdef __CUDACC__
-  const int nbytesRnarray = nRnarray * sizeof(double);
-  double* devRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp)
+  const int nbytesRnarray = nRnarray * sizeof(fptype);
+  fptype* devRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp)
   checkCuda( cudaMalloc( &devRnarray, nbytesRnarray ) );
 #if defined MGONGPU_CURAND_ONHOST
-  double* hstRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp)
+  fptype* hstRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp)
   checkCuda( cudaMallocHost( &hstRnarray, nbytesRnarray ) );
 #endif
 #else
-  double* hstRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp)
-  hstRnarray = new double[nRnarray]();
+  fptype* hstRnarray = 0; // AOSOA[npag][nparf][np4][nepp] (NB: ndim=npag*nepp)
+  hstRnarray = new fptype[nRnarray]();
 #endif
 
   const int nMomenta = np4*npar*ndim; // (NB: ndim=npag*nepp for ASA layouts)
 #if defined MGONGPU_LAYOUT_ASA
-  double* hstMomenta = 0; // AOSOA[npag][npar][np4][nepp] (previously was: lp)
+  fptype* hstMomenta = 0; // AOSOA[npag][npar][np4][nepp] (previously was: lp)
 #elif defined MGONGPU_LAYOUT_SOA
-  double* hstMomenta = 0; // SOA[npar][np4][ndim] (previously was: lp)
+  fptype* hstMomenta = 0; // SOA[npar][np4][ndim] (previously was: lp)
 #elif defined MGONGPU_LAYOUT_AOS
-  double* hstMomenta = 0; // AOS[ndim][npar][np4] (previously was: lp)
+  fptype* hstMomenta = 0; // AOS[ndim][npar][np4] (previously was: lp)
 #endif
 #ifdef __CUDACC__
-  const int nbytesMomenta = nMomenta * sizeof(double);
+  const int nbytesMomenta = nMomenta * sizeof(fptype);
   checkCuda( cudaMallocHost( &hstMomenta, nbytesMomenta ) );
-  double* devMomenta = 0; // (previously was: allMomenta)
+  fptype* devMomenta = 0; // (previously was: allMomenta)
   checkCuda( cudaMalloc( &devMomenta, nbytesMomenta ) );
 #else
-  hstMomenta = new double[nMomenta]();
+  hstMomenta = new fptype[nMomenta]();
 #endif
 
   const int nWeights = ndim; //  (NB: ndim=npag*nepp for ASA layouts)
-  double* hstWeights = 0; // (previously was: meHostPtr)
+  fptype* hstWeights = 0; // (previously was: meHostPtr)
 #ifdef __CUDACC__
-  const int nbytesWeights = nWeights * sizeof(double);
+  const int nbytesWeights = nWeights * sizeof(fptype);
   checkCuda( cudaMallocHost( &hstWeights, nbytesWeights ) );
-  double* devWeights = 0; // (previously was: meDevPtr)
+  fptype* devWeights = 0; // (previously was: meDevPtr)
   checkCuda( cudaMalloc( &devWeights, nbytesWeights ) );
 #else
-  hstWeights = new double[nWeights]();
+  hstWeights = new fptype[nWeights]();
 #endif
 
   const int nMEs = ndim; //  (NB: ndim=npag*nepp for ASA layouts)
-  double* hstMEs = 0; // (previously was: meHostPtr)
+  fptype* hstMEs = 0; // (previously was: meHostPtr)
 #ifdef __CUDACC__
-  const int nbytesMEs = nMEs * sizeof(double);
+  const int nbytesMEs = nMEs * sizeof(fptype);
   checkCuda( cudaMallocHost( &hstMEs, nbytesMEs ) );
-  double* devMEs = 0; // (previously was: meDevPtr)
+  fptype* devMEs = 0; // (previously was: meDevPtr)
   checkCuda( cudaMalloc( &devMEs, nbytesMEs ) );
 #else
-  hstMEs = new double[nMEs]();
+  hstMEs = new fptype[nMEs]();
 #endif
 
   float* wavetimes = new float[niter]();
-  double* matrixelementvector = new double[niter * ndim * process.nprocesses]();
+  fptype* matrixelementvector = new fptype[niter * ndim * process.nprocesses]();
   
 #ifdef __CUDACC__
 #if defined MGONGPU_WFMEM_GLOBAL
diff --git a/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h b/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h
index a0c8eeb28e..7f8fbc5388 100644
--- a/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h
+++ b/examples/gpu/eemumu_AV/src/mgOnGpuConfig.h
@@ -15,6 +15,10 @@
 //#define MGONGPU_WFMEM_GLOBAL 1
 //#define MGONGPU_WFMEM_SHARED 1
 
+// Floating point precision (CHOOSE ONLY ONE)
+//#define MGONGPU_FPTYPE_DOUBLE 1 // default
+#define MGONGPU_FPTYPE_FLOAT 1
+
 #ifdef __CUDACC__
 #include <thrust/complex.h>
 #else
@@ -52,14 +56,17 @@ namespace mgOnGpu
   //const int nepp = 4; // FOR DEBUGGING!
 
   // Floating point type: fptype (tflpoint? tfloatpt?)
-  //typedef double fptype; // double precision (8 bytes, fp64)
-  //typedef float fptype; // single precision (4 bytes, fp32)
+#if defined MGONGPU_FPTYPE_DOUBLE
+  typedef double fptype; // double precision (8 bytes, fp64)
+#elif defined MGONGPU_FPTYPE_FLOAT
+  typedef float fptype; // single precision (4 bytes, fp32)
+#endif
 
   // Complex type: cxtype (tcomplex?)
 #ifdef __CUDACC__
-  typedef thrust::complex<double> cxtype; // two doubles: RI
+  typedef thrust::complex<fptype> cxtype; // two doubles: RI
 #else
-  typedef std::complex<double> cxtype; // two doubles: RI
+  typedef std::complex<fptype> cxtype; // two doubles: RI
 #endif
 
   // Vector types: <type>_v is a <type>[256]
@@ -68,4 +75,8 @@ namespace mgOnGpu
 
 }
 
+// Expose typedefs outside the namespace
+using mgOnGpu::fptype;
+using mgOnGpu::cxtype;
+
 #endif // MGONGPUCONFIG_H
diff --git a/examples/gpu/eemumu_AV/src/rambo2toNm0.cc b/examples/gpu/eemumu_AV/src/rambo2toNm0.cc
index 800dbfdd47..7f33aabb15 100644
--- a/examples/gpu/eemumu_AV/src/rambo2toNm0.cc
+++ b/examples/gpu/eemumu_AV/src/rambo2toNm0.cc
@@ -19,28 +19,28 @@ namespace rambo2toNm0
 #ifdef __CUDACC__
   __global__
 #endif
-  void getMomentaInitial( const double energy,    // input: energy
+  void getMomentaInitial( const fptype energy,    // input: energy
 #if defined MGONGPU_LAYOUT_ASA
-                          double momenta1d[],     // output: momenta as AOSOA[npag][npar][4][nepp]
+                          fptype momenta1d[],     // output: momenta as AOSOA[npag][npar][4][nepp]
 #elif defined MGONGPU_LAYOUT_SOA
-                          double momenta1d[],     // output: momenta as SOA[npar][4][nevt]
+                          fptype momenta1d[],     // output: momenta as SOA[npar][4][nevt]
 #elif defined MGONGPU_LAYOUT_AOS
-                          double momenta1d[],     // output: momenta as AOS[nevt][npar][4]
+                          fptype momenta1d[],     // output: momenta as AOS[nevt][npar][4]
 #endif
                           const int nevt )        // input: #events
   {
 #if defined MGONGPU_LAYOUT_ASA
     using mgOnGpu::nepp;
-    double (*momenta)[npar][np4][nepp] = (double (*)[npar][np4][nepp]) momenta1d; // cast to multiD array pointer (AOSOA)
+    fptype (*momenta)[npar][np4][nepp] = (fptype (*)[npar][np4][nepp]) momenta1d; // cast to multiD array pointer (AOSOA)
 #elif defined MGONGPU_LAYOUT_SOA
     // Cast is impossible in CUDA C ("error: expression must have a constant value")
-    //double (*momenta)[np4][nevt] = (double (*)[np4][nevt]) momenta1d; // cast to multiD array pointer (SOA)
+    //fptype (*momenta)[np4][nevt] = (fptype (*)[np4][nevt]) momenta1d; // cast to multiD array pointer (SOA)
 #elif defined MGONGPU_LAYOUT_AOS
-    double (*momenta)[npar][np4] = (double (*)[npar][np4]) momenta1d; // cast to multiD array pointer (AOS)
+    fptype (*momenta)[npar][np4] = (fptype (*)[npar][np4]) momenta1d; // cast to multiD array pointer (AOS)
 #endif
-    const double energy1 = energy/2;
-    const double energy2 = energy/2;
-    const double mom = energy/2;
+    const fptype energy1 = energy/2;
+    const fptype energy2 = energy/2;
+    const fptype mom = energy/2;
 #ifndef __CUDACC__
     // ** START LOOP ON IEVT **
     for (int ievt = 0; ievt < nevt; ++ievt)
@@ -92,16 +92,16 @@ namespace rambo2toNm0
 #ifdef __CUDACC__
   __global__
 #endif
-  void getMomentaFinal( const double energy,      // input: energy
-                        const double rnarray1d[], // input: randomnumbers in [0,1] as AOSOA[npag][nparf][4][nepp]
+  void getMomentaFinal( const fptype energy,      // input: energy
+                        const fptype rnarray1d[], // input: randomnumbers in [0,1] as AOSOA[npag][nparf][4][nepp]
 #if defined MGONGPU_LAYOUT_ASA
-                        double momenta1d[],       // output: momenta as AOSOA[npag][npar][4][nepp]
+                        fptype momenta1d[],       // output: momenta as AOSOA[npag][npar][4][nepp]
 #elif defined MGONGPU_LAYOUT_SOA
-                        double momenta1d[],       // output: momenta as SOA[npar][4][nevt]
+                        fptype momenta1d[],       // output: momenta as SOA[npar][4][nevt]
 #elif defined MGONGPU_LAYOUT_AOS
-                        double momenta1d[],       // output: momenta as AOS[nevt][npar][4]
+                        fptype momenta1d[],       // output: momenta as AOS[nevt][npar][4]
 #endif
-                        double wgts[],            // output: weights[nevt]
+                        fptype wgts[],            // output: weights[nevt]
                         const int nevt )          // input: #events
   {
     /****************************************************************************
@@ -117,14 +117,14 @@ namespace rambo2toNm0
      ****************************************************************************/
 
     // initialization step: factorials for the phase space weight
-    const double twopi = 8. * atan(1.);
-    const double po2log = log(twopi / 4.);
-    double z[nparf];
+    const fptype twopi = 8. * atan(1.);
+    const fptype po2log = log(twopi / 4.);
+    fptype z[nparf];
     z[1] = po2log;
     for (int kpar = 2; kpar < nparf; kpar++)
-      z[kpar] = z[kpar - 1] + po2log - 2. * log(double(kpar - 1));
+      z[kpar] = z[kpar - 1] + po2log - 2. * log(fptype(kpar - 1));
     for (int kpar = 2; kpar < nparf; kpar++)
-      z[kpar] = (z[kpar] - log(double(kpar)));
+      z[kpar] = (z[kpar] - log(fptype(kpar)));
 
 #ifndef __CUDACC__
     // ** START LOOP ON IEVT **
@@ -140,27 +140,27 @@ namespace rambo2toNm0
       using mgOnGpu::nepp;
       const int ipag = ievt/nepp; // #eventpage in this iteration
       const int iepp = ievt%nepp; // #event in the current eventpage in this iteration
-      double (*rnarray)[nparf][np4][nepp] = (double (*)[nparf][np4][nepp]) rnarray1d; // cast to multiD array pointer (AOSOA)
+      fptype (*rnarray)[nparf][np4][nepp] = (fptype (*)[nparf][np4][nepp]) rnarray1d; // cast to multiD array pointer (AOSOA)
 #if defined MGONGPU_LAYOUT_ASA
-      double (*momenta)[npar][np4][nepp] = (double (*)[npar][np4][nepp]) momenta1d; // cast to multiD array pointer (AOSOA)
+      fptype (*momenta)[npar][np4][nepp] = (fptype (*)[npar][np4][nepp]) momenta1d; // cast to multiD array pointer (AOSOA)
 #elif defined MGONGPU_LAYOUT_SOA
       // Cast is impossible in CUDA C ("error: expression must have a constant value")
-      //double (*momenta)[np4][nevt] = (double (*)[np4][nevt]) momenta1d; // cast to multiD array pointer (SOA)
+      //fptype (*momenta)[np4][nevt] = (fptype (*)[np4][nevt]) momenta1d; // cast to multiD array pointer (SOA)
 #elif defined MGONGPU_LAYOUT_AOS
-      double (*momenta)[npar][np4] = (double (*)[npar][np4]) momenta1d; // cast to multiD array pointer (AOS)
+      fptype (*momenta)[npar][np4] = (fptype (*)[npar][np4]) momenta1d; // cast to multiD array pointer (AOS)
 #endif
-      double& wt = wgts[ievt];
+      fptype& wt = wgts[ievt];
 
       // generate n massless momenta in infinite phase space
-      double q[nparf][np4];
+      fptype q[nparf][np4];
       for (int iparf = 0; iparf < nparf; iparf++) {
-        const double r1 = rnarray[ipag][iparf][0][iepp];
-        const double r2 = rnarray[ipag][iparf][1][iepp];
-        const double r3 = rnarray[ipag][iparf][2][iepp];
-        const double r4 = rnarray[ipag][iparf][3][iepp];
-        const double c = 2. * r1 - 1.;
-        const double s = sqrt(1. - c * c);
-        const double f = twopi * r2;
+        const fptype r1 = rnarray[ipag][iparf][0][iepp];
+        const fptype r2 = rnarray[ipag][iparf][1][iepp];
+        const fptype r3 = rnarray[ipag][iparf][2][iepp];
+        const fptype r4 = rnarray[ipag][iparf][3][iepp];
+        const fptype c = 2. * r1 - 1.;
+        const fptype s = sqrt(1. - c * c);
+        const fptype f = twopi * r2;
         q[iparf][0] = -log(r3 * r4);
         q[iparf][3] = q[iparf][0] * c;
         q[iparf][2] = q[iparf][0] * s * cos(f);
@@ -168,24 +168,24 @@ namespace rambo2toNm0
       }
 
       // calculate the parameters of the conformal transformation
-      double r[np4];
-      double b[np4-1];
+      fptype r[np4];
+      fptype b[np4-1];
       for (int i4 = 0; i4 < np4; i4++)
         r[i4] = 0.;
       for (int iparf = 0; iparf < nparf; iparf++) {
         for (int i4 = 0; i4 < np4; i4++)
           r[i4] = r[i4] + q[iparf][i4];
       }
-      const double rmas = sqrt(pow(r[0], 2) - pow(r[3], 2) - pow(r[2], 2) - pow(r[1], 2));
+      const fptype rmas = sqrt(pow(r[0], 2) - pow(r[3], 2) - pow(r[2], 2) - pow(r[1], 2));
       for (int i4 = 1; i4 < np4; i4++)
         b[i4-1] = -r[i4] / rmas;
-      const double g = r[0] / rmas;
-      const double a = 1. / (1. + g);
-      const double x0 = energy / rmas;
+      const fptype g = r[0] / rmas;
+      const fptype a = 1. / (1. + g);
+      const fptype x0 = energy / rmas;
 
       // transform the q's conformally into the p's (i.e. the 'momenta')
       for (int iparf = 0; iparf < nparf; iparf++) {
-        double bq = b[0] * q[iparf][1] + b[1] * q[iparf][2] + b[2] * q[iparf][3];
+        fptype bq = b[0] * q[iparf][1] + b[1] * q[iparf][2] + b[2] * q[iparf][3];
 #if defined MGONGPU_LAYOUT_ASA
         for (int i4 = 1; i4 < np4; i4++)
           momenta[ipag][iparf+npari][i4][iepp] = x0 * (q[iparf][i4] + b[i4-1] * (q[iparf][0] + a * bq));
@@ -276,10 +276,14 @@ namespace rambo2toNm0
   // ** NB: the random numbers are always produced in the same order and are interpreted as an AOSOA
   // AOSOA: rnarray[npag][nparf][np4][nepp] where nevt=npag*nepp
   void generateRnArray( curandGenerator_t gen, // input: curand generator
-                        double rnarray1d[],    // output: randomnumbers in [0,1]
+                        fptype rnarray1d[],    // output: randomnumbers in [0,1]
                         const int nevt )       // input: #events
   {
+#if defined MGONGPU_FPTYPE_DOUBLE
     checkCurand( curandGenerateUniformDouble( gen, rnarray1d, np4*nparf*nevt ) );
+#elif defined MGONGPU_FPTYPE_FLOAT
+    checkCurand( curandGenerateUniform( gen, rnarray1d, np4*nparf*nevt ) );
+#endif
   }
 
   //--------------------------------------------------------------------------
diff --git a/examples/gpu/eemumu_AV/src/rambo2toNm0.h b/examples/gpu/eemumu_AV/src/rambo2toNm0.h
index 27c5005163..8f5e0497ec 100644
--- a/examples/gpu/eemumu_AV/src/rambo2toNm0.h
+++ b/examples/gpu/eemumu_AV/src/rambo2toNm0.h
@@ -43,13 +43,13 @@ namespace rambo2toNm0
 #ifdef __CUDACC__
   __global__
 #endif
-  void getMomentaInitial( const double energy,    // input: energy
+  void getMomentaInitial( const fptype energy,    // input: energy
 #if defined MGONGPU_LAYOUT_ASA
-                          double momenta1d[],     // output: momenta as AOSOA[npag][npar][4][nepp]
+                          fptype momenta1d[],     // output: momenta as AOSOA[npag][npar][4][nepp]
 #elif defined MGONGPU_LAYOUT_SOA
-                          double momenta1d[],     // output: momenta as SOA[npar][4][nevt]
+                          fptype momenta1d[],     // output: momenta as SOA[npar][4][nevt]
 #elif defined MGONGPU_LAYOUT_AOS
-                          double momenta1d[],     // output: momenta as AOS[nevt][npar][4]
+                          fptype momenta1d[],     // output: momenta as AOS[nevt][npar][4]
 #endif
                           const int nevt );       // input: #events
 
@@ -60,16 +60,16 @@ namespace rambo2toNm0
 #ifdef __CUDACC__
   __global__
 #endif
-  void getMomentaFinal( const double energy,      // input: energy
-                        const double rnarray1d[], // input: randomnumbers in [0,1] as AOSOA[npag][nparf][4][nepp]
+  void getMomentaFinal( const fptype energy,      // input: energy
+                        const fptype rnarray1d[], // input: randomnumbers in [0,1] as AOSOA[npag][nparf][4][nepp]
 #if defined MGONGPU_LAYOUT_ASA
-                        double momenta1d[],       // output: momenta as AOSOA[npag][npar][4][nepp]
+                        fptype momenta1d[],       // output: momenta as AOSOA[npag][npar][4][nepp]
 #elif defined MGONGPU_LAYOUT_SOA
-                        double momenta1d[],       // output: momenta as SOA[npar][4][nevt]
+                        fptype momenta1d[],       // output: momenta as SOA[npar][4][nevt]
 #elif defined MGONGPU_LAYOUT_AOS
-                        double momenta1d[],       // output: momenta as AOS[nevt][npar][4]
+                        fptype momenta1d[],       // output: momenta as AOS[nevt][npar][4]
 #endif
-                        double wgts[],            // output: weights[nevt]
+                        fptype wgts[],            // output: weights[nevt]
                         const int nevt );         // input: #events
 
   //--------------------------------------------------------------------------
@@ -93,7 +93,7 @@ namespace rambo2toNm0
   // ** NB: the random numbers are always produced in the same order and are interpreted as an AOSOA
   // AOSOA: rnarray[npag][nparf][np4][nepp] where nevt=npag*nepp
   void generateRnArray( curandGenerator_t gen, // input: curand generator
-                        double rnarray1d[],    // output: randomnumbers in [0,1]
+                        fptype rnarray1d[],    // output: randomnumbers in [0,1]
                         const int nevt );      // input: #events
 
   //--------------------------------------------------------------------------