From 4a9d474fbeb2801fc47d07b1e4f627dd740563d3 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 23 May 2018 16:21:20 +0200
Subject: [PATCH] Clean up some GPU- and non-GPU-related pixel tracking code
 (#49)

  - clean up GPU- and non-GPU-related pixel tracking code and build files
  - drop obsolete or unused classes
  - rename RawToDigiGPU to SiPixelRawToDigiGPUKernel
  - move the definitions in SiPixelRawToDigiGPUKernel to the `pixelgpudetails` namespace
---
 .../SiPixelRawToDigi/plugins/BuildFile.xml    |   4 +-
 .../SiPixelRawToDigi/plugins/DetParamBits.h   |  39 -
 .../SiPixelRawToDigi/plugins/EventInfoGPU.h   |  11 -
 .../SiPixelRawToDigi/plugins/RawToDigiGPU.cu  | 693 -----------------
 .../SiPixelRawToDigi/plugins/RawToDigiGPU.h   | 216 ------
 .../SiPixelRawToDigi/plugins/SealModule.cc    |   8 -
 .../plugins/SiPixelDigiToRaw.cc               |   5 +-
 .../plugins/SiPixelFedCablingMapGPU.cc        |  45 +-
 .../plugins/SiPixelFedCablingMapGPU.h         |  54 +-
 .../plugins/SiPixelRawToDigi.cc               |   4 +
 .../plugins/SiPixelRawToDigiGPU.cc            |  44 +-
 .../plugins/SiPixelRawToDigiGPU.h             |  32 +-
 .../plugins/SiPixelRawToDigiGPUKernel.cu      | 696 ++++++++++++++++++
 .../plugins/SiPixelRawToDigiGPUKernel.h       | 212 ++++++
 .../SiPixelRecHits/interface/pixelCPEforGPU.h | 317 ++++----
 .../SiPixelRecHits/plugins/PixelRecHits.cu    |   4 +-
 .../SiPixelRecHits/plugins/PixelRecHits.h     |   6 +-
 .../plugins/SiPixelRecHitGPU.cc               |  18 +-
 .../SiPixelRecHits/plugins/gpuPixelRecHits.h  |  96 ++-
 19 files changed, 1221 insertions(+), 1283 deletions(-)
 delete mode 100644 EventFilter/SiPixelRawToDigi/plugins/DetParamBits.h
 delete mode 100644 EventFilter/SiPixelRawToDigi/plugins/EventInfoGPU.h
 delete mode 100644 EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu
 delete mode 100644 EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h
 delete mode 100644 EventFilter/SiPixelRawToDigi/plugins/SealModule.cc
 create mode 100644 EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.cu
 create mode 100644 EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.h
diff --git a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
index 790b772fc6feb..3e047e2f90b57 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
@@ -1,8 +1,8 @@
 <use name="EventFilter/SiPixelRawToDigi"/>
-<library file="SiPixelDigiToRaw.cc SiPixelRawToDigi.cc SealModule.cc" name="EventFilterSiPixelRawToDigiPlugins">
+<library file="SiPixelDigiToRaw.cc SiPixelRawToDigi.cc" name="EventFilterSiPixelRawToDigiPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
-<library file="SiPixelRawToDigiGPU.cc SiPixelFedCablingMapGPU.cc RawToDigiGPU.cu" name="EventFilterSiPixelRawToDigiGPUPlugins">
+<library file="SiPixelRawToDigiGPU.cc SiPixelFedCablingMapGPU.cc SiPixelRawToDigiGPUKernel.cu" name="EventFilterSiPixelRawToDigiGPUPlugins">
   <use name="cuda"/>
   <use name="CalibTracker/SiPixelESProducers"/>
   <flags EDM_PLUGIN="1"/>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/DetParamBits.h b/EventFilter/SiPixelRawToDigi/plugins/DetParamBits.h
deleted file mode 100644
index 8cc944c3c7f25..0000000000000
--- a/EventFilter/SiPixelRawToDigi/plugins/DetParamBits.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Sushil Dubey, Shashi Dugad, TIFR
-#ifndef DETPARAMBITS_H
-#define DETPARAMBITS_H
-typedef unsigned int uint;
-//reference
-//http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/d3/db2/PixelROC_8cc_source.html#l00197 
-const uint layerStartBit_  = 20;
-const uint ladderStartBit_ = 12;
-const uint moduleStartBit_ = 2;
-
-const uint panelStartBit_  = 10;
-const uint diskStartBit_   = 18;
-const uint bladeStartBit_  = 12;
-
-const uint layerMask_      = 0xF;
-const uint ladderMask_     = 0xFF;
-const uint moduleMask_     = 0x3FF;
-const uint panelMask_      = 0x3;
-const uint diskMask_       = 0xF;
-const uint bladeMask_      = 0x3F;
-
-// __host__ __device__ bool isBarrel(uint rawId) {
-//   return (1==((rawId>>25)&0x7));
-// }
-
-__host__ __device__ int getLayer(uint rawId) {
-  int layer  = (rawId >> layerStartBit_)  & layerMask_;
-  return layer;
-}
-
-__host__ __device__ int getDisk(uint rawId) {
-  // int side =1;
-  // unsigned int panel = ((rawId>>panelStartBit_) & panelMask_);
-  // if(panel==1) side = -1;
-   unsigned int disk = int((rawId>>diskStartBit_) & diskMask_);
-  // return disk*side;
-  return disk;
-}
-#endif
diff --git a/EventFilter/SiPixelRawToDigi/plugins/EventInfoGPU.h b/EventFilter/SiPixelRawToDigi/plugins/EventInfoGPU.h
deleted file mode 100644
index 1170717e4adf8..0000000000000
--- a/EventFilter/SiPixelRawToDigi/plugins/EventInfoGPU.h
+++ /dev/null
@@ -1,11 +0,0 @@
-/*Sushil Dubey, Shashi Dugad, TIFR
-*/
-
-#ifndef EVENTINFO_GPU
-#define EVENTINFO_GPU
-
-const int NEVENT = 1 ;  //optimal number of events to run simultaneously,
-// using 4 cuda stream, hence it should be multiple of 4
-const int NMODULE = 1856; // for phase 1, we have 1856 modules
-
-#endif 
diff --git a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu b/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu
deleted file mode 100644
index 88f68f460441e..0000000000000
--- a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.cu
+++ /dev/null
@@ -1,693 +0,0 @@
-/* Sushil Dubey, Shashi Dugad, TIFR, July 2017
- *
- * File Name: RawToDigiGPU.cu
- * Description: It converts Raw data into Digi Format on GPU
- * then it converts adc -> electron and
- * applies the adc threshold to needed for clustering
- * Finaly the Output of RawToDigi data is given to pixelClusterizer
- *
-**/
-
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
-
-
-
-// System includes
-#include <cstdio>
-#include <cstdlib>
-#include <cassert>
-#include <iomanip>
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <chrono>
-
-// CUDA runtime
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <thrust/host_vector.h>
-#include <thrust/device_vector.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-#include <thrust/execution_policy.h>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "EventInfoGPU.h"
-#include "RawToDigiGPU.h"
-#include "SiPixelFedCablingMapGPU.h"
-
-context initDeviceMemory() {
-
-  using namespace gpuClustering;
-  context c;
-
-  // Number of words for all the feds
-  constexpr uint32_t MAX_WORD08_SIZE = MAX_FED * MAX_WORD  * sizeof(uint8_t);
-  constexpr uint32_t MAX_WORD32_SIZE = MAX_FED * MAX_WORD  * sizeof(uint32_t);
-  constexpr uint32_t MAX_WORD16_SIZE = MAX_FED * MAX_WORD  * sizeof(uint16_t);
-  constexpr uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
-  constexpr uint32_t esize = sizeof(error_obj);
-  constexpr uint32_t MAX_ERROR_SIZE  = MAX_FED * MAX_WORD * esize;
-
-  cudaCheck(cudaMalloc((void**) & c.word_d,        MAX_WORD32_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.fedId_d,       MAX_WORD08_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.pdigi_d,       MAX_WORD32_SIZE)); // to store thepacked digi
-  cudaCheck(cudaMalloc((void**) & c.xx_d,          MAX_WORD16_SIZE)); // to store the x and y coordinate
-  cudaCheck(cudaMalloc((void**) & c.yy_d,          MAX_WORD16_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.adc_d,         MAX_WORD16_SIZE));
-
-  cudaCheck(cudaMalloc((void**) & c.moduleInd_d,   MAX_WORD16_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.rawIdArr_d,    MAX_WORD32_SIZE));
-  cudaCheck(cudaMalloc((void**) & c.error_d,       vsize));
-  cudaCheck(cudaMalloc((void**) & c.data_d,        MAX_ERROR_SIZE));
-
-  // for the clusterizer
-  cudaCheck(cudaMalloc((void**) & c.clus_d,        MAX_WORD32_SIZE)); // cluser index in module
-
-  cudaCheck(cudaMalloc((void**) & c.moduleStart_d,   (MaxNumModules+1)*sizeof(uint32_t) ));
-  cudaCheck(cudaMalloc((void**) & c.clusInModule_d,   (MaxNumModules)*sizeof(uint32_t) ));
-  cudaCheck(cudaMalloc((void**) & c.moduleId_d,   (MaxNumModules)*sizeof(uint32_t) ));
-
-  cudaCheck(cudaMalloc((void**) & c.debug_d,        MAX_WORD32_SIZE));
-
-  // create a CUDA stream
-  cudaCheck(cudaStreamCreate(&c.stream));
-
-  return c;
-}
-
-
-void freeMemory(context & c) {
-  // free the GPU memory
-  cudaCheck(cudaFree(c.word_d));
-  cudaCheck(cudaFree(c.fedId_d));
-  cudaCheck(cudaFree(c.pdigi_d));
-  cudaCheck(cudaFree(c.xx_d));
-  cudaCheck(cudaFree(c.yy_d));
-  cudaCheck(cudaFree(c.adc_d));
-  cudaCheck(cudaFree(c.moduleInd_d));
-  cudaCheck(cudaFree(c.rawIdArr_d));
-  cudaCheck(cudaFree(c.error_d));
-  cudaCheck(cudaFree(c.data_d));
-
- // these are for the clusterizer (to be moved)
- cudaCheck(cudaFree(c.moduleStart_d));
- cudaCheck(cudaFree(c.clus_d));
- cudaCheck(cudaFree(c.clusInModule_d));
- cudaCheck(cudaFree(c.moduleId_d));
- cudaCheck(cudaFree(c.debug_d));
-
-
-  // destroy the CUDA stream
-  cudaCheck(cudaStreamDestroy(c.stream));
-}
-
-
-__device__ uint32_t getLink(uint32_t ww)  {
-  return ((ww >> LINK_shift) & LINK_mask);
-}
-
-
-__device__ uint32_t getRoc(uint32_t ww) {
-  return ((ww >> ROC_shift ) & ROC_mask);
-}
-
-
-__device__ uint32_t getADC(uint32_t ww) {
-  return ((ww >> ADC_shift) & ADC_mask);
-}
-
-
-__device__ bool isBarrel(uint32_t rawId) {
-  return (1==((rawId>>25)&0x7));
-}
-
-
-
-__device__ DetIdGPU getRawId(const SiPixelFedCablingMapGPU * Map, uint32_t fed, uint32_t link, uint32_t roc) {
-  uint32_t index = fed * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + roc;
-  DetIdGPU detId = { Map->RawId[index], Map->rocInDet[index], Map->moduleId[index] };
-  return detId;
-}
-
-
-//reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html
-//http://cmslxr.fnal.gov/source/CondFormats/SiPixelObjects/src/PixelROC.cc?v=CMSSW_9_2_0#0071
-// Convert local pixel to global pixel
-__device__ Pixel frameConversion(bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, Pixel local) {
-
-  int slopeRow  = 0, slopeCol = 0;
-  int rowOffset = 0, colOffset = 0;
-
-  if (bpix) {
-
-    if (side == -1 && layer != 1) { // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
-      if (rocIdInDetUnit < 8) {
-        slopeRow = 1;
-        slopeCol = -1;
-        rowOffset = 0;
-        colOffset = (8-rocIdInDetUnit)*numColsInRoc-1;
-      }
-      else {
-        slopeRow  = -1;
-        slopeCol  = 1;
-        rowOffset = 2*numRowsInRoc-1;
-        colOffset = (rocIdInDetUnit-8)*numColsInRoc;
-      } // if roc
-    }
-    else { // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
-      if (rocIdInDetUnit < 8) {
-        slopeRow  = -1;
-        slopeCol  =  1;
-        rowOffset = 2*numRowsInRoc-1;
-        colOffset = rocIdInDetUnit * numColsInRoc;
-      }
-      else {
-        slopeRow  = 1;
-        slopeCol  = -1;
-        rowOffset = 0;
-        colOffset = (16-rocIdInDetUnit)*numColsInRoc-1;
-      }
-    }
-
-  }
-  else { // fpix
-    if (side==-1) { // pannel 1
-      if (rocIdInDetUnit < 8) {
-        slopeRow = 1;
-        slopeCol = -1;
-        rowOffset = 0;
-        colOffset = (8-rocIdInDetUnit)*numColsInRoc-1;
-      }
-      else {
-        slopeRow = -1;
-        slopeCol = 1;
-        rowOffset = 2*numRowsInRoc-1;
-        colOffset = (rocIdInDetUnit-8)*numColsInRoc;
-      }
-    }
-    else { // pannel 2
-      if (rocIdInDetUnit < 8) {
-        slopeRow = 1;
-        slopeCol = -1;
-        rowOffset = 0;
-        colOffset = (8-rocIdInDetUnit)*numColsInRoc-1;
-      }
-      else {
-        slopeRow = -1;
-        slopeCol = 1;
-        rowOffset = 2*numRowsInRoc-1;
-        colOffset = (rocIdInDetUnit-8)*numColsInRoc;
-      }
-
-    } // side
-
-  }
-
-  uint32_t gRow = rowOffset+slopeRow*local.row;
-  uint32_t gCol = colOffset+slopeCol*local.col;
-  //printf("Inside frameConversion row: %u, column: %u\n",gRow, gCol);
-  Pixel global = {gRow, gCol};
-  return global;
-}
-
-
-__device__ uint32_t conversionError(uint32_t fedId, uint32_t status, bool debug = false)
-{
-
-  uint32_t errorType = 0;
-
-  // debug = true;
-
-  switch (status) {
-      case(1) : {
-        if (debug) printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId );
-        errorType = 35;
-        break;
-      }
-      case(2) : {
-        if (debug) printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
-        errorType = 36;
-        break;
-      }
-      case(3) : {
-        if (debug) printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
-        errorType = 37;
-        break;
-      }
-      case(4) : {
-        if (debug) printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
-        errorType = 38;
-        break;
-      }
-      default: if (debug) printf("Cabling check returned unexpected result, status = %i\n", status);
-  };
-
-  return errorType;
-
-}
-
-
-__device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol)
-{
-    uint32_t numRowsInRoc = 80;
-    uint32_t numColsInRoc = 52;
-
-    /// row and collumn in ROC representation
-    return ( (rocRow < numRowsInRoc) & (rocCol < numColsInRoc) );
-}
-
-
-__device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid)
-{
-    return ( (dcol < 26) &  (2 <= pxid) & (pxid < 162) );
-}
-
-
-__device__ uint32_t checkROC(uint32_t errorWord, uint32_t fedId, uint32_t link, const SiPixelFedCablingMapGPU *Map, bool debug = false)
-{
-
- int errorType = (errorWord >> ROC_shift) & ERROR_mask;
- if (errorType < 25) return false;
- bool errorFound = false;
-
- switch (errorType) {
-    case(25) : {
-     errorFound = true;
-     uint32_t index = fedId * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + 1;
-     if (index > 1 && index <= Map->size){
-       if (!(link == Map->link[index] && 1 == Map->roc[index])) errorFound = false;
-     }
-     if (debug&errorFound) printf("Invalid ROC = 25 found (errorType = 25)\n");
-     break;
-   }
-   case(26) : {
-     if (debug) printf("Gap word found (errorType = 26)\n");
-     errorFound = true;
-     break;
-   }
-   case(27) : {
-     if (debug) printf("Dummy word found (errorType = 27)\n");
-     errorFound = true;
-     break;
-   }
-   case(28) : {
-     if (debug) printf("Error fifo nearly full (errorType = 28)\n");
-     errorFound = true;
-     break;
-   }
-   case(29) : {
-     if (debug) printf("Timeout on a channel (errorType = 29)\n");
-     if ((errorWord >> OMIT_ERR_shift) & OMIT_ERR_mask) {
-       if (debug) printf("...first errorType=29 error, this gets masked out\n");
-     }
-     errorFound = true;
-     break;
-   }
-   case(30) : {
-     if (debug) printf("TBM error trailer (errorType = 30)\n");
-     int StateMatch_bits = 4;
-     int StateMatch_shift = 8;
-     uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits);
-     int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask;
-     if ( StateMatch != 1 && StateMatch != 8 ) {
-       if (debug) printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
-     }
-     if ( StateMatch == 1 ) errorType = 40; // 1=Overflow -> 40, 8=number of ROCs -> 30
-     errorFound = true;
-     break;
-   }
-   case(31) : {
-     if (debug) printf("Event number error (errorType = 31)\n");
-     errorFound = true;
-     break;
-   }
-   default: errorFound = false;
-
- };
-
- return errorFound? errorType : 0;
-
-}
-
-
-__device__ uint32_t getErrRawID(uint32_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelFedCablingMapGPU *Map, bool debug = false)
-{
-
-  uint32_t rID = 0xffffffff;
-
-  switch (errorType) {
-    case  25 : case  30 : case  31 : case  36 : case 40 : {
-      //set dummy values for cabling just to get detId from link
-      //cabling.dcol = 0;
-      //cabling.pxid = 2;
-      uint32_t roc  = 1;
-      uint32_t link = (errWord >> LINK_shift) & LINK_mask;
-
-      uint32_t rID_temp = getRawId(Map, fedId, link, roc).RawId;
-      if(rID_temp != 9999) rID = rID_temp;
-      break;
-    }
-    case  29 : {
-      int chanNmbr = 0;
-      const int DB0_shift = 0;
-      const int DB1_shift = DB0_shift + 1;
-      const int DB2_shift = DB1_shift + 1;
-      const int DB3_shift = DB2_shift + 1;
-      const int DB4_shift = DB3_shift + 1;
-      const uint32_t DataBit_mask = ~(~uint32_t(0) << 1);
-
-      int CH1 = (errWord >> DB0_shift) & DataBit_mask;
-      int CH2 = (errWord >> DB1_shift) & DataBit_mask;
-      int CH3 = (errWord >> DB2_shift) & DataBit_mask;
-      int CH4 = (errWord >> DB3_shift) & DataBit_mask;
-      int CH5 = (errWord >> DB4_shift) & DataBit_mask;
-      int BLOCK_bits      = 3;
-      int BLOCK_shift     = 8;
-      uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits);
-      int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask;
-      int localCH = 1*CH1+2*CH2+3*CH3+4*CH4+5*CH5;
-      if (BLOCK%2==0) chanNmbr=(BLOCK/2)*9+localCH;
-      else chanNmbr = ((BLOCK-1)/2)*9+4+localCH;
-      if ((chanNmbr < 1)||(chanNmbr > 36)) break;  // signifies unexpected result
-
-      // set dummy values for cabling just to get detId from link if in Barrel
-      //cabling.dcol = 0;
-      //cabling.pxid = 2;
-      uint32_t roc  = 1;
-      uint32_t link = chanNmbr;
-      uint32_t rID_temp = getRawId(Map, fedId, link, roc).RawId;
-      if(rID_temp != 9999) rID = rID_temp;
-      break;
-    }
-    case  37 : case  38: {
-      //cabling.dcol = 0;
-      //cabling.pxid = 2;
-      uint32_t roc  = (errWord >> ROC_shift) & ROC_mask;
-      uint32_t link = (errWord >> LINK_shift) & LINK_mask;
-      uint32_t rID_temp = getRawId(Map, fedId, link, roc).RawId;
-      if(rID_temp != 9999) rID = rID_temp;
-      break;
-    }
-
-  default : break;
-
-  };
-
-  return rID;
-
-}
-
-
-/*----------
-* Name: applyADCthreshold_kernel()
-* Desc: converts adc count to electrons and then applies the
-* threshold on each channel.
-* make pixel to 0 if it is below the threshold
-* Input: xx_d[], yy_d[], layer_d[], wordCounter, adc[], ADCThreshold
-*-----------
-* Output: xx_adc[], yy_adc[] with pixel threshold applied
-*/
-// kernel to apply adc threshold on the channels
-
-
-// Felice: gains and pedestals are not the same for each pixel. This code should be rewritten to take
-// in account local gains/pedestals
-// __global__ void applyADCthreshold_kernel(const uint32_t *xx_d, const uint32_t *yy_d, const uint32_t *layer_d, uint32_t *adc, const uint32_t wordCounter,
-//  const ADCThreshold adcThreshold, uint32_t *xx_adc, uint32_t *yy_adc ) {
-//   int tid = threadIdx.x;
-//   int gIndex = blockDim.x*blockIdx.x+tid;
-//   if (gIndex<wordCounter) {
-//     uint32_t adcOld = adc[gIndex];
-//     const float gain = adcThreshold.theElectronPerADCGain_; // default: 1 ADC = 135 electrons
-//     const float pedestal = 0; //
-//     int adcNew = int(adcOld*gain+pedestal);
-//     // rare chance of entering into the if ()
-//     if (layer_d[gIndex]>=adcThreshold.theFirstStack_) {
-//       if (adcThreshold.theStackADC_==1 && adcOld==1) {
-//         adcNew = int(255*135); // Arbitrarily use overflow value.
-//       }
-//       if (adcThreshold.theStackADC_ >1 && adcThreshold.theStackADC_!=255 && adcOld>=1){
-//         adcNew = int((adcOld-1) * gain * 255/float(adcThreshold.theStackADC_-1));
-//       }
-//     }
-//
-//     if (adcNew >adcThreshold.thePixelThreshold ) {
-//       xx_adc[gIndex]=xx_d[gIndex];
-//       yy_adc[gIndex]=yy_d[gIndex];
-//     }
-//     else {
-//       xx_adc[gIndex]=0; // 0: dead pixel
-//       yy_adc[gIndex]=0;
-//     }
-//     adc[gIndex] = adcNew;
-//   }
-// }
-
-
-// Kernel to perform Raw to Digi conversion
-__global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint32_t wordCounter, const uint32_t *Word, const uint8_t *fedIds,
-                                 uint16_t * XX, uint16_t * YY, uint16_t * ADC,
-                                 uint32_t * pdigi, uint32_t *rawIdArr, uint16_t * moduleId,
-                                 GPU::SimpleVector<error_obj> *err,
-                                 bool useQualityInfo, bool includeErrors, bool debug)
-{
-  uint32_t blockId  = blockIdx.x;
-  uint32_t threadId  = threadIdx.x;
-
-  bool skipROC = false;
-  //if (threadId==0) printf("Event: %u blockId: %u start: %u end: %u\n", eventno, blockId, begin, end);
-  
-  for (int aaa=0; aaa<1; ++aaa) {  // too many coninue below.... (to be fixed)
-    auto gIndex = threadId + blockId*blockDim.x;
-    if (gIndex < wordCounter) {
-   
-      uint32_t fedId = fedIds[gIndex/2]; // +1200;    
-
-      // initialize (too many coninue below)
-      pdigi[gIndex]  = 0;
-      rawIdArr[gIndex] = 0;
-      moduleId[gIndex] = 9999; 
-
-      uint32_t ww = Word[gIndex]; // Array containing 32 bit raw data
-      if (ww == 0) {
-        //noise and dead channels are ignored
-        XX[gIndex]    = 0;  // 0 is an indicator of a noise/dead channel
-        YY[gIndex]    = 0; // skip these pixels during clusterization
-        ADC[gIndex]   = 0;
-        continue ; // 0: bad word
-      }
-
-      uint32_t link  = getLink(ww);            // Extract link
-      uint32_t roc   = getRoc(ww);             // Extract Roc in link
-      DetIdGPU detId = getRawId(Map, fedId, link, roc);
-
-      uint32_t errorType = checkROC(ww, fedId, link, Map, debug);
-      skipROC = (roc < maxROCIndex) ? false : (errorType != 0);
-      if (includeErrors and skipROC)
-      {
-        uint32_t rID = getErrRawID(fedId, ww, errorType, Map, debug);
-        err->emplace_back(rID, ww, errorType, fedId);
-        continue;
-      }
-
-      uint32_t rawId  = detId.RawId;
-      uint32_t rocIdInDetUnit = detId.rocInDet;
-      bool barrel = isBarrel(rawId);
-
-      uint32_t index = fedId * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + roc;
-      if (useQualityInfo) {
-
-          skipROC = Map->badRocs[index];
-          if (skipROC) continue;
-
-      }
-      skipROC = Map->modToUnp[index];
-      if (skipROC) continue;
-
-      uint32_t layer = 0;//, ladder =0;
-      int side = 0, panel = 0, module = 0;//disk = 0,blade = 0
-
-      if (barrel)
-      {
-        layer  = (rawId >> layerStartBit_) & layerMask_;
-        module = (rawId >> moduleStartBit_) & moduleMask_;
-        side   = (module < 5)? -1 : 1;
-      }
-      else {
-        // endcap ids
-        layer = 0;
-        panel = (rawId >> panelStartBit_) & panelMask_;
-        //disk  = (rawId >> diskStartBit_)  & diskMask_ ;
-        side  = (panel == 1)? -1 : 1;
-        //blade = (rawId>>bladeStartBit_) & bladeMask_;
-      }
-
-      // ***special case of layer to 1 be handled here
-      Pixel localPix;
-      if (layer == 1) {
-        uint32_t col = (ww >> COL_shift) & COL_mask;
-        uint32_t row = (ww >> ROW_shift) & ROW_mask;
-        localPix.row = row;
-        localPix.col = col;
-        if (includeErrors) {
-          if (not rocRowColIsValid(row, col)) {
-            uint32_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
-            err->emplace_back(rawId, ww, error, fedId);
-            if(debug) printf("BPIX1  Error status: %i\n", error);
-            continue;
-          }
-        }
-      } else {
-        // ***conversion rules for dcol and pxid
-        uint32_t dcol = (ww >> DCOL_shift) & DCOL_mask;
-        uint32_t pxid = (ww >> PXID_shift) & PXID_mask;
-        uint32_t row  = numRowsInRoc - pxid/2;
-        uint32_t col  = dcol*2 + pxid%2;
-        localPix.row = row;
-        localPix.col = col;
-        if (includeErrors and not dcolIsValid(dcol, pxid)) {
-          uint32_t error = conversionError(fedId, 3, debug);
-          err->emplace_back(rawId, ww, error, fedId);
-          if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
-          continue;
-        }
-      }
-
-      Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
-      XX[gIndex]    = globalPix.row  ; // origin shifting by 1 0-159
-      YY[gIndex]    = globalPix.col ; // origin shifting by 1 0-415
-      ADC[gIndex]   = getADC(ww);
-      pdigi[gIndex] = pack(globalPix.row,globalPix.col,ADC[gIndex]);
-      moduleId[gIndex] = detId.moduleId;
-      rawIdArr[gIndex] = rawId;
-    } // end of if (gIndex < end)
-   } // end fake loop
-} // end of Raw to Digi kernel
-
-
-// kernel wrapper called from runRawToDigi_kernel
-void RawToDigi_wrapper(
-    context & c,
-    const SiPixelFedCablingMapGPU* cablingMapDevice, SiPixelGainForHLTonGPU * const ped,
-    const uint32_t wordCounter, uint32_t *word, const uint32_t fedCounter,  uint8_t *fedId_h,
-    bool convertADCtoElectrons, 
-    uint32_t * pdigi_h, uint32_t *rawIdArr_h,
-    GPU::SimpleVector<error_obj> *error_h, GPU::SimpleVector<error_obj> *error_h_tmp, error_obj *data_h,
-    uint16_t * adc_h, int32_t * clus_h,
-    bool useQualityInfo, bool includeErrors, bool debug, uint32_t & nModulesActive)
-{
-  const int threadsPerBlock = 512;
-  const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
-
-
-  assert(0 == wordCounter%2);
-  // wordCounter is the total no of words in each event to be trasfered on device
-  cudaCheck(cudaMemcpyAsync(&c.word_d[0],     &word[0],     wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
-  cudaCheck(cudaMemcpyAsync(&c.fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t)/2, cudaMemcpyDefault, c.stream));
-    
-  constexpr uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
-  constexpr uint32_t esize = sizeof(error_obj);
-  cudaCheck(cudaMemcpyAsync(c.error_d, error_h_tmp, vsize, cudaMemcpyDefault, c.stream));
-    
-  // Launch rawToDigi kernel
-  RawToDigi_kernel<<<blocks, threadsPerBlock, 0, c.stream>>>(
-      cablingMapDevice,
-      wordCounter,
-      c.word_d,
-      c.fedId_d,
-      c.xx_d, c.yy_d, c.adc_d,
-      c.pdigi_d,
-      c.rawIdArr_d,
-      c.moduleInd_d,
-      c.error_d,
-      useQualityInfo,
-      includeErrors,
-      debug);
-  cudaCheck(cudaGetLastError());
-
-  // copy data to host variable
-
-  cudaCheck(cudaMemcpyAsync(pdigi_h, c.pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
-  cudaCheck(cudaMemcpyAsync(rawIdArr_h, c.rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
-
-  if (includeErrors) {
-      cudaCheck(cudaMemcpyAsync(error_h, c.error_d, vsize, cudaMemcpyDefault, c.stream));
-      cudaStreamSynchronize(c.stream);
-      error_h->set_data(data_h);
-      int size = error_h->size();
-      cudaCheck(cudaMemcpyAsync(data_h, c.data_d, size*esize, cudaMemcpyDefault, c.stream));
-  }
-  // End  of Raw2Digi and passing data for cluserisation
-
- {
-   // clusterizer ...
-   using namespace gpuClustering;
-  int threadsPerBlock = 256;
-  int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
-
-
-  assert(ped);
-  gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, c.stream>>>(
-               c.moduleInd_d,
-               c.xx_d, c.yy_d, c.adc_d,
-               ped,
-               wordCounter
-             );
-
-  cudaCheck(cudaGetLastError());
-
-  // calibrated adc
-  cudaCheck(cudaMemcpyAsync(adc_h, c.adc_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
-
-  /*
-  std::cout
-    << "CUDA countModules kernel launch with " << blocks
-    << " blocks of " << threadsPerBlock << " threads\n";
-  */
-
-  uint32_t nModules=0;
-  cudaCheck(cudaMemcpyAsync(c.moduleStart_d, &nModules, sizeof(uint32_t), cudaMemcpyDefault, c.stream));
-
-  countModules<<<blocks, threadsPerBlock, 0, c.stream>>>(c.moduleInd_d, c.moduleStart_d, c.clus_d, wordCounter);
-  cudaCheck(cudaGetLastError());
-
-  cudaCheck(cudaMemcpyAsync(&nModules, c.moduleStart_d, sizeof(uint32_t), cudaMemcpyDefault, c.stream));
-
-  // std::cout << "found " << nModules << " Modules active" << std::endl;
-
-  
-  threadsPerBlock = 256;
-  blocks = nModules;
-
-  /*
-  std::cout
-    << "CUDA findClus kernel launch with " << blocks
-    << " blocks of " << threadsPerBlock << " threads\n";
-  */
-
-  cudaCheck(cudaMemsetAsync(c.clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t),c.stream));
-
-  findClus<<<blocks, threadsPerBlock, 0, c.stream>>>(
-               c.moduleInd_d,
-               c.xx_d, c.yy_d, c.adc_d,
-	       c.moduleStart_d,
-	       c.clusInModule_d, c.moduleId_d,
-	       c.clus_d,
-	       c.debug_d,
-               wordCounter
-  );
-
-  // clusters
-  cudaCheck(cudaMemcpyAsync(clus_h, c.clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
-
-
-  cudaStreamSynchronize(c.stream);
-  cudaCheck(cudaGetLastError());
-
-  nModulesActive = nModules;
-
- } // end clusterizer scope
-
-}
diff --git a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h b/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h
deleted file mode 100644
index 44a820db45aba..0000000000000
--- a/EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h
+++ /dev/null
@@ -1,216 +0,0 @@
-/*Sushil Dubey, Shashi Dugad, TIFR
- *
- */
-
-#ifndef RAWTODIGIGPU_H
-#define RAWTODIGIGPU_H
-
-#include <cuda_runtime.h>
-
-#include "SiPixelFedCablingMapGPU.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include<algorithm>
-
-const uint32_t layerStartBit_   = 20;
-const uint32_t ladderStartBit_  = 12;
-const uint32_t moduleStartBit_  = 2;
-
-const uint32_t panelStartBit_   = 10;
-const uint32_t diskStartBit_    = 18;
-const uint32_t bladeStartBit_   = 12;
-
-const uint32_t layerMask_       = 0xF;
-const uint32_t ladderMask_      = 0xFF;
-const uint32_t moduleMask_      = 0x3FF;
-const uint32_t panelMask_       = 0x3;
-const uint32_t diskMask_        = 0xF;
-const uint32_t bladeMask_       = 0x3F;
-
-const uint32_t LINK_bits        = 6;
-const uint32_t ROC_bits         = 5;
-const uint32_t DCOL_bits        = 5;
-const uint32_t PXID_bits        = 8;
-const uint32_t ADC_bits         = 8;
-// special for layer 1
-const uint32_t LINK_bits1       = 6;
-const uint32_t ROC_bits1        = 5;
-const uint32_t COL_bits1_l1     = 6;
-const uint32_t ROW_bits1_l1     = 7;
-const uint32_t OMIT_ERR_bits    = 1;
-
-const uint32_t maxROCIndex      = 8;
-const uint32_t numRowsInRoc     = 80;
-const uint32_t numColsInRoc     = 52;
-
-const uint32_t MAX_WORD = 2000;
-
-const uint32_t ADC_shift  = 0;
-const uint32_t PXID_shift = ADC_shift + ADC_bits;
-const uint32_t DCOL_shift = PXID_shift + PXID_bits;
-const uint32_t ROC_shift  = DCOL_shift + DCOL_bits;
-const uint32_t LINK_shift = ROC_shift + ROC_bits1;
-// special for layer 1 ROC
-const uint32_t ROW_shift = ADC_shift + ADC_bits;
-const uint32_t COL_shift = ROW_shift + ROW_bits1_l1;
-const uint32_t OMIT_ERR_shift = 20;
-
-const uint32_t LINK_mask = ~(~uint32_t(0) << LINK_bits1);
-const uint32_t ROC_mask  = ~(~uint32_t(0) << ROC_bits1);
-const uint32_t COL_mask  = ~(~uint32_t(0) << COL_bits1_l1);
-const uint32_t ROW_mask  = ~(~uint32_t(0) << ROW_bits1_l1);
-const uint32_t DCOL_mask = ~(~uint32_t(0) << DCOL_bits);
-const uint32_t PXID_mask = ~(~uint32_t(0) << PXID_bits);
-const uint32_t ADC_mask  = ~(~uint32_t(0) << ADC_bits);
-const uint32_t ERROR_mask = ~(~uint32_t(0) << ROC_bits1);
-const uint32_t OMIT_ERR_mask = ~(~uint32_t(0) << OMIT_ERR_bits);
-
-struct DetIdGPU {
-  uint32_t RawId;
-  uint32_t rocInDet;
-  uint32_t moduleId;
-};
-
-struct Pixel {
- uint32_t row;
- uint32_t col;
-};
-
-
-namespace gpudetails{
-
-class Packing {
-  public:
-  using PackedDigiType = uint32_t;
-    
-    // Constructor: pre-computes masks and shifts from field widths
-__host__ __device__
-inline
-    constexpr Packing(unsigned int row_w, unsigned int column_w,
-	    unsigned int time_w, unsigned int adc_w) :
-      row_width(row_w), column_width(column_w), adc_width(adc_w)
-      ,row_shift(0)
-      ,column_shift(row_shift + row_w)
-      ,time_shift(column_shift + column_w)
-      ,adc_shift(time_shift + time_w)
-      ,row_mask(~(~0U << row_w))
-      ,column_mask( ~(~0U << column_w))
-      ,time_mask(~(~0U << time_w))
-      ,adc_mask(~(~0U << adc_w))
-      ,rowcol_mask(~(~0U << (column_w+row_w)))
-      ,max_row(row_mask)
-      ,max_column(column_mask)
-      ,max_adc(adc_mask){}
-
-							   
-    uint32_t  row_width;
-    uint32_t  column_width;
-    uint32_t  adc_width;
-    
-    uint32_t  row_shift;
-    uint32_t  column_shift;
-    uint32_t  time_shift;
-    uint32_t  adc_shift;
-   
-    PackedDigiType row_mask;
-    PackedDigiType column_mask;
-    PackedDigiType time_mask;
-    PackedDigiType adc_mask;
-    PackedDigiType rowcol_mask;
-    
-    
-    uint32_t  max_row;
-    uint32_t  max_column;
-    uint32_t  max_adc;
-  };
-
-
-// const PixelChannelIdentifier::Packing PixelChannelIdentifier::thePacking( 11, 11, 0, 10); // row, col, time, adc
-
-
-__host__ __device__
-inline
-constexpr gpudetails::Packing packing() { return gpudetails::Packing(11, 11, 0, 10);}
-
-}
-
-// constexpr Packing thePacking = packing();
-
-__host__ __device__
-inline uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) {
-  constexpr gpudetails::Packing thePacking = gpudetails::packing();
-  adc = std::min(adc, thePacking.max_adc);
-
-  return (row << thePacking.row_shift) |
-    (col << thePacking.column_shift) |
-    (adc << thePacking.adc_shift);
-
-}
-
-struct error_obj {
-  uint32_t rawId;
-  uint32_t word;
-  unsigned char errorType;
-  unsigned char fedId;
-  __host__ __device__ error_obj(uint32_t a_, uint32_t b_, unsigned char c_, unsigned char d_):
-    rawId(a_), word(b_), errorType(c_), fedId(d_) {}
-};
-
-// configuration and memory buffers alocated on the GPU
-struct context {
-  cudaStream_t stream;
-
-  uint32_t * word_d;
-  uint8_t * fedId_d;
-  uint32_t * pdigi_d;
-  uint16_t * xx_d;
-  uint16_t * yy_d;
-  uint16_t * adc_d;
-
-  uint16_t * moduleInd_d;
-  uint32_t * rawIdArr_d;
-
-  GPU::SimpleVector<error_obj> * error_d;
-  error_obj * data_d;
-    
-  // these are for the clusterizer (to be moved)
-  uint32_t * moduleStart_d;
-  int32_t *  clus_d;
-  uint32_t * clusInModule_d;
-  uint32_t * moduleId_d;
-
-  uint32_t * debug_d;
-};
-
-
-// wrapper function to call RawToDigi on the GPU from host side
-void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice,
-                       SiPixelGainForHLTonGPU * const ped,
-                       const uint32_t wordCounter, uint32_t *word,
-                       const uint32_t fedCounter,  uint8_t *fedId_h,
-                       bool convertADCtoElectrons, uint32_t * pdigi_h,
-                       uint32_t *rawIdArr_h, GPU::SimpleVector<error_obj> *error_h,
-                       GPU::SimpleVector<error_obj> *error_h_tmp, error_obj *data_h,
-                       uint16_t * adc_h, int32_t * clus_h,
-                       bool useQualityInfo, bool includeErrors, bool debug,
-                       uint32_t & nModulesActive);
-
-// void initCablingMap();
-context initDeviceMemory();
-void freeMemory(context &);
-
-// reference cmssw/RecoLocalTracker/SiPixelClusterizer
-// all are runtime const, should be specified in python _cfg.py
-struct ADCThreshold {
-  const int thePixelThreshold = 1000; // default Pixel threshold in electrons
-  const int theSeedThreshold = 1000; //seed thershold in electrons not used in our algo
-  const float theClusterThreshold = 4000; // Cluster threshold in electron
-  const int ConversionFactor = 65;  // adc to electron conversion factor
-
-  // following are the default value
-  // it should be i python script
-  const int theStackADC_  = 255; // the maximum adc count for stack layer
-  const int theFirstStack_ = 5; // the index of the fits stack layer
-  const double theElectronPerADCGain_ = 600; //ADC to electron conversion
-};
-
-#endif
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SealModule.cc b/EventFilter/SiPixelRawToDigi/plugins/SealModule.cc
deleted file mode 100644
index 30ed2eb6e8850..0000000000000
--- a/EventFilter/SiPixelRawToDigi/plugins/SealModule.cc
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "FWCore/PluginManager/interface/ModuleDef.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-
-#include "SiPixelRawToDigi.h"
-#include "SiPixelDigiToRaw.h"
-
-DEFINE_FWK_MODULE(SiPixelDigiToRaw);
-DEFINE_FWK_MODULE(SiPixelRawToDigi);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiToRaw.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiToRaw.cc
index cb0999d51c4db..cd6199e05ae9c 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiToRaw.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiToRaw.cc
@@ -147,5 +147,6 @@ void SiPixelDigiToRaw::produce( edm::Event& ev,
   
 }
 
-// -----------------------------------------------------------------------------
-
+// declare this as a framework plugin
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiPixelDigiToRaw);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc
index e96af4789b1ce..16fa6281e5c7b 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.cc
@@ -1,3 +1,4 @@
+// C++ includes
 #include <algorithm>
 #include <iomanip>
 #include <iostream>
@@ -5,21 +6,51 @@
 #include <set>
 #include <vector>
 
+// CUDA includes
 #include <cuda_runtime.h>
 
-#include "SiPixelFedCablingMapGPU.h"
-
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
+// CMSSW includes
 #include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelQuality.h"
-#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "Geometry/CommonDetUnit/interface/GeomDetType.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h"
+// local includes
+#include "SiPixelFedCablingMapGPU.h"
+
+void allocateCablingMap(SiPixelFedCablingMapGPU* & cablingMapHost, SiPixelFedCablingMapGPU* & cablingMapDevice) {
+  cudaCheck(cudaMallocHost((void**) & cablingMapHost, sizeof(SiPixelFedCablingMapGPU)));
+  cudaCheck(cudaMalloc((void**) & cablingMapDevice, sizeof(SiPixelFedCablingMapGPU)));
+  cudaCheck(cudaMalloc((void**) & cablingMapHost->fed,      MAX_SIZE_BYTE_INT));
+  cudaCheck(cudaMalloc((void**) & cablingMapHost->link,     MAX_SIZE_BYTE_INT));
+  cudaCheck(cudaMalloc((void**) & cablingMapHost->roc,      MAX_SIZE_BYTE_INT));
+  cudaCheck(cudaMalloc((void**) & cablingMapHost->RawId,    MAX_SIZE_BYTE_INT));
+  cudaCheck(cudaMalloc((void**) & cablingMapHost->rocInDet, MAX_SIZE_BYTE_INT));
+  cudaCheck(cudaMalloc((void**) & cablingMapHost->moduleId, MAX_SIZE_BYTE_INT));
+  cudaCheck(cudaMalloc((void**) & cablingMapHost->badRocs,  MAX_SIZE_BYTE_BOOL));
+  cudaCheck(cudaMalloc((void**) & cablingMapHost->modToUnp, MAX_SIZE_BYTE_BOOL));
+  cudaCheck(cudaMemcpy(cablingMapDevice, cablingMapHost, sizeof(SiPixelFedCablingMapGPU), cudaMemcpyDefault));
+}
+
+void deallocateCablingMap(SiPixelFedCablingMapGPU* cablingMapHost, SiPixelFedCablingMapGPU* cablingMapDevice) {
+  cudaCheck(cudaFree(cablingMapHost->fed));
+  cudaCheck(cudaFree(cablingMapHost->link));
+  cudaCheck(cudaFree(cablingMapHost->roc));
+  cudaCheck(cudaFree(cablingMapHost->RawId));
+  cudaCheck(cudaFree(cablingMapHost->rocInDet));
+  cudaCheck(cudaFree(cablingMapHost->moduleId));
+  cudaCheck(cudaFree(cablingMapHost->modToUnp));
+  cudaCheck(cudaFree(cablingMapHost->badRocs));
+  cudaCheck(cudaFree(cablingMapDevice));
+  cudaCheck(cudaFreeHost(cablingMapHost));
+}
 
-#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
 
 void processCablingMap(SiPixelFedCablingMap const& cablingMap,  TrackerGeometry const& trackerGeom,
                        SiPixelFedCablingMapGPU* cablingMapHost, SiPixelFedCablingMapGPU* cablingMapDevice, 
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.h b/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.h
index 3cd35f54480d5..27355cb9d96c7 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.h
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelFedCablingMapGPU.h
@@ -1,10 +1,9 @@
-#ifndef SiPixelFedCablingMapGPU_h
-#define SiPixelFedCablingMapGPU_h
+#ifndef EventFilter_SiPixelRawToDigi_plugins_SiPixelFedCablingMapGPU_h
+#define EventFilter_SiPixelRawToDigi_plugins_SiPixelFedCablingMapGPU_h
 
+// C++ includes
 #include <set>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
 class SiPixelFedCablingMap;
 class SiPixelQuality;
 class TrackerGeometry;
@@ -34,39 +33,22 @@ struct SiPixelFedCablingMapGPU {
   unsigned char * badRocs;
 };
 
-inline
-void allocateCablingMap(SiPixelFedCablingMapGPU* & cablingMapHost, SiPixelFedCablingMapGPU* & cablingMapDevice) {
-  cudaCheck(cudaMallocHost((void**) & cablingMapHost, sizeof(SiPixelFedCablingMapGPU)));
-  cudaCheck(cudaMalloc((void**) & cablingMapDevice, sizeof(SiPixelFedCablingMapGPU)));
-  cudaCheck(cudaMalloc((void**) & cablingMapHost->fed,      MAX_SIZE_BYTE_INT));
-  cudaCheck(cudaMalloc((void**) & cablingMapHost->link,     MAX_SIZE_BYTE_INT));
-  cudaCheck(cudaMalloc((void**) & cablingMapHost->roc,      MAX_SIZE_BYTE_INT));
-  cudaCheck(cudaMalloc((void**) & cablingMapHost->RawId,    MAX_SIZE_BYTE_INT));
-  cudaCheck(cudaMalloc((void**) & cablingMapHost->rocInDet, MAX_SIZE_BYTE_INT));
-  cudaCheck(cudaMalloc((void**) & cablingMapHost->moduleId, MAX_SIZE_BYTE_INT));
-  cudaCheck(cudaMalloc((void**) & cablingMapHost->badRocs,  MAX_SIZE_BYTE_BOOL));
-  cudaCheck(cudaMalloc((void**) & cablingMapHost->modToUnp, MAX_SIZE_BYTE_BOOL));
-  cudaCheck(cudaMemcpy(cablingMapDevice, cablingMapHost, sizeof(SiPixelFedCablingMapGPU), cudaMemcpyDefault));
-}
-
-inline
-void deallocateCablingMap(SiPixelFedCablingMapGPU* cablingMapHost, SiPixelFedCablingMapGPU* cablingMapDevice) {
-  cudaCheck(cudaFree(cablingMapHost->fed));
-  cudaCheck(cudaFree(cablingMapHost->link));
-  cudaCheck(cudaFree(cablingMapHost->roc));
-  cudaCheck(cudaFree(cablingMapHost->RawId));
-  cudaCheck(cudaFree(cablingMapHost->rocInDet));
-  cudaCheck(cudaFree(cablingMapHost->moduleId));
-  cudaCheck(cudaFree(cablingMapHost->modToUnp));
-  cudaCheck(cudaFree(cablingMapHost->badRocs));
-  cudaCheck(cudaFree(cablingMapDevice));
-  cudaCheck(cudaFreeHost(cablingMapHost));
-}
+void allocateCablingMap(SiPixelFedCablingMapGPU* & cablingMapHost,
+                        SiPixelFedCablingMapGPU* & cablingMapDevice);
 
-void processCablingMap(SiPixelFedCablingMap const& cablingMap, TrackerGeometry const& trackerGeom,
-                       SiPixelFedCablingMapGPU* cablingMapHost, SiPixelFedCablingMapGPU* cablingMapDevice, const SiPixelQuality* badPixelInfo, std::set<unsigned int> const& modules);
+void deallocateCablingMap(SiPixelFedCablingMapGPU* cablingMapHost,
+                          SiPixelFedCablingMapGPU* cablingMapDevice);
 
-void processGainCalibration(SiPixelGainCalibrationForHLT const & gains, TrackerGeometry const& trackerGeom, SiPixelGainForHLTonGPU * & gainsOnGPU, SiPixelGainForHLTonGPU_DecodingStructure * & gainDataOnGPU);
+void processCablingMap(SiPixelFedCablingMap const& cablingMap,
+                       TrackerGeometry const& trackerGeom,
+                       SiPixelFedCablingMapGPU* cablingMapHost,
+                       SiPixelFedCablingMapGPU* cablingMapDevice,
+                       SiPixelQuality const* badPixelInfo,
+                       std::set<unsigned int> const& modules);
 
-#endif // SiPixelFedCablingMapGPU_h
+void processGainCalibration(SiPixelGainCalibrationForHLT const& gains,
+                            TrackerGeometry const& trackerGeom,
+                            SiPixelGainForHLTonGPU * & gainsOnGPU,
+                            SiPixelGainForHLTonGPU_DecodingStructure * & gainDataOnGPU);
 
+#endif // EventFilter_SiPixelRawToDigi_plugins_SiPixelFedCablingMapGPU_h
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigi.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigi.cc
index 9b93f63741df4..dc3c9e4c68f68 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigi.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigi.cc
@@ -324,3 +324,7 @@ void SiPixelRawToDigi::produce( edm::Event& ev,
     ev.put(std::move(disabled_channelcollection));
   }
 }
+
+// declare this as a framework plugin
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiPixelRawToDigi);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc
index e9f4d3654f4c9..92089d50cb58e 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.cc
@@ -1,17 +1,21 @@
 // This code is an entry point for GPU based pixel track reconstruction for HLT
 // Modified by Sushil and Shashi for this purpose July-2017
 
+// C++ includes
 #include <string>
 #include <chrono>
 #include <iostream>
 #include <fstream>
 
+// CUDA kincludes
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+// ROOT includes
 #include <TH1D.h>
 #include <TFile.h>
 
+// CMSSW includes
 #include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
 #include "CondFormats/SiPixelObjects/interface/SiPixelQuality.h"
@@ -21,34 +25,28 @@
 #include "DataFormats/FEDRawData/interface/FEDNumbering.h"
 #include "DataFormats/FEDRawData/interface/FEDRawData.h"
 #include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
 #include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
 #include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
 #include "DataFormats/SiPixelRawData/interface/SiPixelRawDataError.h"
-
-#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
-
-
-
-#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
-
 #include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
 #include "EventFilter/SiPixelRawToDigi/interface/PixelUnpackingRegions.h"
 #include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/PluginManager/interface/ModuleDef.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include "EventInfoGPU.h"
-#include "RawToDigiGPU.h"
+// local includes
 #include "SiPixelFedCablingMapGPU.h"
 #include "SiPixelRawToDigiGPU.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
+#include "SiPixelRawToDigiGPUKernel.h"
 
 namespace {
 struct AccretionCluster {
@@ -159,7 +157,7 @@ SiPixelRawToDigiGPU::SiPixelRawToDigiGPU( const edm::ParameterSet& conf )
   // device copy of GPU friendly cablng map
   allocateCablingMap(cablingMapGPUHost_, cablingMapGPUDevice_);
 
-  int WSIZE = MAX_FED * MAX_WORD * sizeof(unsigned int);
+  int WSIZE = MAX_FED * pixelgpudetails::MAX_WORD * sizeof(unsigned int);
   cudaMallocHost(&word,       sizeof(unsigned int)*WSIZE);
   cudaMallocHost(&fedId_h,    sizeof(unsigned char)*WSIZE);
 
@@ -170,21 +168,21 @@ SiPixelRawToDigiGPU::SiPixelRawToDigiGPU( const edm::ParameterSet& conf )
   cudaMallocHost(&adc_h, sizeof(uint16_t)*WSIZE);
   cudaMallocHost(&clus_h, sizeof(int32_t)*WSIZE);
 
-  uint32_t vsize = sizeof(GPU::SimpleVector<error_obj>);
-  uint32_t esize = sizeof(error_obj);
+  uint32_t vsize = sizeof(GPU::SimpleVector<pixelgpudetails::error_obj>);
+  uint32_t esize = sizeof(pixelgpudetails::error_obj);
   cudaCheck(cudaMallocHost(&error_h, vsize));
   cudaCheck(cudaMallocHost(&error_h_tmp, vsize));
-  cudaCheck(cudaMallocHost(&data_h, MAX_FED*MAX_WORD*esize));
+  cudaCheck(cudaMallocHost(&data_h, MAX_FED*pixelgpudetails::MAX_WORD*esize));
 
   // allocate memory for RawToDigi on GPU
-  context_ = initDeviceMemory();
+  context_ = pixelgpudetails::initDeviceMemory();
 
-  new (error_h) GPU::SimpleVector<error_obj>(MAX_FED*MAX_WORD, data_h);
-  new (error_h_tmp) GPU::SimpleVector<error_obj>(MAX_FED*MAX_WORD, context_.data_d);
+  new (error_h) GPU::SimpleVector<pixelgpudetails::error_obj>(MAX_FED*pixelgpudetails::MAX_WORD, data_h);
+  new (error_h_tmp) GPU::SimpleVector<pixelgpudetails::error_obj>(MAX_FED*pixelgpudetails::MAX_WORD, context_.data_d);
   assert(error_h->size() == 0);
-  assert(error_h->capacity() == static_cast<int>(MAX_FED*MAX_WORD));
+  assert(error_h->capacity() == static_cast<int>(MAX_FED*pixelgpudetails::MAX_WORD));
   assert(error_h_tmp->size() == 0);
-  assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED*MAX_WORD));
+  assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED*pixelgpudetails::MAX_WORD));
 }
 
 // -----------------------------------------------------------------------------
@@ -479,7 +477,7 @@ SiPixelRawToDigiGPU::produce( edm::Event& ev, const edm::EventSetup& es)
 
   auto size = error_h->size();
   for (auto i = 0; i < size; i++) {
-    error_obj err = (*error_h)[i];
+    pixelgpudetails::error_obj err = (*error_h)[i];
     if (err.errorType != 0) {
         SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
         errors[err.rawId].push_back(error);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h
index 2a4e0d1df3ccd..76f7f7f256e70 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPU.h
@@ -6,32 +6,30 @@
  *  for pixel subdetector
  */
 
-#include "FWCore/Framework/interface/ESWatcher.h"
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTService.h"
 #include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
 #include "CondFormats/DataRecord/interface/SiPixelQualityRcd.h"
 #include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-
-#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTService.h"
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
-
-
 #include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESWatcher.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/Utilities/interface/CPUTimer.h"
-#include "RawToDigiGPU.h"
+#include "SiPixelRawToDigiGPUKernel.h"
 
+class PixelUnpackingRegions;
 class SiPixelFedCablingTree;
 class SiPixelFedCabling;
 class SiPixelQuality;
 class TH1D;
-class PixelUnpackingRegions;
-
 class SiPixelGainForHLTonGPU;
 struct SiPixelGainForHLTonGPU_DecodingStructure;
-class SiPixelRawToDigiGPU : public edm::stream::EDProducer<> {
+
+class SiPixelRawToDigiGPU : public edm::stream::EDProducer<>
+{
 public:
 
   /// ctor
@@ -77,12 +75,12 @@ class SiPixelRawToDigiGPU : public edm::stream::EDProducer<> {
   // to store the output
   uint32_t *pdigi_h, *rawIdArr_h;                   // host copy of output
   uint16_t * adc_h; int32_t * clus_h; // host copy of calib&clus output
-  error_obj *data_h = nullptr;
-  GPU::SimpleVector<error_obj> *error_h = nullptr;
-  GPU::SimpleVector<error_obj> *error_h_tmp = nullptr;
+  pixelgpudetails::error_obj *data_h = nullptr;
+  GPU::SimpleVector<pixelgpudetails::error_obj> *error_h = nullptr;
+  GPU::SimpleVector<pixelgpudetails::error_obj> *error_h_tmp = nullptr;
 
   // configuration and memory buffers alocated on the GPU
-  context context_;
+  pixelgpudetails::context context_;
   SiPixelFedCablingMapGPU * cablingMapGPUHost_;
   SiPixelFedCablingMapGPU * cablingMapGPUDevice_;
 
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.cu b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.cu
new file mode 100644
index 0000000000000..d8cfc643f4461
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.cu
@@ -0,0 +1,696 @@
+/* Sushil Dubey, Shashi Dugad, TIFR, July 2017
+ *
+ * File Name: RawToDigiGPU.cu
+ * Description: It converts Raw data into Digi Format on GPU
+ * then it converts adc -> electron and
+ * applies the adc threshold to needed for clustering
+ * Finaly the Output of RawToDigi data is given to pixelClusterizer
+ *
+**/
+
+// C++ includes
+#include <cassert>
+#include <chrono>
+#include <cstdio>
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+
+// CUDA includes
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+#include <thrust/execution_policy.h>
+
+// CMSSW includes
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuCalibPixel.h"
+#include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
+
+// local includes
+#include "SiPixelFedCablingMapGPU.h"
+#include "SiPixelRawToDigiGPUKernel.h"
+
+namespace pixelgpudetails {
+
+  pixelgpudetails::context initDeviceMemory() {
+
+    using namespace gpuClustering;
+    pixelgpudetails::context c;
+
+    // Number of words for all the feds
+    constexpr uint32_t MAX_WORD08_SIZE = MAX_FED * pixelgpudetails::MAX_WORD  * sizeof(uint8_t);
+    constexpr uint32_t MAX_WORD32_SIZE = MAX_FED * pixelgpudetails::MAX_WORD  * sizeof(uint32_t);
+    constexpr uint32_t MAX_WORD16_SIZE = MAX_FED * pixelgpudetails::MAX_WORD  * sizeof(uint16_t);
+    constexpr uint32_t vsize = sizeof(GPU::SimpleVector<pixelgpudetails::error_obj>);
+    constexpr uint32_t esize = sizeof(pixelgpudetails::error_obj);
+    constexpr uint32_t MAX_ERROR_SIZE  = MAX_FED * pixelgpudetails::MAX_WORD * esize;
+
+    cudaCheck(cudaMalloc((void**) & c.word_d,        MAX_WORD32_SIZE));
+    cudaCheck(cudaMalloc((void**) & c.fedId_d,       MAX_WORD08_SIZE));
+    cudaCheck(cudaMalloc((void**) & c.pdigi_d,       MAX_WORD32_SIZE)); // to store thepacked digi
+    cudaCheck(cudaMalloc((void**) & c.xx_d,          MAX_WORD16_SIZE)); // to store the x and y coordinate
+    cudaCheck(cudaMalloc((void**) & c.yy_d,          MAX_WORD16_SIZE));
+    cudaCheck(cudaMalloc((void**) & c.adc_d,         MAX_WORD16_SIZE));
+
+    cudaCheck(cudaMalloc((void**) & c.moduleInd_d,   MAX_WORD16_SIZE));
+    cudaCheck(cudaMalloc((void**) & c.rawIdArr_d,    MAX_WORD32_SIZE));
+    cudaCheck(cudaMalloc((void**) & c.error_d,       vsize));
+    cudaCheck(cudaMalloc((void**) & c.data_d,        MAX_ERROR_SIZE));
+
+    // for the clusterizer
+    cudaCheck(cudaMalloc((void**) & c.clus_d,        MAX_WORD32_SIZE)); // cluser index in module
+
+    cudaCheck(cudaMalloc((void**) & c.moduleStart_d,   (MaxNumModules+1)*sizeof(uint32_t) ));
+    cudaCheck(cudaMalloc((void**) & c.clusInModule_d,   (MaxNumModules)*sizeof(uint32_t) ));
+    cudaCheck(cudaMalloc((void**) & c.moduleId_d,   (MaxNumModules)*sizeof(uint32_t) ));
+
+    cudaCheck(cudaMalloc((void**) & c.debug_d,        MAX_WORD32_SIZE));
+
+    // create a CUDA stream
+    cudaCheck(cudaStreamCreate(&c.stream));
+
+    return c;
+  }
+
+
+  void freeMemory(pixelgpudetails::context & c) {
+    // free the GPU memory
+    cudaCheck(cudaFree(c.word_d));
+    cudaCheck(cudaFree(c.fedId_d));
+    cudaCheck(cudaFree(c.pdigi_d));
+    cudaCheck(cudaFree(c.xx_d));
+    cudaCheck(cudaFree(c.yy_d));
+    cudaCheck(cudaFree(c.adc_d));
+    cudaCheck(cudaFree(c.moduleInd_d));
+    cudaCheck(cudaFree(c.rawIdArr_d));
+    cudaCheck(cudaFree(c.error_d));
+    cudaCheck(cudaFree(c.data_d));
+
+   // these are for the clusterizer (to be moved)
+   cudaCheck(cudaFree(c.moduleStart_d));
+   cudaCheck(cudaFree(c.clus_d));
+   cudaCheck(cudaFree(c.clusInModule_d));
+   cudaCheck(cudaFree(c.moduleId_d));
+   cudaCheck(cudaFree(c.debug_d));
+
+
+    // destroy the CUDA stream
+    cudaCheck(cudaStreamDestroy(c.stream));
+  }
+
+
+  __device__ uint32_t getLink(uint32_t ww)  {
+    return ((ww >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask);
+  }
+
+
+  __device__ uint32_t getRoc(uint32_t ww) {
+    return ((ww >> pixelgpudetails::ROC_shift ) & pixelgpudetails::ROC_mask);
+  }
+
+
+  __device__ uint32_t getADC(uint32_t ww) {
+    return ((ww >> pixelgpudetails::ADC_shift) & pixelgpudetails::ADC_mask);
+  }
+
+
+  __device__ bool isBarrel(uint32_t rawId) {
+    return (1==((rawId>>25)&0x7));
+  }
+
+
+
+  __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelFedCablingMapGPU * Map, uint32_t fed, uint32_t link, uint32_t roc) {
+    uint32_t index = fed * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + roc;
+    pixelgpudetails::DetIdGPU detId = { Map->RawId[index], Map->rocInDet[index], Map->moduleId[index] };
+    return detId;
+  }
+
+
+  //reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html
+  //http://cmslxr.fnal.gov/source/CondFormats/SiPixelObjects/src/PixelROC.cc?v=CMSSW_9_2_0#0071
+  // Convert local pixel to pixelgpudetails::global pixel
+  __device__ pixelgpudetails::Pixel frameConversion(bool bpix, int side, uint32_t layer, uint32_t rocIdInDetUnit, pixelgpudetails::Pixel local) {
+
+    int slopeRow  = 0, slopeCol = 0;
+    int rowOffset = 0, colOffset = 0;
+
+    if (bpix) {
+
+      if (side == -1 && layer != 1) { // -Z side: 4 non-flipped modules oriented like 'dddd', except Layer 1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow  = -1;
+          slopeCol  = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        } // if roc
+      }
+      else { // +Z side: 4 non-flipped modules oriented like 'pppp', but all 8 in layer1
+        if (rocIdInDetUnit < 8) {
+          slopeRow  = -1;
+          slopeCol  =  1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = rocIdInDetUnit * pixelgpudetails::numColsInRoc;
+        }
+        else {
+          slopeRow  = 1;
+          slopeCol  = -1;
+          rowOffset = 0;
+          colOffset = (16-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+      }
+
+    }
+    else { // fpix
+      if (side==-1) { // pannel 1
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        }
+      }
+      else { // pannel 2
+        if (rocIdInDetUnit < 8) {
+          slopeRow = 1;
+          slopeCol = -1;
+          rowOffset = 0;
+          colOffset = (8-rocIdInDetUnit)*pixelgpudetails::numColsInRoc-1;
+        }
+        else {
+          slopeRow = -1;
+          slopeCol = 1;
+          rowOffset = 2*pixelgpudetails::numRowsInRoc-1;
+          colOffset = (rocIdInDetUnit-8)*pixelgpudetails::numColsInRoc;
+        }
+
+      } // side
+
+    }
+
+    uint32_t gRow = rowOffset+slopeRow*local.row;
+    uint32_t gCol = colOffset+slopeCol*local.col;
+    //printf("Inside frameConversion row: %u, column: %u\n",gRow, gCol);
+    pixelgpudetails::Pixel global = {gRow, gCol};
+    return global;
+  }
+
+
+  __device__ uint32_t conversionError(uint32_t fedId, uint32_t status, bool debug = false)
+  {
+
+    uint32_t errorType = 0;
+
+    // debug = true;
+
+    switch (status) {
+        case(1) : {
+          if (debug) printf("Error in Fed: %i, invalid channel Id (errorType = 35\n)", fedId );
+          errorType = 35;
+          break;
+        }
+        case(2) : {
+          if (debug) printf("Error in Fed: %i, invalid ROC Id (errorType = 36)\n", fedId);
+          errorType = 36;
+          break;
+        }
+        case(3) : {
+          if (debug) printf("Error in Fed: %i, invalid dcol/pixel value (errorType = 37)\n", fedId);
+          errorType = 37;
+          break;
+        }
+        case(4) : {
+          if (debug) printf("Error in Fed: %i, dcol/pixel read out of order (errorType = 38)\n", fedId);
+          errorType = 38;
+          break;
+        }
+        default: if (debug) printf("Cabling check returned unexpected result, status = %i\n", status);
+    };
+
+    return errorType;
+
+  }
+
+
+  __device__ bool rocRowColIsValid(uint32_t rocRow, uint32_t rocCol)
+  {
+      uint32_t numRowsInRoc = 80;
+      uint32_t numColsInRoc = 52;
+
+      /// row and collumn in ROC representation
+      return ((rocRow < numRowsInRoc) & (rocCol < numColsInRoc));
+  }
+
+
+  __device__ bool dcolIsValid(uint32_t dcol, uint32_t pxid)
+  {
+      return ((dcol < 26) &  (2 <= pxid) & (pxid < 162));
+  }
+
+
+  __device__ uint32_t checkROC(uint32_t errorWord, uint32_t fedId, uint32_t link, const SiPixelFedCablingMapGPU *Map, bool debug = false)
+  {
+
+   int errorType = (errorWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ERROR_mask;
+   if (errorType < 25) return false;
+   bool errorFound = false;
+
+   switch (errorType) {
+      case(25) : {
+       errorFound = true;
+       uint32_t index = fedId * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + 1;
+       if (index > 1 && index <= Map->size){
+         if (!(link == Map->link[index] && 1 == Map->roc[index])) errorFound = false;
+       }
+       if (debug&errorFound) printf("Invalid ROC = 25 found (errorType = 25)\n");
+       break;
+     }
+     case(26) : {
+       if (debug) printf("Gap word found (errorType = 26)\n");
+       errorFound = true;
+       break;
+     }
+     case(27) : {
+       if (debug) printf("Dummy word found (errorType = 27)\n");
+       errorFound = true;
+       break;
+     }
+     case(28) : {
+       if (debug) printf("Error fifo nearly full (errorType = 28)\n");
+       errorFound = true;
+       break;
+     }
+     case(29) : {
+       if (debug) printf("Timeout on a channel (errorType = 29)\n");
+       if ((errorWord >> pixelgpudetails::OMIT_ERR_shift) & pixelgpudetails::OMIT_ERR_mask) {
+         if (debug) printf("...first errorType=29 error, this gets masked out\n");
+       }
+       errorFound = true;
+       break;
+     }
+     case(30) : {
+       if (debug) printf("TBM error trailer (errorType = 30)\n");
+       int StateMatch_bits = 4;
+       int StateMatch_shift = 8;
+       uint32_t StateMatch_mask = ~(~uint32_t(0) << StateMatch_bits);
+       int StateMatch = (errorWord >> StateMatch_shift) & StateMatch_mask;
+       if ( StateMatch != 1 && StateMatch != 8 ) {
+         if (debug) printf("FED error 30 with unexpected State Bits (errorType = 30)\n");
+       }
+       if ( StateMatch == 1 ) errorType = 40; // 1=Overflow -> 40, 8=number of ROCs -> 30
+       errorFound = true;
+       break;
+     }
+     case(31) : {
+       if (debug) printf("Event number error (errorType = 31)\n");
+       errorFound = true;
+       break;
+     }
+     default: errorFound = false;
+
+   };
+
+   return errorFound? errorType : 0;
+
+  }
+
+
+  __device__ uint32_t getErrRawID(uint32_t fedId, uint32_t errWord, uint32_t errorType, const SiPixelFedCablingMapGPU *Map, bool debug = false)
+  {
+
+    uint32_t rID = 0xffffffff;
+
+    switch (errorType) {
+      case  25 : case  30 : case  31 : case  36 : case 40 : {
+        //set dummy values for cabling just to get detId from link
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc  = 1;
+        uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
+
+        uint32_t rID_temp = getRawId(Map, fedId, link, roc).RawId;
+        if(rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+      case  29 : {
+        int chanNmbr = 0;
+        const int DB0_shift = 0;
+        const int DB1_shift = DB0_shift + 1;
+        const int DB2_shift = DB1_shift + 1;
+        const int DB3_shift = DB2_shift + 1;
+        const int DB4_shift = DB3_shift + 1;
+        const uint32_t DataBit_mask = ~(~uint32_t(0) << 1);
+
+        int CH1 = (errWord >> DB0_shift) & DataBit_mask;
+        int CH2 = (errWord >> DB1_shift) & DataBit_mask;
+        int CH3 = (errWord >> DB2_shift) & DataBit_mask;
+        int CH4 = (errWord >> DB3_shift) & DataBit_mask;
+        int CH5 = (errWord >> DB4_shift) & DataBit_mask;
+        int BLOCK_bits      = 3;
+        int BLOCK_shift     = 8;
+        uint32_t BLOCK_mask = ~(~uint32_t(0) << BLOCK_bits);
+        int BLOCK = (errWord >> BLOCK_shift) & BLOCK_mask;
+        int localCH = 1*CH1+2*CH2+3*CH3+4*CH4+5*CH5;
+        if (BLOCK%2==0) chanNmbr=(BLOCK/2)*9+localCH;
+        else chanNmbr = ((BLOCK-1)/2)*9+4+localCH;
+        if ((chanNmbr < 1)||(chanNmbr > 36)) break;  // signifies unexpected result
+
+        // set dummy values for cabling just to get detId from link if in Barrel
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc  = 1;
+        uint32_t link = chanNmbr;
+        uint32_t rID_temp = getRawId(Map, fedId, link, roc).RawId;
+        if(rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+      case  37 : case  38: {
+        //cabling.dcol = 0;
+        //cabling.pxid = 2;
+        uint32_t roc  = (errWord >> pixelgpudetails::ROC_shift) & pixelgpudetails::ROC_mask;
+        uint32_t link = (errWord >> pixelgpudetails::LINK_shift) & pixelgpudetails::LINK_mask;
+        uint32_t rID_temp = getRawId(Map, fedId, link, roc).RawId;
+        if(rID_temp != 9999) rID = rID_temp;
+        break;
+      }
+
+    default : break;
+
+    };
+
+    return rID;
+
+  }
+
+
+  /*----------
+  * Name: applyADCthreshold_kernel()
+  * Desc: converts adc count to electrons and then applies the
+  * threshold on each channel.
+  * make pixel to 0 if it is below the threshold
+  * Input: xx_d[], yy_d[], layer_d[], wordCounter, adc[], ADCThreshold
+  *-----------
+  * Output: xx_adc[], yy_adc[] with pixel threshold applied
+  */
+  // kernel to apply adc threshold on the channels
+
+
+  // Felice: gains and pedestals are not the same for each pixel. This code should be rewritten to take
+  // in account local gains/pedestals
+  // __global__ void applyADCthreshold_kernel(const uint32_t *xx_d, const uint32_t *yy_d, const uint32_t *layer_d, uint32_t *adc, const uint32_t wordCounter,
+  //  const ADCThreshold adcThreshold, uint32_t *xx_adc, uint32_t *yy_adc ) {
+  //   int tid = threadIdx.x;
+  //   int gIndex = blockDim.x*blockIdx.x+tid;
+  //   if (gIndex<wordCounter) {
+  //     uint32_t adcOld = adc[gIndex];
+  //     const float gain = adcThreshold.theElectronPerADCGain_; // default: 1 ADC = 135 electrons
+  //     const float pedestal = 0; //
+  //     int adcNew = int(adcOld*gain+pedestal);
+  //     // rare chance of entering into the if ()
+  //     if (layer_d[gIndex]>=adcThreshold.theFirstStack_) {
+  //       if (adcThreshold.theStackADC_==1 && adcOld==1) {
+  //         adcNew = int(255*135); // Arbitrarily use overflow value.
+  //       }
+  //       if (adcThreshold.theStackADC_ >1 && adcThreshold.theStackADC_!=255 && adcOld>=1){
+  //         adcNew = int((adcOld-1) * gain * 255/float(adcThreshold.theStackADC_-1));
+  //       }
+  //     }
+  //
+  //     if (adcNew >adcThreshold.thePixelThreshold ) {
+  //       xx_adc[gIndex]=xx_d[gIndex];
+  //       yy_adc[gIndex]=yy_d[gIndex];
+  //     }
+  //     else {
+  //       xx_adc[gIndex]=0; // 0: dead pixel
+  //       yy_adc[gIndex]=0;
+  //     }
+  //     adc[gIndex] = adcNew;
+  //   }
+  // }
+
+
+  // Kernel to perform Raw to Digi conversion
+  __global__ void RawToDigi_kernel(const SiPixelFedCablingMapGPU *Map, const uint32_t wordCounter, const uint32_t *Word, const uint8_t *fedIds,
+                                   uint16_t * XX, uint16_t * YY, uint16_t * ADC,
+                                   uint32_t * pdigi, uint32_t *rawIdArr, uint16_t * moduleId,
+                                   GPU::SimpleVector<pixelgpudetails::error_obj> *err,
+                                   bool useQualityInfo, bool includeErrors, bool debug)
+  {
+    uint32_t blockId  = blockIdx.x;
+    uint32_t threadId  = threadIdx.x;
+
+    bool skipROC = false;
+    //if (threadId==0) printf("Event: %u blockId: %u start: %u end: %u\n", eventno, blockId, begin, end);
+    
+    for (int aaa=0; aaa<1; ++aaa) {  // too many coninue below.... (to be fixed)
+      auto gIndex = threadId + blockId*blockDim.x;
+      if (gIndex < wordCounter) {
+     
+        uint32_t fedId = fedIds[gIndex/2]; // +1200;    
+
+        // initialize (too many coninue below)
+        pdigi[gIndex]  = 0;
+        rawIdArr[gIndex] = 0;
+        moduleId[gIndex] = 9999; 
+
+        uint32_t ww = Word[gIndex]; // Array containing 32 bit raw data
+        if (ww == 0) {
+          //noise and dead channels are ignored
+          XX[gIndex]    = 0;  // 0 is an indicator of a noise/dead channel
+          YY[gIndex]    = 0; // skip these pixels during clusterization
+          ADC[gIndex]   = 0;
+          continue ; // 0: bad word
+        }
+
+        uint32_t link  = getLink(ww);            // Extract link
+        uint32_t roc   = getRoc(ww);             // Extract Roc in link
+        pixelgpudetails::DetIdGPU detId = getRawId(Map, fedId, link, roc);
+
+        uint32_t errorType = checkROC(ww, fedId, link, Map, debug);
+        skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0);
+        if (includeErrors and skipROC)
+        {
+          uint32_t rID = getErrRawID(fedId, ww, errorType, Map, debug);
+          err->emplace_back(rID, ww, errorType, fedId);
+          continue;
+        }
+
+        uint32_t rawId  = detId.RawId;
+        uint32_t rocIdInDetUnit = detId.rocInDet;
+        bool barrel = isBarrel(rawId);
+
+        uint32_t index = fedId * MAX_LINK * MAX_ROC + (link-1) * MAX_ROC + roc;
+        if (useQualityInfo) {
+
+            skipROC = Map->badRocs[index];
+            if (skipROC) continue;
+
+        }
+        skipROC = Map->modToUnp[index];
+        if (skipROC) continue;
+
+        uint32_t layer = 0;//, ladder =0;
+        int side = 0, panel = 0, module = 0;//disk = 0,blade = 0
+
+        if (barrel)
+        {
+          layer  = (rawId >> pixelgpudetails::layerStartBit) & pixelgpudetails::layerMask;
+          module = (rawId >> pixelgpudetails::moduleStartBit) & pixelgpudetails::moduleMask;
+          side   = (module < 5)? -1 : 1;
+        }
+        else {
+          // endcap ids
+          layer = 0;
+          panel = (rawId >> pixelgpudetails::panelStartBit) & pixelgpudetails::panelMask;
+          //disk  = (rawId >> diskStartBit_)  & diskMask_ ;
+          side  = (panel == 1)? -1 : 1;
+          //blade = (rawId>>bladeStartBit_) & bladeMask_;
+        }
+
+        // ***special case of layer to 1 be handled here
+        pixelgpudetails::Pixel localPix;
+        if (layer == 1) {
+          uint32_t col = (ww >> pixelgpudetails::COL_shift) & pixelgpudetails::COL_mask;
+          uint32_t row = (ww >> pixelgpudetails::ROW_shift) & pixelgpudetails::ROW_mask;
+          localPix.row = row;
+          localPix.col = col;
+          if (includeErrors) {
+            if (not rocRowColIsValid(row, col)) {
+              uint32_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
+              err->emplace_back(rawId, ww, error, fedId);
+              if(debug) printf("BPIX1  Error status: %i\n", error);
+              continue;
+            }
+          }
+        } else {
+          // ***conversion rules for dcol and pxid
+          uint32_t dcol = (ww >> pixelgpudetails::DCOL_shift) & pixelgpudetails::DCOL_mask;
+          uint32_t pxid = (ww >> pixelgpudetails::PXID_shift) & pixelgpudetails::PXID_mask;
+          uint32_t row  = pixelgpudetails::numRowsInRoc - pxid/2;
+          uint32_t col  = dcol*2 + pxid%2;
+          localPix.row = row;
+          localPix.col = col;
+          if (includeErrors and not dcolIsValid(dcol, pxid)) {
+            uint32_t error = conversionError(fedId, 3, debug);
+            err->emplace_back(rawId, ww, error, fedId);
+            if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
+            continue;
+          }
+        }
+
+        pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, rocIdInDetUnit, localPix);
+        XX[gIndex]    = globalPix.row  ; // origin shifting by 1 0-159
+        YY[gIndex]    = globalPix.col ; // origin shifting by 1 0-415
+        ADC[gIndex]   = getADC(ww);
+        pdigi[gIndex] = pixelgpudetails::pack(globalPix.row,globalPix.col,ADC[gIndex]);
+        moduleId[gIndex] = detId.moduleId;
+        rawIdArr[gIndex] = rawId;
+      } // end of if (gIndex < end)
+     } // end fake loop
+  } // end of Raw to Digi kernel
+
+
+  // kernel wrapper called from runRawToDigi_kernel
+  void RawToDigi_wrapper(
+      pixelgpudetails::context & c,
+      const SiPixelFedCablingMapGPU* cablingMapDevice, SiPixelGainForHLTonGPU * const ped,
+      const uint32_t wordCounter, uint32_t *word, const uint32_t fedCounter,  uint8_t *fedId_h,
+      bool convertADCtoElectrons, 
+      uint32_t * pdigi_h, uint32_t *rawIdArr_h,
+      GPU::SimpleVector<pixelgpudetails::error_obj> *error_h, GPU::SimpleVector<pixelgpudetails::error_obj> *error_h_tmp, pixelgpudetails::error_obj *data_h,
+      uint16_t * adc_h, int32_t * clus_h,
+      bool useQualityInfo, bool includeErrors, bool debug, uint32_t & nModulesActive)
+  {
+    const int threadsPerBlock = 512;
+    const int blocks = (wordCounter + threadsPerBlock-1) /threadsPerBlock; // fill it all
+
+
+    assert(0 == wordCounter%2);
+    // wordCounter is the total no of words in each event to be trasfered on device
+    cudaCheck(cudaMemcpyAsync(&c.word_d[0],     &word[0],     wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
+    cudaCheck(cudaMemcpyAsync(&c.fedId_d[0], &fedId_h[0], wordCounter*sizeof(uint8_t)/2, cudaMemcpyDefault, c.stream));
+      
+    constexpr uint32_t vsize = sizeof(GPU::SimpleVector<pixelgpudetails::error_obj>);
+    constexpr uint32_t esize = sizeof(pixelgpudetails::error_obj);
+    cudaCheck(cudaMemcpyAsync(c.error_d, error_h_tmp, vsize, cudaMemcpyDefault, c.stream));
+      
+    // Launch rawToDigi kernel
+    RawToDigi_kernel<<<blocks, threadsPerBlock, 0, c.stream>>>(
+        cablingMapDevice,
+        wordCounter,
+        c.word_d,
+        c.fedId_d,
+        c.xx_d, c.yy_d, c.adc_d,
+        c.pdigi_d,
+        c.rawIdArr_d,
+        c.moduleInd_d,
+        c.error_d,
+        useQualityInfo,
+        includeErrors,
+        debug);
+    cudaCheck(cudaGetLastError());
+
+    // copy data to host variable
+
+    cudaCheck(cudaMemcpyAsync(pdigi_h, c.pdigi_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
+    cudaCheck(cudaMemcpyAsync(rawIdArr_h, c.rawIdArr_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
+
+    if (includeErrors) {
+        cudaCheck(cudaMemcpyAsync(error_h, c.error_d, vsize, cudaMemcpyDefault, c.stream));
+        cudaStreamSynchronize(c.stream);
+        error_h->set_data(data_h);
+        int size = error_h->size();
+        cudaCheck(cudaMemcpyAsync(data_h, c.data_d, size*esize, cudaMemcpyDefault, c.stream));
+    }
+    // End  of Raw2Digi and passing data for cluserisation
+
+   {
+     // clusterizer ...
+     using namespace gpuClustering;
+    int threadsPerBlock = 256;
+    int blocks = (wordCounter + threadsPerBlock - 1) / threadsPerBlock;
+
+
+    assert(ped);
+    gpuCalibPixel::calibDigis<<<blocks, threadsPerBlock, 0, c.stream>>>(
+                 c.moduleInd_d,
+                 c.xx_d, c.yy_d, c.adc_d,
+                 ped,
+                 wordCounter
+               );
+
+    cudaCheck(cudaGetLastError());
+
+    // calibrated adc
+    cudaCheck(cudaMemcpyAsync(adc_h, c.adc_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
+
+    /*
+    std::cout
+      << "CUDA countModules kernel launch with " << blocks
+      << " blocks of " << threadsPerBlock << " threads\n";
+    */
+
+    uint32_t nModules=0;
+    cudaCheck(cudaMemcpyAsync(c.moduleStart_d, &nModules, sizeof(uint32_t), cudaMemcpyDefault, c.stream));
+
+    countModules<<<blocks, threadsPerBlock, 0, c.stream>>>(c.moduleInd_d, c.moduleStart_d, c.clus_d, wordCounter);
+    cudaCheck(cudaGetLastError());
+
+    cudaCheck(cudaMemcpyAsync(&nModules, c.moduleStart_d, sizeof(uint32_t), cudaMemcpyDefault, c.stream));
+
+    // std::cout << "found " << nModules << " Modules active" << std::endl;
+
+    
+    threadsPerBlock = 256;
+    blocks = nModules;
+
+    /*
+    std::cout
+      << "CUDA findClus kernel launch with " << blocks
+      << " blocks of " << threadsPerBlock << " threads\n";
+    */
+
+    cudaCheck(cudaMemsetAsync(c.clusInModule_d, 0, (MaxNumModules)*sizeof(uint32_t),c.stream));
+
+    findClus<<<blocks, threadsPerBlock, 0, c.stream>>>(
+                 c.moduleInd_d,
+                 c.xx_d, c.yy_d, c.adc_d,
+                 c.moduleStart_d,
+                 c.clusInModule_d, c.moduleId_d,
+                 c.clus_d,
+                 c.debug_d,
+                 wordCounter
+    );
+
+    // clusters
+    cudaCheck(cudaMemcpyAsync(clus_h, c.clus_d, wordCounter*sizeof(uint32_t), cudaMemcpyDefault, c.stream));
+
+
+    cudaStreamSynchronize(c.stream);
+    cudaCheck(cudaGetLastError());
+
+    nModulesActive = nModules;
+
+   } // end clusterizer scope
+
+  }
+
+}
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.h b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.h
new file mode 100644
index 0000000000000..0bebde676bc28
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.h
@@ -0,0 +1,212 @@
+#ifndef EventFilter_SiPixelRawToDigi_plugins_SiPixelRawToDigiGPUKernel_h
+#define EventFilter_SiPixelRawToDigi_plugins_SiPixelRawToDigiGPUKernel_h
+
+#include <algorithm>
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "SiPixelFedCablingMapGPU.h"
+
+namespace pixelgpudetails {
+
+  // Phase 1 geometry constants
+  const uint32_t layerStartBit    = 20;
+  const uint32_t ladderStartBit   = 12;
+  const uint32_t moduleStartBit   = 2;
+
+  const uint32_t panelStartBit    = 10;
+  const uint32_t diskStartBit     = 18;
+  const uint32_t bladeStartBit    = 12;
+
+  const uint32_t layerMask        = 0xF;
+  const uint32_t ladderMask       = 0xFF;
+  const uint32_t moduleMask       = 0x3FF;
+  const uint32_t panelMask        = 0x3;
+  const uint32_t diskMask         = 0xF;
+  const uint32_t bladeMask        = 0x3F;
+
+  const uint32_t LINK_bits        = 6;
+  const uint32_t ROC_bits         = 5;
+  const uint32_t DCOL_bits        = 5;
+  const uint32_t PXID_bits        = 8;
+  const uint32_t ADC_bits         = 8;
+
+  // special for layer 1
+  const uint32_t LINK_bits_l1     = 6;
+  const uint32_t ROC_bits_l1      = 5;
+  const uint32_t COL_bits_l1      = 6;
+  const uint32_t ROW_bits_l1      = 7;
+  const uint32_t OMIT_ERR_bits    = 1;
+
+  const uint32_t maxROCIndex      = 8;
+  const uint32_t numRowsInRoc     = 80;
+  const uint32_t numColsInRoc     = 52;
+
+  const uint32_t MAX_WORD = 2000;
+
+  const uint32_t ADC_shift  = 0;
+  const uint32_t PXID_shift = ADC_shift + ADC_bits;
+  const uint32_t DCOL_shift = PXID_shift + PXID_bits;
+  const uint32_t ROC_shift  = DCOL_shift + DCOL_bits;
+  const uint32_t LINK_shift = ROC_shift + ROC_bits_l1;
+  // special for layer 1 ROC
+  const uint32_t ROW_shift = ADC_shift + ADC_bits;
+  const uint32_t COL_shift = ROW_shift + ROW_bits_l1;
+  const uint32_t OMIT_ERR_shift = 20;
+
+  const uint32_t LINK_mask = ~(~uint32_t(0) << LINK_bits_l1);
+  const uint32_t ROC_mask  = ~(~uint32_t(0) << ROC_bits_l1);
+  const uint32_t COL_mask  = ~(~uint32_t(0) << COL_bits_l1);
+  const uint32_t ROW_mask  = ~(~uint32_t(0) << ROW_bits_l1);
+  const uint32_t DCOL_mask = ~(~uint32_t(0) << DCOL_bits);
+  const uint32_t PXID_mask = ~(~uint32_t(0) << PXID_bits);
+  const uint32_t ADC_mask  = ~(~uint32_t(0) << ADC_bits);
+  const uint32_t ERROR_mask = ~(~uint32_t(0) << ROC_bits_l1);
+  const uint32_t OMIT_ERR_mask = ~(~uint32_t(0) << OMIT_ERR_bits);
+
+  struct DetIdGPU {
+    uint32_t RawId;
+    uint32_t rocInDet;
+    uint32_t moduleId;
+  };
+
+  struct Pixel {
+   uint32_t row;
+   uint32_t col;
+  };
+
+  class Packing {
+  public:
+    using PackedDigiType = uint32_t;
+      
+    // Constructor: pre-computes masks and shifts from field widths
+    __host__ __device__
+    inline
+    constexpr Packing(unsigned int row_w, unsigned int column_w,
+                      unsigned int time_w, unsigned int adc_w) :
+      row_width(row_w),
+      column_width(column_w),
+      adc_width(adc_w),
+      row_shift(0),
+      column_shift(row_shift + row_w),
+      time_shift(column_shift + column_w),
+      adc_shift(time_shift + time_w),
+      row_mask(~(~0U << row_w)),
+      column_mask( ~(~0U << column_w)),
+      time_mask(~(~0U << time_w)),
+      adc_mask(~(~0U << adc_w)),
+      rowcol_mask(~(~0U << (column_w+row_w))),
+      max_row(row_mask),
+      max_column(column_mask),
+      max_adc(adc_mask)
+    { }
+
+    uint32_t  row_width;
+    uint32_t  column_width;
+    uint32_t  adc_width;
+
+    uint32_t  row_shift;
+    uint32_t  column_shift;
+    uint32_t  time_shift;
+    uint32_t  adc_shift;
+
+    PackedDigiType row_mask;
+    PackedDigiType column_mask;
+    PackedDigiType time_mask;
+    PackedDigiType adc_mask;
+    PackedDigiType rowcol_mask;
+
+    uint32_t  max_row;
+    uint32_t  max_column;
+    uint32_t  max_adc;
+  };
+
+  __host__ __device__
+  inline
+  constexpr Packing packing() {
+    return Packing(11, 11, 0, 10);
+  }
+
+
+  __host__ __device__
+  inline
+  uint32_t pack(uint32_t row, uint32_t col, uint32_t adc) {
+    constexpr Packing thePacking = packing();
+    adc = std::min(adc, thePacking.max_adc);
+
+    return (row << thePacking.row_shift) |
+           (col << thePacking.column_shift) |
+           (adc << thePacking.adc_shift);
+  }
+
+  struct error_obj {
+    uint32_t rawId;
+    uint32_t word;
+    unsigned char errorType;
+    unsigned char fedId;
+
+    __host__ __device__
+    error_obj(uint32_t a, uint32_t b, unsigned char c, unsigned char d):
+      rawId(a),
+      word(b),
+      errorType(c),
+      fedId(d)
+    { }
+  };
+
+  // configuration and memory buffers alocated on the GPU
+  struct context {
+    cudaStream_t stream;
+
+    uint32_t * word_d;
+    uint8_t *  fedId_d;
+    uint32_t * pdigi_d;
+    uint16_t * xx_d;
+    uint16_t * yy_d;
+    uint16_t * adc_d;
+    uint16_t * moduleInd_d;
+    uint32_t * rawIdArr_d;
+
+    GPU::SimpleVector<error_obj> * error_d;
+    error_obj * data_d;
+      
+    // these are for the clusterizer (to be moved)
+    uint32_t * moduleStart_d;
+    int32_t *  clus_d;
+    uint32_t * clusInModule_d;
+    uint32_t * moduleId_d;
+    uint32_t * debug_d;
+  };
+
+  // wrapper function to call RawToDigi on the GPU from host side
+  void RawToDigi_wrapper(context &, const SiPixelFedCablingMapGPU* cablingMapDevice,
+                         SiPixelGainForHLTonGPU * const ped,
+                         const uint32_t wordCounter, uint32_t *word,
+                         const uint32_t fedCounter,  uint8_t *fedId_h,
+                         bool convertADCtoElectrons, uint32_t * pdigi_h,
+                         uint32_t *rawIdArr_h, GPU::SimpleVector<error_obj> *error_h,
+                         GPU::SimpleVector<error_obj> *error_h_tmp, error_obj *data_h,
+                         uint16_t * adc_h, int32_t * clus_h,
+                         bool useQualityInfo, bool includeErrors, bool debug,
+                         uint32_t & nModulesActive);
+
+  // void initCablingMap();
+  context initDeviceMemory();
+  void freeMemory(context &);
+
+  // see RecoLocalTracker/SiPixelClusterizer
+  // all are runtime const, should be specified in python _cfg.py
+  struct ADCThreshold {
+    const int     thePixelThreshold       = 1000;     // default Pixel threshold in electrons
+    const int     theSeedThreshold        = 1000;     // seed thershold in electrons not used in our algo
+    const float   theClusterThreshold     = 4000;     // cluster threshold in electron
+    const int     ConversionFactor        =   65;     // adc to electron conversion factor
+
+    const int     theStackADC_            =  255;     // the maximum adc count for stack layer
+    const int     theFirstStack_          =    5;     // the index of the fits stack layer
+    const double  theElectronPerADCGain_  =  600;     // ADC to electron conversion
+  };
+
+}
+
+#endif // EventFilter_SiPixelRawToDigi_plugins_SiPixelRawToDigiGPUKernel_h
diff --git a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
index a76ca821e70da..6add3a78b96e6 100644
--- a/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
+++ b/RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h
@@ -19,12 +19,11 @@ namespace pixelCPEforGPU {
   struct CommonParams {
     float theThicknessB;
     float theThicknessE;
-     float thePitchX;
+    float thePitchX;
     float thePitchY;
   };
 
   struct DetParams {
-
     bool isBarrel;
     bool isPosZ;
     uint16_t layer;
@@ -39,7 +38,6 @@ namespace pixelCPEforGPU {
     float x0,y0,z0;  // the vertex in the local coord of the detector
 
     Frame frame;
-
   };
 
 
@@ -51,10 +49,9 @@ namespace pixelCPEforGPU {
     CommonParams const & commonParams() const {return *m_commonParams;}
     constexpr
     DetParams const &  detParams(int i) const {return m_detParams[i];}
-
   };
 
-   // SOA!  (on device)
+  // SOA (on device)
   template<uint32_t N>
   struct ClusParamsT {
     uint32_t minRow[N];
@@ -66,12 +63,12 @@ namespace pixelCPEforGPU {
     int32_t Q_l_X[N];
     int32_t Q_f_Y[N];
     int32_t Q_l_Y[N];
-    
+
     int32_t charge[N];
 
     float xpos[N];
     float ypos[N];
-    
+
     float xerr[N];
     float yerr[N];
   };
@@ -85,177 +82,171 @@ namespace pixelCPEforGPU {
     // x,y local position on det
     auto gvx = x - detParams.x0;
     auto gvy = y - detParams.y0;
-    auto gvz = -1.f/detParams.z0;
-    //  normalization not required as only ratio used...
+    auto gvz = -1.f / detParams.z0;
+    // normalization not required as only ratio used...
     // calculate angles
-    cotalpha = gvx*gvz;
-    cotbeta  = gvy*gvz;  
+    cotalpha = gvx * gvz;
+    cotbeta  = gvy * gvz;
   }
 
   constexpr inline
-  float correction( 
-                         int sizeM1,
-                         int Q_f,              //!< Charge in the first pixel.
-                         int Q_l,              //!< Charge in the last pixel.
-                         uint16_t upper_edge_first_pix, //!< As the name says.
-                         uint16_t lower_edge_last_pix,  //!< As the name says.
-                         float lorentz_shift,   //!< L-shift at half thickness
-                         float theThickness,   //detector thickness
-                         float cot_angle,        //!< cot of alpha_ or beta_
-                         float pitch,            //!< thePitchX or thePitchY
-                         bool first_is_big,       //!< true if the first is big
-                         bool last_is_big        //!< true if the last is big
-                   )
-{
-   if (0==sizeM1) return 0;  // size1
-   float W_eff=0;
-   bool simple=true;
-   if (1==sizeM1) {   // size 2   
-     //--- Width of the clusters minus the edge (first and last) pixels.
-     //--- In the note, they are denoted x_F and x_L (and y_F and y_L)
-     // assert(lower_edge_last_pix>=upper_edge_first_pix);
-     auto W_inner      =  pitch * float(lower_edge_last_pix-upper_edge_first_pix);  // in cm
-
-     //--- Predicted charge width from geometry
-     auto W_pred = theThickness * cot_angle                     // geometric correction (in cm)
-                    - lorentz_shift;                    // (in cm) &&& check fpix!
-   
-     W_eff = std::abs( W_pred ) - W_inner;
-
-     //--- If the observed charge width is inconsistent with the expectations
-     //--- based on the track, do *not* use W_pred-W_innner.  Instead, replace
-     //--- it with an *average* effective charge width, which is the average
-     //--- length of the edge pixels.
-     //
-     simple = ( W_eff < 0.0f ) | ( W_eff > pitch ); // this produces "large" regressions for very small	numeric differences...
-
-   }
-   if (simple) {
-     //--- Total length of the two edge pixels (first+last)
-     float sum_of_edge = 2.0f;
-     if (first_is_big) sum_of_edge += 1.0f;
-     if (last_is_big)  sum_of_edge += 1.0f;
-     W_eff = pitch * 0.5f * sum_of_edge;  // ave. length of edge pixels (first+last) (cm)
-   }
-   
-   
-   //--- Finally, compute the position in this projection
-   float Qdiff = Q_l - Q_f;
-   float Qsum  = Q_l + Q_f;
-   
-   //--- Temporary fix for clusters with both first and last pixel with charge = 0
-   if(Qsum==0) Qsum=1.0f;
-   return 0.5f*(Qdiff/Qsum) * W_eff;   
-
+  float correction(
+      int sizeM1,
+      int Q_f,                          //!< Charge in the first pixel.
+      int Q_l,                          //!< Charge in the last pixel.
+      uint16_t upper_edge_first_pix,    //!< As the name says.
+      uint16_t lower_edge_last_pix,     //!< As the name says.
+      float lorentz_shift,              //!< L-shift at half thickness
+      float theThickness,               //detector thickness
+      float cot_angle,                  //!< cot of alpha_ or beta_
+      float pitch,                      //!< thePitchX or thePitchY
+      bool first_is_big,                //!< true if the first is big
+      bool last_is_big )                //!< true if the last is big
+  {
+    if (0 == sizeM1)    // size 1
+      return 0;
+
+    float W_eff = 0;
+    bool simple = true;
+    if (1 == sizeM1) {  // size 2
+      //--- Width of the clusters minus the edge (first and last) pixels.
+      //--- In the note, they are denoted x_F and x_L (and y_F and y_L)
+      // assert(lower_edge_last_pix >= upper_edge_first_pix);
+      auto W_inner = pitch * float(lower_edge_last_pix - upper_edge_first_pix);     // in cm
+
+      //--- Predicted charge width from geometry
+      auto W_pred  = theThickness * cot_angle       // geometric correction (in cm)
+                   - lorentz_shift;                 // (in cm) &&& check fpix!
+
+      W_eff = std::abs(W_pred) - W_inner;
+
+      //--- If the observed charge width is inconsistent with the expectations
+      //--- based on the track, do *not* use W_pred-W_inner.  Instead, replace
+      //--- it with an *average* effective charge width, which is the average
+      //--- length of the edge pixels.
+      simple = (W_eff < 0.0f) | (W_eff > pitch);    // this produces "large" regressions for very small numeric differences...
+    }
+
+    if (simple) {
+      //--- Total length of the two edge pixels (first+last)
+      float sum_of_edge = 2.0f;
+      if (first_is_big) sum_of_edge += 1.0f;
+      if (last_is_big)  sum_of_edge += 1.0f;
+      W_eff = pitch * 0.5f * sum_of_edge;           // ave. length of edge pixels (first+last) (cm)
+    }
+
+    //--- Finally, compute the position in this projection
+    float Qdiff = Q_l - Q_f;
+    float Qsum  = Q_l + Q_f;
+
+    //--- Temporary fix for clusters with both first and last pixel with charge = 0
+    if (Qsum == 0)
+      Qsum = 1.0f;
+
+    return 0.5f * (Qdiff/Qsum) * W_eff;
   }
 
   constexpr inline
   void position(CommonParams const & comParams, DetParams const & detParams, ClusParams & cp, uint32_t ic) {
 
-   //--- Upper Right corner of Lower Left pixel -- in measurement frame
-   uint16_t llx = cp.minRow[ic]+1;
-   uint16_t lly = cp.minCol[ic]+1;
-   
-   //--- Lower Left corner of Upper Right pixel -- in measurement frame
-   uint16_t urx = cp.maxRow[ic];
-   uint16_t ury = cp.maxCol[ic];
-   
-   auto llxl = phase1PixelTopology::localX(llx);   
-   auto llyl = phase1PixelTopology::localY(lly);
-   auto urxl = phase1PixelTopology::localX(urx);
-   auto uryl = phase1PixelTopology::localY(ury);
-
-   auto mx = llxl+urxl;
-   auto my = llyl+uryl;   
-
-   // apply the lorentz offset correction
-   auto xPos = detParams.shiftX + comParams.thePitchX*(0.5f*float(mx)+float(phase1PixelTopology::xOffset));
-   auto yPos = detParams.shiftY + comParams.thePitchY*(0.5f*float(my)+float(phase1PixelTopology::yOffset));
- 
-   float cotalpha=0, cotbeta=0;
-
-
-   computeAnglesFromDet(detParams, xPos,  yPos, cotalpha, cotbeta);
-
-   auto thickness = detParams.isBarrel ? comParams.theThicknessB : comParams.theThicknessE;
-
-   auto xcorr = correction(
-                            cp.maxRow[ic]-cp.minRow[ic],
-                            cp.Q_f_X[ic], cp.Q_l_X[ic],
-                            llxl, urxl,
-                            detParams.chargeWidthX,   // lorentz shift in cm
-                            thickness,
-                            cotalpha,
-                            comParams.thePitchX,
-                            phase1PixelTopology::isBigPixX( cp.minRow[ic] ),
-                            phase1PixelTopology::isBigPixX( cp.maxRow[ic] )
-                           );   
-
-
-   auto ycorr = correction(
-                            cp.maxCol[ic]-cp.minCol[ic],
-                            cp.Q_f_Y[ic], cp.Q_l_Y[ic],
-                            llyl, uryl,
-                            detParams.chargeWidthY,   // lorentz shift in cm
-                            thickness,
-                            cotbeta,
-                            comParams.thePitchY,
-                            phase1PixelTopology::isBigPixY( cp.minCol[ic] ),
-                            phase1PixelTopology::isBigPixY( cp.maxCol[ic] )
-                           );
-
-   cp.xpos[ic]=xPos+xcorr;
-   cp.ypos[ic]=yPos+ycorr;
-
+    //--- Upper Right corner of Lower Left pixel -- in measurement frame
+    uint16_t llx = cp.minRow[ic]+1;
+    uint16_t lly = cp.minCol[ic]+1;
+
+    //--- Lower Left corner of Upper Right pixel -- in measurement frame
+    uint16_t urx = cp.maxRow[ic];
+    uint16_t ury = cp.maxCol[ic];
+
+    auto llxl = phase1PixelTopology::localX(llx);
+    auto llyl = phase1PixelTopology::localY(lly);
+    auto urxl = phase1PixelTopology::localX(urx);
+    auto uryl = phase1PixelTopology::localY(ury);
+
+    auto mx = llxl+urxl;
+    auto my = llyl+uryl;
+
+    // apply the lorentz offset correction
+    auto xPos = detParams.shiftX + comParams.thePitchX*(0.5f*float(mx)+float(phase1PixelTopology::xOffset));
+    auto yPos = detParams.shiftY + comParams.thePitchY*(0.5f*float(my)+float(phase1PixelTopology::yOffset));
+
+    float cotalpha=0, cotbeta=0;
+
+    computeAnglesFromDet(detParams, xPos,  yPos, cotalpha, cotbeta);
+
+    auto thickness = detParams.isBarrel ? comParams.theThicknessB : comParams.theThicknessE;
+
+    auto xcorr = correction(
+        cp.maxRow[ic]-cp.minRow[ic],
+        cp.Q_f_X[ic], cp.Q_l_X[ic],
+        llxl, urxl,
+        detParams.chargeWidthX,   // lorentz shift in cm
+        thickness,
+        cotalpha,
+        comParams.thePitchX,
+        phase1PixelTopology::isBigPixX(cp.minRow[ic]),
+        phase1PixelTopology::isBigPixX(cp.maxRow[ic]) );
+
+    auto ycorr = correction(
+        cp.maxCol[ic]-cp.minCol[ic],
+        cp.Q_f_Y[ic], cp.Q_l_Y[ic],
+        llyl, uryl,
+        detParams.chargeWidthY,   // lorentz shift in cm
+        thickness,
+        cotbeta,
+        comParams.thePitchY,
+        phase1PixelTopology::isBigPixY(cp.minCol[ic]),
+        phase1PixelTopology::isBigPixY(cp.maxCol[ic]) );
+
+    cp.xpos[ic]=xPos+xcorr;
+    cp.ypos[ic]=yPos+ycorr;
   }
 
-  // FIXME these are errors form Run1
   constexpr inline
   void error(CommonParams const & comParams, DetParams const & detParams, ClusParams & cp, uint32_t ic) {
-     // Edge cluster errors
-     cp.xerr[ic]= 0.0050;
-     cp.yerr[ic]= 0.0085;
-
-
-     constexpr float xerr_barrel_l1[] = {0.00115, 0.00120, 0.00088};
-     constexpr float xerr_barrel_l1_def = 0.01030;
-     constexpr float yerr_barrel_l1[] = {0.00375,0.00230,0.00250,0.00250,0.00230,0.00230,0.00210,0.00210,0.00240};
-     constexpr float yerr_barrel_l1_def=0.00210;
-     constexpr float xerr_barrel_ln[]= {0.00115, 0.00120, 0.00088};
-     constexpr float xerr_barrel_ln_def=0.01030;
-     constexpr float yerr_barrel_ln[]= {0.00375,0.00230,0.00250,0.00250,0.00230,0.00230,0.00210,0.00210,0.00240};
-     constexpr float yerr_barrel_ln_def=0.00210;
-     constexpr float xerr_endcap[]= {0.0020, 0.0020};
-     constexpr float xerr_endcap_def=0.0020;
-     constexpr float yerr_endcap[]= {0.00210};
-     constexpr float yerr_endcap_def=0.00210;
-
-     // is edgy?
-     bool isEdgeX = cp.minRow[ic]==0 || cp.maxRow[ic]==phase1PixelTopology::lastRowInModule;
-     bool isEdgeY = cp.minCol[ic]==0 || cp.maxCol[ic]==phase1PixelTopology::lastColInModule;
- 
-     if (!isEdgeX) {
-       auto sx = cp.maxRow[ic]-cp.minRow[ic];
-       if (!detParams.isBarrel ) {
-          cp.xerr[ic] =  sx <std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
-       } else if (detParams.layer==1) {
-          cp.xerr[ic] =  sx <std::size(xerr_barrel_l1) ?  xerr_barrel_l1[sx]: xerr_barrel_l1_def;
-       } else {
-          cp.xerr[ic] =  sx <std::size(xerr_barrel_ln) ?  xerr_barrel_ln[sx]: xerr_barrel_ln_def;
+    // Edge cluster errors
+    cp.xerr[ic]= 0.0050;
+    cp.yerr[ic]= 0.0085;
+
+    // FIXME these are errors form Run1
+    constexpr float xerr_barrel_l1[] = { 0.00115, 0.00120, 0.00088 };
+    constexpr float xerr_barrel_l1_def = 0.01030;
+    constexpr float yerr_barrel_l1[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
+    constexpr float yerr_barrel_l1_def = 0.00210;
+    constexpr float xerr_barrel_ln[] = { 0.00115, 0.00120, 0.00088 };
+    constexpr float xerr_barrel_ln_def = 0.01030;
+    constexpr float yerr_barrel_ln[] = { 0.00375, 0.00230, 0.00250, 0.00250, 0.00230, 0.00230, 0.00210, 0.00210, 0.00240 };
+    constexpr float yerr_barrel_ln_def = 0.00210;
+    constexpr float xerr_endcap[] = { 0.0020, 0.0020 };
+    constexpr float xerr_endcap_def = 0.0020;
+    constexpr float yerr_endcap[] = { 0.00210 };
+    constexpr float yerr_endcap_def = 0.00210;
+
+    // is edgy ?
+    bool isEdgeX = cp.minRow[ic] == 0 or cp.maxRow[ic] == phase1PixelTopology::lastRowInModule;
+    bool isEdgeY = cp.minCol[ic] == 0 or cp.maxCol[ic] == phase1PixelTopology::lastColInModule;
+
+    if (not isEdgeX) {
+      auto sx = cp.maxRow[ic] - cp.minRow[ic];
+      if (not detParams.isBarrel) {
+        cp.xerr[ic] = sx <std::size(xerr_endcap) ? xerr_endcap[sx] : xerr_endcap_def;
+      } else if (detParams.layer == 1) {
+        cp.xerr[ic] = sx <std::size(xerr_barrel_l1) ? xerr_barrel_l1[sx]: xerr_barrel_l1_def;
+      } else {
+        cp.xerr[ic] = sx <std::size(xerr_barrel_ln) ? xerr_barrel_ln[sx]: xerr_barrel_ln_def;
       }
-     } 
-
-     if (!isEdgeY) {
-       auto sy =cp.maxCol[ic]-cp.minCol[ic];;
-       if (!detParams.isBarrel ) {
-          cp.yerr[ic] =  sy <std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
-       } else if (detParams.layer==1) {
-          cp.yerr[ic] =  sy <std::size(yerr_barrel_l1) ?  yerr_barrel_l1[sy]: yerr_barrel_l1_def;
-       } else {
-          cp.yerr[ic] =  sy <std::size(yerr_barrel_ln) ?  yerr_barrel_ln[sy]: yerr_barrel_ln_def;
+    }
+
+    if (not isEdgeY) {
+      auto sy = cp.maxCol[ic] - cp.minCol[ic];;
+      if (not detParams.isBarrel) {
+        cp.yerr[ic] = sy <std::size(yerr_endcap) ? yerr_endcap[sy] : yerr_endcap_def;
+      } else if (detParams.layer == 1) {
+        cp.yerr[ic] = sy <std::size(yerr_barrel_l1) ? yerr_barrel_l1[sy]: yerr_barrel_l1_def;
+      } else {
+        cp.yerr[ic] = sy <std::size(yerr_barrel_ln) ? yerr_barrel_ln[sy]: yerr_barrel_ln_def;
       }
-     }
-   }
+    }
+  }
 
 }
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 43c9f1a8a9236..b931815dc2623 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -10,7 +10,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 
 // CMSSW headers
-#include "EventFilter/SiPixelRawToDigi/plugins/RawToDigiGPU.h"
+#include "EventFilter/SiPixelRawToDigi/plugins/SiPixelRawToDigiGPUKernel.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h"
 #include "PixelRecHits.h"
@@ -35,7 +35,7 @@ HitsOnGPU allocHitsOnGPU() {
 
 HitsOnCPU
 pixelRecHits_wrapper(
-      context const & c,
+      pixelgpudetails::context const & c,
       pixelCPEforGPU::ParamsOnGPU const * cpeParams,
       uint32_t ndigis,
       uint32_t nModules, // active modules (with digis)
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
index 75976ad697a1a..9126ab1c8abd1 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -8,7 +8,9 @@ namespace pixelCPEforGPU {
   struct ParamsOnGPU;
 }
 
-struct context;
+namespace pixelgpudetails {
+  struct context;
+}
 
 struct HitsOnGPU{
    uint32_t * hitsModuleStart_d;
@@ -32,7 +34,7 @@ struct HitsOnCPU {
 HitsOnGPU allocHitsOnGPU();
 
 HitsOnCPU pixelRecHits_wrapper(
-      context const & c,
+      pixelgpudetails::context const & c,
       pixelCPEforGPU::ParamsOnGPU const * cpeParams,
       uint32_t ndigis,
       uint32_t nModules, // active modules (with digis)
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitGPU.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitGPU.cc
index aa9c2751c3848..d2ad7d5cc2806 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitGPU.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitGPU.cc
@@ -136,7 +136,7 @@ using namespace std;
     // Step B*: create CPE
     edm::ESHandle<PixelClusterParameterEstimator> hCPE;
     std::string cpeName_ = conf_.getParameter<std::string>("CPE");
-    es.get<TkPixelCPERecord>().get(cpeName_,hCPE);
+    es.get<TkPixelCPERecord>().get(cpeName_, hCPE);
     cpe_ = dynamic_cast< const PixelCPEBase* >(&(*hCPE));
     
     ///  do it on GPU....
@@ -152,14 +152,14 @@ using namespace std;
     }
     assert(fcpe->d_paramsOnGPU);
 
-    auto hoc = pixelRecHits_wrapper(* (context const *)(gprod[0]),fcpe->d_paramsOnGPU,gprod[1],gprod[2], hitsOnGPU_);
+    auto hoc = pixelRecHits_wrapper(* (pixelgpudetails::context const *)(gprod[0]), fcpe->d_paramsOnGPU, gprod[1], gprod[2], hitsOnGPU_);
 
     // Step C: Iterate over DetIds and invoke the strip CPE algorithm
     // on each DetUnit
 
     // std::cout << "Number of Clusers on CPU " << (*input).data().size() << std::endl;
 
-    run( input, *output, geom,hoc );
+    run( input, *output, geom, hoc );
     // std::cout << "Number of Hits on CPU " << (*output).data().size() << std::endl;
 
     output->shrink_to_fit();
@@ -192,7 +192,7 @@ using namespace std;
       auto gind = genericDet->index();
       const PixelGeomDetUnit * pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
       assert(pixDet); 
-      SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(output,detid);
+      SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(output, detid);
       auto fc = hoc.hitsModuleStart[gind];
       auto lc = hoc.hitsModuleStart[gind+1];
       auto nhits = lc-fc;
@@ -201,10 +201,10 @@ using namespace std;
       uint32_t ngh=0;
       for (uint32_t i=0; i<nhits;++i) {
         if( hoc.charge[fc+i]<2000 || (gind>=96 && hoc.charge[fc+i]<4000) ) continue;
-        ind[ngh]=i;std::push_heap(ind,ind+ngh+1,[&](auto a, auto b) { return mrp[a]<mrp[b];});
+        ind[ngh]=i;std::push_heap(ind, ind+ngh+1,[&](auto a, auto b) { return mrp[a]<mrp[b];});
         ++ngh;
       }
-      std::sort_heap(ind,ind+ngh,[&](auto a, auto b) { return mrp[a]<mrp[b];});
+      std::sort_heap(ind, ind+ngh,[&](auto a, auto b) { return mrp[a]<mrp[b];});
       uint32_t ic=0;
       assert(ngh==DSViter->size());
       for (auto const & clust : *DSViter) {
@@ -233,8 +233,8 @@ using namespace std;
                         << gind <<'/'<<fc<<'/'<<ic<<'/'<<ij << ' ' << clust.charge()<<"!="<<hoc.charge[ij] 
                         << ' ' << clust.minPixelRow()<<'/'<< mrp[ij] <<'/'<< mrp[fc+ind[ic]] << std::endl;
 
-        LocalPoint lp(hoc.xl[ij],hoc.yl[ij]);
-        LocalError le(hoc.xe[ij]*hoc.xe[ij],0,hoc.ye[ij]*hoc.ye[ij]);
+        LocalPoint lp(hoc.xl[ij], hoc.yl[ij]);
+        LocalError le(hoc.xe[ij]*hoc.xe[ij], 0, hoc.ye[ij]*hoc.ye[ij]);
         SiPixelRecHitQuality::QualWordType rqw=0;
 
 
@@ -242,7 +242,7 @@ using namespace std;
 	numberOfClusters++;
 
         /*   cpu version....  (for reference)
-	std::tuple<LocalPoint, LocalError,SiPixelRecHitQuality::QualWordType> tuple = cpe_->getParameters( clust, *genericDet );
+	std::tuple<LocalPoint, LocalError, SiPixelRecHitQuality::QualWordType> tuple = cpe_->getParameters( clust, *genericDet );
 	LocalPoint lp( std::get<0>(tuple) );
 	LocalError le( std::get<1>(tuple) );
         SiPixelRecHitQuality::QualWordType rqw( std::get<2>(tuple) );
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 14e4f5f26aaa1..51cbb54dcb8e3 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -13,8 +13,8 @@ namespace gpuPixelRecHits {
   // to be moved in common namespace...
   constexpr uint16_t InvId=9999; // must be > MaxNumModules
 
-  
-  constexpr uint32_t MaxClusInModule=  pixelCPEforGPU::MaxClusInModule;
+
+  constexpr uint32_t MaxClusInModule = pixelCPEforGPU::MaxClusInModule;
 
   using ClusParams = pixelCPEforGPU::ClusParams;
 
@@ -33,101 +33,91 @@ namespace gpuPixelRecHits {
                           int32_t * chargeh,
 			  float * xh, float * yh, float * zh,
                           float * xe, float * ye, uint16_t * mr,
-			  bool local // if true fill just x & y in local coord...
-			  ){
-
+			  bool local)   // if true fill just x & y in local coord
+  {
     // as usual one block per module
-
     __shared__ ClusParams clusParams;
 
-
-    auto first = digiModuleStart[1 + blockIdx.x];  
-    
+    auto first = digiModuleStart[1 + blockIdx.x];
     auto me = id[first];
-    assert (moduleId[blockIdx.x]==me);
-
+    assert(moduleId[blockIdx.x] == me);
     auto nclus = clusInModule[me];
 
 #ifdef GPU_DEBUG
     if (me%100==1)
-      if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n",nclus,me,hitsModuleStart[me]);
+      if (threadIdx.x==0) printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, hitsModuleStart[me]);
 #endif
 
-    assert(blockDim.x>=MaxClusInModule);
-    assert(nclus<=MaxClusInModule);
+    assert(blockDim.x >= MaxClusInModule);
+    assert(nclus <= MaxClusInModule);
 
     auto ic = threadIdx.x;
-    
-    if (ic<nclus) {
+
+    if (ic < nclus) {
       clusParams.minRow[ic] = std::numeric_limits<uint32_t>::max();
       clusParams.maxRow[ic] = 0;
       clusParams.minCol[ic] = std::numeric_limits<uint32_t>::max();
       clusParams.maxCol[ic] = 0;
-      
       clusParams.charge[ic] = 0;
-
       clusParams.Q_f_X[ic] = 0;
       clusParams.Q_l_X[ic] = 0;
       clusParams.Q_f_Y[ic] = 0;
       clusParams.Q_l_Y[ic] = 0;
     }
 
-    
-    first+=threadIdx.x;
-    
-    __syncthreads();
+    first += threadIdx.x;
 
+    __syncthreads();
 
     // one thead per "digi"
-    
-    for (int i=first; i<numElements; i+=blockDim.x) {
-      if (id[i]==InvId) continue;  // not valid
-      if (id[i]!=me) break;  // end of module
-      assert(clus[i]<nclus);
-      atomicMin(&clusParams.minRow[clus[i]],x[i]);
-      atomicMax(&clusParams.maxRow[clus[i]],x[i]);
-      atomicMin(&clusParams.minCol[clus[i]],y[i]);
-      atomicMax(&clusParams.maxCol[clus[i]],y[i]);
+
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != me) break;           // end of module
+      assert(clus[i] < nclus);
+      atomicMin(&clusParams.minRow[clus[i]], x[i]);
+      atomicMax(&clusParams.maxRow[clus[i]], x[i]);
+      atomicMin(&clusParams.minCol[clus[i]], y[i]);
+      atomicMax(&clusParams.maxCol[clus[i]], y[i]);
     }
 
     __syncthreads();
 
-    for (int i=first; i<numElements; i+=blockDim.x) {
-      if (id[i]==InvId) continue;  // not valid
-      if (id[i]!=me) break;  // end of module
-      atomicAdd(&clusParams.charge[clus[i]],adc[i]);
-      if (clusParams.minRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_f_X[clus[i]],adc[i]); 
-      if (clusParams.maxRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_l_X[clus[i]],adc[i]); 
-      if (clusParams.minCol[clus[i]]==y[i]) atomicAdd(&clusParams.Q_f_Y[clus[i]],adc[i]);
-      if (clusParams.maxCol[clus[i]]==y[i]) atomicAdd(&clusParams.Q_l_Y[clus[i]],adc[i]); 
+    for (int i = first; i < numElements; i += blockDim.x) {
+      if (id[i] == InvId) continue;     // not valid
+      if (id[i] != me) break;           // end of module
+      atomicAdd(&clusParams.charge[clus[i]], adc[i]);
+      if (clusParams.minRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_f_X[clus[i]], adc[i]);
+      if (clusParams.maxRow[clus[i]]==x[i]) atomicAdd(&clusParams.Q_l_X[clus[i]], adc[i]);
+      if (clusParams.minCol[clus[i]]==y[i]) atomicAdd(&clusParams.Q_f_Y[clus[i]], adc[i]);
+      if (clusParams.maxCol[clus[i]]==y[i]) atomicAdd(&clusParams.Q_l_Y[clus[i]], adc[i]);
     }
 
     __syncthreads();
 
     // next one cluster per thread...
-    if (ic>=nclus) return;
+    if (ic >= nclus) return;
 
     first = hitsModuleStart[me];
     auto h = first+ic;  // output index in global memory
 
-    assert(h<2000*256);
+    assert(h < 2000*256);
+
+    pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
+    pixelCPEforGPU::error(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
-    pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams,ic);
-    pixelCPEforGPU::error(cpeParams->commonParams(), cpeParams->detParams(me), clusParams,ic);
- 
     chargeh[h] = clusParams.charge[ic];
 
-    if (local) {   
-     xh[h]= clusParams.xpos[ic];   
-     yh[h]= clusParams.ypos[ic]; 
+    if (local) {
+      xh[h] = clusParams.xpos[ic];
+      yh[h] = clusParams.ypos[ic];
     } else {
-      cpeParams->detParams(me).frame.toGlobal(clusParams.xpos[ic],clusParams.ypos[ic],
-                                              xh[h],yh[h],zh[h]
-                                             );
+      cpeParams->detParams(me).frame.toGlobal(clusParams.xpos[ic], clusParams.ypos[ic],
+                                              xh[h], yh[h], zh[h] );
     }
-    xe[h]= clusParams.xerr[ic];
-    ye[h]= clusParams.yerr[ic];
-    mr[h]= clusParams.minRow[ic];
+    xe[h] = clusParams.xerr[ic];
+    ye[h] = clusParams.yerr[ic];
+    mr[h] = clusParams.minRow[ic];
   }
 
 }