diff --git a/Configuration/ProcessModifiers/python/alpakaValidationPixel_cff.py b/Configuration/ProcessModifiers/python/alpakaValidationPixel_cff.py
new file mode 100644
index 0000000000000..ebdb7d9e6981a
--- /dev/null
+++ b/Configuration/ProcessModifiers/python/alpakaValidationPixel_cff.py
@@ -0,0 +1,6 @@
+import FWCore.ParameterSet.Config as cms
+
+# This modifier chain is for turning on DQM modules used for alpaka device/host validation for pixels
+
+alpakaValidationPixel = cms.Modifier()
+
diff --git a/Configuration/ProcessModifiers/python/alpakaValidation_cff.py b/Configuration/ProcessModifiers/python/alpakaValidation_cff.py
new file mode 100644
index 0000000000000..3399bdda7c4df
--- /dev/null
+++ b/Configuration/ProcessModifiers/python/alpakaValidation_cff.py
@@ -0,0 +1,11 @@
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.ProcessModifiers.alpaka_cff import *
+from Configuration.ProcessModifiers.alpakaValidationPixel_cff import *
+
+# This modifier chain is for turning on DQM modules used for alpaka device/host validation
+
+alpakaValidation =  cms.ModifierChain(
+    alpaka,
+    alpakaValidationPixel
+)
diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
index ef7525a26b540..3f9a6ed96c9e5 100644
--- a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
+++ b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py
@@ -894,6 +894,7 @@ def setup_(self, step, stepName, stepDict, k, properties):
 #  - HLT on CPU
 #  - Pixel-only reconstruction on CPU, with DQM and validation
 #  - harvesting
+
 upgradeWFs['PatatrackPixelOnlyCPU'] = PatatrackWorkflow(
     digi = {
         # the HLT menu is already set up for using GPUs if available and if the "gpu" modifier is enabled
@@ -1513,6 +1514,53 @@ def setup_(self, step, stepName, stepDict, k, properties):
     offset = 0.597,
 )
 
+
+# Alpaka workflows
+
+upgradeWFs['PatatrackPixelOnlyAlpaka'] = PatatrackWorkflow(
+    digi = {
+        '--procModifiers': 'alpaka'
+    },
+    reco = {
+        '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM',
+        '--procModifiers': 'alpaka'
+    },
+    harvest = {
+        '-s': 'HARVESTING:@trackingOnlyValidation+@pixelTrackingOnlyDQM'
+    },
+    suffix = 'Patatrack_PixelOnlyAlpaka',
+    offset = 0.402,
+)
+
+upgradeWFs['PatatrackPixelOnlyAlpakaValidation'] = PatatrackWorkflow(
+    digi = {
+        '--procModifiers': 'alpaka'
+    },
+    reco = {
+        '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM',
+        '--procModifiers': 'alpakaValidation'
+    },
+    harvest = {
+        '-s': 'HARVESTING:@trackingOnlyValidation+@pixelTrackingOnlyDQM'
+    },
+    suffix = 'Patatrack_PixelOnlyAlpaka_Validation',
+    offset = 0.403,
+)
+
+upgradeWFs['PatatrackPixelOnlyAlpakaProfiling'] = PatatrackWorkflow(
+    digi = {
+        '--procModifiers': 'alpaka'
+    },
+    reco = {
+        '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly',
+        '--procModifiers': 'alpaka',
+        '--customise' : 'RecoTracker/Configuration/customizePixelOnlyForProfiling.customizePixelOnlyForProfilingGPUOnly'
+    },
+    harvest = None,
+    suffix = 'Patatrack_PixelOnlyAlpaka_Profiling',
+    offset = 0.404,
+)
+
 # end of Patatrack workflows
 
 class UpgradeWorkflow_ProdLike(UpgradeWorkflow):
@@ -2718,7 +2766,7 @@ def condition(self, fragment, stepList, key, hasHarvest):
     },
     '2022HI' : {
         'Geom' : 'DB:Extended',
-        'GT':'auto:phase1_2022_realistic_hi', 
+        'GT':'auto:phase1_2022_realistic_hi',
         'HLTmenu': '@fake2',
         'Era':'Run3_pp_on_PbPb',
         'BeamSpot': 'DBrealistic',
@@ -2726,7 +2774,7 @@ def condition(self, fragment, stepList, key, hasHarvest):
     },
     '2022HIRP' : {
         'Geom' : 'DB:Extended',
-        'GT':'auto:phase1_2022_realistic_hi', 
+        'GT':'auto:phase1_2022_realistic_hi',
         'HLTmenu': '@fake2',
         'Era':'Run3_pp_on_PbPb_approxSiStripClusters',
         'BeamSpot': 'DBrealistic',
@@ -2734,7 +2782,7 @@ def condition(self, fragment, stepList, key, hasHarvest):
     },
     '2023HI' : {
         'Geom' : 'DB:Extended',
-        'GT':'auto:phase1_2023_realistic_hi', 
+        'GT':'auto:phase1_2023_realistic_hi',
         'HLTmenu': '@fake2',
         'Era':'Run3_pp_on_PbPb',
         'BeamSpot': 'DBrealistic',
@@ -2742,7 +2790,7 @@ def condition(self, fragment, stepList, key, hasHarvest):
     },
     '2023HIRP' : {
         'Geom' : 'DB:Extended',
-        'GT':'auto:phase1_2023_realistic_hi', 
+        'GT':'auto:phase1_2023_realistic_hi',
         'HLTmenu': '@fake2',
         'Era':'Run3_pp_on_PbPb_approxSiStripClusters',
         'BeamSpot': 'DBrealistic',
diff --git a/DQM/SiPixelHeterogeneous/plugins/BuildFile.xml b/DQM/SiPixelHeterogeneous/plugins/BuildFile.xml
index 66adf1666762e..79925fdcb6cf8 100644
--- a/DQM/SiPixelHeterogeneous/plugins/BuildFile.xml
+++ b/DQM/SiPixelHeterogeneous/plugins/BuildFile.xml
@@ -5,8 +5,11 @@
 <use name="DataFormats/Common"/>
 <use name="Geometry/Records"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
+<use name="DataFormats/TrackingRecHitSoA"/>
+<use name="DataFormats/TrackSoA"/>
+<use name="DataFormats/VertexSoA"/>
+<use name="DataFormats/BeamSpot"/>
 <use name="CUDADataFormats/TrackingRecHit"/>
 <use name="CUDADataFormats/Track"/>
 <use name="CUDADataFormats/Vertex"/>
-<use name="DataFormats/BeamSpot"/>
 <flags EDM_PLUGIN="1"/>
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoAAlpaka.cc
new file mode 100644
index 0000000000000..474194ad72616
--- /dev/null
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoAAlpaka.cc
@@ -0,0 +1,244 @@
+#include "DQMServices/Core/interface/MonitorElement.h"
+#include "DQMServices/Core/interface/DQMEDAnalyzer.h"
+#include "DQMServices/Core/interface/DQMStore.h"
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
+#include "Geometry/CommonTopologies/interface/PixelTopology.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+
+template <typename T>
+class SiPixelCompareRecHitsSoAAlpaka : public DQMEDAnalyzer {
+public:
+  using HitsOnHost = TrackingRecHitHost<T>;
+
+  explicit SiPixelCompareRecHitsSoAAlpaka(const edm::ParameterSet&);
+  ~SiPixelCompareRecHitsSoAAlpaka() override = default;
+  void dqmBeginRun(const edm::Run&, const edm::EventSetup&) override;
+  void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
+  void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
+  const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> topoToken_;
+  const edm::EDGetTokenT<HitsOnHost> tokenSoAHitsHost_;    //these two are both on Host but originally they have been
+  const edm::EDGetTokenT<HitsOnHost> tokenSoAHitsDevice_;  //produced on Host or on Device
+  const std::string topFolderName_;
+  const float mind2cut_;
+  static constexpr uint32_t invalidHit_ = std::numeric_limits<uint32_t>::max();
+  static constexpr float micron_ = 10000.;
+  const TrackerGeometry* tkGeom_ = nullptr;
+  const TrackerTopology* tTopo_ = nullptr;
+  MonitorElement* hnHits_;
+  MonitorElement* hBchargeL_[4];  // max 4 barrel hits
+  MonitorElement* hBsizexL_[4];
+  MonitorElement* hBsizeyL_[4];
+  MonitorElement* hBposxL_[4];
+  MonitorElement* hBposyL_[4];
+  MonitorElement* hFchargeD_[2][12];  // max 12 endcap disks
+  MonitorElement* hFsizexD_[2][12];
+  MonitorElement* hFsizeyD_[2][12];
+  MonitorElement* hFposxD_[2][12];
+  MonitorElement* hFposyD_[2][12];
+  //differences
+  MonitorElement* hBchargeDiff_;
+  MonitorElement* hFchargeDiff_;
+  MonitorElement* hBsizeXDiff_;
+  MonitorElement* hFsizeXDiff_;
+  MonitorElement* hBsizeYDiff_;
+  MonitorElement* hFsizeYDiff_;
+  MonitorElement* hBposXDiff_;
+  MonitorElement* hFposXDiff_;
+  MonitorElement* hBposYDiff_;
+  MonitorElement* hFposYDiff_;
+};
+
+//
+// constructors
+//
+template <typename T>
+SiPixelCompareRecHitsSoAAlpaka<T>::SiPixelCompareRecHitsSoAAlpaka(const edm::ParameterSet& iConfig)
+    : geomToken_(esConsumes<TrackerGeometry, TrackerDigiGeometryRecord, edm::Transition::BeginRun>()),
+      topoToken_(esConsumes<TrackerTopology, TrackerTopologyRcd, edm::Transition::BeginRun>()),
+      tokenSoAHitsHost_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrcHost"))),
+      tokenSoAHitsDevice_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrcDevice"))),
+      topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
+      mind2cut_(iConfig.getParameter<double>("minD2cut")) {}
+
+//
+// Begin Run
+//
+template <typename T>
+void SiPixelCompareRecHitsSoAAlpaka<T>::dqmBeginRun(const edm::Run& iRun, const edm::EventSetup& iSetup) {
+  tkGeom_ = &iSetup.getData(geomToken_);
+  tTopo_ = &iSetup.getData(topoToken_);
+}
+
+//
+// -- Analyze
+//
+template <typename T>
+void SiPixelCompareRecHitsSoAAlpaka<T>::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  const auto& rhsoaHandleHost = iEvent.getHandle(tokenSoAHitsHost_);
+  const auto& rhsoaHandleDevice = iEvent.getHandle(tokenSoAHitsDevice_);
+  if (not rhsoaHandleHost or not rhsoaHandleDevice) {
+    edm::LogWarning out("SiPixelCompareRecHitsSoAAlpaka");
+    if (not rhsoaHandleHost) {
+      out << "reference (Host) rechits not found; ";
+    }
+    if (not rhsoaHandleDevice) {
+      out << "target (Device) rechits not found; ";
+    }
+    out << "the comparison will not run.";
+    return;
+  }
+
+  auto const& rhsoaHost = *rhsoaHandleHost;
+  auto const& rhsoaDevice = *rhsoaHandleDevice;
+
+  auto const& soa2dHost = rhsoaHost.const_view();
+  auto const& soa2dDevice = rhsoaDevice.const_view();
+
+  uint32_t nHitsHost = soa2dHost.metadata().size();
+  uint32_t nHitsDevice = soa2dDevice.metadata().size();
+
+  hnHits_->Fill(nHitsHost, nHitsDevice);
+  auto detIds = tkGeom_->detUnitIds();
+  for (uint32_t i = 0; i < nHitsHost; i++) {
+    float minD = mind2cut_;
+    uint32_t matchedHit = invalidHit_;
+    uint16_t indHost = soa2dHost[i].detectorIndex();
+    float xLocalHost = soa2dHost[i].xLocal();
+    float yLocalHost = soa2dHost[i].yLocal();
+    for (uint32_t j = 0; j < nHitsDevice; j++) {
+      if (soa2dDevice.detectorIndex(j) == indHost) {
+        float dx = xLocalHost - soa2dDevice[j].xLocal();
+        float dy = yLocalHost - soa2dDevice[j].yLocal();
+        float distance = dx * dx + dy * dy;
+        if (distance < minD) {
+          minD = distance;
+          matchedHit = j;
+        }
+      }
+    }
+    DetId id = detIds[indHost];
+    uint32_t chargeHost = soa2dHost[i].chargeAndStatus().charge;
+    int16_t sizeXHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeX()) / 8.));
+    int16_t sizeYHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeY()) / 8.));
+    uint32_t chargeDevice = 0;
+    int16_t sizeXDevice = -99;
+    int16_t sizeYDevice = -99;
+    float xLocalDevice = -999.;
+    float yLocalDevice = -999.;
+    if (matchedHit != invalidHit_) {
+      chargeDevice = soa2dDevice[matchedHit].chargeAndStatus().charge;
+      sizeXDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeX()) / 8.));
+      sizeYDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeY()) / 8.));
+      xLocalDevice = soa2dDevice[matchedHit].xLocal();
+      yLocalDevice = soa2dDevice[matchedHit].yLocal();
+    }
+    switch (id.subdetId()) {
+      case PixelSubdetector::PixelBarrel:
+        hBchargeL_[tTopo_->pxbLayer(id) - 1]->Fill(chargeHost, chargeDevice);
+        hBsizexL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeXHost, sizeXDevice);
+        hBsizeyL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeYHost, sizeYDevice);
+        hBposxL_[tTopo_->pxbLayer(id) - 1]->Fill(xLocalHost, xLocalDevice);
+        hBposyL_[tTopo_->pxbLayer(id) - 1]->Fill(yLocalHost, yLocalDevice);
+        hBchargeDiff_->Fill(chargeHost - chargeDevice);
+        hBsizeXDiff_->Fill(sizeXHost - sizeXDevice);
+        hBsizeYDiff_->Fill(sizeYHost - sizeYDevice);
+        hBposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice));
+        hBposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice));
+        break;
+      case PixelSubdetector::PixelEndcap:
+        hFchargeD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(chargeHost, chargeDevice);
+        hFsizexD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeXHost, sizeXDevice);
+        hFsizeyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeYHost, sizeYDevice);
+        hFposxD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xLocalHost, xLocalDevice);
+        hFposyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(yLocalHost, yLocalDevice);
+        hFchargeDiff_->Fill(chargeHost - chargeDevice);
+        hFsizeXDiff_->Fill(sizeXHost - sizeXDevice);
+        hFsizeYDiff_->Fill(sizeYHost - sizeYDevice);
+        hFposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice));
+        hFposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice));
+        break;
+    }
+  }
+}
+
+//
+// -- Book Histograms
+//
+template <typename T>
+void SiPixelCompareRecHitsSoAAlpaka<T>::bookHistograms(DQMStore::IBooker& iBook,
+                                                       edm::Run const& iRun,
+                                                       edm::EventSetup const& iSetup) {
+  iBook.cd();
+  iBook.setCurrentFolder(topFolderName_);
+
+  // clang-format off
+  //Global
+  hnHits_ = iBook.book2I("nHits", "HostvsDevice RecHits per event;#Host RecHits;#Device RecHits", 200, 0, 5000,200, 0, 5000);
+  //Barrel Layer
+  for(unsigned int il=0;il<tkGeom_->numberOfLayers(PixelSubdetector::PixelBarrel);il++){
+    hBchargeL_[il] = iBook.book2I(Form("recHitsBLay%dCharge",il+1), Form("HostvsDevice RecHits Charge Barrel Layer%d;Host Charge;Device Charge",il+1), 250, 0, 100000, 250, 0, 100000);
+    hBsizexL_[il] = iBook.book2I(Form("recHitsBLay%dSizex",il+1), Form("HostvsDevice RecHits SizeX Barrel Layer%d;Host SizeX;Device SizeX",il+1), 30, 0, 30, 30, 0, 30);
+    hBsizeyL_[il] = iBook.book2I(Form("recHitsBLay%dSizey",il+1), Form("HostvsDevice RecHits SizeY Barrel Layer%d;Host SizeY;Device SizeY",il+1), 30, 0, 30, 30, 0, 30);
+    hBposxL_[il] = iBook.book2D(Form("recHitsBLay%dPosx",il+1), Form("HostvsDevice RecHits x-pos in Barrel Layer%d;Host pos x;Device pos x",il+1), 200, -5, 5, 200,-5,5);
+    hBposyL_[il] = iBook.book2D(Form("recHitsBLay%dPosy",il+1), Form("HostvsDevice RecHits y-pos in Barrel Layer%d;Host pos y;Device pos y",il+1), 200, -5, 5, 200,-5,5);
+  }
+  //Endcaps
+  //Endcaps Disk
+  for(int is=0;is<2;is++){
+    int sign=is==0? -1:1;
+    for(unsigned int id=0;id<tkGeom_->numberOfLayers(PixelSubdetector::PixelEndcap);id++){
+      hFchargeD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("HostvsDevice RecHits Charge Endcaps Disk%+d;Host Charge;Device Charge",id*sign+sign), 250, 0, 100000, 250, 0, 100000);
+      hFsizexD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("HostvsDevice RecHits SizeX Endcaps Disk%+d;Host SizeX;Device SizeX",id*sign+sign), 30, 0, 30, 30, 0, 30);
+      hFsizeyD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("HostvsDevice RecHits SizeY Endcaps Disk%+d;Host SizeY;Device SizeY",id*sign+sign), 30, 0, 30, 30, 0, 30);
+      hFposxD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosx",id*sign+sign), Form("HostvsDevice RecHits x-pos Endcaps Disk%+d;Host pos x;Device pos x",id*sign+sign), 200, -5, 5, 200, -5, 5);
+      hFposyD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosy",id*sign+sign), Form("HostvsDevice RecHits y-pos Endcaps Disk%+d;Host pos y;Device pos y",id*sign+sign), 200, -5, 5, 200, -5, 5);
+    }
+  }
+  //1D differences
+  hBchargeDiff_ = iBook.book1D("rechitChargeDiffBpix","Charge differnce of rechits in BPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5);
+  hFchargeDiff_ = iBook.book1D("rechitChargeDiffFpix","Charge differnce of rechits in FPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5);
+  hBsizeXDiff_ = iBook.book1D("rechitsizeXDiffBpix","SizeX difference of rechits in BPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5);
+  hFsizeXDiff_ = iBook.book1D("rechitsizeXDiffFpix","SizeX difference of rechits in FPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5);
+  hBsizeYDiff_ = iBook.book1D("rechitsizeYDiffBpix","SizeY difference of rechits in BPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5);
+  hFsizeYDiff_ = iBook.book1D("rechitsizeYDiffFpix","SizeY difference of rechits in FPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5);
+  hBposXDiff_ = iBook.book1D("rechitsposXDiffBpix","x-position difference of rechits in BPix; rechit x-pos difference (Host - Device)", 1000, -10, 10);
+  hFposXDiff_ = iBook.book1D("rechitsposXDiffFpix","x-position difference of rechits in FPix; rechit x-pos difference (Host - Device)", 1000, -10, 10);
+  hBposYDiff_ = iBook.book1D("rechitsposYDiffBpix","y-position difference of rechits in BPix; rechit y-pos difference (Host - Device)", 1000, -10, 10);
+  hFposYDiff_ = iBook.book1D("rechitsposYDiffFpix","y-position difference of rechits in FPix; rechit y-pos difference (Host - Device)", 1000, -10, 10);
+}
+
+template<typename T>
+void SiPixelCompareRecHitsSoAAlpaka<T>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  // monitorpixelRecHitsSoAAlpaka
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelHitsSrcHost", edm::InputTag("siPixelRecHitsPreSplittingAlpakaSerial"));
+  desc.add<edm::InputTag>("pixelHitsSrcDevice", edm::InputTag("siPixelRecHitsPreSplittingAlpaka"));
+  desc.add<std::string>("topFolderName", "SiPixelHeterogeneous/PixelRecHitsCompareDeviceVSHost");
+  desc.add<double>("minD2cut", 0.0001);
+  descriptions.addWithDefaultLabel(desc);
+}
+
+using SiPixelPhase1CompareRecHitsSoAAlpaka = SiPixelCompareRecHitsSoAAlpaka<pixelTopology::Phase1>;
+using SiPixelPhase2CompareRecHitsSoAAlpaka = SiPixelCompareRecHitsSoAAlpaka<pixelTopology::Phase2>;
+using SiPixelHIonPhase1CompareRecHitsSoAAlpaka = SiPixelCompareRecHitsSoAAlpaka<pixelTopology::HIonPhase1>;
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiPixelPhase1CompareRecHitsSoAAlpaka);
+DEFINE_FWK_MODULE(SiPixelPhase2CompareRecHitsSoAAlpaka);
+DEFINE_FWK_MODULE(SiPixelHIonPhase1CompareRecHitsSoAAlpaka);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoAAlpaka.cc
new file mode 100644
index 0000000000000..65a6dc2802831
--- /dev/null
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoAAlpaka.cc
@@ -0,0 +1,308 @@
+// for string manipulations
+#include <fmt/printf.h>
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/Math/interface/deltaR.h"
+#include "DataFormats/Math/interface/deltaPhi.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+// DQM Histograming
+#include "DQMServices/Core/interface/MonitorElement.h"
+#include "DQMServices/Core/interface/DQMEDAnalyzer.h"
+#include "DQMServices/Core/interface/DQMStore.h"
+// DataFormats
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+
+namespace {
+  // same logic used for the MTV:
+  // cf https://github.com/cms-sw/cmssw/blob/master/Validation/RecoTrack/src/MTVHistoProducerAlgoForTracker.cc
+  typedef dqm::reco::DQMStore DQMStore;
+
+  void setBinLog(TAxis* axis) {
+    int bins = axis->GetNbins();
+    float from = axis->GetXmin();
+    float to = axis->GetXmax();
+    float width = (to - from) / bins;
+    std::vector<float> new_bins(bins + 1, 0);
+    for (int i = 0; i <= bins; i++) {
+      new_bins[i] = TMath::Power(10, from + i * width);
+    }
+    axis->Set(bins, new_bins.data());
+  }
+
+  void setBinLogX(TH1* h) {
+    TAxis* axis = h->GetXaxis();
+    setBinLog(axis);
+  }
+  void setBinLogY(TH1* h) {
+    TAxis* axis = h->GetYaxis();
+    setBinLog(axis);
+  }
+
+  template <typename... Args>
+  dqm::reco::MonitorElement* make2DIfLog(DQMStore::IBooker& ibook, bool logx, bool logy, Args&&... args) {
+    auto h = std::make_unique<TH2I>(std::forward<Args>(args)...);
+    if (logx)
+      setBinLogX(h.get());
+    if (logy)
+      setBinLogY(h.get());
+    const auto& name = h->GetName();
+    return ibook.book2I(name, h.release());
+  }
+}  // namespace
+
+template <typename T>
+class SiPixelCompareTrackSoAAlpaka : public DQMEDAnalyzer {
+public:
+  using PixelTrackSoA = TracksHost<T>;
+
+  explicit SiPixelCompareTrackSoAAlpaka(const edm::ParameterSet&);
+  ~SiPixelCompareTrackSoAAlpaka() override = default;
+  void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
+  void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  const edm::EDGetTokenT<PixelTrackSoA> tokenSoATrackHost_;
+  const edm::EDGetTokenT<PixelTrackSoA> tokenSoATrackDevice_;
+  const std::string topFolderName_;
+  const bool useQualityCut_;
+  const pixelTrack::Quality minQuality_;
+  const float dr2cut_;
+  MonitorElement* hnTracks_;
+  MonitorElement* hnLooseAndAboveTracks_;
+  MonitorElement* hnLooseAndAboveTracks_matched_;
+  MonitorElement* hnHits_;
+  MonitorElement* hnHitsVsPhi_;
+  MonitorElement* hnHitsVsEta_;
+  MonitorElement* hnLayers_;
+  MonitorElement* hnLayersVsPhi_;
+  MonitorElement* hnLayersVsEta_;
+  MonitorElement* hCharge_;
+  MonitorElement* hchi2_;
+  MonitorElement* hChi2VsPhi_;
+  MonitorElement* hChi2VsEta_;
+  MonitorElement* hpt_;
+  MonitorElement* hptLogLog_;
+  MonitorElement* heta_;
+  MonitorElement* hphi_;
+  MonitorElement* hz_;
+  MonitorElement* htip_;
+  MonitorElement* hquality_;
+  //1D differences
+  MonitorElement* hptdiffMatched_;
+  MonitorElement* hCurvdiffMatched_;
+  MonitorElement* hetadiffMatched_;
+  MonitorElement* hphidiffMatched_;
+  MonitorElement* hzdiffMatched_;
+  MonitorElement* htipdiffMatched_;
+
+  //for matching eff vs region: derive the ratio at harvesting
+  MonitorElement* hpt_eta_tkAllHost_;
+  MonitorElement* hpt_eta_tkAllHostMatched_;
+  MonitorElement* hphi_z_tkAllHost_;
+  MonitorElement* hphi_z_tkAllHostMatched_;
+};
+
+//
+// constructors
+//
+
+template <typename T>
+SiPixelCompareTrackSoAAlpaka<T>::SiPixelCompareTrackSoAAlpaka(const edm::ParameterSet& iConfig)
+    : tokenSoATrackHost_(consumes<PixelTrackSoA>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcHost"))),
+      tokenSoATrackDevice_(consumes<PixelTrackSoA>(iConfig.getParameter<edm::InputTag>("pixelTrackSrcDevice"))),
+      topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
+      useQualityCut_(iConfig.getParameter<bool>("useQualityCut")),
+      minQuality_(pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"))),
+      dr2cut_(iConfig.getParameter<double>("deltaR2cut")) {}
+
+//
+// -- Analyze
+//
+template <typename T>
+void SiPixelCompareTrackSoAAlpaka<T>::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  using helper = TracksUtilities<T>;
+  const auto& tsoaHandleHost = iEvent.getHandle(tokenSoATrackHost_);
+  const auto& tsoaHandleDevice = iEvent.getHandle(tokenSoATrackDevice_);
+  if (not tsoaHandleHost or not tsoaHandleDevice) {
+    edm::LogWarning out("SiPixelCompareTrackSoAAlpaka");
+    if (not tsoaHandleHost) {
+      out << "reference (cpu) tracks not found; ";
+    }
+    if (not tsoaHandleDevice) {
+      out << "target (gpu) tracks not found; ";
+    }
+    out << "the comparison will not run.";
+    return;
+  }
+
+  auto const& tsoaHost = *tsoaHandleHost;
+  auto const& tsoaDevice = *tsoaHandleDevice;
+  auto maxTracksHost = tsoaHost.view().metadata().size();      //this should be same for both?
+  auto maxTracksDevice = tsoaDevice.view().metadata().size();  //this should be same for both?
+  auto const* qualityHost = tsoaHost.view().quality();
+  auto const* qualityDevice = tsoaDevice.view().quality();
+  int32_t nTracksHost = 0;
+  int32_t nTracksDevice = 0;
+  int32_t nLooseAndAboveTracksHost = 0;
+  int32_t nLooseAndAboveTracksHost_matchedDevice = 0;
+  int32_t nLooseAndAboveTracksDevice = 0;
+
+  //Loop over Device tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false?
+  std::vector<int32_t> looseTrkidxDevice;
+  for (int32_t jt = 0; jt < maxTracksDevice; ++jt) {
+    if (helper::nHits(tsoaDevice.view(), jt) == 0)
+      break;  // this is a guard
+    if (!(tsoaDevice.view()[jt].pt() > 0.))
+      continue;
+    nTracksDevice++;
+    if (useQualityCut_ && qualityDevice[jt] < minQuality_)
+      continue;
+    nLooseAndAboveTracksDevice++;
+    looseTrkidxDevice.emplace_back(jt);
+  }
+
+  //Now loop over Host tracks//nested loop for loose gPU tracks
+  for (int32_t it = 0; it < maxTracksHost; ++it) {
+    int nHitsHost = helper::nHits(tsoaHost.view(), it);
+
+    if (nHitsHost == 0)
+      break;  // this is a guard
+
+    float ptHost = tsoaHost.view()[it].pt();
+    float etaHost = tsoaHost.view()[it].eta();
+    float phiHost = helper::phi(tsoaHost.view(), it);
+    float zipHost = helper::zip(tsoaHost.view(), it);
+    float tipHost = helper::tip(tsoaHost.view(), it);
+
+    if (!(ptHost > 0.))
+      continue;
+    nTracksHost++;
+    if (useQualityCut_ && qualityHost[it] < minQuality_)
+      continue;
+    nLooseAndAboveTracksHost++;
+    //Now loop over loose Device trk and find the closest in DeltaR//do we need pt cut?
+    const int32_t notFound = -1;
+    int32_t closestTkidx = notFound;
+    float mindr2 = dr2cut_;
+
+    for (auto gid : looseTrkidxDevice) {
+      float etaDevice = tsoaDevice.view()[gid].eta();
+      float phiDevice = helper::phi(tsoaDevice.view(), gid);
+      float dr2 = reco::deltaR2(etaHost, phiHost, etaDevice, phiDevice);
+      if (dr2 > dr2cut_)
+        continue;  // this is arbitrary
+      if (mindr2 > dr2) {
+        mindr2 = dr2;
+        closestTkidx = gid;
+      }
+    }
+
+    hpt_eta_tkAllHost_->Fill(etaHost, ptHost);  //all Host tk
+    hphi_z_tkAllHost_->Fill(phiHost, zipHost);
+    if (closestTkidx == notFound)
+      continue;
+    nLooseAndAboveTracksHost_matchedDevice++;
+
+    hchi2_->Fill(tsoaHost.view()[it].chi2(), tsoaDevice.view()[closestTkidx].chi2());
+    hCharge_->Fill(reco::charge<T>(tsoaHost.view(), it), reco::charge<T>(tsoaDevice.view(), closestTkidx));
+    hnHits_->Fill(helper::nHits(tsoaHost.view(), it), helper::nHits(tsoaDevice.view(), closestTkidx));
+    hnLayers_->Fill(tsoaHost.view()[it].nLayers(), tsoaDevice.view()[closestTkidx].nLayers());
+    hpt_->Fill(tsoaHost.view()[it].pt(), tsoaDevice.view()[closestTkidx].pt());
+    hptLogLog_->Fill(tsoaHost.view()[it].pt(), tsoaDevice.view()[closestTkidx].pt());
+    heta_->Fill(etaHost, tsoaDevice.view()[closestTkidx].eta());
+    hphi_->Fill(phiHost, helper::phi(tsoaDevice.view(), closestTkidx));
+    hz_->Fill(zipHost, helper::zip(tsoaDevice.view(), closestTkidx));
+    htip_->Fill(tipHost, helper::tip(tsoaDevice.view(), closestTkidx));
+    hptdiffMatched_->Fill(ptHost - tsoaDevice.view()[closestTkidx].pt());
+    hCurvdiffMatched_->Fill((reco::charge<T>(tsoaHost.view(), it) / tsoaHost.view()[it].pt()) -
+                            (reco::charge<T>(tsoaDevice.view(), closestTkidx) / tsoaDevice.view()[closestTkidx].pt()));
+    hetadiffMatched_->Fill(etaHost - tsoaDevice.view()[closestTkidx].eta());
+    hphidiffMatched_->Fill(reco::deltaPhi(phiHost, helper::phi(tsoaDevice.view(), closestTkidx)));
+    hzdiffMatched_->Fill(zipHost - helper::zip(tsoaDevice.view(), closestTkidx));
+    htipdiffMatched_->Fill(tipHost - helper::tip(tsoaDevice.view(), closestTkidx));
+    hpt_eta_tkAllHostMatched_->Fill(etaHost, tsoaHost.view()[it].pt());  //matched to gpu
+    hphi_z_tkAllHostMatched_->Fill(etaHost, zipHost);
+  }
+  hnTracks_->Fill(nTracksHost, nTracksDevice);
+  hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksHost, nLooseAndAboveTracksDevice);
+  hnLooseAndAboveTracks_matched_->Fill(nLooseAndAboveTracksHost, nLooseAndAboveTracksHost_matchedDevice);
+}
+
+//
+// -- Book Histograms
+//
+template <typename T>
+void SiPixelCompareTrackSoAAlpaka<T>::bookHistograms(DQMStore::IBooker& iBook,
+                                                     edm::Run const& iRun,
+                                                     edm::EventSetup const& iSetup) {
+  iBook.cd();
+  iBook.setCurrentFolder(topFolderName_);
+
+  // clang-format off
+  std::string toRep = "Number of tracks";
+  // FIXME: all the 2D correlation plots are quite heavy in terms of memory consumption, so a as soon as DQM supports THnSparse
+  // these should be moved to a less resource consuming format
+  hnTracks_ = iBook.book2I("nTracks", fmt::format("{} per event; Host; Device",toRep), 501, -0.5, 500.5, 501, -0.5, 500.5);
+  hnLooseAndAboveTracks_ = iBook.book2I("nLooseAndAboveTracks", fmt::format("{} (quality #geq loose) per event; Host; Device",toRep), 501, -0.5, 500.5, 501, -0.5, 500.5);
+  hnLooseAndAboveTracks_matched_ = iBook.book2I("nLooseAndAboveTracks_matched", fmt::format("{} (quality #geq loose) per event; Host; Device",toRep), 501, -0.5, 500.5, 501, -0.5, 500.5);
+
+  toRep = "Number of all RecHits per track (quality #geq loose)";
+  hnHits_ = iBook.book2I("nRecHits", fmt::format("{};Host;Device",toRep), 15, -0.5, 14.5, 15, -0.5, 14.5);
+
+  toRep = "Number of all layers per track (quality #geq loose)";
+  hnLayers_ = iBook.book2I("nLayers", fmt::format("{};Host;Device",toRep), 15, -0.5, 14.5, 15, -0.5, 14.5);
+
+  toRep = "Track (quality #geq loose) #chi^{2}/ndof";
+  hchi2_ = iBook.book2I("nChi2ndof", fmt::format("{};Host;Device",toRep), 40, 0., 20., 40, 0., 20.);
+
+  toRep = "Track (quality #geq loose) charge";
+  hCharge_ = iBook.book2I("charge",fmt::format("{};Host;Device",toRep),3, -1.5, 1.5, 3, -1.5, 1.5);
+
+  hpt_ = iBook.book2I("pt", "Track (quality #geq loose) p_{T} [GeV];Host;Device", 200, 0., 200., 200, 0., 200.);
+  hptLogLog_ = make2DIfLog(iBook, true, true, "ptLogLog", "Track (quality #geq loose) p_{T} [GeV];Host;Device", 200, log10(0.5), log10(200.), 200, log10(0.5), log10(200.));
+  heta_ = iBook.book2I("eta", "Track (quality #geq loose) #eta;Host;Device", 30, -3., 3., 30, -3., 3.);
+  hphi_ = iBook.book2I("phi", "Track (quality #geq loose) #phi;Host;Device", 30, -M_PI, M_PI, 30, -M_PI, M_PI);
+  hz_ = iBook.book2I("z", "Track (quality #geq loose) z [cm];Host;Device", 30, -30., 30., 30, -30., 30.);
+  htip_ = iBook.book2I("tip", "Track (quality #geq loose) TIP [cm];Host;Device", 100, -0.5, 0.5, 100, -0.5, 0.5);
+  //1D difference plots
+  hptdiffMatched_ = iBook.book1D("ptdiffmatched", " p_{T} diff [GeV] between matched tracks; #Delta p_{T} [GeV]", 60, -30., 30.);
+  hCurvdiffMatched_ = iBook.book1D("curvdiffmatched", "q/p_{T} diff [GeV] between matched tracks; #Delta q/p_{T} [GeV]", 60, -30., 30.);
+  hetadiffMatched_ = iBook.book1D("etadiffmatched", " #eta diff between matched tracks; #Delta #eta", 160, -0.04 ,0.04);
+  hphidiffMatched_ = iBook.book1D("phidiffmatched", " #phi diff between matched tracks; #Delta #phi",  160, -0.04 ,0.04);
+  hzdiffMatched_ = iBook.book1D("zdiffmatched", " z diff between matched tracks; #Delta z [cm]", 300, -1.5, 1.5);
+  htipdiffMatched_ = iBook.book1D("tipdiffmatched", " TIP diff between matched tracks; #Delta TIP [cm]", 300, -1.5, 1.5);
+  //2D plots for eff
+  hpt_eta_tkAllHost_ = iBook.book2I("ptetatrkAllHost", "Track (quality #geq loose) on Host; #eta; p_{T} [GeV];", 30, -M_PI, M_PI, 200, 0., 200.);
+  hpt_eta_tkAllHostMatched_ = iBook.book2I("ptetatrkAllHostmatched", "Track (quality #geq loose) on Host matched to Device track; #eta; p_{T} [GeV];", 30, -M_PI, M_PI, 200, 0., 200.);
+
+  hphi_z_tkAllHost_ = iBook.book2I("phiztrkAllHost", "Track (quality #geq loose) on Host; #phi; z [cm];",  30, -M_PI, M_PI, 30, -30., 30.);
+  hphi_z_tkAllHostMatched_ = iBook.book2I("phiztrkAllHostmatched", "Track (quality #geq loose) on Host; #phi; z [cm];", 30, -M_PI, M_PI, 30, -30., 30.);
+
+}
+
+template<typename T>
+void SiPixelCompareTrackSoAAlpaka<T>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  // monitorpixelTrackSoA
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelTrackSrcHost", edm::InputTag("pixelTracksAlpakaSerial"));
+  desc.add<edm::InputTag>("pixelTrackSrcDevice", edm::InputTag("pixelTracksAlpaka"));
+  desc.add<std::string>("topFolderName", "SiPixelHeterogeneous/PixelTrackCompareDeviceVSHost");
+  desc.add<bool>("useQualityCut", true);
+  desc.add<std::string>("minQuality", "loose");
+  desc.add<double>("deltaR2cut", 0.04);
+  descriptions.addWithDefaultLabel(desc);
+}
+
+using SiPixelPhase1CompareTrackSoAAlpaka = SiPixelCompareTrackSoAAlpaka<pixelTopology::Phase1>;
+using SiPixelPhase2CompareTrackSoAAlpaka = SiPixelCompareTrackSoAAlpaka<pixelTopology::Phase2>;
+using SiPixelHIonPhase1CompareTrackSoAAlpaka = SiPixelCompareTrackSoAAlpaka<pixelTopology::HIonPhase1>;
+
+DEFINE_FWK_MODULE(SiPixelPhase1CompareTrackSoAAlpaka);
+DEFINE_FWK_MODULE(SiPixelPhase2CompareTrackSoAAlpaka);
+DEFINE_FWK_MODULE(SiPixelHIonPhase1CompareTrackSoAAlpaka);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc
new file mode 100644
index 0000000000000..2eea6a980d9c5
--- /dev/null
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc
@@ -0,0 +1,186 @@
+// -*- C++ -*-
+// Package:    SiPixelCompareVertexSoAAlpaka
+// Class:      SiPixelCompareVertexSoAAlpaka
+//
+/**\class SiPixelCompareVertexSoAAlpaka SiPixelCompareVertexSoAAlpaka.cc
+*/
+//
+// Author: Suvankar Roy Chowdhury
+//
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "DataFormats/Common/interface/Handle.h"
+// DQM Histograming
+#include "DQMServices/Core/interface/MonitorElement.h"
+#include "DQMServices/Core/interface/DQMEDAnalyzer.h"
+#include "DQMServices/Core/interface/DQMStore.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+
+class SiPixelCompareVertexSoAAlpaka : public DQMEDAnalyzer {
+public:
+  using IndToEdm = std::vector<uint16_t>;
+  explicit SiPixelCompareVertexSoAAlpaka(const edm::ParameterSet&);
+  ~SiPixelCompareVertexSoAAlpaka() override = default;
+  void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
+  void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  const edm::EDGetTokenT<ZVertexHost> tokenSoAVertexHost_;
+  const edm::EDGetTokenT<ZVertexHost> tokenSoAVertexDevice_;
+  const edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
+  const std::string topFolderName_;
+  const float dzCut_;
+  MonitorElement* hnVertex_;
+  MonitorElement* hx_;
+  MonitorElement* hy_;
+  MonitorElement* hz_;
+  MonitorElement* hchi2_;
+  MonitorElement* hchi2oNdof_;
+  MonitorElement* hptv2_;
+  MonitorElement* hntrks_;
+  MonitorElement* hxdiff_;
+  MonitorElement* hydiff_;
+  MonitorElement* hzdiff_;
+};
+
+//
+// constructors
+//
+
+// Note tokenSoAVertexDevice_ contains data copied from device to host, hence is a HostCollection
+SiPixelCompareVertexSoAAlpaka::SiPixelCompareVertexSoAAlpaka(const edm::ParameterSet& iConfig)
+    : tokenSoAVertexHost_(consumes<ZVertexHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcHost"))),
+      tokenSoAVertexDevice_(consumes<ZVertexHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcDevice"))),
+      tokenBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"))),
+      topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
+      dzCut_(iConfig.getParameter<double>("dzCut")) {}
+
+//
+// -- Analyze
+//
+void SiPixelCompareVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  const auto& vsoaHandleHost = iEvent.getHandle(tokenSoAVertexHost_);
+  const auto& vsoaHandleDevice = iEvent.getHandle(tokenSoAVertexDevice_);
+  if (not vsoaHandleHost or not vsoaHandleDevice) {
+    edm::LogWarning out("SiPixelCompareVertexSoAAlpaka");
+    if (not vsoaHandleHost) {
+      out << "reference (cpu) tracks not found; ";
+    }
+    if (not vsoaHandleDevice) {
+      out << "target (gpu) tracks not found; ";
+    }
+    out << "the comparison will not run.";
+    return;
+  }
+
+  auto const& vsoaHost = *vsoaHandleHost;
+  int nVerticesHost = vsoaHost.view().nvFinal();
+  auto const& vsoaDevice = *vsoaHandleDevice;
+  int nVerticesDevice = vsoaDevice.view().nvFinal();
+
+  auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
+  float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;
+  if (!bsHandle.isValid()) {
+    edm::LogWarning("SiPixelCompareVertexSoAAlpaka") << "No beamspot found. returning vertexes with (0,0,Z) ";
+  } else {
+    const reco::BeamSpot& bs = *bsHandle;
+    x0 = bs.x0();
+    y0 = bs.y0();
+    z0 = bs.z0();
+    dxdz = bs.dxdz();
+    dydz = bs.dydz();
+  }
+
+  for (int ivc = 0; ivc < nVerticesHost; ivc++) {
+    auto sic = vsoaHost.view()[ivc].sortInd();
+    auto zc = vsoaHost.view()[sic].zv();
+    auto xc = x0 + dxdz * zc;
+    auto yc = y0 + dydz * zc;
+    zc += z0;
+
+    auto ndofHost = vsoaHost.view()[sic].ndof();
+    auto chi2Host = vsoaHost.view()[sic].chi2();
+
+    const int32_t notFound = -1;
+    int32_t closestVtxidx = notFound;
+    float mindz = dzCut_;
+
+    for (int ivg = 0; ivg < nVerticesDevice; ivg++) {
+      auto sig = vsoaDevice.view()[ivg].sortInd();
+      auto zgc = vsoaDevice.view()[sig].zv() + z0;
+      auto zDist = std::abs(zc - zgc);
+      //insert some matching condition
+      if (zDist > dzCut_)
+        continue;
+      if (mindz > zDist) {
+        mindz = zDist;
+        closestVtxidx = sig;
+      }
+    }
+    if (closestVtxidx == notFound)
+      continue;
+
+    auto zg = vsoaDevice.view()[closestVtxidx].zv();
+    auto xg = x0 + dxdz * zg;
+    auto yg = y0 + dydz * zg;
+    zg += z0;
+    auto ndofDevice = vsoaDevice.view()[closestVtxidx].ndof();
+    auto chi2Device = vsoaDevice.view()[closestVtxidx].chi2();
+
+    hx_->Fill(xc - x0, xg - x0);
+    hy_->Fill(yc - y0, yg - y0);
+    hz_->Fill(zc, zg);
+    hxdiff_->Fill(xc - xg);
+    hydiff_->Fill(yc - yg);
+    hzdiff_->Fill(zc - zg);
+    hchi2_->Fill(chi2Host, chi2Device);
+    hchi2oNdof_->Fill(chi2Host / ndofHost, chi2Device / ndofDevice);
+    hptv2_->Fill(vsoaHost.view()[sic].ptv2(), vsoaDevice.view()[closestVtxidx].ptv2());
+    hntrks_->Fill(ndofHost + 1, ndofDevice + 1);
+  }
+  hnVertex_->Fill(nVerticesHost, nVerticesDevice);
+}
+
+//
+// -- Book Histograms
+//
+void SiPixelCompareVertexSoAAlpaka::bookHistograms(DQMStore::IBooker& ibooker,
+                                                   edm::Run const& iRun,
+                                                   edm::EventSetup const& iSetup) {
+  ibooker.cd();
+  ibooker.setCurrentFolder(topFolderName_);
+
+  // FIXME: all the 2D correlation plots are quite heavy in terms of memory consumption, so a as soon as DQM supports either TH2I or THnSparse
+  // these should be moved to a less resource consuming format
+  hnVertex_ = ibooker.book2I("nVertex", "# of Vertices;Host;Device", 101, -0.5, 100.5, 101, -0.5, 100.5);
+  hx_ = ibooker.book2I("vx", "Vertez x - Beamspot x;Host;Device", 50, -0.1, 0.1, 50, -0.1, 0.1);
+  hy_ = ibooker.book2I("vy", "Vertez y - Beamspot y;Host;Device", 50, -0.1, 0.1, 50, -0.1, 0.1);
+  hz_ = ibooker.book2I("vz", "Vertez z;Host;Device", 30, -30., 30., 30, -30., 30.);
+  hchi2_ = ibooker.book2I("chi2", "Vertex chi-squared;Host;Device", 40, 0., 20., 40, 0., 20.);
+  hchi2oNdof_ = ibooker.book2I("chi2oNdof", "Vertex chi-squared/Ndof;Host;Device", 40, 0., 20., 40, 0., 20.);
+  hptv2_ = ibooker.book2I("ptsq", "Vertex #sum (p_{T})^{2};Host;Device", 200, 0., 200., 200, 0., 200.);
+  hntrks_ = ibooker.book2I("ntrk", "#tracks associated;Host;Device", 100, -0.5, 99.5, 100, -0.5, 99.5);
+  hntrks_ = ibooker.book2I("ntrk", "#tracks associated;Host;Device", 100, -0.5, 99.5, 100, -0.5, 99.5);
+  hxdiff_ = ibooker.book1D("vxdiff", ";Vertex x difference (Host - Device);#entries", 100, -0.001, 0.001);
+  hydiff_ = ibooker.book1D("vydiff", ";Vertex y difference (Host - Device);#entries", 100, -0.001, 0.001);
+  hzdiff_ = ibooker.book1D("vzdiff", ";Vertex z difference (Host - Device);#entries", 100, -2.5, 2.5);
+}
+
+void SiPixelCompareVertexSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  // monitorpixelVertexSoA
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelVertexSrcHost", edm::InputTag("pixelVerticesAlpakaSerial"));
+  desc.add<edm::InputTag>("pixelVertexSrcDevice", edm::InputTag("pixelVerticesAlpaka"));
+  desc.add<edm::InputTag>("beamSpotSrc", edm::InputTag("offlineBeamSpot"));
+  desc.add<std::string>("topFolderName", "SiPixelHeterogeneous/PixelVertexCompareSoADeviceVSHost");
+  desc.add<double>("dzCut", 1.);
+  descriptions.addWithDefaultLabel(desc);
+}
+
+DEFINE_FWK_MODULE(SiPixelCompareVertexSoAAlpaka);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoAAlpaka.cc
new file mode 100644
index 0000000000000..f4c8968fafb16
--- /dev/null
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoAAlpaka.cc
@@ -0,0 +1,198 @@
+#include "DQMServices/Core/interface/MonitorElement.h"
+#include "DQMServices/Core/interface/DQMEDAnalyzer.h"
+#include "DQMServices/Core/interface/DQMStore.h"
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
+#include "Geometry/CommonTopologies/interface/PixelTopology.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+
+template <typename T>
+class SiPixelMonitorRecHitsSoAAlpaka : public DQMEDAnalyzer {
+public:
+  using HitsOnHost = TrackingRecHitHost<T>;
+
+  explicit SiPixelMonitorRecHitsSoAAlpaka(const edm::ParameterSet&);
+  ~SiPixelMonitorRecHitsSoAAlpaka() override = default;
+  void dqmBeginRun(const edm::Run&, const edm::EventSetup&) override;
+  void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
+  void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
+  const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> topoToken_;
+  const edm::EDGetTokenT<HitsOnHost> tokenSoAHits_;
+  const std::string topFolderName_;
+  const TrackerGeometry* tkGeom_ = nullptr;
+  const TrackerTopology* tTopo_ = nullptr;
+  MonitorElement* hnHits;
+  MonitorElement* hBFposZP;
+  MonitorElement* hBFposZR;
+  MonitorElement* hBposXY;
+  MonitorElement* hBposZP;
+  MonitorElement* hBcharge;
+  MonitorElement* hBsizex;
+  MonitorElement* hBsizey;
+  MonitorElement* hBposZPL[4];  // max 4 barrel hits
+  MonitorElement* hBchargeL[4];
+  MonitorElement* hBsizexL[4];
+  MonitorElement* hBsizeyL[4];
+  MonitorElement* hFposXY;
+  MonitorElement* hFposZP;
+  MonitorElement* hFcharge;
+  MonitorElement* hFsizex;
+  MonitorElement* hFsizey;
+  MonitorElement* hFposXYD[2][12];  // max 12 endcap disks
+  MonitorElement* hFchargeD[2][12];
+  MonitorElement* hFsizexD[2][12];
+  MonitorElement* hFsizeyD[2][12];
+};
+
+//
+// constructors
+//
+template <typename T>
+SiPixelMonitorRecHitsSoAAlpaka<T>::SiPixelMonitorRecHitsSoAAlpaka(const edm::ParameterSet& iConfig)
+    : geomToken_(esConsumes<TrackerGeometry, TrackerDigiGeometryRecord, edm::Transition::BeginRun>()),
+      topoToken_(esConsumes<TrackerTopology, TrackerTopologyRcd, edm::Transition::BeginRun>()),
+      tokenSoAHits_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrc"))),
+      topFolderName_(iConfig.getParameter<std::string>("TopFolderName")) {}
+
+//
+// Begin Run
+//
+template <typename T>
+void SiPixelMonitorRecHitsSoAAlpaka<T>::dqmBeginRun(const edm::Run& iRun, const edm::EventSetup& iSetup) {
+  tkGeom_ = &iSetup.getData(geomToken_);
+  tTopo_ = &iSetup.getData(topoToken_);
+}
+
+//
+// -- Analyze
+//
+template <typename T>
+void SiPixelMonitorRecHitsSoAAlpaka<T>::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  const auto& rhsoaHandle = iEvent.getHandle(tokenSoAHits_);
+  if (!rhsoaHandle.isValid()) {
+    edm::LogWarning("SiPixelMonitorRecHitsSoAAlpaka") << "No RecHits SoA found \n returning!";
+    return;
+  }
+  auto const& rhsoa = *rhsoaHandle;
+  auto const& soa2d = rhsoa.const_view();
+
+  uint32_t nHits_ = soa2d.metadata().size();
+  hnHits->Fill(nHits_);
+  auto detIds = tkGeom_->detUnitIds();
+  for (uint32_t i = 0; i < nHits_; i++) {
+    DetId id = detIds[soa2d[i].detectorIndex()];
+    float xG = soa2d[i].xGlobal();
+    float yG = soa2d[i].yGlobal();
+    float zG = soa2d[i].zGlobal();
+    float rG = soa2d[i].rGlobal();
+    float fphi = short2phi(soa2d[i].iphi());
+    uint32_t charge = soa2d[i].chargeAndStatus().charge;
+    int16_t sizeX = std::ceil(float(std::abs(soa2d[i].clusterSizeX()) / 8.));
+    int16_t sizeY = std::ceil(float(std::abs(soa2d[i].clusterSizeY()) / 8.));
+    hBFposZP->Fill(zG, fphi);
+    int16_t ysign = yG >= 0 ? 1 : -1;
+    hBFposZR->Fill(zG, rG * ysign);
+    switch (id.subdetId()) {
+      case PixelSubdetector::PixelBarrel:
+        hBposXY->Fill(xG, yG);
+        hBposZP->Fill(zG, fphi);
+        hBcharge->Fill(charge);
+        hBsizex->Fill(sizeX);
+        hBsizey->Fill(sizeY);
+        hBposZPL[tTopo_->pxbLayer(id) - 1]->Fill(zG, fphi);
+        hBchargeL[tTopo_->pxbLayer(id) - 1]->Fill(charge);
+        hBsizexL[tTopo_->pxbLayer(id) - 1]->Fill(sizeX);
+        hBsizeyL[tTopo_->pxbLayer(id) - 1]->Fill(sizeY);
+        break;
+      case PixelSubdetector::PixelEndcap:
+        hFposXY->Fill(xG, yG);
+        hFposZP->Fill(zG, fphi);
+        hFcharge->Fill(charge);
+        hFsizex->Fill(sizeX);
+        hFsizey->Fill(sizeY);
+        hFposXYD[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xG, yG);
+        hFchargeD[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(charge);
+        hFsizexD[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeX);
+        hFsizeyD[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeY);
+        break;
+    }
+  }
+}
+
+//
+// -- Book Histograms
+//
+template <typename T>
+void SiPixelMonitorRecHitsSoAAlpaka<T>::bookHistograms(DQMStore::IBooker& iBook,
+                                                       edm::Run const& iRun,
+                                                       edm::EventSetup const& iSetup) {
+  iBook.cd();
+  iBook.setCurrentFolder(topFolderName_);
+
+  // clang-format off
+  //Global
+  hnHits = iBook.book1D("nHits", "RecHits per event;RecHits;#events", 200, 0, 5000);
+  hBFposZP = iBook.book2D("recHitsGlobalPosZP", "RecHits position Global;Z;#phi", 1000, -60, 60, 200,-3.2,3.2);
+  hBFposZR = iBook.book2D("recHitsGlobalPosZR", "RecHits position Global;Z;R", 1000, -60, 60, 200,-20,20);
+  //Barrel
+  hBposXY = iBook.book2D("recHitsBarrelPosXY", "RecHits position Barrel;X;Y", 200, -20, 20, 200,-20,20);
+  hBposZP = iBook.book2D("recHitsBarrelPosZP", "RecHits position Barrel;Z;#phi", 300, -30, 30, 200,-3.2,3.2);
+  hBcharge = iBook.book1D("recHitsBarrelCharge", "RecHits Charge Barrel;Charge;#events", 250, 0, 100000);
+  hBsizex = iBook.book1D("recHitsBarrelSizex", "RecHits SizeX Barrel;SizeX;#events", 50, 0, 50);
+  hBsizey = iBook.book1D("recHitsBarrelSizey", "RecHits SizeY Barrel;SizeY;#events", 50, 0, 50);
+  //Barrel Layer
+  for(unsigned int il=0;il<tkGeom_->numberOfLayers(PixelSubdetector::PixelBarrel);il++){
+    hBposZPL[il] = iBook.book2D(Form("recHitsBLay%dPosZP",il+1), Form("RecHits position Barrel Layer%d;Z;#phi",il+1), 300, -30, 30, 200,-3.2,3.2);
+    hBchargeL[il] = iBook.book1D(Form("recHitsBLay%dCharge",il+1), Form("RecHits Charge Barrel Layer%d;Charge;#events",il+1), 250, 0, 100000);
+    hBsizexL[il] = iBook.book1D(Form("recHitsBLay%dSizex",il+1), Form("RecHits SizeX Barrel Layer%d;SizeX;#events",il+1), 50, 0, 50);
+    hBsizeyL[il] = iBook.book1D(Form("recHitsBLay%dSizey",il+1), Form("RecHits SizeY Barrel Layer%d;SizeY;#events",il+1), 50, 0, 50);
+  }
+  //Endcaps
+  hFposXY = iBook.book2D("recHitsEndcapsPosXY", "RecHits position Endcaps;X;Y", 200, -20, 20, 200,-20, 20);
+  hFposZP = iBook.book2D("recHitsEndcapsPosZP", "RecHits position Endcaps;Z;#phi", 600, -60, 60, 200,-3.2,3.2);
+  hFcharge = iBook.book1D("recHitsEndcapsCharge", "RecHits Charge Endcaps;Charge;#events", 250, 0, 100000);
+  hFsizex = iBook.book1D("recHitsEndcapsSizex", "RecHits SizeX Endcaps;SizeX;#events", 50, 0, 50);
+  hFsizey = iBook.book1D("recHitsEndcapsSizey", "RecHits SizeY Endcaps;SizeY;#events", 50, 0, 50);
+  //Endcaps Disk
+  for(int is=0;is<2;is++){
+    int sign=is==0? -1:1;
+    for(unsigned int id=0;id<tkGeom_->numberOfLayers(PixelSubdetector::PixelEndcap);id++){
+      hFposXYD[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosXY",id*sign+sign), Form("RecHits position Endcaps Disk%+d;X;Y",id*sign+sign), 200, -20, 20, 200,-20,20);
+      hFchargeD[is][id] = iBook.book1D(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("RecHits Charge Endcaps Disk%+d;Charge;#events",id*sign+sign), 250, 0, 100000);
+      hFsizexD[is][id] = iBook.book1D(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("RecHits SizeX Endcaps Disk%+d;SizeX;#events",id*sign+sign), 50, 0, 50);
+      hFsizeyD[is][id] = iBook.book1D(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("RecHits SizeY Endcaps Disk%+d;SizeY;#events",id*sign+sign), 50, 0, 50);
+    }
+  }
+}
+
+template<typename T>
+void SiPixelMonitorRecHitsSoAAlpaka<T>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  // monitorpixelRecHitsSoA
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelHitsSrc", edm::InputTag("siPixelRecHitsPreSplittingAlpaka"));
+  desc.add<std::string>("TopFolderName", "SiPixelHeterogeneous/PixelRecHitsAlpaka");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+using SiPixelPhase1MonitorRecHitsSoAAlpaka = SiPixelMonitorRecHitsSoAAlpaka<pixelTopology::Phase1>;
+using SiPixelPhase2MonitorRecHitsSoAAlpaka = SiPixelMonitorRecHitsSoAAlpaka<pixelTopology::Phase2>;
+using SiPixelHIonPhase1MonitorRecHitsSoAAlpaka = SiPixelMonitorRecHitsSoAAlpaka<pixelTopology::HIonPhase1>;
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiPixelPhase1MonitorRecHitsSoAAlpaka);
+DEFINE_FWK_MODULE(SiPixelPhase2MonitorRecHitsSoAAlpaka);
+DEFINE_FWK_MODULE(SiPixelHIonPhase1MonitorRecHitsSoAAlpaka);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoAAlpaka.cc
new file mode 100644
index 0000000000000..fd98957ee8492
--- /dev/null
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoAAlpaka.cc
@@ -0,0 +1,197 @@
+// -*- C++ -*-
+// Package:    SiPixelMonitorTrackSoAAlpaka
+// Class:      SiPixelMonitorTrackSoAAlpaka
+//
+/**\class SiPixelMonitorTrackSoAAlpaka SiPixelMonitorTrackSoAAlpaka.cc
+*/
+//
+// Author: Suvankar Roy Chowdhury
+//
+
+// for string manipulations
+#include <fmt/printf.h>
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+// DQM Histograming
+#include "DQMServices/Core/interface/MonitorElement.h"
+#include "DQMServices/Core/interface/DQMEDAnalyzer.h"
+#include "DQMServices/Core/interface/DQMStore.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+
+template <typename T>
+class SiPixelMonitorTrackSoAAlpaka : public DQMEDAnalyzer {
+public:
+  using PixelTrackHeterogeneous = TracksHost<T>;
+  explicit SiPixelMonitorTrackSoAAlpaka(const edm::ParameterSet&);
+  ~SiPixelMonitorTrackSoAAlpaka() override = default;
+  void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
+  void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  std::string topFolderName_;
+  bool useQualityCut_;
+  pixelTrack::Quality minQuality_;
+  MonitorElement* hnTracks;
+  MonitorElement* hnLooseAndAboveTracks;
+  MonitorElement* hnHits;
+  MonitorElement* hnHitsVsPhi;
+  MonitorElement* hnHitsVsEta;
+  MonitorElement* hnLayers;
+  MonitorElement* hnLayersVsPhi;
+  MonitorElement* hnLayersVsEta;
+  MonitorElement* hchi2;
+  MonitorElement* hChi2VsPhi;
+  MonitorElement* hChi2VsEta;
+  MonitorElement* hpt;
+  MonitorElement* heta;
+  MonitorElement* hphi;
+  MonitorElement* hz;
+  MonitorElement* htip;
+  MonitorElement* hquality;
+};
+
+//
+// constructors
+//
+
+template <typename T>
+SiPixelMonitorTrackSoAAlpaka<T>::SiPixelMonitorTrackSoAAlpaka(const edm::ParameterSet& iConfig) {
+  tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+  topFolderName_ = iConfig.getParameter<std::string>("topFolderName");  //"SiPixelHeterogeneous/PixelTrackSoA";
+  useQualityCut_ = iConfig.getParameter<bool>("useQualityCut");
+  minQuality_ = pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"));
+}
+
+//
+// -- Analyze
+//
+template <typename T>
+void SiPixelMonitorTrackSoAAlpaka<T>::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  const auto& tsoaHandle = iEvent.getHandle(tokenSoATrack_);
+  if (!tsoaHandle.isValid()) {
+    edm::LogWarning("SiPixelMonitorTrackSoAAlpaka") << "No Track SoA found \n returning!" << std::endl;
+    return;
+  }
+
+  auto const& tsoa = *tsoaHandle.product();
+  auto maxTracks = tsoa.view().metadata().size();
+  auto const* quality = tsoa.view().quality();
+  int32_t nTracks = 0;
+  int32_t nLooseAndAboveTracks = 0;
+
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.view().detIndices().size(it);
+    auto nLayers = tsoa.view()[it].nLayers();
+    if (nHits == 0)
+      break;  // this is a guard
+    float pt = tsoa.view()[it].pt();
+    if (!(pt > 0.))
+      continue;
+
+    // fill the quality for all tracks
+    pixelTrack::Quality qual = quality[it];
+    hquality->Fill(int(qual));
+    nTracks++;
+
+    if (useQualityCut_ && quality[it] < minQuality_)
+      continue;
+
+    // fill parameters only for quality >= loose
+
+    float chi2 = tsoa.view()[it].chi2();
+    float phi = tsoa.view()[it].state()(0);  //TODO: put these numbers in enum
+    float zip = tsoa.view()[it].state()(4);
+    float eta = tsoa.view()[it].eta();
+    float tip = tsoa.view()[it].state()(1);
+
+    hchi2->Fill(chi2);
+    hChi2VsPhi->Fill(phi, chi2);
+    hChi2VsEta->Fill(eta, chi2);
+    hnHits->Fill(nHits);
+    hnLayers->Fill(nLayers);
+    hnHitsVsPhi->Fill(phi, nHits);
+    hnHitsVsEta->Fill(eta, nHits);
+    hnLayersVsPhi->Fill(phi, nLayers);
+    hnLayersVsEta->Fill(eta, nLayers);
+    hpt->Fill(pt);
+    heta->Fill(eta);
+    hphi->Fill(phi);
+    hz->Fill(zip);
+    htip->Fill(tip);
+    nLooseAndAboveTracks++;
+  }
+  hnTracks->Fill(nTracks);
+  hnLooseAndAboveTracks->Fill(nLooseAndAboveTracks);
+}
+
+//
+// -- Book Histograms
+//
+template <typename T>
+void SiPixelMonitorTrackSoAAlpaka<T>::bookHistograms(DQMStore::IBooker& iBook,
+                                                     edm::Run const& iRun,
+                                                     edm::EventSetup const& iSetup) {
+  iBook.cd();
+  iBook.setCurrentFolder(topFolderName_);
+
+  // clang-format off
+std::string toRep = "Number of tracks";
+hnTracks = iBook.book1D("nTracks", fmt::format(";{} per event;#events",toRep), 1001, -0.5, 1000.5);
+hnLooseAndAboveTracks = iBook.book1D("nLooseAndAboveTracks", fmt::format(";{} (quality #geq loose) per event;#events",toRep), 1001, -0.5, 1000.5);
+
+toRep = "Number of all RecHits per track (quality #geq loose)";
+hnHits = iBook.book1D("nRecHits", fmt::format(";{};#tracks",toRep), 15, -0.5, 14.5);
+hnHitsVsPhi = iBook.bookProfile("nHitsPerTrackVsPhi", fmt::format("{} vs track #phi;Track #phi;{}",toRep,toRep), 30, -M_PI, M_PI,0., 15.);
+hnHitsVsEta = iBook.bookProfile("nHitsPerTrackVsEta", fmt::format("{} vs track #eta;Track #eta;{}",toRep,toRep), 30, -3., 3., 0., 15.);
+
+toRep = "Number of all layers per track (quality #geq loose)";
+hnLayers = iBook.book1D("nLayers", fmt::format(";{};#tracks",toRep), 15, -0.5, 14.5);
+hnLayersVsPhi = iBook.bookProfile("nLayersPerTrackVsPhi", fmt::format("{} vs track #phi;Track #phi;{}",toRep,toRep), 30, -M_PI, M_PI,0., 15.);
+hnLayersVsEta = iBook.bookProfile("nLayersPerTrackVsEta", fmt::format("{} vs track #eta;Track #eta;{}",toRep,toRep), 30, -3., 3., 0., 15.);
+
+toRep = "Track (quality #geq loose) #chi^{2}/ndof";
+hchi2 = iBook.book1D("nChi2ndof", fmt::format(";{};#tracks",toRep), 40, 0., 20.);
+hChi2VsPhi = iBook.bookProfile("nChi2ndofVsPhi", fmt::format("{} vs track #phi;Track #phi;{}",toRep,toRep), 30, -M_PI, M_PI, 0., 20.);
+hChi2VsEta = iBook.bookProfile("nChi2ndofVsEta", fmt::format("{} vs track #eta;Track #eta;{}",toRep,toRep), 30, -3., 3., 0., 20.);
+  // clang-format on
+
+  hpt = iBook.book1D("pt", ";Track (quality #geq loose) p_{T} [GeV];#tracks", 200, 0., 200.);
+  heta = iBook.book1D("eta", ";Track (quality #geq loose) #eta;#tracks", 30, -3., 3.);
+  hphi = iBook.book1D("phi", ";Track (quality #geq loose) #phi;#tracks", 30, -M_PI, M_PI);
+  hz = iBook.book1D("z", ";Track (quality #geq loose) z [cm];#tracks", 30, -30., 30.);
+  htip = iBook.book1D("tip", ";Track (quality #geq loose) TIP [cm];#tracks", 100, -0.5, 0.5);
+  hquality = iBook.book1D("quality", ";Track Quality;#tracks", 7, -0.5, 6.5);
+  uint i = 1;
+  for (const auto& q : pixelTrack::qualityName) {
+    hquality->setBinLabel(i, q.data(), 1);
+    i++;
+  }
+}
+
+template <typename T>
+void SiPixelMonitorTrackSoAAlpaka<T>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  // monitorpixelTrackSoA
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("pixelTracksAlpaka"));
+  desc.add<std::string>("topFolderName", "SiPixelHeterogeneous/PixelTrackAlpaka");
+  desc.add<bool>("useQualityCut", true);
+  desc.add<std::string>("minQuality", "loose");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+using SiPixelPhase1MonitorTrackSoAAlpaka = SiPixelMonitorTrackSoAAlpaka<pixelTopology::Phase1>;
+using SiPixelPhase2MonitorTrackSoAAlpaka = SiPixelMonitorTrackSoAAlpaka<pixelTopology::Phase2>;
+using SiPixelHIonPhase1MonitorTrackSoAAlpaka = SiPixelMonitorTrackSoAAlpaka<pixelTopology::HIonPhase1>;
+
+DEFINE_FWK_MODULE(SiPixelPhase1MonitorTrackSoAAlpaka);
+DEFINE_FWK_MODULE(SiPixelPhase2MonitorTrackSoAAlpaka);
+DEFINE_FWK_MODULE(SiPixelHIonPhase1MonitorTrackSoAAlpaka);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc
new file mode 100644
index 0000000000000..d3121f77bccb8
--- /dev/null
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc
@@ -0,0 +1,131 @@
+// -*- C++ -*-
+///bookLayer
+// Package:    SiPixelMonitorVertexSoAAlpaka
+// Class:      SiPixelMonitorVertexSoAAlpaka
+//
+/**\class SiPixelMonitorVertexSoAAlpaka SiPixelMonitorVertexSoAAlpaka.cc
+*/
+//
+// Author: Suvankar Roy Chowdhury
+//
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "DataFormats/Common/interface/Handle.h"
+// DQM Histograming
+#include "DQMServices/Core/interface/MonitorElement.h"
+#include "DQMServices/Core/interface/DQMEDAnalyzer.h"
+#include "DQMServices/Core/interface/DQMStore.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+
+class SiPixelMonitorVertexSoAAlpaka : public DQMEDAnalyzer {
+public:
+  using IndToEdm = std::vector<uint16_t>;
+  explicit SiPixelMonitorVertexSoAAlpaka(const edm::ParameterSet&);
+  ~SiPixelMonitorVertexSoAAlpaka() override = default;
+  void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
+  void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  const edm::EDGetTokenT<ZVertexHost> tokenSoAVertex_;
+  const edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
+  std::string topFolderName_;
+  MonitorElement* hnVertex;
+  MonitorElement* hx;
+  MonitorElement* hy;
+  MonitorElement* hz;
+  MonitorElement* hchi2;
+  MonitorElement* hchi2oNdof;
+  MonitorElement* hptv2;
+  MonitorElement* hntrks;
+};
+
+//
+// constructors
+//
+
+SiPixelMonitorVertexSoAAlpaka::SiPixelMonitorVertexSoAAlpaka(const edm::ParameterSet& iConfig)
+    : tokenSoAVertex_(consumes<ZVertexHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"))),
+      tokenBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"))),
+      topFolderName_(iConfig.getParameter<std::string>("topFolderName")) {}
+
+//
+// -- Analyze
+//
+void SiPixelMonitorVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  const auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_);
+  if (!vsoaHandle.isValid()) {
+    edm::LogWarning("SiPixelMonitorVertexSoAAlpaka") << "No Vertex SoA found \n returning!" << std::endl;
+    return;
+  }
+
+  auto const& vsoa = *vsoaHandle;
+  int nVertices = vsoa.view().nvFinal();
+  auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
+  float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;
+  if (!bsHandle.isValid()) {
+    edm::LogWarning("SiPixelMonitorVertexSoAAlpaka") << "No beamspot found. returning vertexes with (0,0,Z) ";
+  } else {
+    const reco::BeamSpot& bs = *bsHandle;
+    x0 = bs.x0();
+    y0 = bs.y0();
+    z0 = bs.z0();
+    dxdz = bs.dxdz();
+    dydz = bs.dydz();
+  }
+
+  for (int iv = 0; iv < nVertices; iv++) {
+    auto si = vsoa.view()[iv].sortInd();
+    auto z = vsoa.view()[si].zv();
+    auto x = x0 + dxdz * z;
+    auto y = y0 + dydz * z;
+
+    z += z0;
+    hx->Fill(x);
+    hy->Fill(y);
+    hz->Fill(z);
+    auto ndof = vsoa.view()[si].ndof();
+    hchi2->Fill(vsoa.view()[si].chi2());
+    hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof);
+    hptv2->Fill(vsoa.view()[si].ptv2());
+    hntrks->Fill(ndof + 1);
+  }
+  hnVertex->Fill(nVertices);
+}
+
+//
+// -- Book Histograms
+//
+void SiPixelMonitorVertexSoAAlpaka::bookHistograms(DQMStore::IBooker& ibooker,
+                                                   edm::Run const& iRun,
+                                                   edm::EventSetup const& iSetup) {
+  //std::string top_folder = ""//
+  ibooker.cd();
+  ibooker.setCurrentFolder(topFolderName_);
+  hnVertex = ibooker.book1D("nVertex", ";# of Vertices;#entries", 101, -0.5, 100.5);
+  hx = ibooker.book1D("vx", ";Vertex x;#entries", 10, -5., 5.);
+  hy = ibooker.book1D("vy", ";Vertex y;#entries", 10, -5., 5.);
+  hz = ibooker.book1D("vz", ";Vertex z;#entries", 30, -30., 30);
+  hchi2 = ibooker.book1D("chi2", ";Vertex chi-squared;#entries", 40, 0., 20.);
+  hchi2oNdof = ibooker.book1D("chi2oNdof", ";Vertex chi-squared/Ndof;#entries", 40, 0., 20.);
+  hptv2 = ibooker.book1D("ptsq", ";Vertex #sum (p_{T})^{2};#entries", 200, 0., 200.);
+  hntrks = ibooker.book1D("ntrk", ";#tracks associated;#entries", 100, -0.5, 99.5);
+}
+
+void SiPixelMonitorVertexSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  // monitorpixelVertexSoA
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVerticesAlpaka"));
+  desc.add<edm::InputTag>("beamSpotSrc", edm::InputTag("offlineBeamSpot"));
+  desc.add<std::string>("topFolderName", "SiPixelHeterogeneous/PixelVertexAlpaka");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+DEFINE_FWK_MODULE(SiPixelMonitorVertexSoAAlpaka);
diff --git a/DQM/SiPixelHeterogeneous/python/SiPixelHeterogenousDQM_FirstStep_cff.py b/DQM/SiPixelHeterogeneous/python/SiPixelHeterogenousDQM_FirstStep_cff.py
index dfb83708c95cf..95245a3fea968 100644
--- a/DQM/SiPixelHeterogeneous/python/SiPixelHeterogenousDQM_FirstStep_cff.py
+++ b/DQM/SiPixelHeterogeneous/python/SiPixelHeterogenousDQM_FirstStep_cff.py
@@ -7,20 +7,35 @@
 from DQM.SiPixelHeterogeneous.siPixelPhase2MonitorTrackSoA_cfi import *
 from DQM.SiPixelHeterogeneous.siPixelHIonPhase1MonitorTrackSoA_cfi import *
 from DQM.SiPixelHeterogeneous.siPixelMonitorVertexSoA_cfi import *
+# Alpaka Modules
+from Configuration.ProcessModifiers.alpaka_cff import alpaka
+from DQM.SiPixelHeterogeneous.siPixelPhase1MonitorRecHitsSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelPhase2MonitorRecHitsSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelHIonPhase1MonitorRecHitsSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelPhase1MonitorTrackSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelPhase2MonitorTrackSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelHIonPhase1MonitorTrackSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelMonitorVertexSoAAlpaka_cfi import *
 
 # Run-3 sequence
 monitorpixelSoASource = cms.Sequence(siPixelPhase1MonitorRecHitsSoA * siPixelPhase1MonitorTrackSoA * siPixelMonitorVertexSoA)
-
+# Run-3 Alpaka sequence 
+monitorpixelSoASourceAlpaka = cms.Sequence(siPixelPhase1MonitorRecHitsSoAAlpaka * siPixelPhase1MonitorTrackSoAAlpaka * siPixelMonitorVertexSoAAlpaka)
+alpaka.toReplaceWith(monitorpixelSoASource, monitorpixelSoASourceAlpaka)
 # Phase-2 sequence
 from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker
 _monitorpixelSoARecHitsSource = cms.Sequence(siPixelPhase2MonitorRecHitsSoA * siPixelPhase2MonitorTrackSoA * siPixelMonitorVertexSoA)
-phase2_tracker.toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSource)
+(phase2_tracker & ~alpaka).toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSource)
+_monitorpixelSoARecHitsSourceAlpaka = cms.Sequence(siPixelPhase2MonitorRecHitsSoAAlpaka * siPixelPhase2MonitorTrackSoAAlpaka * siPixelMonitorVertexSoAAlpaka)
+(phase2_tracker & alpaka).toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSourceAlpaka)
 
 # HIon Phase 1 sequence
 from Configuration.ProcessModifiers.pp_on_AA_cff import pp_on_AA
 
 _monitorpixelSoARecHitsSourceHIon = cms.Sequence(siPixelHIonPhase1MonitorRecHitsSoA * siPixelHIonPhase1MonitorTrackSoA * siPixelMonitorVertexSoA)
 (pp_on_AA & ~phase2_tracker).toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSourceHIon)
+_monitorpixelSoARecHitsSourceHIonAlpaka = cms.Sequence(siPixelHIonPhase1MonitorRecHitsSoAAlpaka * siPixelHIonPhase1MonitorTrackSoAAlpaka * siPixelMonitorVertexSoAAlpaka)
+(pp_on_AA & ~phase2_tracker & alpaka).toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSourceHIonAlpaka)
 
 #Define the sequence for GPU vs CPU validation
 #This should run:- individual monitor for the 2 collections + comparison module
@@ -33,6 +48,14 @@
 from DQM.SiPixelHeterogeneous.siPixelCompareVertexSoA_cfi import *
 from DQM.SiPixelHeterogeneous.siPixelPhase1RawDataErrorComparator_cfi import *
 from DQM.SiPixelPhase1Common.SiPixelPhase1RawData_cfi import *
+#Alpaka
+from DQM.SiPixelHeterogeneous.siPixelPhase1CompareRecHitsSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelPhase2CompareRecHitsSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelHIonPhase1CompareRecHitsSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelPhase1CompareTrackSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelPhase2CompareTrackSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelHIonPhase1CompareTrackSoAAlpaka_cfi import *
+from DQM.SiPixelHeterogeneous.siPixelCompareVertexSoAAlpaka_cfi import *
 
 # digi errors
 SiPixelPhase1RawDataConfForCPU = copy.deepcopy(SiPixelPhase1RawDataConf)
@@ -126,6 +149,43 @@
   topFolderName = 'SiPixelHeterogeneous/PixelVertexSoAGPU',
 )
 
+### Alpaka
+
+# PixelRecHits: monitor of CPUSerial product (Alpaka backend: 'serial_sync')
+siPixelRecHitsSoAMonitorSerial = siPixelPhase1MonitorRecHitsSoAAlpaka.clone(
+    pixelHitsSrc = cms.InputTag( 'siPixelRecHitsPreSplittingAlpakaSerial' ),
+    TopFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsSerial' )
+)
+
+# PixelRecHits: monitor of Device product (Alpaka backend: '')
+siPixelRecHitsSoAMonitorDevice = siPixelPhase1MonitorRecHitsSoAAlpaka.clone(
+    pixelHitsSrc = cms.InputTag( 'siPixelRecHitsPreSplittingAlpaka' ),
+    TopFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsDevice' )
+)
+
+# PixelTracks: monitor of CPUSerial product (Alpaka backend: 'serial_sync')
+siPixelTrackSoAMonitorSerial = siPixelPhase1MonitorTrackSoAAlpaka.clone(
+    pixelTrackSrc = cms.InputTag('pixelTracksAlpakaSerial'),
+    topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackSerial')
+)
+
+# PixelTracks: monitor of CPUSerial product (Alpaka backend: 'serial_sync')
+siPixelTrackSoAMonitorDevice = siPixelPhase1MonitorTrackSoAAlpaka.clone(
+    pixelTrackSrc = cms.InputTag('pixelTracksAlpaka'),
+    topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackDevice')
+)
+
+# PixelVertices: monitor of CPUSerial product (Alpaka backend: 'serial_sync')
+siPixelVertexSoAMonitorSerial = siPixelMonitorVertexSoAAlpaka.clone(
+    pixelVertexSrc = cms.InputTag("pixelVerticesAlpakaSerial"),
+    topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexSerial')
+)
+
+siPixelVertexSoAMonitorDevice = siPixelMonitorVertexSoAAlpaka.clone(
+    pixelVertexSrc = cms.InputTag("pixelVerticesAlpaka"),
+    topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexDevice')
+)
+
 # Run-3 sequence
 monitorpixelSoACompareSource = cms.Sequence(siPixelPhase1MonitorRawDataACPU *
                                             siPixelPhase1MonitorRawDataAGPU *
@@ -139,6 +199,17 @@
                                             siPixelMonitorVertexSoAGPU *
                                             siPixelCompareVertexSoA *
                                             siPixelPhase1RawDataErrorComparator)
+# and the Alpaka version
+monitorpixelSoACompareSourceAlpaka = cms.Sequence(
+                                            siPixelRecHitsSoAMonitorSerial *
+                                            siPixelRecHitsSoAMonitorDevice *
+                                            siPixelPhase1CompareRecHitsSoAAlpaka *
+                                            siPixelTrackSoAMonitorSerial *
+                                            siPixelTrackSoAMonitorDevice *
+                                            siPixelPhase1CompareTrackSoAAlpaka *
+                                            siPixelVertexSoAMonitorSerial *
+                                            siPixelVertexSoAMonitorDevice *
+                                            siPixelCompareVertexSoAAlpaka )
 
 # Phase-2 sequence
 _monitorpixelSoACompareSource =  cms.Sequence(siPixelPhase2MonitorRecHitsSoACPU *
@@ -166,3 +237,6 @@
 
 from Configuration.ProcessModifiers.gpuValidationPixel_cff import gpuValidationPixel
 gpuValidationPixel.toReplaceWith(monitorpixelSoASource, monitorpixelSoACompareSource)
+
+from Configuration.ProcessModifiers.alpakaValidationPixel_cff import alpakaValidationPixel
+(alpakaValidationPixel & ~gpuValidationPixel).toReplaceWith(monitorpixelSoASource, monitorpixelSoACompareSourceAlpaka)
diff --git a/DataFormats/TrackSoA/BuildFile.xml b/DataFormats/TrackSoA/BuildFile.xml
new file mode 100644
index 0000000000000..ac764cf5b95ff
--- /dev/null
+++ b/DataFormats/TrackSoA/BuildFile.xml
@@ -0,0 +1,12 @@
+<use name="alpaka"/>
+<use name="rootcore"/>
+<use name="eigen"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/Portable"/>
+<use name="DataFormats/SoATemplate" source_only="1"/>
+<use name="DataFormats/TrackerCommon" source_only="1"/>
+<use name="HeterogeneousCore/AlpakaInterface"/>
+<flags ALPAKA_BACKENDS="!serial"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/DataFormats/TrackSoA/README.md b/DataFormats/TrackSoA/README.md
new file mode 100644
index 0000000000000..433dfb0d656c7
--- /dev/null
+++ b/DataFormats/TrackSoA/README.md
@@ -0,0 +1,60 @@
+# TrackSoA Data Formats
+
+`DataFormat`s meant to be used on Host (CPU) or Device (GPU) for
+storing information about `TrackSoA`s created during the Pixel-local Reconstruction
+chain. It stores data in an SoA manner.  
+
+The host format is inheriting from `DataFormats/Portable/interface/PortableHostCollection.h`,
+while the device format is inheriting from `DataFormats/Portable/interface/PortableDeviceCollection.h`
+
+Both formats use the same SoA Layout (`TrackSoA::Layout`) which is generated
+via the `GENERATE_SOA_LAYOUT` macro in the `TrackDefinitions.h` file.
+
+## Notes
+
+-`hitIndices` and `detIndices`, instances of `HitContainer`, have been added into the
+layout as `SOA_SCALAR`s, meaning that they manage their own data independently from the SoA
+`Layout`. This could be improved in the future, if `HitContainer` (aka a `OneToManyAssoc` of fixed size)
+is replaced, but there don't seem to be any conflicts in including it in the `Layout` like this.
+- Host and Device classes should **not** be created via inheritance, as they're done here,
+but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309).
+
+## TracksHost
+
+The version of the data format to be used for storing `TrackSoA` data on the CPU. 
+Instances of this class are to be used for:
+
+- Having a place to copy data to host from device, via `Memcpy`, or
+- Running host-side algorithms using data stored in an SoA manner.
+
+## TracksDevice
+
+The version of the data format to be used for storing `TrackSoA` data on the GPU.
+
+Instances of `TracksDevice` are to be created on host and be
+used on device only. To do so, the instance's `view()` method is to be called
+to pass a `View` to any kernel launched. Accessing data from the `view()` is not
+possible on the host side.
+
+## TracksSoACollection
+
+Depending on the Alpaka accelerator back-end enabled, `TrackSoACollection` is an alias to either the Host or Device SoA:
+
+```cpp
+template <typename TrackerTraits>
+  using TrackSoACollection = std::conditional_t<std::is_same_v<Device, alpaka::DevCpu>,
+                                                          TrackSoAHost<TrackerTraits>,
+                                                          TrackSoADevice<TrackerTraits, Device>>;
+```
+
+## Utilities
+
+`alpaka/TrackUtilities.h` contains a collection of methods which were originally
+defined as class methods inside either `TrackSoAHeterogeneousT` and `TrajectoryStateSoAT`
+which have been adapted to operate on `View` instances, so that they are callable
+from within `__global__` kernels, on both CPU and CPU. 
+
+## Use case
+
+See `test/TrackSoAHeterogeneous_test.cpp` for a simple example of instantiation,
+processing and copying from device to host.
diff --git a/DataFormats/TrackSoA/interface/TrackDefinitions.h b/DataFormats/TrackSoA/interface/TrackDefinitions.h
new file mode 100644
index 0000000000000..6bd36b5bd3cd1
--- /dev/null
+++ b/DataFormats/TrackSoA/interface/TrackDefinitions.h
@@ -0,0 +1,32 @@
+#ifndef DataFormats_Track_interface_TrackDefinitions_h
+#define DataFormats_Track_interface_TrackDefinitions_h
+#include <string>
+#include <algorithm>
+#include <stdexcept>
+
+namespace pixelTrack {
+
+  enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
+  constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
+  constexpr std::string_view qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"};
+  inline Quality qualityByName(std::string_view name) {
+    auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
+    auto ret = static_cast<Quality>(qp);
+
+    if (ret == pixelTrack::Quality::notQuality)
+      throw std::invalid_argument(std::string(name) + " is not a pixelTrack::Quality!");
+
+    return ret;
+  }
+
+#ifdef GPU_SMALL_EVENTS
+  // kept for testing and debugging
+  constexpr uint32_t maxNumber() { return 2 * 1024; }
+#else
+  // tested on MC events with 55-75 pileup events
+  constexpr uint32_t maxNumber() { return 32 * 1024; }
+#endif
+
+}  // namespace pixelTrack
+
+#endif
diff --git a/DataFormats/TrackSoA/interface/TracksDevice.h b/DataFormats/TrackSoA/interface/TracksDevice.h
new file mode 100644
index 0000000000000..6ef28014bab63
--- /dev/null
+++ b/DataFormats/TrackSoA/interface/TracksDevice.h
@@ -0,0 +1,38 @@
+#ifndef DataFormats_Track_interface_TracksDevice_h
+#define DataFormats_Track_interface_TracksDevice_h
+
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/Portable/interface/PortableDeviceCollection.h"
+
+// TODO: The class is created via inheritance of the PortableCollection.
+// This is generally discouraged, and should be done via composition.
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+template <typename TrackerTraits, typename TDev>
+class TracksDevice : public PortableDeviceCollection<reco::TrackLayout<TrackerTraits>, TDev> {
+public:
+  static constexpr int32_t S = TrackerTraits::maxNumberOfTuples;  //TODO: this could be made configurable at runtime
+  TracksDevice() = default;                                       // necessary for ROOT dictionaries
+
+  using PortableDeviceCollection<reco::TrackLayout<TrackerTraits>, TDev>::view;
+  using PortableDeviceCollection<reco::TrackLayout<TrackerTraits>, TDev>::const_view;
+  using PortableDeviceCollection<reco::TrackLayout<TrackerTraits>, TDev>::buffer;
+
+  // Constructor which specifies the SoA size
+  template <typename TQueue>
+  explicit TracksDevice<TrackerTraits, TDev>(TQueue& queue)
+      : PortableDeviceCollection<reco::TrackLayout<TrackerTraits>, TDev>(S, queue) {}
+};
+
+namespace pixelTrack {
+
+  template <typename TDev>
+  using TracksDevicePhase1 = TracksDevice<pixelTopology::Phase1, TDev>;
+  template <typename TDev>
+  using TracksDevicePhase2 = TracksDevice<pixelTopology::Phase2, TDev>;
+
+}  // namespace pixelTrack
+
+#endif  // DataFormats_Track_TracksDevice_H
diff --git a/DataFormats/TrackSoA/interface/TracksHost.h b/DataFormats/TrackSoA/interface/TracksHost.h
new file mode 100644
index 0000000000000..a8f459eac066c
--- /dev/null
+++ b/DataFormats/TrackSoA/interface/TracksHost.h
@@ -0,0 +1,42 @@
+#ifndef DataFormats_Track_TracksHost_H
+#define DataFormats_Track_TracksHost_H
+
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/Portable/interface/PortableHostCollection.h"
+
+// TODO: The class is created via inheritance of the PortableHostCollection.
+// This is generally discouraged, and should be done via composition.
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+template <typename TrackerTraits>
+class TracksHost : public PortableHostCollection<reco::TrackLayout<TrackerTraits>> {
+public:
+  static constexpr int32_t S = TrackerTraits::maxNumberOfTuples;  //TODO: this could be made configurable at runtime
+  TracksHost() = default;  // Needed for the dictionary; not sure if line above is needed anymore
+
+  using PortableHostCollection<reco::TrackLayout<TrackerTraits>>::view;
+  using PortableHostCollection<reco::TrackLayout<TrackerTraits>>::const_view;
+  using PortableHostCollection<reco::TrackLayout<TrackerTraits>>::buffer;
+
+  // Constructor which specifies the SoA size
+  template <typename TQueue>
+  explicit TracksHost<TrackerTraits>(TQueue& queue)
+      : PortableHostCollection<reco::TrackLayout<TrackerTraits>>(S, queue) {}
+
+  // Constructor which specifies the DevHost
+  explicit TracksHost(alpaka_common::DevHost const& host)
+      : PortableHostCollection<reco::TrackLayout<TrackerTraits>>(S, host) {}
+};
+
+namespace pixelTrack {
+
+  using TracksHostPhase1 = TracksHost<pixelTopology::Phase1>;
+  using TracksHostPhase2 = TracksHost<pixelTopology::Phase2>;
+  using TracksHostHIonPhase1 = TracksHost<pixelTopology::HIonPhase1>;
+
+}  // namespace pixelTrack
+
+#endif  // DataFormats_Track_TracksHost_H
diff --git a/DataFormats/TrackSoA/interface/TracksSoA.h b/DataFormats/TrackSoA/interface/TracksSoA.h
new file mode 100644
index 0000000000000..bc3a8c4be9cb5
--- /dev/null
+++ b/DataFormats/TrackSoA/interface/TracksSoA.h
@@ -0,0 +1,56 @@
+#ifndef DataFormats_Track_interface_TrackLayout_h
+#define DataFormats_Track_interface_TrackLayout_h
+
+#include <Eigen/Core>
+#include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+
+namespace reco {
+
+  template <typename TrackerTraits>
+  struct TrackSoA {
+    static constexpr int32_t S = TrackerTraits::maxNumberOfTuples;
+    static constexpr int32_t H = TrackerTraits::avgHitsPerTrack;
+    // Aliases in order to not confuse the GENERATE_SOA_LAYOUT
+    // macro with weird colons and angled brackets.
+    using Vector5f = Eigen::Matrix<float, 5, 1>;
+    using Vector15f = Eigen::Matrix<float, 15, 1>;
+    using Quality = pixelTrack::Quality;
+
+    using hindex_type = uint32_t;
+
+    using HitContainer = cms::alpakatools::OneToManyAssocSequential<hindex_type, S + 1, H * S>;
+
+    GENERATE_SOA_LAYOUT(Layout,
+                        SOA_COLUMN(Quality, quality),
+                        SOA_COLUMN(float, chi2),
+                        SOA_COLUMN(int8_t, nLayers),
+                        SOA_COLUMN(float, eta),
+                        SOA_COLUMN(float, pt),
+                        SOA_EIGEN_COLUMN(Vector5f, state),
+                        SOA_EIGEN_COLUMN(Vector15f, covariance),
+                        SOA_SCALAR(int, nTracks),
+                        SOA_SCALAR(HitContainer, hitIndices),
+                        SOA_SCALAR(HitContainer, detIndices))
+  };
+
+  template <typename TrackerTraits>
+  using TrackLayout = typename reco::TrackSoA<TrackerTraits>::template Layout<>;
+  template <typename TrackerTraits>
+  using TrackSoAView = typename reco::TrackSoA<TrackerTraits>::template Layout<>::View;
+  template <typename TrackerTraits>
+  using TrackSoAConstView = typename reco::TrackSoA<TrackerTraits>::template Layout<>::ConstView;
+
+  template <typename TrackerTraits>
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float charge(const TrackSoAConstView<TrackerTraits> &tracks,
+                                                                    int32_t i) {
+    //was: std::copysign(1.f, tracks[i].state()(2)). Will be constexpr with C++23
+    float v = tracks[i].state()(2);
+    return float((0.0f < v) - (v < 0.0f));
+  }
+
+}  // namespace reco
+
+#endif
diff --git a/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h b/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h
new file mode 100644
index 0000000000000..8affb29845779
--- /dev/null
+++ b/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h
@@ -0,0 +1,197 @@
+#ifndef DataFormats_Track_interface_alpaka_TrackUtilities_h
+#define DataFormats_Track_interface_alpaka_TrackUtilities_h
+
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+
+// Methods that operate on View and ConstView of the TrackSoA, and cannot be class methods.
+template <typename TrackerTraits>
+struct TracksUtilities {
+  using TrackSoAView = typename reco::TrackSoA<TrackerTraits>::template Layout<>::View;
+  using TrackSoAConstView = typename reco::TrackSoA<TrackerTraits>::template Layout<>::ConstView;
+  using hindex_type = typename reco::TrackSoA<TrackerTraits>::hindex_type;
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  /*  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float charge(const TrackSoAConstView &tracks, int32_t i) {
+    //was: std::copysign(1.f, tracks[i].state()(2)). Will be constexpr with C++23
+    float v = tracks[i].state()(2);
+    return float((0.0f < v) - (v < 0.0f));
+  }
+*/
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float phi(const TrackSoAConstView &tracks, int32_t i) {
+    return tracks[i].state()(0);
+  }
+
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float tip(const TrackSoAConstView &tracks, int32_t i) {
+    return tracks[i].state()(1);
+  }
+
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float zip(const TrackSoAConstView &tracks, int32_t i) {
+    return tracks[i].state()(4);
+  }
+
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr bool isTriplet(const TrackSoAConstView &tracks, int i) {
+    return tracks[i].nLayers() == 3;
+  }
+
+  template <typename V3, typename M3, typename V2, typename M2>
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr void copyFromCircle(
+      TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) {
+    tracks[i].state() << cp.template cast<float>(), lp.template cast<float>();
+
+    tracks[i].state()(2) = tracks[i].state()(2) * b;
+    auto cov = tracks[i].covariance();
+    cov(0) = ccov(0, 0);
+    cov(1) = ccov(0, 1);
+    cov(2) = b * float(ccov(0, 2));
+    cov(4) = cov(3) = 0;
+    cov(5) = ccov(1, 1);
+    cov(6) = b * float(ccov(1, 2));
+    cov(8) = cov(7) = 0;
+    cov(9) = b * b * float(ccov(2, 2));
+    cov(11) = cov(10) = 0;
+    cov(12) = lcov(0, 0);
+    cov(13) = lcov(0, 1);
+    cov(14) = lcov(1, 1);
+  }
+
+  template <typename V5, typename M5>
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr void copyFromDense(TrackSoAView &tracks,
+                                                                          V5 const &v,
+                                                                          M5 const &cov,
+                                                                          int32_t i) {
+    tracks[i].state() = v.template cast<float>();
+    for (int j = 0, ind = 0; j < 5; ++j)
+      for (auto k = j; k < 5; ++k)
+        tracks[i].covariance()(ind++) = cov(j, k);
+  }
+
+  template <typename V5, typename M5>
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr void copyToDense(const TrackSoAConstView &tracks,
+                                                                        V5 &v,
+                                                                        M5 &cov,
+                                                                        int32_t i) {
+    v = tracks[i].state().template cast<typename V5::Scalar>();
+    for (int j = 0, ind = 0; j < 5; ++j) {
+      cov(j, j) = tracks[i].covariance()(ind++);
+      for (auto k = j + 1; k < 5; ++k)
+        cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++);
+    }
+  }
+
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr int computeNumberOfLayers(const TrackSoAConstView &tracks,
+                                                                                 int32_t i) {
+    auto pdet = tracks.detIndices().begin(i);
+    int nl = 1;
+    auto ol = pixelTopology::getLayer<TrackerTraits>(*pdet);
+    for (; pdet < tracks.detIndices().end(i); ++pdet) {
+      auto il = pixelTopology::getLayer<TrackerTraits>(*pdet);
+      if (il != ol)
+        ++nl;
+      ol = il;
+    }
+    return nl;
+  }
+
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr int nHits(const TrackSoAConstView &tracks, int i) {
+    return tracks.detIndices().size(i);
+  }
+};
+
+namespace pixelTrack {
+
+  template <typename TrackerTraits, typename Enable = void>
+  struct QualityCutsT {};
+
+  template <typename TrackerTraits>
+  struct QualityCutsT<TrackerTraits, pixelTopology::isPhase1Topology<TrackerTraits>> {
+    using TrackSoAView = typename reco::TrackSoA<TrackerTraits>::template Layout<>::View;
+    using TrackSoAConstView = typename reco::TrackSoA<TrackerTraits>::template Layout<>::ConstView;
+    using tracksHelper = TracksUtilities<TrackerTraits>;
+    float chi2Coeff[4];
+    float chi2MaxPt;  // GeV
+    float chi2Scale;
+
+    struct Region {
+      float maxTip;  // cm
+      float minPt;   // GeV
+      float maxZip;  // cm
+    };
+
+    Region triplet;
+    Region quadruplet;
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool isHP(const TrackSoAConstView &tracks, int nHits, int it) const {
+      // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+      // default cuts:
+      //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+      //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+      // (see CAHitNtupletGeneratorGPU.cc)
+      auto const &region = (nHits > 3) ? quadruplet : triplet;
+      return (std::abs(tracksHelper::tip(tracks, it)) < region.maxTip) and (tracks.pt(it) > region.minPt) and
+             (std::abs(tracksHelper::zip(tracks, it)) < region.maxZip);
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool strictCut(const TrackSoAConstView &tracks, int it) const {
+      auto roughLog = [](float x) {
+        // max diff [0.5,12] at 1.25 0.16143
+        // average diff  0.0662998
+        union IF {
+          uint32_t i;
+          float f;
+        };
+        IF z;
+        z.f = x;
+        uint32_t lsb = 1 < 21;
+        z.i += lsb;
+        z.i >>= 21;
+        auto f = z.i & 3;
+        int ex = int(z.i >> 2) - 127;
+
+        // log2(1+0.25*f)
+        // averaged over bins
+        const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
+        return float(ex) + frac[f];
+      };
+
+      float pt = std::min<float>(tracks.pt(it), chi2MaxPt);
+      float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]);
+      if (tracks.chi2(it) >= chi2Cut) {
+#ifdef NTUPLE_FIT_DEBUG
+        printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks.pt(it), tracks.eta(it), tracks.chi2(it));
+#endif
+        return true;
+      }
+      return false;
+    }
+  };
+
+  template <typename TrackerTraits>
+  struct QualityCutsT<TrackerTraits, pixelTopology::isPhase2Topology<TrackerTraits>> {
+    using TrackSoAView = typename reco::TrackSoA<TrackerTraits>::template Layout<>::View;
+    using TrackSoAConstView = typename reco::TrackSoA<TrackerTraits>::template Layout<>::ConstView;
+    using tracksHelper = TracksUtilities<TrackerTraits>;
+
+    float maxChi2;
+    float minPt;
+    float maxTip;
+    float maxZip;
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool isHP(const TrackSoAConstView &tracks, int nHits, int it) const {
+      return (std::abs(tracksHelper::tip(tracks, it)) < maxTip) and (tracks.pt(it) > minPt) and
+             (std::abs(tracksHelper::zip(tracks, it)) < maxZip);
+    }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool strictCut(const TrackSoAConstView &tracks, int it) const {
+      return tracks.chi2(it) >= maxChi2;
+    }
+  };
+
+}  // namespace pixelTrack
+
+// TODO: Should those be placed in the ALPAKA_ACCELERATOR_NAMESPACE
+template struct TracksUtilities<pixelTopology::Phase1>;
+template struct TracksUtilities<pixelTopology::Phase2>;
+
+#endif
diff --git a/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h b/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h
new file mode 100644
index 0000000000000..62e9f69e34636
--- /dev/null
+++ b/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h
@@ -0,0 +1,52 @@
+#ifndef DataFormats_Track_interface_alpaka_TracksSoACollection_h
+#define DataFormats_Track_interface_alpaka_TracksSoACollection_h
+
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/CopyToHost.h"
+
+// TODO: The class is created via inheritance of the PortableCollection.
+// This is generally discouraged, and should be done via composition.
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  template <typename TrackerTraits>
+  using TracksSoACollection = std::conditional_t<std::is_same_v<Device, alpaka::DevCpu>,
+                                                 TracksHost<TrackerTraits>,
+                                                 TracksDevice<TrackerTraits, Device>>;
+
+  //Classes definition for Phase1/Phase2/HIonPhase1, to make the classes_def lighter. Not actually used in the code.
+  namespace pixelTrack {
+    using TracksSoACollectionPhase1 = TracksSoACollection<pixelTopology::Phase1>;
+    using TracksSoACollectionPhase2 = TracksSoACollection<pixelTopology::Phase2>;
+    using TracksSoACollectionHIonPhase1 = TracksSoACollection<pixelTopology::HIonPhase1>;
+  }  // namespace pixelTrack
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+namespace cms::alpakatools {
+  template <typename TrackerTraits, typename TDevice>
+  struct CopyToHost<TracksDevice<TrackerTraits, TDevice>> {
+    template <typename TQueue>
+    static auto copyAsync(TQueue& queue, TracksDevice<TrackerTraits, TDevice> const& deviceData) {
+      ::TracksHost<TrackerTraits> hostData(queue);
+      alpaka::memcpy(queue, hostData.buffer(), deviceData.buffer());
+#ifdef GPU_DEBUG
+      printf("TracksSoACollection: I'm copying to host.\n");
+#endif
+      return hostData;
+    }
+  };
+}  // namespace cms::alpakatools
+
+ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionPhase1, pixelTrack::TracksHostPhase1);
+ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionPhase2, pixelTrack::TracksHostPhase2);
+ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionHIonPhase1, pixelTrack::TracksHostHIonPhase1);
+
+#endif  // DataFormats_Track_interface_alpaka_TracksSoACollection_h
diff --git a/DataFormats/TrackSoA/src/alpaka/classes_cuda.h b/DataFormats/TrackSoA/src/alpaka/classes_cuda.h
new file mode 100644
index 0000000000000..4783184611401
--- /dev/null
+++ b/DataFormats/TrackSoA/src/alpaka/classes_cuda.h
@@ -0,0 +1,14 @@
+
+#ifndef DataFormats_TrackSoA_src_alpaka_classes_cuda_h
+#define DataFormats_TrackSoA_src_alpaka_classes_cuda_h
+
+#include "DataFormats/Common/interface/DeviceProduct.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+using namespace reco;
+
+#endif  // DataFormats_TrackSoA_src_alpaka_classes_cuda_h
diff --git a/DataFormats/TrackSoA/src/alpaka/classes_cuda_def.xml b/DataFormats/TrackSoA/src/alpaka/classes_cuda_def.xml
new file mode 100644
index 0000000000000..c04ca173c49f9
--- /dev/null
+++ b/DataFormats/TrackSoA/src/alpaka/classes_cuda_def.xml
@@ -0,0 +1,10 @@
+<lcgdict>
+  <class name="alpaka_cuda_async::PortableCollection<TrackLayout<pixelTopology::Phase1>>" persistent="false"/>
+  <class name="alpaka_cuda_async::pixelTrack::TracksSoACollectionPhase1" persistent="false"/>
+  <class name="edm::DeviceProduct<alpaka_cuda_async::pixelTrack::TracksSoACollectionPhase1>" persistent="false"/>
+  <class name="edm::Wrapper<edm::DeviceProduct<alpaka_cuda_async::pixelTrack::TracksSoACollectionPhase1>>" persistent="false"/>
+  <class name="alpaka_cuda_async::PortableCollection<TrackLayout<pixelTopology::Phase2>>" persistent="false"/>
+  <class name="alpaka_cuda_async::pixelTrack::TracksSoACollectionPhase2" persistent="false"/>
+  <class name="edm::DeviceProduct<alpaka_cuda_async::pixelTrack::TracksSoACollectionPhase2>" persistent="false"/>
+  <class name="edm::Wrapper<edm::DeviceProduct<alpaka_cuda_async::pixelTrack::TracksSoACollectionPhase2>>" persistent="false"/>
+</lcgdict>
diff --git a/DataFormats/TrackSoA/src/alpaka/classes_rocm.h b/DataFormats/TrackSoA/src/alpaka/classes_rocm.h
new file mode 100644
index 0000000000000..38143a6058c36
--- /dev/null
+++ b/DataFormats/TrackSoA/src/alpaka/classes_rocm.h
@@ -0,0 +1,14 @@
+
+#ifndef DataFormats_TrackSoA_src_alpaka_classes_rocm_h
+#define DataFormats_TrackSoA_src_alpaka_classes_rocm_h
+
+#include "DataFormats/Common/interface/DeviceProduct.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+using namespace reco;
+
+#endif  // DataFormats_TrackSoA_src_alpaka_classes_rocm_h
diff --git a/DataFormats/TrackSoA/src/alpaka/classes_rocm_def.xml b/DataFormats/TrackSoA/src/alpaka/classes_rocm_def.xml
new file mode 100644
index 0000000000000..b7e40aedead42
--- /dev/null
+++ b/DataFormats/TrackSoA/src/alpaka/classes_rocm_def.xml
@@ -0,0 +1,10 @@
+<lcgdict>
+  <class name="alpaka_rocm_async::PortableCollection<TrackLayout<pixelTopology::Phase1>>" persistent="false"/>
+  <class name="alpaka_rocm_async::pixelTrack::TracksSoACollectionPhase1" persistent="false"/>
+  <class name="edm::DeviceProduct<alpaka_rocm_async::pixelTrack::TracksSoACollectionPhase1>" persistent="false"/>
+  <class name="edm::Wrapper<edm::DeviceProduct<alpaka_rocm_async::pixelTrack::TracksSoACollectionPhase1>>" persistent="false"/>
+  <class name="alpaka_rocm_async::PortableCollection<TrackLayout<pixelTopology::Phase2>>" persistent="false"/>
+  <class name="alpaka_rocm_async::pixelTrack::TracksSoACollectionPhase2" persistent="false"/>
+  <class name="edm::DeviceProduct<alpaka_rocm_async::pixelTrack::TracksSoACollectionPhase2>" persistent="false"/>
+  <class name="edm::Wrapper<edm::DeviceProduct<alpaka_rocm_async::pixelTrack::TracksSoACollectionPhase2>>" persistent="false"/>
+</lcgdict>
diff --git a/DataFormats/TrackSoA/src/classes.cc b/DataFormats/TrackSoA/src/classes.cc
new file mode 100644
index 0000000000000..97e00cc5b5638
--- /dev/null
+++ b/DataFormats/TrackSoA/src/classes.cc
@@ -0,0 +1,9 @@
+#include "DataFormats/Portable/interface/PortableHostCollectionReadRules.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+using namespace reco;
+
+SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection<TrackLayout<pixelTopology::Phase1>>);
+SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection<TrackLayout<pixelTopology::Phase2>>);
+// SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection<TrackLayout<pixelTopology::HIonPhase1>>); //TODO: For the moment we live without HIons
diff --git a/DataFormats/TrackSoA/src/classes.h b/DataFormats/TrackSoA/src/classes.h
new file mode 100644
index 0000000000000..43d40e5f8f3ac
--- /dev/null
+++ b/DataFormats/TrackSoA/src/classes.h
@@ -0,0 +1,11 @@
+#ifndef DataFormats_TrackSoA_src_classes_h
+#define DataFormats_TrackSoA_src_classes_h
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+
+using namespace pixelTopology;
+using namespace reco;
+
+#endif  // DataFormats_TrackSoA_src_classes_h
diff --git a/DataFormats/TrackSoA/src/classes_def.xml b/DataFormats/TrackSoA/src/classes_def.xml
new file mode 100644
index 0000000000000..fd8fc0781ee25
--- /dev/null
+++ b/DataFormats/TrackSoA/src/classes_def.xml
@@ -0,0 +1,34 @@
+<lcgdict>
+  <class name="TrackSoA<pixelTopology::Phase1>"/>
+  <class name="TrackSoA<pixelTopology::Phase1>::Layout<>"/>
+  <class name="TrackLayout<pixelTopology::Phase1>"/>
+  <class name="TrackSoAView<pixelTopology::Phase1>"/>
+
+  <class name="PortableHostCollection<TrackLayout<pixelTopology::Phase1>>"/>
+  <class name="pixelTrack::TracksHostPhase1" ClassVersion="3">
+    <version ClassVersion="3" checksum="794224446"/>
+  </class>
+  <class name="edm::Wrapper<pixelTrack::TracksHostPhase1>" splitLevel="0"/>
+
+  <class name="TrackSoA<pixelTopology::Phase2>"/>
+  <class name="TrackSoA<pixelTopology::Phase2>::Layout<>"/>
+  <class name="TrackLayout<pixelTopology::Phase2>"/>
+  <class name="TrackSoAView<pixelTopology::Phase2>"/>
+
+  <class name="PortableHostCollection<TrackLayout<pixelTopology::Phase2>>"/>
+  <class name="pixelTrack::TracksHostPhase2" ClassVersion="3">
+    <version ClassVersion="3" checksum="3813159454"/>
+  </class>
+  <class name="edm::Wrapper<pixelTrack::TracksHostPhase2>" splitLevel="0"/>
+
+  <class name="TrackSoA<pixelTopology::HIonPhase1>"/>
+  <class name="TrackSoA<pixelTopology::HIonPhase1>::Layout<>"/>
+  <class name="TrackLayout<pixelTopology::HIonPhase1>"/>
+  <class name="TrackSoAView<pixelTopology::HIonPhase1>"/>
+
+  <class name="PortableHostCollection<TrackLayout<pixelTopology::HIonPhase1>>"/>
+  <class name="pixelTrack::TracksHostHIonPhase1" ClassVersion="3">
+    <version ClassVersion="3" checksum="3935456462"/>
+  </class>
+  <class name="edm::Wrapper<pixelTrack::TracksHostHIonPhase1>" splitLevel="0"/>
+</lcgdict>
diff --git a/DataFormats/TrackSoA/test/BuildFile.xml b/DataFormats/TrackSoA/test/BuildFile.xml
new file mode 100644
index 0000000000000..ce2b273d90577
--- /dev/null
+++ b/DataFormats/TrackSoA/test/BuildFile.xml
@@ -0,0 +1,6 @@
+<use name="eigen"/>
+<bin file="alpaka/TrackSoAHeterogeneous_test.cc alpaka/TrackSoAHeterogeneous_test.dev.cc" name="TrackSoAHeterogeneousAlpaka_test">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+<flags ALPAKA_BACKENDS="1"/>
+</bin>
diff --git a/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.cc b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.cc
new file mode 100644
index 0000000000000..f4af0688ca1bf
--- /dev/null
+++ b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.cc
@@ -0,0 +1,82 @@
+/**
+   Simple test for the pixelTrack::TrackSoA data structure
+   which inherits from PortableDeviceCollection.
+
+   Creates an instance of the class (automatically allocates
+   memory on device), passes the view of the SoA data to
+   the CUDA kernels which:
+   - Fill the SoA with data.
+   - Verify that the data written is correct.
+
+   Then, the SoA data are copied back to Host, where
+   a temporary host-side view (tmp_view) is created using
+   the same Layout to access the data on host and print it.
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <unistd.h>
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/host.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+using namespace std;
+using namespace reco;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelTrack;
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace testTrackSoA {
+
+    template <typename TrackerTraits>
+    void runKernels(TrackSoAView<TrackerTraits> tracks_view, Queue& queue);
+  }
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+int main() {
+  const auto host = cms::alpakatools::host();
+  const auto device = cms::alpakatools::devices<Platform>()[0];
+  Queue queue(device);
+
+  // Inner scope to deallocate memory before destroying the stream
+  {
+    // Instantiate tracks on device. PortableDeviceCollection allocates
+    // SoA on device automatically.
+    TracksSoACollection<pixelTopology::Phase1> tracks_d(queue);
+    testTrackSoA::runKernels<pixelTopology::Phase1>(tracks_d.view(), queue);
+
+    // Instantate tracks on host. This is where the data will be
+    // copied to from device.
+    TracksHost<pixelTopology::Phase1> tracks_h(queue);
+
+    std::cout << tracks_h.view().metadata().size() << std::endl;
+    alpaka::memcpy(queue, tracks_h.buffer(), tracks_d.const_buffer());
+    alpaka::wait(queue);
+
+    // Print results
+    std::cout << "pt"
+              << "\t"
+              << "eta"
+              << "\t"
+              << "chi2"
+              << "\t"
+              << "quality"
+              << "\t"
+              << "nLayers"
+              << "\t"
+              << "hitIndices off" << std::endl;
+
+    for (int i = 0; i < 10; ++i) {
+      std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2()
+                << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t"
+                << tracks_h.view().hitIndices().off[i] << std::endl;
+    }
+  }
+
+  return 0;
+}
diff --git a/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc
new file mode 100644
index 0000000000000..2c2d0961eb106
--- /dev/null
+++ b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc
@@ -0,0 +1,74 @@
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+
+using namespace reco;
+
+using Quality = pixelTrack::Quality;
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  using namespace cms::alpakatools;
+  namespace testTrackSoA {
+
+    // Kernel which fills the TrackSoAView with data
+    // to test writing to it
+    template <typename TrackerTraits>
+    class TestFillKernel {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const& acc, TrackSoAView<TrackerTraits> tracks_view) const {
+        if (cms::alpakatools::once_per_grid(acc)) {
+          tracks_view.nTracks() = 420;
+        }
+
+        for (int32_t j : elements_with_stride(acc, tracks_view.metadata().size())) {
+          tracks_view[j].pt() = (float)j;
+          tracks_view[j].eta() = (float)j;
+          tracks_view[j].chi2() = (float)j;
+          tracks_view[j].quality() = (Quality)(j % 256);
+          tracks_view[j].nLayers() = j % 128;
+          tracks_view.hitIndices().off[j] = j;
+        }
+      }
+    };
+
+    // Kernel which reads from the TrackSoAView to verify
+    // that it was written correctly from the fill kernel
+    template <typename TrackerTraits>
+    class TestVerifyKernel {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const& acc, TrackSoAConstView<TrackerTraits> tracks_view) const {
+        if (cms::alpakatools::once_per_grid(acc)) {
+          ALPAKA_ASSERT_OFFLOAD(tracks_view.nTracks() == 420);
+        }
+        for (int32_t j : elements_with_stride(acc, tracks_view.nTracks())) {
+          assert(abs(tracks_view[j].pt() - (float)j) < .0001);
+          assert(abs(tracks_view[j].eta() - (float)j) < .0001);
+          assert(abs(tracks_view[j].chi2() - (float)j) < .0001);
+          assert(tracks_view[j].quality() == (Quality)(j % 256));
+          assert(tracks_view[j].nLayers() == j % 128);
+          assert(tracks_view.hitIndices().off[j] == uint32_t(j));
+        }
+      }
+    };
+
+    // Host function which invokes the two kernels above
+    template <typename TrackerTraits>
+    void runKernels(TrackSoAView<TrackerTraits> tracks_view, Queue& queue) {
+      uint32_t items = 64;
+      uint32_t groups = divide_up_by(tracks_view.metadata().size(), items);
+      auto workDiv = make_workdiv<Acc1D>(groups, items);
+      alpaka::exec<Acc1D>(queue, workDiv, TestFillKernel<TrackerTraits>{}, tracks_view);
+      alpaka::exec<Acc1D>(queue,
+                          workDiv,
+                          TestVerifyKernel<TrackerTraits>{},
+                          tracks_view);  //TODO: wait for some PR that solves this and then check it!!!
+    }
+
+    template void runKernels<pixelTopology::Phase1>(TrackSoAView<pixelTopology::Phase1> tracks_view, Queue& queue);
+    template void runKernels<pixelTopology::Phase2>(TrackSoAView<pixelTopology::Phase2> tracks_view, Queue& queue);
+
+  }  // namespace testTrackSoA
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/DataFormats/VertexSoA/BuildFile.xml b/DataFormats/VertexSoA/BuildFile.xml
new file mode 100644
index 0000000000000..af53fc68f5a45
--- /dev/null
+++ b/DataFormats/VertexSoA/BuildFile.xml
@@ -0,0 +1,11 @@
+<use name="alpaka"/>
+<use name="rootcore"/>
+<use name="eigen"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/Portable"/>
+<use name="DataFormats/SoATemplate" source_only="1"/>
+<use name="HeterogeneousCore/AlpakaInterface"/>
+<flags ALPAKA_BACKENDS="!serial"/>
+<export>
+  <lib name="1"/>
+</export>
diff --git a/DataFormats/VertexSoA/README.md b/DataFormats/VertexSoA/README.md
new file mode 100644
index 0000000000000..54172eda14281
--- /dev/null
+++ b/DataFormats/VertexSoA/README.md
@@ -0,0 +1,45 @@
+# Vertex Portable Data Formats
+
+`DataFormat`s meant to be used on Host (CPU) or Device (GPU) for
+storing information about vertices created during the Pixel-local Reconstruction
+chain. It stores data in an SoA manner. It contains the data that was previously
+contained in the deprecated `ZVertexSoA` class. 
+
+The host format is inheriting from `DataFormats/Common/interface/PortableHostCollection.h`,
+while the device format is inheriting from `DataFormats/Common/interface/PortableDeviceCollection.h`
+
+Both formats use the same SoA Layout (`ZVertexLayout`) which is generated
+via the `GENERATE_SOA_LAYOUT` macro in the `ZVertexUtilities.h` file.
+
+## Notes
+
+- Initially, `ZVertexSoA` had distinct array sizes for each attribute (e.g. `zv` was `MAXVTX` elements 
+long, `ndof` was `MAXTRACKS` elements long). All columns are now of uniform `MAXTRACKS` size, 
+meaning that there will be some wasted space (appx. 190kB). 
+- Host and Device classes should **not** be created via inheritance, as they're done here,
+but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309).
+
+## ZVertexHeterogeneousHost
+
+The version of the data format to be used for storing vertex data on the CPU. 
+Instances of this class are to be used for:
+
+- Having a place to copy data to host from device, via `cudaMemcpy`, or
+- Running host-side algorithms using data stored in an SoA manner.
+
+## ZVertexHeterogeneousDevice
+
+The version of the data format to be used for storing vertex data on the GPU.
+
+Instances of `ZVertexHeterogeneousDevice` are to be created on host and be
+used on device only. To do so, the instance's `view()` method is to be called
+to pass a `View` to any kernel launched. Accessing data from the `view()` is not
+possible on the host side.
+
+## Utilities
+
+Apart from `ZVertexLayout`, `ZVertexUtilities.h` also contains
+a collection of methods which were originally
+defined as class methods inside the `ZVertexSoA` class
+which have been adapted to operate on `View` instances, so that they are callable
+from within `__global__` kernels, on both CPU and CPU. 
diff --git a/DataFormats/VertexSoA/interface/ZVertexDefinitions.h b/DataFormats/VertexSoA/interface/ZVertexDefinitions.h
new file mode 100644
index 0000000000000..028668d1ff52a
--- /dev/null
+++ b/DataFormats/VertexSoA/interface/ZVertexDefinitions.h
@@ -0,0 +1,13 @@
+#ifndef DataFormats_VertexSoA_ZVertexDefinitions_h
+#define DataFormats_VertexSoA_ZVertexDefinitions_h
+
+#include <cstdint>
+
+namespace zVertex {
+
+  constexpr uint32_t MAXTRACKS = 32 * 1024;
+  constexpr uint32_t MAXVTX = 1024;
+
+}  // namespace zVertex
+
+#endif
diff --git a/DataFormats/VertexSoA/interface/ZVertexDevice.h b/DataFormats/VertexSoA/interface/ZVertexDevice.h
new file mode 100644
index 0000000000000..8d120ae190f3c
--- /dev/null
+++ b/DataFormats/VertexSoA/interface/ZVertexDevice.h
@@ -0,0 +1,26 @@
+#ifndef DataFormats_VertexSoA_interface_ZVertexDevice_h
+#define DataFormats_VertexSoA_interface_ZVertexDevice_h
+
+#include <cstdint>
+
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "DataFormats/Portable/interface/PortableDeviceCollection.h"
+
+template <int32_t S, typename TDev>
+class ZVertexDeviceSoA : public PortableDeviceCollection<reco::ZVertexLayout<>, TDev> {
+public:
+  ZVertexDeviceSoA() = default;  // necessary for ROOT dictionaries
+
+  // Constructor which specifies the SoA size
+  template <typename TQueue>
+  explicit ZVertexDeviceSoA(TQueue queue) : PortableDeviceCollection<reco::ZVertexLayout<>, TDev>(S, queue) {}
+};
+
+using namespace ::zVertex;
+template <typename TDev>
+using ZVertexDevice = ZVertexDeviceSoA<MAXTRACKS, TDev>;
+
+#endif  // DataFormats_VertexSoA_interface_ZVertexDevice_h
diff --git a/DataFormats/VertexSoA/interface/ZVertexHost.h b/DataFormats/VertexSoA/interface/ZVertexHost.h
new file mode 100644
index 0000000000000..2d72b83bfe385
--- /dev/null
+++ b/DataFormats/VertexSoA/interface/ZVertexHost.h
@@ -0,0 +1,29 @@
+#ifndef DataFormats_VertexSoA_ZVertexHost_H
+#define DataFormats_VertexSoA_ZVertexHost_H
+
+#include <cstdint>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h"
+#include "DataFormats/Portable/interface/PortableHostCollection.h"
+
+template <int32_t S>
+class ZVertexHostSoA : public PortableHostCollection<reco::ZVertexSoA> {
+public:
+  ZVertexHostSoA() = default;
+
+  // Constructor which specifies the queue
+  template <typename TQueue>
+  explicit ZVertexHostSoA(TQueue queue) : PortableHostCollection<reco::ZVertexSoA>(S, queue) {}
+
+  // Constructor which specifies the DevHost
+  explicit ZVertexHostSoA(alpaka_common::DevHost const& host) : PortableHostCollection<reco::ZVertexSoA>(S, host) {}
+};
+
+//using namespace ::zVertex;
+using ZVertexHost = ZVertexHostSoA<zVertex::MAXTRACKS>;
+
+#endif  // DataFormats_VertexSoA_ZVertexHost_H
diff --git a/DataFormats/VertexSoA/interface/ZVertexSoA.h b/DataFormats/VertexSoA/interface/ZVertexSoA.h
new file mode 100644
index 0000000000000..045603618acd7
--- /dev/null
+++ b/DataFormats/VertexSoA/interface/ZVertexSoA.h
@@ -0,0 +1,31 @@
+#ifndef DataFormats_VertexSoA_interface_ZVertexSoA_h
+#define DataFormats_VertexSoA_interface_ZVertexSoA_h
+
+#include <alpaka/alpaka.hpp>
+
+#include <Eigen/Core>
+
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+namespace reco {
+
+  GENERATE_SOA_LAYOUT(ZVertexLayout,
+                      SOA_COLUMN(int16_t, idv),
+                      SOA_COLUMN(float, zv),
+                      SOA_COLUMN(float, wv),
+                      SOA_COLUMN(float, chi2),
+                      SOA_COLUMN(float, ptv2),
+                      SOA_COLUMN(int32_t, ndof),
+                      SOA_COLUMN(uint16_t, sortInd),
+                      SOA_SCALAR(uint32_t, nvFinal))
+
+  // Common types for both Host and Device code
+  using ZVertexSoA = ZVertexLayout<>;
+  using ZVertexSoAView = ZVertexSoA::View;
+  using ZVertexSoAConstView = ZVertexSoA::ConstView;
+
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; }
+
+}  // namespace reco
+
+#endif  // DataFormats_VertexSoA_interface_ZVertexSoA_h
diff --git a/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h b/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h
new file mode 100644
index 0000000000000..636a07e2bd978
--- /dev/null
+++ b/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h
@@ -0,0 +1,39 @@
+#ifndef DataFormats_VertexSoA_interface_ZVertexSoACollection_h
+#define DataFormats_VertexSoA_interface_ZVertexSoACollection_h
+
+#include <cstdint>
+
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDevice.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/CopyToHost.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  using ZVertexSoACollection =
+      std::conditional_t<std::is_same_v<Device, alpaka::DevCpu>, ZVertexHost, ZVertexDevice<Device>>;
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+namespace cms::alpakatools {
+  template <typename TDevice>
+  struct CopyToHost<ZVertexDevice<TDevice>> {
+    template <typename TQueue>
+    static auto copyAsync(TQueue& queue, ZVertexDevice<TDevice> const& deviceData) {
+      ZVertexHost hostData(queue);
+      alpaka::memcpy(queue, hostData.buffer(), deviceData.buffer());
+#ifdef GPU_DEBUG
+      printf("ZVertexSoACollection: I'm copying to host.\n");
+#endif
+      return hostData;
+    }
+  };
+}  // namespace cms::alpakatools
+
+ASSERT_DEVICE_MATCHES_HOST_COLLECTION(ZVertexSoACollection, ZVertexHost);
+
+#endif  // DataFormats_VertexSoA_interface_ZVertexSoACollection_h
diff --git a/DataFormats/VertexSoA/src/alpaka/classes_cuda.h b/DataFormats/VertexSoA/src/alpaka/classes_cuda.h
new file mode 100644
index 0000000000000..e76f6ca1365c1
--- /dev/null
+++ b/DataFormats/VertexSoA/src/alpaka/classes_cuda.h
@@ -0,0 +1,10 @@
+#ifndef DataFormats_VertexSoA_src_alpaka_classes_cuda_h
+#define DataFormats_VertexSoA_src_alpaka_classes_cuda_h
+
+#include "DataFormats/Common/interface/DeviceProduct.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "DataFormats/VertexSoA/interface//ZVertexDevice.h"
+#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h"
+
+#endif  // DataFormats_VertexSoA_src_alpaka_classes_cuda_h
diff --git a/DataFormats/VertexSoA/src/alpaka/classes_cuda_def.xml b/DataFormats/VertexSoA/src/alpaka/classes_cuda_def.xml
new file mode 100644
index 0000000000000..606937a5bd3e5
--- /dev/null
+++ b/DataFormats/VertexSoA/src/alpaka/classes_cuda_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="alpaka_cuda_async::PortableCollection<reco::ZVertexSoA>" persistent="false"/>
+  <class name="alpaka_cuda_async::ZVertexSoACollection" persistent="false"/>
+  <class name="edm::DeviceProduct<alpaka_cuda_async::ZVertexSoACollection>" persistent="false"/>
+  <class name="edm::Wrapper<edm::DeviceProduct<alpaka_cuda_async::ZVertexSoACollection>>" persistent="false"/>
+</lcgdict>
diff --git a/DataFormats/VertexSoA/src/alpaka/classes_rocm.h b/DataFormats/VertexSoA/src/alpaka/classes_rocm.h
new file mode 100644
index 0000000000000..f5ea845c028b1
--- /dev/null
+++ b/DataFormats/VertexSoA/src/alpaka/classes_rocm.h
@@ -0,0 +1,9 @@
+#ifndef DataFormats_VertexSoA_src_alpaka_classes_rocm_h
+#define DataFormats_VertexSoA_src_alpaka_classes_rocm_h
+
+#include "DataFormats/Common/interface/DeviceProduct.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "DataFormats/VertexSoA/interface//ZVertexDevice.h"
+#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h"
+#endif  // DataFormats_VertexSoA_src_alpaka_classes_rocm_h
diff --git a/DataFormats/VertexSoA/src/alpaka/classes_rocm_def.xml b/DataFormats/VertexSoA/src/alpaka/classes_rocm_def.xml
new file mode 100644
index 0000000000000..94deb6fff7d61
--- /dev/null
+++ b/DataFormats/VertexSoA/src/alpaka/classes_rocm_def.xml
@@ -0,0 +1,6 @@
+<lcgdict>
+  <class name="alpaka_rocm_async::PortableCollection<reco::ZVertexSoA>" persistent="false"/>
+  <class name="alpaka_rocm_async::ZVertexSoACollection" persistent="false"/>
+  <class name="edm::DeviceProduct<alpaka_rocm_async::ZVertexSoACollection>" persistent="false"/>
+  <class name="edm::Wrapper<edm::DeviceProduct<alpaka_rocm_async::ZVertexSoACollection>>" persistent="false"/>
+</lcgdict>
diff --git a/DataFormats/VertexSoA/src/classes.cc b/DataFormats/VertexSoA/src/classes.cc
new file mode 100644
index 0000000000000..edffb6e08a9e5
--- /dev/null
+++ b/DataFormats/VertexSoA/src/classes.cc
@@ -0,0 +1,4 @@
+#include "DataFormats/Portable/interface/PortableHostCollectionReadRules.h"
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+
+SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection<reco::ZVertexSoA>);
diff --git a/DataFormats/VertexSoA/src/classes.h b/DataFormats/VertexSoA/src/classes.h
new file mode 100644
index 0000000000000..883182c01dcf9
--- /dev/null
+++ b/DataFormats/VertexSoA/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef DataFormats_VertexSoA_src_classes_h
+#define DataFormats_VertexSoA_src_classes_h
+
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+
+#endif  // DataFormats_VertexSoA_src_classes_h
diff --git a/DataFormats/VertexSoA/src/classes_def.xml b/DataFormats/VertexSoA/src/classes_def.xml
new file mode 100644
index 0000000000000..820d28ecc3493
--- /dev/null
+++ b/DataFormats/VertexSoA/src/classes_def.xml
@@ -0,0 +1,8 @@
+<lcgdict>
+  <class name="reco::ZVertexSoA"/>
+  <class name="PortableHostCollection<reco::ZVertexSoA>"/>
+  <class name="ZVertexHost" ClassVersion="3">
+    <version ClassVersion="3" checksum="1989784241"/>
+  </class>
+  <class name="edm::Wrapper<ZVertexHost>" splitLevel="0"/>
+</lcgdict>
diff --git a/DataFormats/VertexSoA/test/BuildFile.xml b/DataFormats/VertexSoA/test/BuildFile.xml
new file mode 100644
index 0000000000000..49dee4babd8a1
--- /dev/null
+++ b/DataFormats/VertexSoA/test/BuildFile.xml
@@ -0,0 +1,6 @@
+<bin file="alpaka/ZVertexSoA_test.cc alpaka/ZVertexSoA_test.dev.cc" name="ZVertexSoA_test">
+  <use name="alpaka"/>
+  <use name="eigen"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+<flags ALPAKA_BACKENDS="1"/>
+</bin>
diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc
new file mode 100644
index 0000000000000..0c0c8e8591df9
--- /dev/null
+++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc
@@ -0,0 +1,82 @@
+/**
+   Simple test for the reco::ZVertexSoA data structure
+   which inherits from Portable{Host}Collection.
+
+   Creates an instance of the class (automatically allocates
+   memory on device), passes the view of the SoA data to
+   the kernels which:
+   - Fill the SoA with data.
+   - Verify that the data written is correct.
+
+   Then, the SoA data are copied back to Host, where
+   a temporary host-side view (tmp_view) is created using
+   the same Layout to access the data on host and print it.
+ */
+
+#include <alpaka/alpaka.hpp>
+#include <unistd.h>
+#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDevice.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/host.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+using namespace std;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+using namespace reco;
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace testZVertexSoAT {
+    void runKernels(ZVertexSoAView zvertex_view, Queue& queue);
+  }
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+int main() {
+  const auto host = cms::alpakatools::host();
+  const auto device = cms::alpakatools::devices<Platform>()[0];
+  Queue queue(device);
+
+  // Inner scope to deallocate memory before destroying the stream
+  {
+    // Instantiate vertices on device. PortableCollection allocates
+    // SoA on device automatically.
+    ZVertexSoACollection zvertex_d(queue);
+    testZVertexSoAT::runKernels(zvertex_d.view(), queue);
+
+    // Instantate vertices on host. This is where the data will be
+    // copied to from device.
+    ZVertexHost zvertex_h(queue);
+    std::cout << zvertex_h.view().metadata().size() << std::endl;
+    alpaka::memcpy(queue, zvertex_h.buffer(), zvertex_d.const_buffer());
+    alpaka::wait(queue);
+
+    // Print results
+    std::cout << "idv"
+              << "\t"
+              << "zv"
+              << "\t"
+              << "wv"
+              << "\t"
+              << "chi2"
+              << "\t"
+              << "ptv2"
+              << "\t"
+              << "ndof"
+              << "\t"
+              << "sortInd"
+              << "\t"
+              << "nvFinal" << std::endl;
+
+    for (int i = 0; i < 10; ++i) {
+      std::cout << (int)zvertex_h.view()[i].idv() << "\t" << zvertex_h.view()[i].zv() << "\t"
+                << zvertex_h.view()[i].wv() << "\t" << zvertex_h.view()[i].chi2() << "\t" << zvertex_h.view()[i].ptv2()
+                << "\t" << (int)zvertex_h.view()[i].ndof() << "\t" << (int)zvertex_h.view()[i].sortInd() << "\t"
+                << (int)zvertex_h.view().nvFinal() << std::endl;
+    }
+  }
+
+  return 0;
+}
diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc
new file mode 100644
index 0000000000000..1b22159a53b88
--- /dev/null
+++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc
@@ -0,0 +1,62 @@
+#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDevice.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"  // Check if this is really needed; code doesn't compile without it
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  using namespace alpaka;
+  using namespace cms::alpakatools;
+
+  namespace testZVertexSoAT {
+
+    class TestFillKernel {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const& acc, reco::ZVertexSoAView zvertex_view) const {
+        if (cms::alpakatools::once_per_grid(acc)) {
+          zvertex_view.nvFinal() = 420;
+        }
+
+        for (int32_t j : elements_with_stride(acc, zvertex_view.metadata().size())) {
+          zvertex_view[j].idv() = (int16_t)j;
+          zvertex_view[j].zv() = (float)j;
+          zvertex_view[j].wv() = (float)j;
+          zvertex_view[j].chi2() = (float)j;
+          zvertex_view[j].ptv2() = (float)j;
+          zvertex_view[j].ndof() = (int32_t)j;
+          zvertex_view[j].sortInd() = (uint16_t)j;
+        }
+      }
+    };
+
+    class TestVerifyKernel {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const& acc, reco::ZVertexSoAView zvertex_view) const {
+        if (cms::alpakatools::once_per_grid(acc)) {
+          ALPAKA_ASSERT_OFFLOAD(zvertex_view.nvFinal() == 420);
+        }
+
+        for (int32_t j : elements_with_stride(acc, zvertex_view.nvFinal())) {
+          assert(zvertex_view[j].idv() == j);
+          assert(zvertex_view[j].zv() - (float)j < 0.0001);
+          assert(zvertex_view[j].wv() - (float)j < 0.0001);
+          assert(zvertex_view[j].chi2() - (float)j < 0.0001);
+          assert(zvertex_view[j].ptv2() - (float)j < 0.0001);
+          assert(zvertex_view[j].ndof() == j);
+          assert(zvertex_view[j].sortInd() == uint32_t(j));
+        }
+      }
+    };
+
+    void runKernels(reco::ZVertexSoAView zvertex_view, Queue& queue) {
+      uint32_t items = 64;
+      uint32_t groups = divide_up_by(zvertex_view.metadata().size(), items);
+      auto workDiv = make_workdiv<Acc1D>(groups, items);
+      alpaka::exec<Acc1D>(queue, workDiv, TestFillKernel{}, zvertex_view);
+      alpaka::exec<Acc1D>(queue, workDiv, TestVerifyKernel{}, zvertex_view);
+    }
+
+  }  // namespace testZVertexSoAT
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/HLTrigger/Configuration/python/customizeHLTforPatatrack.py b/HLTrigger/Configuration/python/customizeHLTforPatatrack.py
new file mode 100644
index 0000000000000..f96716d82ae2d
--- /dev/null
+++ b/HLTrigger/Configuration/python/customizeHLTforPatatrack.py
@@ -0,0 +1,485 @@
+import FWCore.ParameterSet.Config as cms
+
+def customizeHLTforDQMGPUvsCPUPixel(process):
+    '''Ad-hoc changes to test HLT config containing only DQM_PixelReconstruction_v and DQMGPUvsCPU stream
+       only up to the Pixel Local Reconstruction
+    '''
+    dqmPixelRecoPathName = None
+    for pathName in process.paths_():
+        if pathName.startswith('DQM_PixelReconstruction_v'):
+            dqmPixelRecoPathName = pathName
+            break
+
+    if dqmPixelRecoPathName == None:
+        return process
+
+    process.hltPixelConsumerGPU.eventProducts = [
+        'hltSiPixelClusters',
+        'hltSiPixelClustersLegacyFormat',
+        'hltSiPixelDigiErrorsLegacyFormat',
+        'hltSiPixelRecHits',
+        'hltSiPixelRecHitsLegacyFormat',
+        'hltPixelTracks',
+        'hltPixelTracksLegacyFormat',
+        'hltPixelVertices',
+        'hltPixelVerticesLegacyFormat',
+    ]
+
+    process.hltPixelConsumerCPU.eventProducts = []
+    for foo in process.hltPixelConsumerGPU.eventProducts:
+        process.hltPixelConsumerCPU.eventProducts += [foo+'CPUSerial']
+
+    # modify EventContent of DQMGPUvsCPU stream
+    if hasattr(process, 'hltOutputDQMGPUvsCPU'):
+        process.hltOutputDQMGPUvsCPU.outputCommands = [
+            'drop *',
+            'keep *Cluster*_hltSiPixelClustersLegacyFormat_*_*',
+            'keep *Cluster*_hltSiPixelClustersLegacyFormatCPUSerial_*_*',
+            'keep *_hltSiPixelDigiErrorsLegacyFormat_*_*',
+            'keep *_hltSiPixelDigiErrorsLegacyFormatCPUSerial_*_*',
+            'keep *RecHit*_hltSiPixelRecHitsLegacyFormat_*_*',
+            'keep *RecHit*_hltSiPixelRecHitsLegacyFormatCPUSerial_*_*',
+            'keep *_hltPixelTracksLegacyFormat_*_*',
+            'keep *_hltPixelTracksLegacyFormatCPUSerial_*_*',
+            'keep *_hltPixelVerticesLegacyFormat_*_*',
+            'keep *_hltPixelVerticesLegacyFormatCPUSerial_*_*',
+        ]
+
+    # PixelRecHits: monitor of CPUSerial product (Alpaka backend: 'serial_sync')
+    process.hltSiPixelRecHitsSoAMonitorCPU = cms.EDProducer('SiPixelPhase1MonitorRecHitsSoAAlpaka',
+        pixelHitsSrc = cms.InputTag( 'hltSiPixelRecHitsCPUSerial' ),
+        TopFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsCPU' )
+    )
+
+    # PixelRecHits: monitor of GPU product (Alpaka backend: '')
+    process.hltSiPixelRecHitsSoAMonitorGPU = cms.EDProducer('SiPixelPhase1MonitorRecHitsSoAAlpaka',
+        pixelHitsSrc = cms.InputTag( 'hltSiPixelRecHits' ),
+        TopFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsGPU' )
+    )
+
+    # PixelRecHits: 'GPUvsCPU' comparisons
+    process.hltSiPixelRecHitsSoACompareGPUvsCPU = cms.EDProducer('SiPixelPhase1CompareRecHitsSoAAlpaka',
+        pixelHitsSrcCPU = cms.InputTag( 'hltSiPixelRecHitsCPUSerial' ),
+        pixelHitsSrcGPU = cms.InputTag( 'hltSiPixelRecHits' ),
+        topFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsCompareGPUvsCPU' ),
+        minD2cut = cms.double( 1.0E-4 )
+    )
+
+    process.hltSiPixelTrackSoAMonitorCPU = cms.EDProducer("SiPixelPhase1MonitorTrackSoAAlpaka",
+        mightGet = cms.optional.untracked.vstring,
+        minQuality = cms.string('loose'),
+        pixelTrackSrc = cms.InputTag('hltPixelTracksCPUSerial'),
+        topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackCPU'),
+        useQualityCut = cms.bool(True)
+    )
+
+    process.hltSiPixelTrackSoAMonitorGPU = cms.EDProducer("SiPixelPhase1MonitorTrackSoAAlpaka",
+        mightGet = cms.optional.untracked.vstring,
+        minQuality = cms.string('loose'),
+        pixelTrackSrc = cms.InputTag('hltPixelTracks'),
+        topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackGPU'),
+        useQualityCut = cms.bool(True)
+    )
+
+    process.hltSiPixelTrackSoACompareGPUvsCPU = cms.EDProducer("SiPixelPhase1CompareTrackSoAAlpaka",
+        deltaR2cut = cms.double(0.04),
+        mightGet = cms.optional.untracked.vstring,
+        minQuality = cms.string('loose'),
+        pixelTrackSrcCPU = cms.InputTag("hltPixelTracksCPUSerial"),
+        pixelTrackSrcGPU = cms.InputTag("hltPixelTracks"),
+        topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackCompareGPUvsCPU'),
+        useQualityCut = cms.bool(True)
+    )
+
+    process.hltSiPixelVertexSoAMonitorCPU = cms.EDProducer("SiPixelMonitorVertexSoAAlpaka",
+        beamSpotSrc = cms.InputTag("hltOnlineBeamSpot"),
+        mightGet = cms.optional.untracked.vstring,
+        pixelVertexSrc = cms.InputTag("hltPixelVerticesCPUSerial"),
+        topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexCPU')
+    )
+
+    process.hltSiPixelVertexSoAMonitorGPU = cms.EDProducer("SiPixelMonitorVertexSoAAlpaka",
+        beamSpotSrc = cms.InputTag("hltOnlineBeamSpot"),
+        mightGet = cms.optional.untracked.vstring,
+        pixelVertexSrc = cms.InputTag("hltPixelVertices"),
+        topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexGPU')
+    )
+
+    process.hltSiPixelVertexSoACompareGPUvsCPU = cms.EDProducer("SiPixelCompareVertexSoAAlpaka",
+        beamSpotSrc = cms.InputTag("hltOnlineBeamSpot"),
+        dzCut = cms.double(1),
+        mightGet = cms.optional.untracked.vstring,
+        pixelVertexSrcCPU = cms.InputTag("hltPixelVerticesCPUSerial"),
+        pixelVertexSrcGPU = cms.InputTag("hltPixelVertices"),
+        topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexCompareGPUvsCPU')
+    )
+
+    process.HLTDQMPixelReconstruction = cms.Sequence(
+        process.hltSiPixelRecHitsSoAMonitorCPU
+      + process.hltSiPixelRecHitsSoAMonitorGPU
+      + process.hltSiPixelRecHitsSoACompareGPUvsCPU
+      + process.hltSiPixelTrackSoAMonitorCPU
+      + process.hltSiPixelTrackSoAMonitorGPU
+      + process.hltSiPixelTrackSoACompareGPUvsCPU
+      + process.hltSiPixelVertexSoAMonitorCPU
+      + process.hltSiPixelVertexSoAMonitorGPU
+      + process.hltSiPixelVertexSoACompareGPUvsCPU
+    )
+
+    # Add CPUSerial sequences to DQM_PixelReconstruction_v Path
+    dqmPixelRecoPath = getattr(process, dqmPixelRecoPathName)
+    try:
+        dqmPixelRecoPathIndex = dqmPixelRecoPath.index(process.HLTRecopixelvertexingSequence) + 1
+        for cpuSeqName in [
+            'HLTDoLocalPixelCPUSerialSequence',
+            'HLTRecopixelvertexingCPUSerialSequence',
+        ]:
+            dqmPixelRecoPath.insert(dqmPixelRecoPathIndex, getattr(process, cpuSeqName))
+            dqmPixelRecoPathIndex += 1
+    except:
+        dqmPixelRecoPathIndex = None
+
+    return process
+
+def customizeHLTforAlpakaPixelRecoLocal(process):
+    '''Customisation to introduce the Local Pixel Reconstruction in Alpaka
+    '''
+    process.hltESPSiPixelCablingSoA = cms.ESProducer('SiPixelCablingSoAESProducer@alpaka', 
+        CablingMapLabel = cms.string(''),
+        UseQualityInfo = cms.bool(False),
+        appendToDataLabel = cms.string(''),
+        alpaka = cms.untracked.PSet(
+            backend = cms.untracked.string('')
+        )
+    )
+
+    process.hltESPSiPixelGainCalibrationForHLTSoA = cms.ESProducer('SiPixelGainCalibrationForHLTSoAESProducer@alpaka',
+        appendToDataLabel = cms.string(''),
+        alpaka = cms.untracked.PSet(
+            backend = cms.untracked.string('')
+        )
+    )
+
+    process.hltESPPixelCPEFastParamsPhase1 = cms.ESProducer('PixelCPEFastParamsESProducerAlpakaPhase1@alpaka', 
+        appendToDataLabel = cms.string(''),
+        alpaka = cms.untracked.PSet(
+            backend = cms.untracked.string('')
+        )
+    )
+
+    ###
+
+    # alpaka EDProducer
+    # consumes
+    #  - reco::BeamSpot
+    # produces
+    #  - BeamSpotDeviceProduct
+    process.hltOnlineBeamSpotDevice = cms.EDProducer('BeamSpotDeviceProducer@alpaka',
+        src = cms.InputTag('hltOnlineBeamSpot'),
+        alpaka = cms.untracked.PSet(
+            backend = cms.untracked.string('')
+        )
+    )
+
+    # alpaka EDProducer
+    # consumes
+    #  - FEDRawDataCollection
+    # produces (* optional)
+    #  - SiPixelClustersSoA
+    #  - SiPixelDigisSoACollection
+    #  - SiPixelDigiErrorsSoACollection *
+    #  - SiPixelFormatterErrors *
+    process.hltSiPixelClusters = cms.EDProducer('SiPixelRawToClusterPhase1@alpaka',
+        mightGet = cms.optional.untracked.vstring,
+        IncludeErrors = cms.bool(True),
+        UseQualityInfo = cms.bool(False),
+        clusterThreshold_layer1 = cms.int32(4000),
+        clusterThreshold_otherLayers = cms.int32(4000),
+        VCaltoElectronGain      = cms.double(1),  # all gains=1, pedestals=0
+        VCaltoElectronGain_L1   = cms.double(1),
+        VCaltoElectronOffset    = cms.double(0),
+        VCaltoElectronOffset_L1 = cms.double(0),
+        InputLabel = cms.InputTag('rawDataCollector'),
+        Regions = cms.PSet(
+            inputs = cms.optional.VInputTag,
+            deltaPhi = cms.optional.vdouble,
+            maxZ = cms.optional.vdouble,
+            beamSpot = cms.optional.InputTag
+        ),
+        CablingMapLabel = cms.string(''),
+        # autoselect the alpaka backend
+        alpaka = cms.untracked.PSet(
+            backend = cms.untracked.string('')
+        )
+    )
+
+    process.hltSiPixelClustersLegacyFormat = cms.EDProducer('SiPixelDigisClustersFromSoAAlpakaPhase1',
+        src = cms.InputTag('hltSiPixelClusters'),
+        clusterThreshold_layer1 = cms.int32(4000),
+        clusterThreshold_otherLayers = cms.int32(4000),
+        produceDigis = cms.bool(False),
+        storeDigis = cms.bool(False)
+    )
+
+    process.hltSiPixelClustersCache = cms.EDProducer('SiPixelClusterShapeCacheProducer',
+        src = cms.InputTag( 'hltSiPixelClustersLegacyFormat' ),
+        onDemand = cms.bool( False )
+    )
+
+    # legacy EDProducer
+    # consumes
+    #  - SiPixelDigiErrorsHost
+    #  - SiPixelFormatterErrors
+    # produces
+    #  - edm::DetSetVector<SiPixelRawDataError>
+    #  - DetIdCollection
+    #  - DetIdCollection, 'UserErrorModules'
+    #  - edmNew::DetSetVector<PixelFEDChannel>
+    process.hltSiPixelDigiErrorsLegacyFormat = cms.EDProducer('SiPixelDigiErrorsFromSoAAlpaka',
+        digiErrorSoASrc = cms.InputTag('hltSiPixelClusters'),
+        fmtErrorsSoASrc = cms.InputTag('hltSiPixelClusters'),
+        CablingMapLabel = cms.string(''),
+        UsePhase1 = cms.bool(True),
+        ErrorList = cms.vint32(29),
+        UserErrorList = cms.vint32(40)
+    )
+
+    # alpaka EDProducer
+    # consumes
+    #  - BeamSpotDeviceProduct
+    #  - SiPixelClustersSoA
+    #  - SiPixelDigisCollection
+    # produces
+    #  - TrackingRecHitAlpakaCollection<TrackerTraits>
+    process.hltSiPixelRecHits = cms.EDProducer('SiPixelRecHitAlpakaPhase1@alpaka',
+        beamSpot = cms.InputTag('hltOnlineBeamSpotDevice'),
+        src = cms.InputTag('hltSiPixelClusters'),
+        CPE = cms.string('PixelCPEFastParams'),
+        mightGet = cms.optional.untracked.vstring,
+        # autoselect the alpaka backend
+        alpaka = cms.untracked.PSet(
+            backend = cms.untracked.string('')
+        )
+    )
+
+    process.hltSiPixelRecHitsLegacyFormat = cms.EDProducer('SiPixelRecHitFromSoAAlpakaPhase1',
+        pixelRecHitSrc = cms.InputTag('hltSiPixelRecHits'),
+        src = cms.InputTag('hltSiPixelClustersLegacyFormat'),
+    )
+
+    ###
+    ### Task: Pixel Local Reconstruction
+    ###
+    process.HLTDoLocalPixelTask = cms.ConditionalTask(
+        process.hltOnlineBeamSpotDevice,
+        process.hltSiPixelClusters,
+        process.hltSiPixelClustersLegacyFormat,   # was: hltSiPixelClusters
+        process.hltSiPixelClustersCache,          # really needed ??
+        process.hltSiPixelDigiErrorsLegacyFormat, # was: hltSiPixelDigis
+        process.hltSiPixelRecHits,
+        process.hltSiPixelRecHitsLegacyFormat,    # was: hltSiPixelRecHits
+    )
+
+    ###
+    ### CPUSerial version of Pixel Local Reconstruction
+    ###
+    process.hltOnlineBeamSpotDeviceCPUSerial = process.hltOnlineBeamSpotDevice.clone(
+        alpaka = dict( backend = 'serial_sync' )
+    )
+
+    process.hltSiPixelClustersCPUSerial = process.hltSiPixelClusters.clone(
+        alpaka = dict( backend = 'serial_sync' )
+    )
+
+    process.hltSiPixelClustersLegacyFormatCPUSerial = process.hltSiPixelClustersLegacyFormat.clone(
+        src = 'hltSiPixelClustersCPUSerial'
+    )
+
+    process.hltSiPixelDigiErrorsLegacyFormatCPUSerial = process.hltSiPixelDigiErrorsLegacyFormat.clone(
+        digiErrorSoASrc = 'hltSiPixelClustersCPUSerial',
+        fmtErrorsSoASrc = 'hltSiPixelClustersCPUSerial',
+    )
+
+    process.hltSiPixelRecHitsCPUSerial = process.hltSiPixelRecHits.clone(
+        beamSpot = 'hltOnlineBeamSpotDeviceCPUSerial',
+        src = 'hltSiPixelClustersCPUSerial',
+        alpaka = dict( backend = 'serial_sync' )
+    )
+
+    process.hltSiPixelRecHitsLegacyFormatCPUSerial = process.hltSiPixelRecHitsLegacyFormat.clone(
+        pixelRecHitSrc = 'hltSiPixelRecHitsCPUSerial',
+        src = 'hltSiPixelClustersLegacyFormatCPUSerial',
+    )
+
+    process.HLTDoLocalPixelCPUSerialTask = cms.ConditionalTask(
+        process.hltOnlineBeamSpotDeviceCPUSerial,
+        process.hltSiPixelClustersCPUSerial,
+        process.hltSiPixelClustersLegacyFormatCPUSerial,
+        process.hltSiPixelDigiErrorsLegacyFormatCPUSerial,
+        process.hltSiPixelRecHitsCPUSerial,
+        process.hltSiPixelRecHitsLegacyFormatCPUSerial,
+    )
+
+    process.HLTDoLocalPixelCPUSerialSequence = cms.Sequence( process.HLTDoLocalPixelCPUSerialTask )
+
+    return process
+
+def customizeHLTforAlpakaPixelRecoTracking(process):
+    '''Customisation to introduce the Pixel-Track Reconstruction in Alpaka
+    '''
+
+    # alpaka EDProducer
+    # consumes
+    #  - TrackingRecHitsSoACollection<TrackerTraits>
+    # produces
+    #  - TkSoADevice
+    process.hltPixelTracks = cms.EDProducer('CAHitNtupletAlpakaPhase1@alpaka',
+        pixelRecHitSrc = cms.InputTag('hltSiPixelRecHits'),
+        CPE = cms.string('PixelCPEFastParams'),
+        ptmin = cms.double(0.89999997615814209),
+        CAThetaCutBarrel = cms.double(0.0020000000949949026),
+        CAThetaCutForward = cms.double(0.0030000000260770321),
+        hardCurvCut = cms.double(0.032840722495894911),
+        dcaCutInnerTriplet = cms.double(0.15000000596046448),
+        dcaCutOuterTriplet = cms.double(0.25),
+        earlyFishbone = cms.bool(True),
+        lateFishbone = cms.bool(False),
+        fillStatistics = cms.bool(False),
+        minHitsPerNtuplet = cms.uint32(3),
+        maxNumberOfDoublets = cms.uint32(524288),
+        minHitsForSharingCut = cms.uint32(10),
+        fitNas4 = cms.bool(False),
+        doClusterCut = cms.bool(True),
+        doZ0Cut = cms.bool(True),
+        doPtCut = cms.bool(True),
+        useRiemannFit = cms.bool(False),
+        doSharedHitCut = cms.bool(True),
+        dupPassThrough = cms.bool(False),
+        useSimpleTripletCleaner = cms.bool(True),
+        idealConditions = cms.bool(False),
+        includeJumpingForwardDoublets = cms.bool(True),
+        trackQualityCuts = cms.PSet(
+            chi2MaxPt = cms.double(10),
+            chi2Coeff = cms.vdouble(0.9, 1.8),
+            chi2Scale = cms.double(8),
+            tripletMinPt = cms.double(0.5),
+            tripletMaxTip = cms.double(0.3),
+            tripletMaxZip = cms.double(12),
+            quadrupletMinPt = cms.double(0.3),
+            quadrupletMaxTip = cms.double(0.5),
+            quadrupletMaxZip = cms.double(12)
+        ),
+        # autoselect the alpaka backend
+        alpaka = cms.untracked.PSet(
+            backend = cms.untracked.string('')
+        )
+    )
+
+    process.hltPixelTracksCPUSerial = process.hltPixelTracks.clone(
+        pixelRecHitSrc = 'hltSiPixelRecHitsCPUSerial',
+        alpaka = dict( backend = 'serial_sync' )
+    )
+
+    process.hltPixelTracksLegacyFormat = cms.EDProducer("PixelTrackProducerFromSoAAlpakaPhase1",
+        beamSpot = cms.InputTag("hltOnlineBeamSpot"),
+        minNumberOfHits = cms.int32(0),
+        minQuality = cms.string('loose'),
+        pixelRecHitLegacySrc = cms.InputTag("hltSiPixelRecHitsLegacyFormat"),
+        trackSrc = cms.InputTag("hltPixelTracks")
+    )
+
+    process.hltPixelTracksLegacyFormatCPUSerial = process.hltPixelTracksLegacyFormat.clone(
+        pixelRecHitLegacySrc = cms.InputTag("hltSiPixelRecHitsLegacyFormatCPUSerial"),
+        trackSrc = cms.InputTag("hltPixelTracksCPUSerial")
+    )
+
+    process.HLTRecoPixelTracksTask = cms.ConditionalTask(
+        process.hltPixelTracks,
+        process.hltPixelTracksLegacyFormat,
+    )
+
+    process.HLTRecoPixelTracksCPUSerialTask = cms.ConditionalTask(
+        process.hltPixelTracksCPUSerial,
+        process.hltPixelTracksLegacyFormatCPUSerial,
+    )
+
+    process.HLTRecoPixelTracksCPUSerialSequence = cms.Sequence( process.HLTRecoPixelTracksCPUSerialTask )
+
+    return process
+
+def customizeHLTforAlpakaPixelRecoVertexing(process):
+    '''Customisation to introduce the Pixel-Vertex Reconstruction in Alpaka
+    '''
+
+    # alpaka EDProducer
+    # consumes
+    #  - TkSoADevice
+    # produces
+    #  - ZVertexDevice
+    process.hltPixelVertices = cms.EDProducer('PixelVertexProducerAlpakaPhase1@alpaka',
+        oneKernel = cms.bool(True),
+        useDensity = cms.bool(True),
+        useDBSCAN = cms.bool(False),
+        useIterative = cms.bool(False),
+        minT = cms.int32(2),
+        eps = cms.double(0.07),
+        errmax = cms.double(0.01),
+        chi2max = cms.double(9),
+        PtMin = cms.double(0.5),
+        PtMax = cms.double(75),
+        pixelTrackSrc = cms.InputTag('hltPixelTracks'),
+        # autoselect the alpaka backend
+        alpaka = cms.untracked.PSet(
+            backend = cms.untracked.string('')
+        )
+    )
+
+    process.hltPixelVerticesCPUSerial = process.hltPixelVertices.clone(
+        pixelTrackSrc = 'hltPixelTracksCPUSerial',
+        alpaka = dict( backend = 'serial_sync' )
+    )
+
+    process.hltPixelVerticesLegacyFormat = cms.EDProducer("PixelVertexProducerFromSoAAlpaka",
+        TrackCollection = cms.InputTag("hltPixelTracksLegacyFormat"),
+        beamSpot = cms.InputTag("hltOnlineBeamSpot"),
+        src = cms.InputTag("hltPixelVertices")
+    )
+
+    process.hltPixelVerticesLegacyFormatCPUSerial = process.hltPixelVerticesLegacyFormat.clone(
+        TrackCollection = cms.InputTag("hltPixelTracksLegacyFormatCPUSerial"),
+        src = cms.InputTag("hltPixelVerticesCPUSerial")
+    )
+
+    process.HLTRecopixelvertexingTask = cms.ConditionalTask(
+        process.HLTRecoPixelTracksTask,
+        process.hltPixelVertices,
+        process.hltPixelVerticesLegacyFormat,
+    )
+
+    process.HLTRecopixelvertexingCPUSerialTask = cms.ConditionalTask(
+        process.HLTRecoPixelTracksCPUSerialTask,
+        process.hltPixelVerticesCPUSerial,
+        process.hltPixelVerticesLegacyFormatCPUSerial,
+    )
+
+    process.HLTRecopixelvertexingCPUSerialSequence = cms.Sequence( process.HLTRecopixelvertexingCPUSerialTask )
+
+    return process
+
+def customizeHLTforAlpakaPixelReco(process):
+    '''Customisation to introduce the Pixel Local+Track+Vertex Reconstruction in Alpaka
+    '''
+    process.load('Configuration.StandardSequences.Accelerators_cff')
+    process.load('HeterogeneousCore.AlpakaCore.ProcessAcceleratorAlpaka_cfi')
+
+    process = customizeHLTforAlpakaPixelRecoLocal(process)
+    process = customizeHLTforAlpakaPixelRecoTracking(process)
+    process = customizeHLTforAlpakaPixelRecoVertexing(process)
+
+    return process
+
+def customizeHLTforPatatrack(process):
+    '''Customize HLT configuration introducing latest Patatrack developments
+    '''
+    process = customizeHLTforAlpakaPixelReco(process)
+    return process
diff --git a/HeterogeneousCore/AlpakaCore/python/functions.py b/HeterogeneousCore/AlpakaCore/python/functions.py
new file mode 100644
index 0000000000000..5b79a1b205631
--- /dev/null
+++ b/HeterogeneousCore/AlpakaCore/python/functions.py
@@ -0,0 +1,23 @@
+def makeSerialClone(module, **kwargs):
+    type = module._TypedParameterizable__type
+    if type.endswith('@alpaka'):
+        # alpaka module with automatic backend selection
+        base = type.removesuffix('@alpaka')
+    elif type.startswith('alpaka_serial_sync::'):
+        # alpaka module with explicit serial_sync backend
+        base = type.removeprefix('alpaka_serial_sync::')
+    elif type.startswith('alpaka_cuda_async::'):
+        # alpaka module with explicit cuda_async backend
+        base = type.removeprefix('alpaka_cuda_async::')
+    elif type.startswith('alpaka_rocm_async::'):
+        # alpaka module with explicit rocm_async backend
+        base = type.removeprefix('alpaka_rocm_async::')
+    else:
+        # non-alpaka module
+        raise TypeError('%s is not an alpaka-based module, and cannot be used with makeSerialClone()' % str(module))
+
+    copy = module.clone(**kwargs)
+    copy._TypedParameterizable__type = 'alpaka_serial_sync::' + base
+    if 'alpaka' in copy.parameterNames_():
+        del copy.alpaka
+    return copy
diff --git a/HeterogeneousCore/AlpakaTest/test/writer.py b/HeterogeneousCore/AlpakaTest/test/writer.py
index bd8d2775b31ed..d23ac528629b8 100644
--- a/HeterogeneousCore/AlpakaTest/test/writer.py
+++ b/HeterogeneousCore/AlpakaTest/test/writer.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.AlpakaCore.functions import *
 
 process = cms.Process('Writer')
 
@@ -31,16 +32,9 @@
 )
 
 # run a second producer explicitly on the cpu
-process.testProducerSerial = cms.EDProducer('alpaka_serial_sync::TestAlpakaProducer',
+process.testProducerSerial = makeSerialClone(process.testProducer,
     size = cms.int32(99)
 )
-# an alternative approach would be to use
-#process.testProducerSerial = cms.EDProducer('TestAlpakaProducer@alpaka',
-#    size = cms.int32(99),
-#    alpaka = cms.untracked.PSet(
-#        backend = cms.untracked.string("serial_sync")
-#    )
-#)
 
 # analyse the second set of products
 process.testAnalyzerSerial = cms.EDAnalyzer('TestAlpakaAnalyzer',
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
index 820b6b237c7e5..0bfa989c92969 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
@@ -21,7 +21,8 @@
 // local include(s)
 #include "PixelClusterizerBase.h"
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
+
 template <typename TrackerTraits>
 class SiPixelDigisClustersFromSoAT : public edm::global::EDProducer<> {
 public:
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc
index ad05ad3ff60c9..423951f4cb74f 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc
@@ -21,8 +21,9 @@
 // local include(s)
 #include "PixelClusterizerBase.h"
 
-// #define EDM_ML_DEBUG
-// #define GPU_DEBUG
+//#define EDM_ML_DEBUG
+//#define GPU_DEBUG
+
 template <typename TrackerTraits>
 class SiPixelDigisClustersFromSoAAlpaka : public edm::global::EDProducer<> {
 public:
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 56718b4bdae14..452b0e2097071 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -33,7 +33,7 @@
 #include "gpuClusterChargeCut.h"
 #include "gpuClustering.h"
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
 
 namespace pixelgpudetails {
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 06b30da68c8cd..fe9cc260a5853 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -18,7 +18,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h"
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
 
 struct SiPixelROCsStatusAndMapping;
 class SiPixelGainForHLTonGPU;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
index ff885b5bad07f..d1f5509052468 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h
@@ -18,7 +18,7 @@
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h"
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
 
 namespace calibPixel {
   using namespace cms::alpakatools;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h
index c149707e41d9a..4056090517aee 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h
@@ -10,7 +10,7 @@
 #include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h"
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
 
 namespace pixelClustering {
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
index 616ccbd3eb8c7..7da68c7b2f5da 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h
@@ -5,15 +5,16 @@
 #include <cstdint>
 #include <cstdio>
 #include <type_traits>
+
 #include <alpaka/alpaka.hpp>
 
-#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
-#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
 #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
 #include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
 
@@ -140,7 +141,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         // find the index of the first pixel not belonging to this module (or invalid)
         auto& msize = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
 
-        const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+        const uint32_t blockIdx = alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u];
         if (blockIdx >= clus_view[0].moduleStart())
           return;
 
@@ -274,11 +275,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           ALPAKA_ASSERT_OFFLOAD((hist.size() / blockDimension) <= maxiter);
 
           // NB: can be tuned.
-          constexpr uint32_t threadDimension = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 1 : 256;
+          constexpr uint32_t threadDimension = cms::alpakatools::requires_single_thread_per_block_v<TAcc> ? 256 : 1;
 
 #ifndef NDEBUG
-          [[maybe_unused]] const uint32_t runTimeThreadDimension(
-              alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u]);
+          [[maybe_unused]] const uint32_t runTimeThreadDimension =
+              alpaka::getWorkDiv<alpaka::Thread, alpaka::Elems>(acc)[0u];
           ALPAKA_ASSERT_OFFLOAD(runTimeThreadDimension <= threadDimension);
 #endif
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
index 3e7caf8b2b3a4..597aaa70987f4 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc
@@ -29,7 +29,7 @@
 #include "PixelClustering.h"
 #include "SiPixelRawToClusterKernel.h"
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
 
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
   namespace pixelDetails {
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
index 8d78599d07d9c..a6dd2bea80e2a 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.AlpakaCore.functions import *
 from Configuration.Eras.Modifier_run3_common_cff import run3_common
 from Configuration.ProcessModifiers.gpu_cff import gpu
 from Configuration.ProcessModifiers.alpaka_cff import alpaka
@@ -130,11 +131,7 @@ def _addProcessCalibTrackerAlpakaES(process):
 ))
 
 # reconstruct the pixel digis and clusters with alpaka on the cpu, for validation
-siPixelClustersPreSplittingAlpakaSerial = siPixelClustersPreSplittingAlpaka.clone(
-    #alpaka = dict( backend = '*' )
-    alpaka = None
-)
-siPixelClustersPreSplittingAlpakaSerial._TypedParameterizable__type = 'alpaka_serial_sync' + siPixelClustersPreSplittingAlpaka._TypedParameterizable__type.removesuffix('@alpaka')
+siPixelClustersPreSplittingAlpakaSerial = makeSerialClone(siPixelClustersPreSplittingAlpaka)
 
 from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoAAlpakaPhase1_cfi import siPixelDigisClustersFromSoAAlpakaPhase1 as _siPixelDigisClustersFromSoAAlpakaPhase1
 from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoAAlpakaPhase2_cfi import siPixelDigisClustersFromSoAAlpakaPhase2 as _siPixelDigisClustersFromSoAAlpakaPhase2
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 61442ea9d2b8c..b1e5e1c3c90e9 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -12,7 +12,8 @@
 
 #include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
-// #define GPU_DEBUG
+
+//#define GPU_DEBUG
 
 namespace {
   template <typename TrackerTraits>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
index 25cc724cd4c4a..407a18be04fa9 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
@@ -10,7 +10,9 @@
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
 //#define GPU_DEBUG
+
 namespace pixelgpudetails {
 
   template <typename TrackerTraits>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSoAAlpaka.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSoAAlpaka.cc
index 9881aeab46bab..a76ff6af49ac9 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSoAAlpaka.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSoAAlpaka.cc
@@ -9,7 +9,6 @@
 #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
 #include "FWCore/Framework/interface/global/EDProducer.h"
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
@@ -181,7 +180,10 @@ void SiPixelRecHitFromSoAAlpaka<TrackerTraits>::produce(edm::StreamID streamID,
 }
 
 using SiPixelRecHitFromSoAAlpakaPhase1 = SiPixelRecHitFromSoAAlpaka<pixelTopology::Phase1>;
-DEFINE_FWK_MODULE(SiPixelRecHitFromSoAAlpakaPhase1);
-
 using SiPixelRecHitFromSoAAlpakaPhase2 = SiPixelRecHitFromSoAAlpaka<pixelTopology::Phase2>;
+using SiPixelRecHitFromSoAAlpakaHIonPhase1 = SiPixelRecHitFromSoAAlpaka<pixelTopology::HIonPhase1>;
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(SiPixelRecHitFromSoAAlpakaPhase1);
 DEFINE_FWK_MODULE(SiPixelRecHitFromSoAAlpakaPhase2);
+DEFINE_FWK_MODULE(SiPixelRecHitFromSoAAlpakaHIonPhase1);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h
index 220a91b85ced3..45587034b572b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h
@@ -19,7 +19,8 @@
 #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
 
-//#define GPU_DEBUG 1
+//#define GPU_DEBUG
+
 namespace ALPAKA_ACCELERATOR_NAMESPACE {
   namespace pixelRecHits {
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 94ae258cc16fb..55c556bd63048 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -7,13 +7,14 @@
 
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 
-//#define GPU_DEBUG 1
+//#define GPU_DEBUG
+
 namespace gpuPixelRecHits {
 
   template <typename TrackerTraits>
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
index e6b2c9832600c..7e8910a8e0918 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.AlpakaCore.functions import *
 from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
 from Configuration.ProcessModifiers.gpu_cff import gpu
 from Configuration.ProcessModifiers.alpaka_cff import alpaka
@@ -139,12 +140,9 @@
 ))
 
 # Hit SoA producer on the cpu, for validation
-siPixelRecHitsPreSplittingAlpakaSerial = siPixelRecHitsPreSplittingAlpaka.clone(
-    src = "siPixelClustersPreSplittingAlpakaSerial",
-    #alpaka = dict( backend = '*' )
-    alpaka = None
+siPixelRecHitsPreSplittingAlpakaSerial = makeSerialClone(siPixelRecHitsPreSplittingAlpaka,
+    src = "siPixelClustersPreSplittingAlpakaSerial"
 )
-siPixelRecHitsPreSplittingAlpakaSerial._TypedParameterizable__type = 'alpaka_serial_sync' + siPixelRecHitsPreSplittingAlpaka._TypedParameterizable__type.removesuffix('@alpaka')
 
 from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromSoAAlpakaPhase1_cfi import siPixelRecHitFromSoAAlpakaPhase1 as _siPixelRecHitFromSoAAlpakaPhase1
 from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromSoAAlpakaPhase2_cfi import siPixelRecHitFromSoAAlpakaPhase2 as _siPixelRecHitFromSoAAlpakaPhase2
diff --git a/RecoTracker/Configuration/python/RecoPixelVertexing_cff.py b/RecoTracker/Configuration/python/RecoPixelVertexing_cff.py
index c08a0987d3f59..f5ba3ad7df1da 100644
--- a/RecoTracker/Configuration/python/RecoPixelVertexing_cff.py
+++ b/RecoTracker/Configuration/python/RecoPixelVertexing_cff.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.AlpakaCore.functions import *
 from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
 
 from RecoTracker.PixelTrackFitting.PixelTracks_cff import *
@@ -98,6 +99,31 @@
     pixelVerticesTask.copy()
 ))
 
+## pixel vertex reconstruction with Alpaka
+
+# pixel vertex SoA producer with alpaka on the device
+from RecoTracker.PixelVertexFinding.pixelVertexProducerAlpakaPhase1_cfi import pixelVertexProducerAlpakaPhase1 as _pixelVerticesAlpakaPhase1
+from RecoTracker.PixelVertexFinding.pixelVertexProducerAlpakaPhase2_cfi import pixelVertexProducerAlpakaPhase2 as _pixelVerticesAlpakaPhase2
+pixelVerticesAlpaka = _pixelVerticesAlpakaPhase1.clone()
+phase2_tracker.toReplaceWith(pixelVerticesAlpaka,_pixelVerticesAlpakaPhase2.clone())
+
+from RecoTracker.PixelVertexFinding.pixelVertexFromSoAAlpaka_cfi import pixelVertexFromSoAAlpaka as _pixelVertexFromSoAAlpaka
+alpaka.toReplaceWith(pixelVertices, _pixelVertexFromSoAAlpaka.clone())
+
+# pixel vertex SoA producer with alpaka on the cpu, for validation
+pixelVerticesAlpakaSerial = makeSerialClone(pixelVerticesAlpaka,
+    pixelTrackSrc = 'pixelTracksAlpakaSerial'
+)
+
+alpaka.toReplaceWith(pixelVerticesTask, cms.Task(
+    # Build the pixel vertices in SoA format with alpaka on the device
+    pixelVerticesAlpaka,
+    # Build the pixel vertices in SoA format with alpaka on the cpu (if requested by the validation)
+    pixelVerticesAlpakaSerial,
+    # Convert the pixel vertices from SoA format (on the host) to the legacy format
+    pixelVertices
+))
+
 # Tasks and Sequences
 recopixelvertexingTask = cms.Task(
     pixelTracksTask,
diff --git a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
index 3d121a8736f8e..55a02f83f913c 100644
--- a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
+++ b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py
@@ -3,10 +3,12 @@
 # Customise the Pixel-only reconstruction to run on GPU
 #
 # Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU.
+# CUDA and Alpaka co-living here for the moment
+
 def customizePixelOnlyForProfilingGPUOnly(process):
 
   process.consumer = cms.EDAnalyzer("GenericConsumer",
-      eventProducts = cms.untracked.vstring('pixelTracksCUDA', 'pixelVerticesCUDA')
+      eventProducts = cms.untracked.vstring('pixelTracksCUDA', 'pixelVerticesCUDA', '*DeviceProduct_pixelTracksAlpaka_*_*', '*DeviceProduct_pixelVerticesAlpaka_*_*')
   )
 
   process.consume_step = cms.EndPath(process.consumer)
@@ -25,10 +27,8 @@ def customizePixelOnlyForProfilingGPUOnly(process):
 # tracks and vertices on the CPU in SoA format, without conversion to legacy format.
 def customizePixelOnlyForProfilingGPUWithHostCopy(process):
 
-  #? process.siPixelRecHitSoAFromLegacy.convertToLegacy = False
-
   process.consumer = cms.EDAnalyzer("GenericConsumer",
-      eventProducts = cms.untracked.vstring('pixelTracksSoA', 'pixelVerticesSoA')
+      eventProducts = cms.untracked.vstring('pixelTracksSoA', 'pixelVerticesSoA', 'pixelTracksAlpaka', 'pixelVerticesAlpaka')
   )
 
   process.consume_step = cms.EndPath(process.consumer)
diff --git a/RecoTracker/PixelSeeding/plugins/BuildFile.xml b/RecoTracker/PixelSeeding/plugins/BuildFile.xml
index 82b80e1c55b66..f9863a6a8c292 100644
--- a/RecoTracker/PixelSeeding/plugins/BuildFile.xml
+++ b/RecoTracker/PixelSeeding/plugins/BuildFile.xml
@@ -1,21 +1,36 @@
 <use name="ofast-flag"/>
-<use name="CUDADataFormats/Track"/>
-<use name="CUDADataFormats/TrackingRecHit"/>
 <use name="CommonTools/RecoAlgos"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/PluginManager"/>
-<use name="HeterogeneousCore/CUDACore"/>
 <use name="RecoTracker/PixelSeeding"/>
 <use name="RecoTracker/Record"/>
 <use name="RecoTracker/TkTrackingRegions"/>
 <use name="RecoTracker/TkSeedingLayers"/>
+
 <iftool name="cuda-gcc-support">
   <use name="cuda"/>
   <set name="cuda_src" value="*.cu"/>
 <else/>
   <set name="cuda_src" value=""/>
 </iftool>
-<library file="*.cc ${cuda_src}" name="RecoPixelVertexingPixelTripletsPlugins">
+<library file="*.cc ${cuda_src}" name="RecoTrackerPixelSeedingPlugins">
+ <use name="HeterogeneousCore/CUDACore"/>
+ <use name="CUDADataFormats/Track"/>
+ <use name="CUDADataFormats/TrackingRecHit"/>
+ <flags EDM_PLUGIN="1"/>
+</library>
+
+<library file="alpaka/*.cc" name="RecoTrackerPixelSeedingPortable">
+ <use name="alpaka"/>
+  <use name="DataFormats/Portable"/>
+  <use name="DataFormats/TrackSoA"/>
+  <use name="DataFormats/TrackingRecHitSoA"/>
+  <use name="FWCore/Utilities"/>
+  <use name="HeterogeneousCore/AlpakaCore"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <use name="RecoLocalTracker/Records"/>
+  <use name="RecoLocalTracker/SiPixelRecHits"/>
+  <flags ALPAKA_BACKENDS="1"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu
index efb2a2e17715c..6e07126e9e428 100644
--- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu
@@ -1,8 +1,9 @@
-#include "RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h"
 #include <mutex>
 
-// #define NTUPLE_DEBUG
-// #define GPU_DEBUG
+#include "RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h"
+
+//#define GPU_DEBUG
+//#define NTUPLE_DEBUG
 
 template <typename TrackerTraits>
 void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(const HitsConstView &hh,
diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.h b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.h
index 0865fa5cbc46a..250aef21c1d6a 100644
--- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.h
@@ -1,18 +1,17 @@
 #ifndef RecoTracker_PixelSeeding_plugins_CAHitNtupletGeneratorKernels_h
 #define RecoTracker_PixelSeeding_plugins_CAHitNtupletGeneratorKernels_h
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
+//#define DUMP_GPU_TK_TUPLES
 
-#include "GPUCACell.h"
-#include "gpuPixelDoublets.h"
-
-#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 
-// #define DUMP_GPU_TK_TUPLES
+#include "GPUCACell.h"
+#include "gpuPixelDoublets.h"
 
 namespace caHitNtupletGenerator {
 
diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
index 6acff4abbd531..64148d5f5ba81 100644
--- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
+++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
@@ -2,7 +2,8 @@
 
 #include "CAHitNtupletGeneratorKernels.h"
 
-// #define GPU_DEBUG
+//#define GPU_DEBUG
+
 template <typename TrackerTraits>
 #ifdef __CUDACC__
 void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::allocateOnGPU(int32_t nHits, cudaStream_t stream) {
diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 540c0b92f9015..57e4ea6f9441f 100644
--- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -2,8 +2,8 @@
 // Original Author: Felice Pantaleo, CERN
 //
 
-// #define NTUPLE_DEBUG
-// #define GPU_DEBUG
+//#define NTUPLE_DEBUG
+//#define GPU_DEBUG
 
 #include <cmath>
 #include <cstdint>
@@ -11,15 +11,14 @@
 
 #include <cuda_runtime.h>
 
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
-#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
-
-#include "CAStructures.h"
 #include "CAHitNtupletGeneratorKernels.h"
+#include "CAStructures.h"
 #include "GPUCACell.h"
 #include "gpuFishbone.h"
 #include "gpuPixelDoublets.h"
diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorOnGPU.cc
index faf0bae6fb0a9..5100cf734142c 100644
--- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -2,8 +2,8 @@
 // Original Author: Felice Pantaleo, CERN
 //
 
-// #define GPU_DEBUG
-// #define DUMP_GPU_TK_TUPLES
+//#define GPU_DEBUG
+//#define DUMP_GPU_TK_TUPLES
 
 #include <array>
 #include <cassert>
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc
new file mode 100644
index 0000000000000..a21fed668b54c
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc
@@ -0,0 +1,412 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+//#define BROKENLINE_DEBUG
+//#define BL_DUMP_HITS
+#include <alpaka/alpaka.hpp>
+#include <cstdint>
+
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
+#include "RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h"
+
+#include "HelixFit.h"
+
+template <typename TrackerTraits>
+using Tuples = typename reco::TrackSoA<TrackerTraits>::HitContainer;
+template <typename TrackerTraits>
+using OutputSoAView = reco::TrackSoAView<TrackerTraits>;
+template <typename TrackerTraits>
+using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+
+// #define BL_DUMP_HITS
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  template <int N, typename TrackerTraits>
+  class Kernel_BLFastFit {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  Tuples<TrackerTraits> const *__restrict__ foundNtuplets,
+                                  TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
+                                  TrackingRecHitSoAConstView<TrackerTraits> hh,
+                                  pixelCPEforDevice::ParamsOnDeviceT<TrackerTraits> const *__restrict__ cpeParams,
+                                  typename TrackerTraits::tindex_type *__restrict__ ptkids,
+                                  double *__restrict__ phits,
+                                  float *__restrict__ phits_ge,
+                                  double *__restrict__ pfast_fit,
+                                  uint32_t nHitsL,
+                                  uint32_t nHitsH,
+                                  int32_t offset) const {
+      constexpr uint32_t hitsInFit = N;
+      constexpr auto invalidTkId = std::numeric_limits<typename TrackerTraits::tindex_type>::max();
+
+      ALPAKA_ASSERT_OFFLOAD(hitsInFit <= nHitsL);
+      ALPAKA_ASSERT_OFFLOAD(nHitsL <= nHitsH);
+      ALPAKA_ASSERT_OFFLOAD(phits);
+      ALPAKA_ASSERT_OFFLOAD(pfast_fit);
+      ALPAKA_ASSERT_OFFLOAD(foundNtuplets);
+      ALPAKA_ASSERT_OFFLOAD(tupleMultiplicity);
+
+      // look in bin for this hit multiplicity
+      int totTK = tupleMultiplicity->end(nHitsH) - tupleMultiplicity->begin(nHitsL);
+      ALPAKA_ASSERT_OFFLOAD(totTK <= int(tupleMultiplicity->size()));
+      ALPAKA_ASSERT_OFFLOAD(totTK >= 0);
+
+#ifdef BROKENLINE_DEBUG
+      const uint32_t threadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+      if (cms::alpakatools::once_per_grid(acc)) {
+        printf("%d total Ntuple\n", tupleMultiplicity->size());
+        printf("%d Ntuple of size %d/%d for %d hits to fit\n", totTK, nHitsL, nHitsH, hitsInFit);
+      }
+#endif
+      const auto nt = riemannFit::maxNumberOfConcurrentFits;
+      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+        auto tuple_idx = local_idx + offset;
+        if ((int)tuple_idx >= totTK) {
+          ptkids[local_idx] = invalidTkId;
+          break;
+        }
+        // get it from the ntuple container (one to one to helix)
+        auto tkid = *(tupleMultiplicity->begin(nHitsL) + tuple_idx);
+        ALPAKA_ASSERT_OFFLOAD(static_cast<int>(tkid) < foundNtuplets->nOnes());
+
+        ptkids[local_idx] = tkid;
+
+        auto nHits = foundNtuplets->size(tkid);
+
+        ALPAKA_ASSERT_OFFLOAD(nHits >= nHitsL);
+        ALPAKA_ASSERT_OFFLOAD(nHits <= nHitsH);
+
+        riemannFit::Map3xNd<N> hits(phits + local_idx);
+        riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+        riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+#ifdef BL_DUMP_HITS
+        auto &&done = alpaka::declareSharedVar<int, __COUNTER__>(acc);
+        done = 0;
+        alpaka::syncBlockThreads(acc);
+        bool dump =
+            (foundNtuplets->size(tkid) == 5 && 0 == alpaka::atomicAdd(acc, &done, 1, alpaka::hierarchy::Blocks{}));
+#endif
+
+        // Prepare data structure
+        auto const *hitId = foundNtuplets->begin(tkid);
+
+        // #define YERR_FROM_DC
+#ifdef YERR_FROM_DC
+        // try to compute more precise error in y
+        auto dx = hh[hitId[hitsInFit - 1]].xGlobal() - hh[hitId[0]].xGlobal();
+        auto dy = hh[hitId[hitsInFit - 1]].yGlobal() - hh[hitId[0]].yGlobal();
+        auto dz = hh[hitId[hitsInFit - 1]].zGlobal() - hh[hitId[0]].zGlobal();
+        float ux, uy, uz;
+#endif
+
+        float incr = std::max(1.f, float(nHits) / float(hitsInFit));
+        float n = 0;
+        for (uint32_t i = 0; i < hitsInFit; ++i) {
+          int j = int(n + 0.5f);  // round
+          if (hitsInFit - 1 == i)
+            j = nHits - 1;  // force last hit to ensure max lever arm.
+          ALPAKA_ASSERT_OFFLOAD(j < int(nHits));
+          n += incr;
+          auto hit = hitId[j];
+          float ge[6];
+
+#ifdef YERR_FROM_DC
+          auto const &dp = cpeParams->detParams(hh.detectorIndex(hit));
+          auto status = hh[hit].chargeAndStatus().status;
+          int qbin = CPEFastParametrisation::kGenErrorQBins - 1 - status.qBin;
+          ALPAKA_ASSERT_OFFLOAD(qbin >= 0 && qbin < 5);
+          bool nok = (status.isBigY | status.isOneY);
+          // compute cotanbeta and use it to recompute error
+          dp.frame.rotation().multiply(dx, dy, dz, ux, uy, uz);
+          auto cb = std::abs(uy / uz);
+          int bin =
+              int(cb * (float(phase1PixelTopology::pixelThickess) / float(phase1PixelTopology::pixelPitchY)) * 8.f) - 4;
+          int low_value = 0;
+          int high_value = CPEFastParametrisation::kNumErrorBins - 1;
+          // return estimated bin value truncated to [0, 15]
+          bin = std::clamp(bin, low_value, high_value);
+          float yerr = dp.sigmay[bin] * 1.e-4f;  // toCM
+          yerr *= dp.yfact[qbin];                // inflate
+          yerr *= yerr;
+          yerr += dp.apeYY;
+          yerr = nok ? hh[hit].yerrLocal() : yerr;
+          dp.frame.toGlobal(hh[hit].xerrLocal(), 0, yerr, ge);
+#else
+          cpeParams->detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge);
+#endif
+
+#ifdef BL_DUMP_HITS
+          bool dump = foundNtuplets->size(tkid) == 5;
+          if (dump) {
+            printf("Track id %d %d Hit %d on %d\nGlobal: hits.col(%d) << %f,%f,%f\n",
+                   local_idx,
+                   tkid,
+                   hit,
+                   hh[hit].detectorIndex(),
+                   i,
+                   hh[hit].xGlobal(),
+                   hh[hit].yGlobal(),
+                   hh[hit].zGlobal());
+            printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]);
+          }
+#endif
+
+          hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal();
+          hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
+        }
+        brokenline::fastFit(acc, hits, fast_fit);
+
+        // no NaN here....
+        ALPAKA_ASSERT_OFFLOAD(fast_fit(0) == fast_fit(0));
+        ALPAKA_ASSERT_OFFLOAD(fast_fit(1) == fast_fit(1));
+        ALPAKA_ASSERT_OFFLOAD(fast_fit(2) == fast_fit(2));
+        ALPAKA_ASSERT_OFFLOAD(fast_fit(3) == fast_fit(3));
+      }
+    }
+  };
+
+  template <int N, typename TrackerTraits>
+  struct Kernel_BLFit {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
+                                  double bField,
+                                  OutputSoAView<TrackerTraits> results_view,
+                                  typename TrackerTraits::tindex_type const *__restrict__ ptkids,
+                                  double *__restrict__ phits,
+                                  float *__restrict__ phits_ge,
+                                  double *__restrict__ pfast_fit) const {
+      ALPAKA_ASSERT_OFFLOAD(results_view.pt());
+      ALPAKA_ASSERT_OFFLOAD(results_view.eta());
+      ALPAKA_ASSERT_OFFLOAD(results_view.chi2());
+      ALPAKA_ASSERT_OFFLOAD(pfast_fit);
+      constexpr auto invalidTkId = std::numeric_limits<typename TrackerTraits::tindex_type>::max();
+
+      // same as above...
+      // look in bin for this hit multiplicity
+      const auto nt = riemannFit::maxNumberOfConcurrentFits;
+      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (invalidTkId == ptkids[local_idx])
+          break;
+        auto tkid = ptkids[local_idx];
+
+        ALPAKA_ASSERT_OFFLOAD(tkid < TrackerTraits::maxNumberOfTuples);
+
+        riemannFit::Map3xNd<N> hits(phits + local_idx);
+        riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+        riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+        brokenline::PreparedBrokenLineData<N> data;
+
+        brokenline::karimaki_circle_fit circle;
+        riemannFit::LineFit line;
+
+        brokenline::prepareBrokenLineData(acc, hits, fast_fit, bField, data);
+        brokenline::lineFit(acc, hits_ge, fast_fit, bField, data, line);
+        brokenline::circleFit(acc, hits, hits_ge, fast_fit, bField, data, circle);
+
+        TracksUtilities<TrackerTraits>::copyFromCircle(
+            results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+        results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2)));
+        results_view[tkid].eta() = alpaka::math::asinh(acc, line.par(0));
+        results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5);
+
+#ifdef BROKENLINE_DEBUG
+        if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
+          printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
+        printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+               N,
+               N,
+               tkid,
+               circle.par(0),
+               circle.par(1),
+               circle.par(2));
+        printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
+        printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
+               circle.chi2,
+               line.chi2,
+               circle.cov(0, 0),
+               circle.cov(1, 1),
+               circle.cov(2, 2),
+               line.cov(0, 0),
+               line.cov(1, 1));
+#endif
+      }
+    }
+  };
+
+  template <typename TrackerTraits>
+  void HelixFit<TrackerTraits>::launchBrokenLineKernels(
+      const TrackingRecHitSoAConstView<TrackerTraits> &hv,
+      pixelCPEforDevice::ParamsOnDeviceT<TrackerTraits> const *cpeParams,
+      uint32_t hitsInFit,
+      uint32_t maxNumberOfTuples,
+      Queue &queue) {
+    ALPAKA_ASSERT_OFFLOAD(tuples_);
+
+    uint32_t blockSize = 64;
+    uint32_t numberOfBlocks = cms::alpakatools::divide_up_by(maxNumberOfConcurrentFits_, blockSize);
+    const WorkDiv1D workDivTriplets = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+    const WorkDiv1D workDivQuadsPenta = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks / 4, blockSize);
+
+    //  Fit internals
+    auto tkidDevice =
+        cms::alpakatools::make_device_buffer<typename TrackerTraits::tindex_type[]>(queue, maxNumberOfConcurrentFits_);
+    auto hitsDevice = cms::alpakatools::make_device_buffer<double[]>(
+        queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double));
+    auto hits_geDevice = cms::alpakatools::make_device_buffer<float[]>(
+        queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float));
+    auto fast_fit_resultsDevice = cms::alpakatools::make_device_buffer<double[]>(
+        queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
+
+    for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+      // fit triplets
+
+      alpaka::exec<Acc1D>(queue,
+                          workDivTriplets,
+                          Kernel_BLFastFit<3, TrackerTraits>{},
+                          tuples_,
+                          tupleMultiplicity_,
+                          hv,
+                          cpeParams,
+                          tkidDevice.data(),
+                          hitsDevice.data(),
+                          hits_geDevice.data(),
+                          fast_fit_resultsDevice.data(),
+                          3,
+                          3,
+                          offset);
+
+      alpaka::exec<Acc1D>(queue,
+                          workDivTriplets,
+                          Kernel_BLFit<3, TrackerTraits>{},
+                          tupleMultiplicity_,
+                          bField_,
+                          outputSoa_,
+                          tkidDevice.data(),
+                          hitsDevice.data(),
+                          hits_geDevice.data(),
+                          fast_fit_resultsDevice.data());
+
+      if (fitNas4_) {
+        // fit all as 4
+        riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrack, 1>([this,
+                                                                       &hv,
+                                                                       &cpeParams,
+                                                                       &tkidDevice,
+                                                                       &hitsDevice,
+                                                                       &hits_geDevice,
+                                                                       &fast_fit_resultsDevice,
+                                                                       &offset,
+                                                                       &queue,
+                                                                       &workDivQuadsPenta](auto i) {
+          alpaka::exec<Acc1D>(queue,
+                              workDivQuadsPenta,
+                              Kernel_BLFastFit<4, TrackerTraits>{},
+                              tuples_,
+                              tupleMultiplicity_,
+                              hv,
+                              cpeParams,
+                              tkidDevice.data(),
+                              hitsDevice.data(),
+                              hits_geDevice.data(),
+                              fast_fit_resultsDevice.data(),
+                              4,
+                              4,
+                              offset);
+
+          alpaka::exec<Acc1D>(queue,
+                              workDivQuadsPenta,
+                              Kernel_BLFit<4, TrackerTraits>{},
+                              tupleMultiplicity_,
+                              bField_,
+                              outputSoa_,
+                              tkidDevice.data(),
+                              hitsDevice.data(),
+                              hits_geDevice.data(),
+                              fast_fit_resultsDevice.data());
+        });
+
+      } else {
+        riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrackForFullFit, 1>([this,
+                                                                                 &hv,
+                                                                                 &cpeParams,
+                                                                                 &tkidDevice,
+                                                                                 &hitsDevice,
+                                                                                 &hits_geDevice,
+                                                                                 &fast_fit_resultsDevice,
+                                                                                 &offset,
+                                                                                 &queue,
+                                                                                 &workDivQuadsPenta](auto i) {
+          alpaka::exec<Acc1D>(queue,
+                              workDivQuadsPenta,
+                              Kernel_BLFastFit<i, TrackerTraits>{},
+                              tuples_,
+                              tupleMultiplicity_,
+                              hv,
+                              cpeParams,
+                              tkidDevice.data(),
+                              hitsDevice.data(),
+                              hits_geDevice.data(),
+                              fast_fit_resultsDevice.data(),
+                              i,
+                              i,
+                              offset);
+
+          alpaka::exec<Acc1D>(queue,
+                              workDivQuadsPenta,
+                              Kernel_BLFit<i, TrackerTraits>{},
+                              tupleMultiplicity_,
+                              bField_,
+                              outputSoa_,
+                              tkidDevice.data(),
+                              hitsDevice.data(),
+                              hits_geDevice.data(),
+                              fast_fit_resultsDevice.data());
+        });
+
+        static_assert(TrackerTraits::maxHitsOnTrackForFullFit < TrackerTraits::maxHitsOnTrack);
+
+        //Fit all the rest using the maximum from previous call
+        alpaka::exec<Acc1D>(queue,
+                            workDivQuadsPenta,
+                            Kernel_BLFastFit<TrackerTraits::maxHitsOnTrackForFullFit, TrackerTraits>{},
+                            tuples_,
+                            tupleMultiplicity_,
+                            hv,
+                            cpeParams,
+                            tkidDevice.data(),
+                            hitsDevice.data(),
+                            hits_geDevice.data(),
+                            fast_fit_resultsDevice.data(),
+                            TrackerTraits::maxHitsOnTrackForFullFit,
+                            TrackerTraits::maxHitsOnTrack - 1,
+                            offset);
+
+        alpaka::exec<Acc1D>(queue,
+                            workDivQuadsPenta,
+                            Kernel_BLFit<TrackerTraits::maxHitsOnTrackForFullFit, TrackerTraits>{},
+                            tupleMultiplicity_,
+                            bField_,
+                            outputSoa_,
+                            tkidDevice.data(),
+                            hitsDevice.data(),
+                            hits_geDevice.data(),
+                            fast_fit_resultsDevice.data());
+      }
+
+    }  // loop on concurrent fits
+  }
+
+  template class HelixFit<pixelTopology::Phase1>;
+  template class HelixFit<pixelTopology::Phase2>;
+  template class HelixFit<pixelTopology::HIonPhase1>;
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h b/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h
new file mode 100644
index 0000000000000..d0142f78415ae
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h
@@ -0,0 +1,391 @@
+#ifndef RecoPixelVertexing_PixelTriplets_CACellT_h
+#define RecoPixelVertexing_PixelTriplets_CACellT_h
+
+//
+// Author: Felice Pantaleo, CERN
+//
+
+// #define ONLY_TRIPLETS_IN_HOLE
+
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
+#include "RecoTracker/PixelSeeding/interface/CircleEq.h"
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/TrackSoA/interface/TracksSoA.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "CAStructures.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  template <typename TrackerTraits>
+  class CACellT {
+  public:
+    using PtrAsInt = unsigned long long;
+
+    static constexpr auto maxCellsPerHit = TrackerTraits::maxCellsPerHit;
+    using OuterHitOfCellContainer = caStructures::OuterHitOfCellContainerT<TrackerTraits>;
+    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+    using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+    using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+
+    using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+    using hindex_type = typename TrackerTraits::hindex_type;
+    using tindex_type = typename TrackerTraits::tindex_type;
+    static constexpr auto invalidHitId = std::numeric_limits<hindex_type>::max();
+
+    using TmpTuple = cms::alpakatools::VecArray<uint32_t, TrackerTraits::maxDepth>;
+
+    using HitContainer = typename reco::TrackSoA<TrackerTraits>::HitContainer;
+    using Quality = ::pixelTrack::Quality;
+    static constexpr auto bad = ::pixelTrack::Quality::bad;
+
+    enum class StatusBit : uint16_t { kUsed = 1, kInTrack = 2, kKilled = 1 << 15 };
+
+    CACellT() = default;
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void init(CellNeighborsVector& cellNeighbors,
+                                             CellTracksVector& cellTracks,
+                                             const HitsConstView& hh,
+                                             int layerPairId,
+                                             hindex_type innerHitId,
+                                             hindex_type outerHitId) {
+      theInnerHitId = innerHitId;
+      theOuterHitId = outerHitId;
+      theLayerPairId_ = layerPairId;
+      theStatus_ = 0;
+      theFishboneId = invalidHitId;
+
+      // optimization that depends on access pattern
+      theInnerZ = hh[innerHitId].zGlobal();
+      theInnerR = hh[innerHitId].rGlobal();
+
+      // link to default empty
+      theOuterNeighbors = &cellNeighbors[0];
+      theTracks = &cellTracks[0];
+      assert(outerNeighbors().empty());
+      assert(tracks().empty());
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) int addOuterNeighbor(
+        const TAcc& acc, typename TrackerTraits::cindex_type t, CellNeighborsVector& cellNeighbors) {
+      // use smart cache
+      if (outerNeighbors().empty()) {
+        auto i = cellNeighbors.extend(acc);  // maybe wasted....
+        if (i > 0) {
+          cellNeighbors[i].reset();
+          alpaka::mem_fence(acc, alpaka::memory_scope::Grid{});
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+          theOuterNeighbors = &cellNeighbors[i];
+#else
+          auto zero = (PtrAsInt)(&cellNeighbors[0]);
+          alpaka::atomicCas(acc,
+                            (PtrAsInt*)(&theOuterNeighbors),
+                            zero,
+                            (PtrAsInt)(&cellNeighbors[i]),
+                            alpaka::hierarchy::Blocks{});  // if fails we cannot give "i" back...
+#endif
+        } else
+          return -1;
+      }
+      alpaka::mem_fence(acc, alpaka::memory_scope::Grid{});
+      return outerNeighbors().push_back(acc, t);
+    }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) int addTrack(TAcc const& acc,
+                                                                               tindex_type t,
+                                                                               CellTracksVector& cellTracks) {
+      if (tracks().empty()) {
+        auto i = cellTracks.extend(acc);  // maybe wasted....
+        if (i > 0) {
+          cellTracks[i].reset();
+          alpaka::mem_fence(acc, alpaka::memory_scope::Grid{});
+#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED
+          theTracks = &cellTracks[i];
+#else
+          auto zero = (PtrAsInt)(&cellTracks[0]);
+          alpaka::atomicCas(acc,
+                            (PtrAsInt*)(&theTracks),
+                            zero,
+                            (PtrAsInt)(&cellTracks[i]),
+                            alpaka::hierarchy::Blocks{});  // if fails we cannot give "i" back...
+
+#endif
+        } else
+          return -1;
+      }
+      alpaka::mem_fence(acc, alpaka::memory_scope::Grid{});
+      return tracks().push_back(acc, t);
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE CellTracks& tracks() { return *theTracks; }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE CellTracks const& tracks() const { return *theTracks; }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_x(const HitsConstView& hh) const { return hh[theInnerHitId].xGlobal(); }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_x(const HitsConstView& hh) const { return hh[theOuterHitId].xGlobal(); }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_y(const HitsConstView& hh) const { return hh[theInnerHitId].yGlobal(); }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_y(const HitsConstView& hh) const { return hh[theOuterHitId].yGlobal(); }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_z(const HitsConstView& hh) const { return theInnerZ; }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_z(const HitsConstView& hh) const { return hh[theOuterHitId].zGlobal(); }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_r(const HitsConstView& hh) const { return theInnerR; }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_r(const HitsConstView& hh) const { return hh[theOuterHitId].rGlobal(); }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE auto inner_iphi(const HitsConstView& hh) const { return hh[theInnerHitId].iphi(); }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE auto outer_iphi(const HitsConstView& hh) const { return hh[theOuterHitId].iphi(); }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_detIndex(const HitsConstView& hh) const {
+      return hh[theInnerHitId].detectorIndex();
+    }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_detIndex(const HitsConstView& hh) const {
+      return hh[theOuterHitId].detectorIndex();
+    }
+
+    constexpr unsigned int inner_hit_id() const { return theInnerHitId; }
+    constexpr unsigned int outer_hit_id() const { return theOuterHitId; }
+
+    ALPAKA_FN_ACC void print_cell() const {
+      printf("printing cell: on layerPair: %d, innerHitId: %d, outerHitId: %d \n",
+             theLayerPairId_,
+             theInnerHitId,
+             theOuterHitId);
+    }
+
+    ALPAKA_FN_ACC bool check_alignment(const HitsConstView& hh,
+                                       CACellT const& otherCell,
+                                       const float ptmin,
+                                       const float hardCurvCut,
+                                       const float caThetaCutBarrel,
+                                       const float caThetaCutForward,
+                                       const float dcaCutInnerTriplet,
+                                       const float dcaCutOuterTriplet) const {
+      // detIndex of the layerStart for the Phase1 Pixel Detector:
+      // [BPX1, BPX2, BPX3, BPX4,  FP1,  FP2,  FP3,  FN1,  FN2,  FN3, LAST_VALID]
+      // [   0,   96,  320,  672, 1184, 1296, 1408, 1520, 1632, 1744,       1856]
+      auto ri = inner_r(hh);
+      auto zi = inner_z(hh);
+
+      auto ro = outer_r(hh);
+      auto zo = outer_z(hh);
+
+      auto r1 = otherCell.inner_r(hh);
+      auto z1 = otherCell.inner_z(hh);
+      auto isBarrel = otherCell.outer_detIndex(hh) < TrackerTraits::last_barrel_detIndex;
+      // TODO tune CA cuts below (theta and dca)
+      bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, isBarrel ? caThetaCutBarrel : caThetaCutForward);
+      return (aligned && dcaCut(hh,
+                                otherCell,
+                                otherCell.inner_detIndex(hh) < TrackerTraits::last_bpix1_detIndex ? dcaCutInnerTriplet
+                                                                                                  : dcaCutOuterTriplet,
+                                hardCurvCut));
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) static bool areAlignedRZ(
+        float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, const float thetaCut) {
+      float radius_diff = std::abs(r1 - ro);
+      float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo);
+
+      float pMin = ptmin * std::sqrt(distance_13_squared);  // this needs to be divided by
+                                                            // radius_diff later
+
+      float tan_12_13_half_mul_distance_13_squared = fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri));
+      return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool dcaCut(const HitsConstView& hh,
+                                               CACellT const& otherCell,
+                                               const float region_origin_radius_plus_tolerance,
+                                               const float maxCurv) const {
+      auto x1 = otherCell.inner_x(hh);
+      auto y1 = otherCell.inner_y(hh);
+
+      auto x2 = inner_x(hh);
+      auto y2 = inner_y(hh);
+
+      auto x3 = outer_x(hh);
+      auto y3 = outer_y(hh);
+
+      CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
+
+      if (eq.curvature() > maxCurv)
+        return false;
+
+      return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) static bool dcaCutH(
+        float x1,
+        float y1,
+        float x2,
+        float y2,
+        float x3,
+        float y3,
+        const float region_origin_radius_plus_tolerance,
+        const float maxCurv) {
+      CircleEq<float> eq(x1, y1, x2, y2, x3, y3);
+
+      if (eq.curvature() > maxCurv)
+        return false;
+
+      return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool hole0(const HitsConstView& hh, CACellT const& innerCell) const {
+      using namespace phase1PixelTopology;
+
+      int p = innerCell.inner_iphi(hh);
+      if (p < 0)
+        p += std::numeric_limits<unsigned short>::max();
+      p = (max_ladder_bpx0 * p) / std::numeric_limits<unsigned short>::max();
+      p %= max_ladder_bpx0;
+      auto il = first_ladder_bpx0 + p;
+      auto r0 = hh.averageGeometry().ladderR[il];
+      auto ri = innerCell.inner_r(hh);
+      auto zi = innerCell.inner_z(hh);
+      auto ro = outer_r(hh);
+      auto zo = outer_z(hh);
+      auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri);
+      auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]);
+      auto z_in_module = z_in_ladder - module_length_bpx0 * int(z_in_ladder / module_length_bpx0);
+      auto gap = z_in_module < module_tolerance_bpx0 || z_in_module > (module_length_bpx0 - module_tolerance_bpx0);
+      return gap;
+    }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool hole4(const HitsConstView& hh, CACellT const& innerCell) const {
+      using namespace phase1PixelTopology;
+
+      int p = outer_iphi(hh);
+      if (p < 0)
+        p += std::numeric_limits<unsigned short>::max();
+      p = (max_ladder_bpx4 * p) / std::numeric_limits<unsigned short>::max();
+      p %= max_ladder_bpx4;
+      auto il = first_ladder_bpx4 + p;
+      auto r4 = hh.averageGeometry().ladderR[il];
+      auto ri = innerCell.inner_r(hh);
+      auto zi = innerCell.inner_z(hh);
+      auto ro = outer_r(hh);
+      auto zo = outer_z(hh);
+      auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri);
+      auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]);
+      auto z_in_module = z_in_ladder - module_length_bpx4 * int(z_in_ladder / module_length_bpx4);
+      auto gap = z_in_module < module_tolerance_bpx4 || z_in_module > (module_length_bpx4 - module_tolerance_bpx4);
+      auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0];
+      auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1];
+      return gap || holeP || holeN;
+    }
+
+    // trying to free the track building process from hardcoded layers, leaving
+    // the visit of the graph based on the neighborhood connections between cells.
+    template <int DEPTH, typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void find_ntuplets(TAcc const& acc,
+                                                      const HitsConstView& hh,
+                                                      CACellT* __restrict__ cells,
+                                                      CellTracksVector& cellTracks,
+                                                      HitContainer& foundNtuplets,
+                                                      cms::alpakatools::AtomicPairCounter& apc,
+                                                      Quality* __restrict__ quality,
+                                                      TmpTuple& tmpNtuplet,
+                                                      const unsigned int minHitsPerNtuplet,
+                                                      bool startAt0) const {
+      // the building process for a track ends if:
+      // it has no right neighbor
+      // it has no compatible neighbor
+      // the ntuplets is then saved if the number of hits it contains is greater
+      // than a threshold
+
+      if constexpr (DEPTH <= 0) {
+        printf("ERROR: CACellT::find_ntuplets reached full depth!\n");
+        ALPAKA_ASSERT_OFFLOAD(false);
+      } else {
+        auto doubletId = this - cells;
+        tmpNtuplet.push_back_unsafe(doubletId);
+        ALPAKA_ASSERT_OFFLOAD(tmpNtuplet.size() <= int(TrackerTraits::maxHitsOnTrack - 3));
+
+        bool last = true;
+        for (unsigned int otherCell : outerNeighbors()) {
+          if (cells[otherCell].isKilled())
+            continue;  // killed by earlyFishbone
+          last = false;
+          cells[otherCell].template find_ntuplets<DEPTH - 1>(
+              acc, hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0);
+        }
+        if (last) {  // if long enough save...
+          if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
+#ifdef ONLY_TRIPLETS_IN_HOLE
+            // triplets accepted only pointing to the hole
+            if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) ||
+                ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]])))
+#endif
+            {
+              hindex_type hits[TrackerTraits::maxDepth + 2];
+              auto nh = 0U;
+              constexpr int maxFB = 2;  // for the time being let's limit this
+              int nfb = 0;
+              for (auto c : tmpNtuplet) {
+                hits[nh++] = cells[c].theInnerHitId;
+                if (nfb < maxFB && cells[c].hasFishbone()) {
+                  ++nfb;
+                  hits[nh++] = cells[c].theFishboneId;  // Fishbone hit is always outer than inner hit
+                }
+              }
+              assert(nh < TrackerTraits::maxHitsOnTrack);
+              hits[nh] = theOuterHitId;
+              auto it = foundNtuplets.bulkFill(acc, apc, hits, nh + 1);
+              if (it >= 0) {  // if negative is overflow....
+                for (auto c : tmpNtuplet)
+                  cells[c].addTrack(acc, it, cellTracks);
+                quality[it] = bad;  // initialize to bad
+              }
+            }
+          }
+        }
+        tmpNtuplet.pop_back();
+        assert(tmpNtuplet.size() < int(TrackerTraits::maxHitsOnTrack - 1));
+      }
+    }
+
+    // Cell status management
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void kill() { theStatus_ |= uint16_t(StatusBit::kKilled); }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool isKilled() const { return theStatus_ & uint16_t(StatusBit::kKilled); }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE int16_t layerPairId() const { return theLayerPairId_; }
+
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); }
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void setFishbone(TAcc const& acc, hindex_type id, float z, const HitsConstView& hh) {
+      // make it deterministic: use the farther apart (in z)
+      auto old = theFishboneId;
+      while (old !=
+             alpaka::atomicCas(
+                 acc,
+                 &theFishboneId,
+                 old,
+                 (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old,
+                 alpaka::hierarchy::Blocks{}))
+        old = theFishboneId;
+    }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE auto fishboneId() const { return theFishboneId; }
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE bool hasFishbone() const { return theFishboneId != invalidHitId; }
+
+  private:
+    CellNeighbors* theOuterNeighbors;
+    CellTracks* theTracks;
+
+    int16_t theLayerPairId_;
+    uint16_t theStatus_;  // tbd
+
+    float theInnerZ;
+    float theInnerR;
+    hindex_type theInnerHitId;
+    hindex_type theOuterHitId;
+    hindex_type theFishboneId;
+  };
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CACellT_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
new file mode 100644
index 0000000000000..343e0cf9ad005
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h
@@ -0,0 +1,148 @@
+#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h
+#define RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+#include "DataFormats/Math/interface/approx_atan2.h"
+
+#include "CACell.h"
+#include "CAStructures.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace caPixelDoublets {
+
+    template <typename TrackerTraits>
+    using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
+
+    template <typename TrackerTraits>
+    class CAFishbone {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                    HitsConstView<TrackerTraits> hh,
+                                    CACellT<TrackerTraits>* cells,
+                                    uint32_t const* __restrict__ nCells,
+                                    OuterHitOfCell<TrackerTraits> const* isOuterHitOfCellWrap,
+                                    int32_t nHits,
+                                    bool checkTrack) const {
+        if (nHits <= isOuterHitOfCellWrap->offset)
+          return;
+        constexpr auto maxCellsPerHit = CACellT<TrackerTraits>::maxCellsPerHit;
+
+        auto const isOuterHitOfCell = isOuterHitOfCellWrap->container;
+
+        // x runs faster...
+
+        float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit];
+        uint16_t d[maxCellsPerHit];
+        uint32_t cc[maxCellsPerHit];
+        uint8_t l[maxCellsPerHit];
+        const uint32_t dimIndexY = 0u;
+        const uint32_t dimIndexX = 1u;
+        const uint32_t blockDimensionX(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndexX]);
+        const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
+            cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
+
+        // Outermost loop on Y
+        const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
+        const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
+            cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY);
+        uint32_t firstElementIdxY = firstElementIdxNoStrideY;
+        uint32_t endElementIdxY = endElementIdxNoStrideY;
+
+        for (uint32_t idy = firstElementIdxY, nt = nHits; idy < nt; ++idy) {
+          if (not cms::alpakatools::next_valid_element_index_strided(
+                  idy, firstElementIdxY, endElementIdxY, gridDimensionY, nt))
+            break;
+
+          auto const& vc = isOuterHitOfCell[idy];
+          auto s = vc.size();
+          if (s < 2)
+            continue;
+
+          auto const& c0 = cells[vc[0]];
+          auto xo = c0.outer_x(hh);
+          auto yo = c0.outer_y(hh);
+          auto zo = c0.outer_z(hh);
+          auto sg = 0;
+          for (int32_t ic = 0; ic < s; ++ic) {
+            auto& ci = cells[vc[ic]];
+            if (ci.unused())
+              continue;  // for triplets equivalent to next
+            if (checkTrack && ci.tracks().empty())
+              continue;
+            cc[sg] = vc[ic];
+            d[sg] = ci.inner_detIndex(hh);
+            l[sg] = ci.layerPairId();
+            x[sg] = ci.inner_x(hh) - xo;
+            y[sg] = ci.inner_y(hh) - yo;
+            z[sg] = ci.inner_z(hh) - zo;
+            n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg];
+            ++sg;
+          }
+          if (sg < 2)
+            continue;
+          // here we parallelize in X
+          uint32_t firstElementIdxX = firstElementIdxNoStrideX;
+          uint32_t endElementIdxX = endElementIdxNoStrideX;
+          for (uint32_t ic = firstElementIdxX; (int)ic < sg - 1; ++ic) {
+            if (not cms::alpakatools::next_valid_element_index_strided(
+                    ic, firstElementIdxX, endElementIdxX, blockDimensionX, sg - 1))
+              break;
+
+            auto& ci = cells[cc[ic]];
+            for (auto jc = ic + 1; (int)jc < sg; ++jc) {
+              auto& cj = cells[cc[jc]];
+              // must be different detectors (in the same layer)
+              //        if (d[ic]==d[jc]) continue;
+              // || l[ic]!=l[jc]) continue;
+              auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc];
+
+              if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * (n[ic] * n[jc])) {
+                // alligned:  kill farthest (prefer consecutive layers)
+                // if same layer prefer farthest (longer level arm) and make space for intermediate hit
+                bool sameLayer = l[ic] == l[jc];
+                if (n[ic] > n[jc]) {
+                  if (sameLayer) {
+                    cj.kill();  // closest
+                    ci.setFishbone(acc, cj.inner_hit_id(), cj.inner_z(hh), hh);
+                  } else {
+                    ci.kill();  // farthest
+                    // break;  // removed to improve reproducibility. keep it for reference and tests
+                  }
+                } else {
+                  if (!sameLayer) {
+                    cj.kill();  // farthest
+                  } else {
+                    ci.kill();  // closest
+                    cj.setFishbone(acc, ci.inner_hit_id(), ci.inner_z(hh), hh);
+                    // break;  // removed to improve reproducibility. keep it for reference    and tests
+                  }
+                }
+              }
+            }  //cj
+          }    // ci
+        }      // hits
+      }
+    };
+  }  // namespace caPixelDoublets
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtuplet.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtuplet.cc
new file mode 100644
index 0000000000000..c16aed2e0b1e8
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtuplet.cc
@@ -0,0 +1,95 @@
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/ESGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDGetToken.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/Event.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EventSetup.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/stream/EDProducer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+#include "RecoLocalTracker/Records/interface/PixelCPEFastParamsRecord.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/alpaka/PixelCPEFastParamsCollection.h"
+
+#include "CAHitNtupletGenerator.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  template <typename TrackerTraits>
+  class CAHitNtupletAlpaka : public stream::EDProducer<> {
+    using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+    using HitsOnDevice = TrackingRecHitsSoACollection<TrackerTraits>;
+    using HitsOnHost = TrackingRecHitHost<TrackerTraits>;
+
+    using TkSoAHost = TracksHost<TrackerTraits>;
+    using TkSoADevice = TracksSoACollection<TrackerTraits>;
+
+    using Algo = CAHitNtupletGenerator<TrackerTraits>;
+
+  public:
+    explicit CAHitNtupletAlpaka(const edm::ParameterSet& iConfig);
+    ~CAHitNtupletAlpaka() override = default;
+    void produce(device::Event& iEvent, const device::EventSetup& es) override;
+    static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  private:
+    const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> tokenField_;
+    const device::ESGetToken<PixelCPEFastParams<TrackerTraits>, PixelCPEFastParamsRecord> cpeToken_;
+    const device::EDGetToken<HitsOnDevice> tokenHit_;
+    const device::EDPutToken<TkSoADevice> tokenTrack_;
+
+    Algo deviceAlgo_;
+  };
+
+  template <typename TrackerTraits>
+  CAHitNtupletAlpaka<TrackerTraits>::CAHitNtupletAlpaka(const edm::ParameterSet& iConfig)
+      : tokenField_(esConsumes()),
+        cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter<std::string>("CPE")))),
+        tokenHit_(consumes(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+        tokenTrack_(produces()),
+        deviceAlgo_(iConfig) {}
+
+  template <typename TrackerTraits>
+  void CAHitNtupletAlpaka<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+
+    desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingAlpaka"));
+
+    std::string cpe = "PixelCPEFastParams";
+    cpe += TrackerTraits::nameModifier;
+    desc.add<std::string>("CPE", cpe);
+
+    Algo::fillPSetDescription(desc);
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+  template <typename TrackerTraits>
+  void CAHitNtupletAlpaka<TrackerTraits>::produce(device::Event& iEvent, const device::EventSetup& es) {
+    auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV();
+
+    auto& fcpe = es.getData(cpeToken_);
+
+    auto const& hits = iEvent.get(tokenHit_);
+
+    iEvent.emplace(tokenTrack_, deviceAlgo_.makeTuplesAsync(hits, fcpe.const_buffer().data(), bf, iEvent.queue()));
+  }
+
+  using CAHitNtupletAlpakaPhase1 = CAHitNtupletAlpaka<pixelTopology::Phase1>;
+  using CAHitNtupletAlpakaPhase2 = CAHitNtupletAlpaka<pixelTopology::Phase2>;
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h"
+
+DEFINE_FWK_ALPAKA_MODULE(CAHitNtupletAlpakaPhase1);
+DEFINE_FWK_ALPAKA_MODULE(CAHitNtupletAlpakaPhase2);
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc
new file mode 100644
index 0000000000000..8f898872a66f4
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc
@@ -0,0 +1,329 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+//#define GPU_DEBUG
+//#define DUMP_GPU_TK_TUPLES
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <vector>
+
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include "CAHitNtupletGenerator.h"
+#include "CAHitNtupletGeneratorKernels.h"
+#include "CAPixelDoublets.h"
+#include "CAPixelDoubletsAlgos.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace {
+
+    using namespace caHitNtupletGenerator;
+    using namespace caPixelDoublets;
+    using namespace pixelTopology;
+    using namespace pixelTrack;
+
+    template <typename T>
+    T sqr(T x) {
+      return x * x;
+    }
+
+    //Common Params
+    void fillDescriptionsCommon(edm::ParameterSetDescription& desc) {
+      // 87 cm/GeV = 1/(3.8T * 0.3)
+      // take less than radius given by the hardPtCut and reject everything below
+      // auto hardCurvCut = 1.f/(0.35 * 87.f);
+      desc.add<double>("ptmin", 0.9f)->setComment("Cut on minimum pt");
+      desc.add<double>("CAThetaCutBarrel", 0.002f)->setComment("Cut on RZ alignement for Barrel");
+      desc.add<double>("CAThetaCutForward", 0.003f)->setComment("Cut on RZ alignment for Forward");
+      desc.add<double>("hardCurvCut", 1.f / (0.35 * 87.f))
+          ->setComment("Cut on minimum curvature, used in DCA ntuplet selection");
+      desc.add<double>("dcaCutInnerTriplet", 0.15f)->setComment("Cut on origin radius when the inner hit is on BPix1");
+      desc.add<double>("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1");
+      desc.add<bool>("earlyFishbone", true);
+      desc.add<bool>("lateFishbone", false);
+      desc.add<bool>("fillStatistics", false);
+      desc.add<unsigned int>("minHitsPerNtuplet", 4);
+      desc.add<unsigned int>("minHitsForSharingCut", 10)
+          ->setComment("Maximum number of hits in a tuple to clean also if the shared hit is on bpx1");
+
+      desc.add<bool>("fitNas4", false)->setComment("fit only 4 hits out of N");
+      desc.add<bool>("doClusterCut", true);
+      desc.add<bool>("doZ0Cut", true);
+      desc.add<bool>("doPtCut", true);
+      desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
+      desc.add<bool>("doSharedHitCut", true)->setComment("Sharing hit nTuples cleaning");
+      desc.add<bool>("dupPassThrough", false)->setComment("Do not reject duplicate");
+      desc.add<bool>("useSimpleTripletCleaner", true)->setComment("use alternate implementation");
+    }
+
+    AlgoParams makeCommonParams(edm::ParameterSet const& cfg) {
+      return AlgoParams({cfg.getParameter<unsigned int>("minHitsForSharingCut"),
+                         cfg.getParameter<bool>("useRiemannFit"),
+                         cfg.getParameter<bool>("fitNas4"),
+                         cfg.getParameter<bool>("includeJumpingForwardDoublets"),
+                         cfg.getParameter<bool>("earlyFishbone"),
+                         cfg.getParameter<bool>("lateFishbone"),
+                         cfg.getParameter<bool>("fillStatistics"),
+                         cfg.getParameter<bool>("doSharedHitCut"),
+                         cfg.getParameter<bool>("dupPassThrough"),
+                         cfg.getParameter<bool>("useSimpleTripletCleaner")});
+    }
+
+    //This is needed to have the partial specialization for isPhase1Topology/isPhase2Topology
+    template <typename TrackerTraits, typename Enable = void>
+    struct TopologyCuts {};
+
+    template <typename TrackerTraits>
+    struct TopologyCuts<TrackerTraits, isPhase1Topology<TrackerTraits>> {
+      static constexpr CAParamsT<TrackerTraits> makeCACuts(edm::ParameterSet const& cfg) {
+        return CAParamsT<TrackerTraits>{{cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
+                                         cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+                                         (float)cfg.getParameter<double>("ptmin"),
+                                         (float)cfg.getParameter<double>("CAThetaCutBarrel"),
+                                         (float)cfg.getParameter<double>("CAThetaCutForward"),
+                                         (float)cfg.getParameter<double>("hardCurvCut"),
+                                         (float)cfg.getParameter<double>("dcaCutInnerTriplet"),
+                                         (float)cfg.getParameter<double>("dcaCutOuterTriplet")}};
+      };
+
+      static constexpr ::pixelTrack::QualityCutsT<TrackerTraits> makeQualityCuts(edm::ParameterSet const& pset) {
+        auto coeff = pset.getParameter<std::array<double, 2>>("chi2Coeff");
+        auto ptMax = pset.getParameter<double>("chi2MaxPt");
+
+        coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax);
+        return ::pixelTrack::QualityCutsT<TrackerTraits>{// polynomial coefficients for the pT-dependent chi2 cut
+                                                         {(float)coeff[0], (float)coeff[1], 0.f, 0.f},
+                                                         // max pT used to determine the chi2 cut
+                                                         (float)ptMax,
+                                                         // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit
+                                                         (float)pset.getParameter<double>("chi2Scale"),
+                                                         // regional cuts for triplets
+                                                         {(float)pset.getParameter<double>("tripletMaxTip"),
+                                                          (float)pset.getParameter<double>("tripletMinPt"),
+                                                          (float)pset.getParameter<double>("tripletMaxZip")},
+                                                         // regional cuts for quadruplets
+                                                         {(float)pset.getParameter<double>("quadrupletMaxTip"),
+                                                          (float)pset.getParameter<double>("quadrupletMinPt"),
+                                                          (float)pset.getParameter<double>("quadrupletMaxZip")}};
+      }
+    };
+
+    template <typename TrackerTraits>
+    struct TopologyCuts<TrackerTraits, isPhase2Topology<TrackerTraits>> {
+      static constexpr CAParamsT<TrackerTraits> makeCACuts(edm::ParameterSet const& cfg) {
+        return CAParamsT<TrackerTraits>{{cfg.getParameter<unsigned int>("maxNumberOfDoublets"),
+                                         cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+                                         (float)cfg.getParameter<double>("ptmin"),
+                                         (float)cfg.getParameter<double>("CAThetaCutBarrel"),
+                                         (float)cfg.getParameter<double>("CAThetaCutForward"),
+                                         (float)cfg.getParameter<double>("hardCurvCut"),
+                                         (float)cfg.getParameter<double>("dcaCutInnerTriplet"),
+                                         (float)cfg.getParameter<double>("dcaCutOuterTriplet")},
+                                        {(bool)cfg.getParameter<bool>("includeFarForwards")}};
+      }
+
+      static constexpr ::pixelTrack::QualityCutsT<TrackerTraits> makeQualityCuts(edm::ParameterSet const& pset) {
+        return ::pixelTrack::QualityCutsT<TrackerTraits>{
+            static_cast<float>(pset.getParameter<double>("maxChi2")),
+            static_cast<float>(pset.getParameter<double>("minPt")),
+            static_cast<float>(pset.getParameter<double>("maxTip")),
+            static_cast<float>(pset.getParameter<double>("maxZip")),
+        };
+      }
+    };
+
+    //Cell Cuts, as they are the cuts have the same logic for Phase2 and Phase1
+    //keeping them separate would allow further differentiation in the future
+    //moving them to TopologyCuts and using the same syntax
+    template <typename TrackerTraits>
+    CellCutsT<TrackerTraits> makeCellCuts(edm::ParameterSet const& cfg) {
+      return CellCutsT<TrackerTraits>{cfg.getParameter<bool>("doClusterCut"),
+                                      cfg.getParameter<bool>("doZ0Cut"),
+                                      cfg.getParameter<bool>("doPtCut"),
+                                      cfg.getParameter<bool>("idealConditions"),
+                                      (float)cfg.getParameter<double>("cellZ0Cut"),
+                                      (float)cfg.getParameter<double>("cellPtCut"),
+                                      cfg.getParameter<std::vector<int>>("phiCuts")};
+    }
+
+  }  // namespace
+
+  using namespace std;
+
+  template <typename TrackerTraits>
+  CAHitNtupletGenerator<TrackerTraits>::CAHitNtupletGenerator(const edm::ParameterSet& cfg)
+      : m_params(makeCommonParams(cfg),
+                 makeCellCuts<TrackerTraits>(cfg),
+                 TopologyCuts<TrackerTraits>::makeQualityCuts(cfg.getParameterSet("trackQualityCuts")),
+                 TopologyCuts<TrackerTraits>::makeCACuts(cfg)) {
+#ifdef DUMP_GPU_TK_TUPLES
+    printf("TK: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+           "tid",
+           "qual",
+           "nh",
+           "nl",
+           "charge",
+           "pt",
+           "eta",
+           "phi",
+           "tip",
+           "zip",
+           "chi2",
+           "h1",
+           "h2",
+           "h3",
+           "h4",
+           "h5",
+           "hn");
+#endif
+  }
+
+  template <typename TrackerTraits>
+  void CAHitNtupletGenerator<TrackerTraits>::fillPSetDescription(edm::ParameterSetDescription& desc) {
+    static_assert(sizeof(TrackerTraits) == 0,
+                  "Note: this fillPSetDescription is a dummy one. Please specialise it for the correct version of "
+                  "CAHitNtupletGenerator<TrackerTraits>.");
+  }
+
+  template <>
+  void CAHitNtupletGenerator<pixelTopology::Phase1>::fillPSetDescription(edm::ParameterSetDescription& desc) {
+    fillDescriptionsCommon(desc);
+
+    desc.add<unsigned int>("maxNumberOfDoublets", pixelTopology::Phase1::maxNumberOfDoublets);
+    desc.add<bool>("idealConditions", true);
+    desc.add<bool>("includeJumpingForwardDoublets", false);
+    desc.add<double>("cellZ0Cut", 12.0);
+    desc.add<double>("cellPtCut", 0.5);
+
+    edm::ParameterSetDescription trackQualityCuts;
+    trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
+    trackQualityCuts.add<std::vector<double>>("chi2Coeff", {0.9, 1.8})->setComment("chi2 at 1GeV and at ptMax above");
+    trackQualityCuts.add<double>("chi2Scale", 8.)
+        ->setComment(
+            "Factor to multiply the pT-dependent chi2 cut (currently: 8 for the broken line fit, ?? for the Riemann "
+            "fit)");
+    trackQualityCuts.add<double>("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV");
+    trackQualityCuts.add<double>("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm");
+    trackQualityCuts.add<double>("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm");
+    trackQualityCuts.add<double>("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV");
+    trackQualityCuts.add<double>("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm");
+    trackQualityCuts.add<double>("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm");
+    desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
+        ->setComment(
+            "Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply "
+            "\"region "
+            "cuts\" based on the fit results (pT, Tip, Zip).");
+
+    desc.add<std::vector<int>>(
+            "phiCuts",
+            std::vector<int>(std::begin(phase1PixelTopology::phicuts), std::end(phase1PixelTopology::phicuts)))
+        ->setComment("Cuts in phi for cells");
+  }
+
+  template <>
+  void CAHitNtupletGenerator<pixelTopology::HIonPhase1>::fillPSetDescription(edm::ParameterSetDescription& desc) {
+    fillDescriptionsCommon(desc);
+
+    desc.add<unsigned int>("maxNumberOfDoublets", pixelTopology::HIonPhase1::maxNumberOfDoublets);
+    desc.add<bool>("idealConditions", false);
+    desc.add<bool>("includeJumpingForwardDoublets", false);
+    desc.add<double>("cellZ0Cut", 10.0);
+    desc.add<double>("cellPtCut", 0.0);
+
+    edm::ParameterSetDescription trackQualityCuts;
+    trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
+    trackQualityCuts.add<std::vector<double>>("chi2Coeff", {0.9, 1.8})->setComment("chi2 at 1GeV and at ptMax above");
+    trackQualityCuts.add<double>("chi2Scale", 8.)
+        ->setComment(
+            "Factor to multiply the pT-dependent chi2 cut (currently: 8 for the broken line fit, ?? for the Riemann "
+            "fit)");
+    trackQualityCuts.add<double>("tripletMinPt", 0.0)->setComment("Min pT for triplets, in GeV");
+    trackQualityCuts.add<double>("tripletMaxTip", 0.1)->setComment("Max |Tip| for triplets, in cm");
+    trackQualityCuts.add<double>("tripletMaxZip", 6.)->setComment("Max |Zip| for triplets, in cm");
+    trackQualityCuts.add<double>("quadrupletMinPt", 0.0)->setComment("Min pT for quadruplets, in GeV");
+    trackQualityCuts.add<double>("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm");
+    trackQualityCuts.add<double>("quadrupletMaxZip", 6.)->setComment("Max |Zip| for quadruplets, in cm");
+
+    desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
+        ->setComment(
+            "Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply "
+            "\"region "
+            "cuts\" based on the fit results (pT, Tip, Zip).");
+
+    desc.add<std::vector<int>>(
+            "phiCuts",
+            std::vector<int>(std::begin(phase1PixelTopology::phicuts), std::end(phase1PixelTopology::phicuts)))
+        ->setComment("Cuts in phi for cells");
+  }
+
+  template <>
+  void CAHitNtupletGenerator<pixelTopology::Phase2>::fillPSetDescription(edm::ParameterSetDescription& desc) {
+    fillDescriptionsCommon(desc);
+
+    desc.add<unsigned int>("maxNumberOfDoublets", pixelTopology::Phase2::maxNumberOfDoublets);
+    desc.add<bool>("idealConditions", false);
+    desc.add<bool>("includeFarForwards", true);
+    desc.add<bool>("includeJumpingForwardDoublets", true);
+    desc.add<double>("cellZ0Cut", 7.5);
+    desc.add<double>("cellPtCut", 0.85);
+
+    edm::ParameterSetDescription trackQualityCuts;
+    trackQualityCuts.add<double>("maxChi2", 5.)->setComment("Max normalized chi2");
+    trackQualityCuts.add<double>("minPt", 0.5)->setComment("Min pT in GeV");
+    trackQualityCuts.add<double>("maxTip", 0.3)->setComment("Max |Tip| in cm");
+    trackQualityCuts.add<double>("maxZip", 12.)->setComment("Max |Zip|, in cm");
+    desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
+        ->setComment(
+            "Quality cuts based on the results of the track fit:\n  - apply cuts based on the fit results (pT, Tip, "
+            "Zip).");
+
+    desc.add<std::vector<int>>(
+            "phiCuts",
+            std::vector<int>(std::begin(phase2PixelTopology::phicuts), std::end(phase2PixelTopology::phicuts)))
+        ->setComment("Cuts in phi for cells");
+  }
+
+  template <typename TrackerTraits>
+  TracksSoACollection<TrackerTraits> CAHitNtupletGenerator<TrackerTraits>::makeTuplesAsync(
+      HitsOnDevice const& hits_d, ParamsOnDevice const* cpeParams, float bfield, Queue& queue) const {
+    using HelixFit = HelixFit<TrackerTraits>;
+    using TrackSoA = TracksSoACollection<TrackerTraits>;
+    using GPUKernels = CAHitNtupletGeneratorKernels<TrackerTraits>;
+
+    TrackSoA tracks(queue);
+
+    GPUKernels kernels(m_params, hits_d.view().metadata().size(), queue);
+
+    kernels.buildDoublets(hits_d.view(), queue);
+    kernels.launchKernels(hits_d.view(), tracks.view(), queue);
+
+    HelixFit fitter(bfield, m_params.fitNas4_);
+    fitter.allocate(kernels.tupleMultiplicity(), tracks.view());
+    if (m_params.useRiemannFit_) {
+      fitter.launchRiemannKernels(
+          hits_d.view(), cpeParams, hits_d.view().metadata().size(), TrackerTraits::maxNumberOfQuadruplets, queue);
+    } else {
+      fitter.launchBrokenLineKernels(
+          hits_d.view(), cpeParams, hits_d.view().metadata().size(), TrackerTraits::maxNumberOfQuadruplets, queue);
+    }
+    kernels.classifyTuples(hits_d.view(), tracks.view(), queue);
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+    std::cout << "finished building pixel tracks on GPU" << std::endl;
+#endif
+
+    return tracks;
+  }
+
+  template class CAHitNtupletGenerator<pixelTopology::Phase1>;
+  template class CAHitNtupletGenerator<pixelTopology::Phase2>;
+  template class CAHitNtupletGenerator<pixelTopology::HIonPhase1>;
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h
new file mode 100644
index 0000000000000..826b92d4a195a
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h
@@ -0,0 +1,86 @@
+#ifndef RecoPixelVertexing_PixelTriplets_Alpaka_CAHitNtupletGenerator_h
+#define RecoPixelVertexing_PixelTriplets_Alpaka_CAHitNtupletGenerator_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
+
+#include "CAHitNtupletGeneratorKernels.h"
+#include "CACell.h"
+#include "HelixFit.h"
+
+namespace edm {
+  class ParameterSetDescription;
+}  // namespace edm
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  template <typename TrackerTraits>
+  class CAHitNtupletGenerator {
+  public:
+    using HitsView = TrackingRecHitSoAView<TrackerTraits>;
+    using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+    using HitsOnDevice = TrackingRecHitsSoACollection<TrackerTraits>;
+    using HitsOnHost = TrackingRecHitHost<TrackerTraits>;
+    using hindex_type = typename TrackingRecHitSoA<TrackerTraits>::hindex_type;
+
+    using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
+    using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+
+    using CACell = CACellT<TrackerTraits>;
+    using TkSoAHost = TracksHost<TrackerTraits>;
+    using TkSoADevice = TracksSoACollection<TrackerTraits>;
+    using HitContainer = typename reco::TrackSoA<TrackerTraits>::HitContainer;
+    using Tuple = HitContainer;
+
+    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+
+    using Quality = ::pixelTrack::Quality;
+
+    using QualityCuts = ::pixelTrack::QualityCutsT<TrackerTraits>;
+    using Params = caHitNtupletGenerator::ParamsT<TrackerTraits>;
+    using Counters = caHitNtupletGenerator::Counters;
+
+    using ParamsOnDevice = pixelCPEforDevice::ParamsOnDeviceT<TrackerTraits>;
+
+  public:
+    CAHitNtupletGenerator(const edm::ParameterSet& cfg);
+
+    static void fillPSetDescription(edm::ParameterSetDescription& desc);
+
+    // NOTE: beginJob and endJob were meant to be used
+    // to fill the statistics. This is still not implemented in Alpaka
+    // since we are missing the begin/endJob functionality for the Alpaka
+    // producers.
+    //
+    // void beginJob();
+    // void endJob();
+
+    TkSoADevice makeTuplesAsync(HitsOnDevice const& hits_d,
+                                ParamsOnDevice const* cpeParams,
+                                float bfield,
+                                Queue& queue) const;
+
+  private:
+    void buildDoublets(const HitsConstView& hh, Queue& queue) const;
+
+    void hitNtuplets(const HitsConstView& hh, const edm::EventSetup& es, bool useRiemannFit, Queue& queue);
+
+    void launchKernels(const HitsConstView& hh, bool useRiemannFit, Queue& queue) const;
+
+    Params m_params;
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGenerator_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc
new file mode 100644
index 0000000000000..44e3295bdb606
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc
@@ -0,0 +1,538 @@
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/host.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "CAHitNtupletGeneratorKernels.h"
+#include "CAHitNtupletGeneratorKernelsImpl.h"
+#ifdef DUMP_GPU_TK_TUPLES
+#include <mutex>
+#endif
+
+//#define GPU_DEBUG
+//#define NTUPLE_DEBUG
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  template <typename TrackerTraits>
+  CAHitNtupletGeneratorKernels<TrackerTraits>::CAHitNtupletGeneratorKernels(Params const &params,
+                                                                            uint32_t nhits,
+                                                                            Queue &queue)
+      : m_params(params),
+        //////////////////////////////////////////////////////////
+        // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
+        //////////////////////////////////////////////////////////
+        counters_{cms::alpakatools::make_device_buffer<Counters>(queue)},
+
+        // workspace
+        device_hitToTuple_{cms::alpakatools::make_device_buffer<HitToTuple>(queue)},
+        device_tupleMultiplicity_{cms::alpakatools::make_device_buffer<TupleMultiplicity>(queue)},
+
+        // NB: In legacy, device_theCells_ and device_isOuterHitOfCell_ were allocated inside buildDoublets
+        device_theCells_{
+            cms::alpakatools::make_device_buffer<CACell[]>(queue, m_params.caParams_.maxNumberOfDoublets_)},
+        // in principle we can use "nhits" to heuristically dimension the workspace...
+        device_isOuterHitOfCell_{
+            cms::alpakatools::make_device_buffer<OuterHitOfCellContainer[]>(queue, std::max(1u, nhits))},
+        isOuterHitOfCell_{cms::alpakatools::make_device_buffer<OuterHitOfCell>(queue)},
+
+        device_theCellNeighbors_{cms::alpakatools::make_device_buffer<CellNeighborsVector>(queue)},
+        device_theCellTracks_{cms::alpakatools::make_device_buffer<CellTracksVector>(queue)},
+        // NB: In legacy, cellStorage_ was allocated inside buildDoublets
+        cellStorage_{cms::alpakatools::make_device_buffer<unsigned char[]>(
+            queue,
+            TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) +
+                TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks))},
+        device_cellCuts_{cms::alpakatools::make_device_buffer<CellCuts>(queue)},
+        device_theCellNeighborsContainer_{reinterpret_cast<CellNeighbors *>(cellStorage_.data())},
+        device_theCellTracksContainer_{reinterpret_cast<CellTracks *>(
+            cellStorage_.data() + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors))},
+
+        // NB: In legacy, device_storage_ was allocated inside allocateOnGPU
+        device_storage_{
+            cms::alpakatools::make_device_buffer<cms::alpakatools::AtomicPairCounter::DoubleWord[]>(queue, 3u)},
+        device_hitTuple_apc_{reinterpret_cast<cms::alpakatools::AtomicPairCounter *>(device_storage_.data())},
+        device_hitToTuple_apc_{reinterpret_cast<cms::alpakatools::AtomicPairCounter *>(device_storage_.data() + 1)},
+        device_nCells_{cms::alpakatools::make_device_view(alpaka::getDev(queue),
+                                                          *reinterpret_cast<uint32_t *>(device_storage_.data() + 2))} {
+    alpaka::memset(queue, counters_, 0);
+    alpaka::memset(queue, device_nCells_, 0);
+    alpaka::memset(queue, cellStorage_, 0);
+
+    auto cellCuts_h = cms::alpakatools::make_host_view(m_params.cellCuts_);
+    alpaka::memcpy(queue, device_cellCuts_, cellCuts_h);
+
+    [[maybe_unused]] TupleMultiplicity *tupleMultiplicityDeviceData = device_tupleMultiplicity_.data();
+    [[maybe_unused]] HitToTuple *hitToTupleDeviceData = device_hitToTuple_.data();
+    using TM = cms::alpakatools::OneToManyAssocRandomAccess<typename TrackerTraits::tindex_type,
+                                                            TrackerTraits::maxHitsOnTrack + 1,
+                                                            TrackerTraits::maxNumberOfTuples>;
+    TM *tm = device_tupleMultiplicity_.data();
+    TM::template launchZero<Acc1D>(tm, queue);
+    TupleMultiplicity::template launchZero<Acc1D>(tupleMultiplicityDeviceData, queue);
+    HitToTuple::template launchZero<Acc1D>(hitToTupleDeviceData, queue);
+  }
+
+  template <typename TrackerTraits>
+  void CAHitNtupletGeneratorKernels<TrackerTraits>::launchKernels(const HitsConstView &hh,
+                                                                  TkSoAView &tracks_view,
+                                                                  Queue &queue) {
+    using namespace caPixelDoublets;
+    using namespace caHitNtupletGeneratorKernels;
+
+    // zero tuples
+    HitContainer::template launchZero<Acc1D>(&(tracks_view.hitIndices()), queue);
+
+    int32_t nhits = hh.metadata().size();
+
+#ifdef NTUPLE_DEBUG
+    std::cout << "start tuple building. N hits " << nhits << std::endl;
+    if (nhits < 2)
+      std::cout << "too few hits " << nhits << std::endl;
+#endif
+
+    //
+    // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+    //
+
+    const auto nthTot = 64;
+    const auto stride = 4;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
+    const auto rescale = numberOfBlocks / 65536;
+    blockSize *= (rescale + 1);
+    numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
+    assert(numberOfBlocks < 65536);
+    assert(blockSize > 0 && 0 == blockSize % 16);
+    const Vec2D blks{numberOfBlocks, 1u};
+    const Vec2D thrs{blockSize, stride};
+    const auto kernelConnectWorkDiv = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
+
+    alpaka::exec<Acc2D>(queue,
+                        kernelConnectWorkDiv,
+                        Kernel_connect<TrackerTraits>{},
+                        this->device_hitTuple_apc_,
+                        this->device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+                        hh,
+                        this->device_theCells_.data(),
+                        this->device_nCells_.data(),
+                        this->device_theCellNeighbors_.data(),
+                        this->isOuterHitOfCell_.data(),
+                        this->m_params.caParams_);
+
+    // do not run the fishbone if there are hits only in BPIX1
+    if (this->m_params.earlyFishbone_) {
+      const auto nthTot = 128;
+      const auto stride = 16;
+      const auto blockSize = nthTot / stride;
+      const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits, blockSize);
+      const Vec2D blks{numberOfBlocks, 1u};
+      const Vec2D thrs{blockSize, stride};
+      const auto fishboneWorkDiv = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
+      alpaka::exec<Acc2D>(queue,
+                          fishboneWorkDiv,
+                          CAFishbone<TrackerTraits>{},
+                          hh,
+                          this->device_theCells_.data(),
+                          this->device_nCells_.data(),
+                          this->isOuterHitOfCell_.data(),
+                          nhits,
+                          false);
+    }
+    blockSize = 64;
+    numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
+    auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+    alpaka::exec<Acc1D>(queue,
+                        workDiv1D,
+                        Kernel_find_ntuplets<TrackerTraits>{},
+                        hh,
+                        tracks_view,
+                        this->device_theCells_.data(),
+                        this->device_nCells_.data(),
+                        this->device_theCellTracks_.data(),
+                        this->device_hitTuple_apc_,
+                        this->m_params.caParams_);
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+    if (this->m_params.doStats_)
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          Kernel_mark_used<TrackerTraits>{},
+                          this->device_theCells_.data(),
+                          this->device_nCells_.data());
+
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+    blockSize = 128;
+    numberOfBlocks = cms::alpakatools::divide_up_by(HitContainer{}.totOnes(), blockSize);
+    workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+
+    alpaka::exec<Acc1D>(
+        queue, workDiv1D, typename HitContainer::finalizeBulk{}, this->device_hitTuple_apc_, &tracks_view.hitIndices());
+
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+    alpaka::exec<Acc1D>(queue, workDiv1D, Kernel_fillHitDetIndices<TrackerTraits>{}, tracks_view, hh);
+
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+    alpaka::exec<Acc1D>(queue, workDiv1D, Kernel_fillNLayers<TrackerTraits>{}, tracks_view, this->device_hitTuple_apc_);
+
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+    // remove duplicates (tracks that share a doublet)
+    numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
+    workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+
+    alpaka::exec<Acc1D>(queue,
+                        workDiv1D,
+                        Kernel_earlyDuplicateRemover<TrackerTraits>{},
+                        this->device_theCells_.data(),
+                        this->device_nCells_.data(),
+                        tracks_view,
+                        this->m_params.dupPassThrough_);
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+    blockSize = 128;
+    numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfTuples / 4, blockSize);
+    workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+
+    alpaka::exec<Acc1D>(queue,
+                        workDiv1D,
+                        Kernel_countMultiplicity<TrackerTraits>{},
+                        tracks_view,
+                        this->device_tupleMultiplicity_.data());
+    TupleMultiplicity::template launchFinalize<Acc1D>(this->device_tupleMultiplicity_.data(), queue);
+
+    workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+    alpaka::exec<Acc1D>(
+        queue, workDiv1D, Kernel_fillMultiplicity<TrackerTraits>{}, tracks_view, this->device_tupleMultiplicity_.data());
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+    // do not run the fishbone if there are hits only in BPIX1
+    if (this->m_params.lateFishbone_) {
+      const auto nthTot = 128;
+      const auto stride = 16;
+      const auto blockSize = nthTot / stride;
+      const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits, blockSize);
+      const Vec2D blks{numberOfBlocks, 1u};
+      const Vec2D thrs{blockSize, stride};
+      const auto workDiv2D = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
+
+      alpaka::exec<Acc2D>(queue,
+                          workDiv2D,
+                          CAFishbone<TrackerTraits>{},
+                          hh,
+                          this->device_theCells_.data(),
+                          this->device_nCells_.data(),
+                          this->isOuterHitOfCell_.data(),
+                          nhits,
+                          true);
+    }
+
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+  }
+
+  template <typename TrackerTraits>
+  void CAHitNtupletGeneratorKernels<TrackerTraits>::buildDoublets(const HitsConstView &hh, Queue &queue) {
+    auto nhits = hh.metadata().size();
+
+    using namespace caPixelDoublets;
+
+    using CACell = CACellT<TrackerTraits>;
+    using OuterHitOfCell = typename CACell::OuterHitOfCell;
+    using CellNeighbors = typename CACell::CellNeighbors;
+    using CellTracks = typename CACell::CellTracks;
+    using OuterHitOfCellContainer = typename CACell::OuterHitOfCellContainer;
+
+#ifdef NTUPLE_DEBUG
+    std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+    // in principle we can use "nhits" to heuristically dimension the workspace...
+    ALPAKA_ASSERT_OFFLOAD(this->device_isOuterHitOfCell_.data());
+
+    alpaka::exec<Acc1D>(
+        queue,
+        cms::alpakatools::make_workdiv<Acc1D>(1, 1),
+        [] ALPAKA_FN_ACC(Acc1D const &acc,
+                         OuterHitOfCell *isOuterHitOfCell,
+                         OuterHitOfCellContainer *container,
+                         int32_t const *offset) {
+          // this code runs on the device
+          isOuterHitOfCell->container = container;
+          isOuterHitOfCell->offset = *offset;
+        },
+        this->isOuterHitOfCell_.data(),
+        this->device_isOuterHitOfCell_.data(),
+        &hh.offsetBPIX2());
+
+    {
+      int threadsPerBlock = 128;
+      // at least one block!
+      int blocks = std::max(1u, cms::alpakatools::divide_up_by(nhits, threadsPerBlock));
+      const auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(blocks, threadsPerBlock);
+
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          InitDoublets<TrackerTraits>{},
+                          this->isOuterHitOfCell_.data(),
+                          nhits,
+                          this->device_theCellNeighbors_.data(),
+                          this->device_theCellNeighborsContainer_,
+                          this->device_theCellTracks_.data(),
+                          this->device_theCellTracksContainer_);
+    }
+
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+    if (0 == nhits)
+      return;  // protect against empty events
+
+    // take all layer pairs into account
+    auto nActualPairs = this->m_params.nPairs();
+
+    const int stride = 4;
+    const int threadsPerBlock = TrackerTraits::getDoubletsFromHistoMaxBlockSize / stride;
+    int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+    const Vec2D blks{blocks, 1u};
+    const Vec2D thrs{threadsPerBlock, stride};
+    const auto workDiv2D = cms::alpakatools::make_workdiv<Acc2D>(blks, thrs);
+
+    alpaka::exec<Acc2D>(queue,
+                        workDiv2D,
+                        GetDoubletsFromHisto<TrackerTraits>{},
+                        this->device_theCells_.data(),
+                        this->device_nCells_.data(),
+                        this->device_theCellNeighbors_.data(),
+                        this->device_theCellTracks_.data(),
+                        hh,
+                        this->isOuterHitOfCell_.data(),
+                        nActualPairs,
+                        this->m_params.caParams_.maxNumberOfDoublets_,
+                        this->m_params.cellCuts_);
+
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+  }
+
+  template <typename TrackerTraits>
+  void CAHitNtupletGeneratorKernels<TrackerTraits>::classifyTuples(const HitsConstView &hh,
+                                                                   TkSoAView &tracks_view,
+                                                                   Queue &queue) {
+    using namespace caHitNtupletGeneratorKernels;
+
+    uint32_t nhits = hh.metadata().size();
+
+    auto blockSize = 64;
+
+    // classify tracks based on kinematics
+    auto numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize);
+    auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+    alpaka::exec<Acc1D>(
+        queue, workDiv1D, Kernel_classifyTracks<TrackerTraits>{}, tracks_view, this->m_params.qualityCuts_);
+
+    if (this->m_params.lateFishbone_) {
+      // apply fishbone cleaning to good tracks
+      numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
+      workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          Kernel_fishboneCleaner<TrackerTraits>{},
+                          this->device_theCells_.data(),
+                          this->device_nCells_.data(),
+                          tracks_view);
+    }
+
+    // mark duplicates (tracks that share a doublet)
+    numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize);
+    workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+    alpaka::exec<Acc1D>(queue,
+                        workDiv1D,
+                        Kernel_fastDuplicateRemover<TrackerTraits>{},
+                        this->device_theCells_.data(),
+                        this->device_nCells_.data(),
+                        tracks_view,
+                        this->m_params.dupPassThrough_);
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+    if (this->m_params.doSharedHitCut_ || this->m_params.doStats_) {
+      // fill hit->track "map"
+      numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize);
+      workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          Kernel_countHitInTracks<TrackerTraits>{},
+                          tracks_view,
+                          this->device_hitToTuple_.data());  //CHECK
+
+      HitToTuple::template launchFinalize<Acc1D>(this->device_hitToTuple_.data(), queue);
+      alpaka::exec<Acc1D>(
+          queue, workDiv1D, Kernel_fillHitInTracks<TrackerTraits>{}, tracks_view, this->device_hitToTuple_.data());
+#ifdef GPU_DEBUG
+      alpaka::wait(queue);
+#endif
+    }
+
+    if (this->m_params.doSharedHitCut_) {
+      // mark duplicates (tracks that share at least one hit)
+      numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4,
+                                                      blockSize);  // TODO: Check if correct
+      workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          Kernel_rejectDuplicate<TrackerTraits>{},
+                          tracks_view,
+                          this->m_params.minHitsForSharingCut_,
+                          this->m_params.dupPassThrough_,
+                          this->device_hitToTuple_.data());
+
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          Kernel_sharedHitCleaner<TrackerTraits>{},
+                          hh,
+                          tracks_view,
+                          this->m_params.minHitsForSharingCut_,
+                          this->m_params.dupPassThrough_,
+                          this->device_hitToTuple_.data());
+
+      if (this->m_params.useSimpleTripletCleaner_) {
+        // (typename HitToTuple{}::capacity(),
+        numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize);
+        workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+        alpaka::exec<Acc1D>(queue,
+                            workDiv1D,
+                            Kernel_simpleTripletCleaner<TrackerTraits>{},
+                            tracks_view,
+                            this->m_params.minHitsForSharingCut_,
+                            this->m_params.dupPassThrough_,
+                            this->device_hitToTuple_.data());
+      } else {
+        numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize);
+        workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+        alpaka::exec<Acc1D>(queue,
+                            workDiv1D,
+                            Kernel_tripletCleaner<TrackerTraits>{},
+                            tracks_view,
+                            this->m_params.minHitsForSharingCut_,
+                            this->m_params.dupPassThrough_,
+                            this->device_hitToTuple_.data());
+      }
+#ifdef GPU_DEBUG
+      alpaka::wait(queue);
+#endif
+    }
+
+    if (this->m_params.doStats_) {
+      numberOfBlocks =
+          cms::alpakatools::divide_up_by(std::max(nhits, m_params.caParams_.maxNumberOfDoublets_), blockSize);
+      workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          Kernel_checkOverflows<TrackerTraits>{},
+                          tracks_view,
+                          this->device_tupleMultiplicity_.data(),
+                          this->device_hitToTuple_.data(),
+                          this->device_hitTuple_apc_,
+                          this->device_theCells_.data(),
+                          this->device_nCells_.data(),
+                          this->device_theCellNeighbors_.data(),
+                          this->device_theCellTracks_.data(),
+                          this->isOuterHitOfCell_.data(),
+                          nhits,
+                          this->m_params.caParams_.maxNumberOfDoublets_,
+                          this->counters_.data());
+    }
+
+    if (this->m_params.doStats_) {
+      // counters (add flag???)
+
+      numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize);
+      workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          Kernel_doStatsForHitInTracks<TrackerTraits>{},
+                          this->device_hitToTuple_.data(),
+                          this->counters_.data());
+
+      numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize);
+      workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+      alpaka::exec<Acc1D>(
+          queue, workDiv1D, Kernel_doStatsForTracks<TrackerTraits>{}, tracks_view, this->counters_.data());
+    }
+#ifdef GPU_DEBUG
+    alpaka::wait(queue);
+#endif
+
+#ifdef DUMP_GPU_TK_TUPLES
+    static std::atomic<int> iev(0);
+    static std::mutex lock;
+    workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(1u, 32u);
+    {
+      std::lock_guard<std::mutex> guard(lock);
+      ++iev;
+      for (int k = 0; k < 20000; k += 500) {
+        alpaka::exec<Acc1D>(queue,
+                            workDiv1D,
+                            Kernel_print_found_ntuplets<TrackerTraits>{},
+                            hh,
+                            tracks_view,
+                            this->device_hitToTuple_.data(),
+                            k,
+                            k + 500,
+                            iev);
+        alpaka::wait(queue);
+      }
+      alpaka::exec<Acc1D>(queue,
+                          workDiv1D,
+                          Kernel_print_found_ntuplets<TrackerTraits>{},
+                          hh,
+                          tracks_view,
+                          this->device_hitToTuple_.data(),
+                          20000,
+                          1000000,
+                          iev);
+
+      alpaka::wait(queue);
+    }
+#endif
+  }
+  // This will make sense when we will be able to run this once per job in Alpaka
+  /*
+template <typename TrackerTraits>
+void CAHitNtupletGeneratorKernels<TrackerTraits>::printCounters() {
+    auto workDiv1D = cms::alpakatools::make_workdiv<Acc1D>(1,1);
+    alpaka::exec<Acc1D>(queue_,workDiv1D,Kernel_printCounters{},this->counters_.data());
+}
+*/
+  template class CAHitNtupletGeneratorKernels<pixelTopology::Phase1>;
+  template class CAHitNtupletGeneratorKernels<pixelTopology::Phase2>;
+  template class CAHitNtupletGeneratorKernels<pixelTopology::HIonPhase1>;
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h
new file mode 100644
index 0000000000000..d55be09e6e497
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h
@@ -0,0 +1,273 @@
+#ifndef RecoPixelVertexing_PixelTriplets_CAHitNtupletGeneratorKernels_h
+#define RecoPixelVertexing_PixelTriplets_CAHitNtupletGeneratorKernels_h
+
+//#define GPU_DEBUG
+//#define DUMP_GPU_TK_TUPLES
+
+#include <cstdint>
+
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/TrackSoA/interface/TrackDefinitions.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+
+#include "CACell.h"
+#include "CAPixelDoublets.h"
+#include "CAStructures.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace caHitNtupletGenerator {
+
+    //Configuration params common to all topologies, for the algorithms
+    struct AlgoParams {
+      const uint32_t minHitsForSharingCut_;
+      const bool useRiemannFit_;
+      const bool fitNas4_;
+      const bool includeJumpingForwardDoublets_;
+      const bool earlyFishbone_;
+      const bool lateFishbone_;
+      const bool doStats_;
+      const bool doSharedHitCut_;
+      const bool dupPassThrough_;
+      const bool useSimpleTripletCleaner_;
+    };
+
+    //CAParams
+    struct CACommon {
+      const uint32_t maxNumberOfDoublets_;
+      const uint32_t minHitsPerNtuplet_;
+      const float ptmin_;
+      const float CAThetaCutBarrel_;
+      const float CAThetaCutForward_;
+      const float hardCurvCut_;
+      const float dcaCutInnerTriplet_;
+      const float dcaCutOuterTriplet_;
+    };
+
+    template <typename TrackerTraits, typename Enable = void>
+    struct CAParamsT : public CACommon {
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startingLayerPair(int16_t pid) const { return false; };
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startAt0(int16_t pid) const { return false; };
+    };
+
+    template <typename TrackerTraits>
+    struct CAParamsT<TrackerTraits, pixelTopology::isPhase1Topology<TrackerTraits>> : public CACommon {
+      /// Is is a starting layer pair?
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startingLayerPair(int16_t pid) const {
+        return minHitsPerNtuplet_ > 3 ? pid < 3 : pid < 8 || pid > 12;
+      }
+
+      /// Is this a pair with inner == 0?
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startAt0(int16_t pid) const {
+        assert((pixelTopology::Phase1::layerPairs[pid * 2] == 0) ==
+               (pid < 3 || pid == 13 || pid == 15 || pid == 16));  // to be 100% sure it's working, may be removed
+        return pixelTopology::Phase1::layerPairs[pid * 2] == 0;
+      }
+    };
+
+    template <typename TrackerTraits>
+    struct CAParamsT<TrackerTraits, pixelTopology::isPhase2Topology<TrackerTraits>> : public CACommon {
+      const bool includeFarForwards_;
+      /// Is is a starting layer pair?
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startingLayerPair(int16_t pid) const {
+        return pid < 33;  // in principle one could remove 5,6,7 23, 28 and 29
+      }
+
+      /// Is this a pair with inner == 0
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startAt0(int16_t pid) const {
+        assert((pixelTopology::Phase2::layerPairs[pid * 2] == 0) == ((pid < 3) | (pid >= 23 && pid < 28)));
+        return pixelTopology::Phase2::layerPairs[pid * 2] == 0;
+      }
+    };
+
+    //Full list of params = algo params + ca params + cell params + quality cuts
+    //Generic template
+    template <typename TrackerTraits, typename Enable = void>
+    struct ParamsT : public AlgoParams {
+      // one should define the params for its own pixelTopology
+      // not defining anything here
+      inline uint32_t nPairs() const { return 0; }
+    };
+
+    template <typename TrackerTraits>
+    struct ParamsT<TrackerTraits, pixelTopology::isPhase1Topology<TrackerTraits>> : public AlgoParams {
+      using TT = TrackerTraits;
+      using QualityCuts = ::pixelTrack::QualityCutsT<TT>;  //track quality cuts
+      using CellCuts = caPixelDoublets::CellCutsT<TT>;     //cell building cuts
+      using CAParams = CAParamsT<TT>;                      //params to be used on device
+
+      ParamsT(AlgoParams const& commonCuts,
+              CellCuts const& cellCuts,
+              QualityCuts const& cutsCuts,
+              CAParams const& caParams)
+          : AlgoParams(commonCuts), cellCuts_(cellCuts), qualityCuts_(cutsCuts), caParams_(caParams) {}
+
+      const CellCuts cellCuts_;
+      const QualityCuts qualityCuts_{// polynomial coefficients for the pT-dependent chi2 cut
+                                     {0.68177776, 0.74609577, -0.08035491, 0.00315399},
+                                     // max pT used to determine the chi2 cut
+                                     10.,
+                                     // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+                                     30.,
+                                     // regional cuts for triplets
+                                     {
+                                         0.3,  // |Tip| < 0.3 cm
+                                         0.5,  // pT > 0.5 GeV
+                                         12.0  // |Zip| < 12.0 cm
+                                     },
+                                     // regional cuts for quadruplets
+                                     {
+                                         0.5,  // |Tip| < 0.5 cm
+                                         0.3,  // pT > 0.3 GeV
+                                         12.0  // |Zip| < 12.0 cm
+                                     }};
+      const CAParams caParams_;
+      /// Compute the number of pairs
+      inline uint32_t nPairs() const {
+        // take all layer pairs into account
+        uint32_t nActualPairs = TT::nPairs;
+        if (not includeJumpingForwardDoublets_) {
+          // exclude forward "jumping" layer pairs
+          nActualPairs = TT::nPairsForTriplets;
+        }
+        if (caParams_.minHitsPerNtuplet_ > 3) {
+          // for quadruplets, exclude all "jumping" layer pairs
+          nActualPairs = TT::nPairsForQuadruplets;
+        }
+
+        return nActualPairs;
+      }
+
+    };  // Params Phase1
+
+    template <typename TrackerTraits>
+    struct ParamsT<TrackerTraits, pixelTopology::isPhase2Topology<TrackerTraits>> : public AlgoParams {
+      using TT = TrackerTraits;
+      using QualityCuts = ::pixelTrack::QualityCutsT<TT>;
+      using CellCuts = caPixelDoublets::CellCutsT<TT>;
+      using CAParams = CAParamsT<TT>;
+
+      ParamsT(AlgoParams const& commonCuts,
+              CellCuts const& cellCuts,
+              QualityCuts const& qualityCuts,
+              CAParams const& caParams)
+          : AlgoParams(commonCuts), cellCuts_(cellCuts), qualityCuts_(qualityCuts), caParams_(caParams) {}
+
+      // quality cuts
+      const CellCuts cellCuts_;
+      const QualityCuts qualityCuts_{5.0f, /*chi2*/ 0.9f, /* pT in Gev*/ 0.4f, /*zip in cm*/ 12.0f /*tip in cm*/};
+      const CAParams caParams_;
+
+      inline uint32_t nPairs() const {
+        // take all layer pairs into account
+        uint32_t nActualPairs = TT::nPairsMinimal;
+        if (caParams_.includeFarForwards_) {
+          // considera far forwards (> 11 & > 23)
+          nActualPairs = TT::nPairsFarForwards;
+        }
+        if (includeJumpingForwardDoublets_) {
+          // include jumping forwards
+          nActualPairs = TT::nPairs;
+        }
+
+        return nActualPairs;
+      }
+
+    };  // Params Phase1
+
+    // counters
+    struct Counters {
+      unsigned long long nEvents;
+      unsigned long long nHits;
+      unsigned long long nCells;
+      unsigned long long nTuples;
+      unsigned long long nFitTracks;
+      unsigned long long nLooseTracks;
+      unsigned long long nGoodTracks;
+      unsigned long long nUsedHits;
+      unsigned long long nDupHits;
+      unsigned long long nFishCells;
+      unsigned long long nKilledCells;
+      unsigned long long nEmptyCells;
+      unsigned long long nZeroTrackCells;
+    };
+
+    using Quality = ::pixelTrack::Quality;
+
+  }  // namespace caHitNtupletGenerator
+
+  template <typename TTTraits>
+  class CAHitNtupletGeneratorKernels {
+  public:
+    using TrackerTraits = TTTraits;
+    using QualityCuts = ::pixelTrack::QualityCutsT<TrackerTraits>;
+    using CellCuts = caPixelDoublets::CellCutsT<TrackerTraits>;
+    using Params = caHitNtupletGenerator::ParamsT<TrackerTraits>;
+    using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+    using Counters = caHitNtupletGenerator::Counters;
+
+    using HitsView = TrackingRecHitSoAView<TrackerTraits>;
+    using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+    using TkSoAView = reco::TrackSoAView<TrackerTraits>;
+
+    using HitToTuple = caStructures::template HitToTupleT<TrackerTraits>;
+    using TupleMultiplicity = caStructures::template TupleMultiplicityT<TrackerTraits>;
+    struct Testttt {
+      TupleMultiplicity tm;
+    };
+    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+    using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+    using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+    using OuterHitOfCellContainer = caStructures::OuterHitOfCellContainerT<TrackerTraits>;
+    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+
+    using CACell = CACellT<TrackerTraits>;
+
+    using Quality = ::pixelTrack::Quality;
+    using HitContainer = typename reco::TrackSoA<TrackerTraits>::HitContainer;
+
+    CAHitNtupletGeneratorKernels(Params const& params, uint32_t nhits, Queue& queue);
+    ~CAHitNtupletGeneratorKernels() = default;
+
+    TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.data(); }
+
+    void launchKernels(const HitsConstView& hh, TkSoAView& track_view, Queue& queue);
+
+    void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, Queue& queue);
+
+    void buildDoublets(const HitsConstView& hh, Queue& queue);
+
+    static void printCounters();
+
+  private:
+    // params
+    Params const& m_params;
+    cms::alpakatools::device_buffer<Device, Counters> counters_;
+
+    // workspace
+    cms::alpakatools::device_buffer<Device, HitToTuple> device_hitToTuple_;
+    cms::alpakatools::device_buffer<Device, TupleMultiplicity> device_tupleMultiplicity_;
+    cms::alpakatools::device_buffer<Device, CACell[]> device_theCells_;
+    cms::alpakatools::device_buffer<Device, OuterHitOfCellContainer[]> device_isOuterHitOfCell_;
+    cms::alpakatools::device_buffer<Device, OuterHitOfCell> isOuterHitOfCell_;
+    cms::alpakatools::device_buffer<Device, CellNeighborsVector> device_theCellNeighbors_;
+    cms::alpakatools::device_buffer<Device, CellTracksVector> device_theCellTracks_;
+    cms::alpakatools::device_buffer<Device, unsigned char[]> cellStorage_;
+    cms::alpakatools::device_buffer<Device, CellCuts> device_cellCuts_;
+    CellNeighbors* device_theCellNeighborsContainer_;
+    CellTracks* device_theCellTracksContainer_;
+    cms::alpakatools::device_buffer<Device, cms::alpakatools::AtomicPairCounter::DoubleWord[]> device_storage_;
+    cms::alpakatools::AtomicPairCounter* device_hitTuple_apc_;
+    cms::alpakatools::AtomicPairCounter* device_hitToTuple_apc_;
+    cms::alpakatools::device_view<Device, uint32_t> device_nCells_;
+  };
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
new file mode 100644
index 0000000000000..dd6ba7c5cf51e
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h
@@ -0,0 +1,1048 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+//#define GPU_DEBUG
+//#define NTUPLE_DEBUG
+
+#include <alpaka/alpaka.hpp>
+#include <cmath>
+#include <cstdint>
+#include <limits>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+
+#include "CAStructures.h"
+#include "CAHitNtupletGeneratorKernels.h"
+#include "CACell.h"
+#include "CAFishbone.h"
+#include "CAPixelDoublets.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace caHitNtupletGeneratorKernels {
+
+    constexpr uint32_t tkNotFound = std::numeric_limits<uint16_t>::max();
+    constexpr float maxScore = std::numeric_limits<float>::max();
+    constexpr float nSigma2 = 25.f;
+
+    //all of these below are mostly to avoid brining around the relative namespace
+
+    template <typename TrackerTraits>
+    using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
+
+    template <typename TrackerTraits>
+    using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+
+    template <typename TrackerTraits>
+    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+
+    template <typename TrackerTraits>
+    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+
+    template <typename TrackerTraits>
+    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+
+    using Quality = ::pixelTrack::Quality;
+
+    template <typename TrackerTraits>
+    using TkSoAView = reco::TrackSoAView<TrackerTraits>;
+
+    template <typename TrackerTraits>
+    using HitContainer = typename reco::TrackSoA<TrackerTraits>::HitContainer;
+
+    template <typename TrackerTraits>
+    using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
+
+    template <typename TrackerTraits>
+    using QualityCuts = ::pixelTrack::QualityCutsT<TrackerTraits>;
+
+    template <typename TrackerTraits>
+    using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+    using Counters = caHitNtupletGenerator::Counters;
+
+    template <typename TrackerTraits>
+    class Kernel_checkOverflows {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    TupleMultiplicity<TrackerTraits> const *tupleMultiplicity,
+                                    HitToTuple<TrackerTraits> const *hitToTuple,
+                                    cms::alpakatools::AtomicPairCounter *apc,
+                                    CACellT<TrackerTraits> const *__restrict__ cells,
+                                    uint32_t const *__restrict__ nCells,
+                                    CellNeighborsVector<TrackerTraits> const *cellNeighbors,
+                                    CellTracksVector<TrackerTraits> const *cellTracks,
+                                    OuterHitOfCell<TrackerTraits> const *isOuterHitOfCell,
+                                    int32_t nHits,
+                                    uint32_t maxNumberOfDoublets,
+                                    Counters *counters) const {
+        auto &c = *counters;
+        // counters once per event
+        if (cms::alpakatools::once_per_grid(acc)) {
+          alpaka::atomicAdd(acc, &c.nEvents, 1ull, alpaka::hierarchy::Blocks{});
+          alpaka::atomicAdd(acc, &c.nHits, static_cast<unsigned long long>(nHits), alpaka::hierarchy::Blocks{});
+          alpaka::atomicAdd(acc, &c.nCells, static_cast<unsigned long long>(*nCells), alpaka::hierarchy::Blocks{});
+          alpaka::atomicAdd(
+              acc, &c.nTuples, static_cast<unsigned long long>(apc->get().first), alpaka::hierarchy::Blocks{});
+          alpaka::atomicAdd(acc,
+                            &c.nFitTracks,
+                            static_cast<unsigned long long>(tupleMultiplicity->size()),
+                            alpaka::hierarchy::Blocks{});
+        }
+
+#ifdef NTUPLE_DEBUGS
+        if (cms::alpakatools::once_per_grid(acc)) {
+          printf("number of found cells %d \n found tuples %d with total hits %d out of %d\n",
+                 *nCells,
+                 apc->get().first,
+                 apc->get().second,
+                 nHits);
+          if (apc->get().first < TrackerTraits::maxNumberOfQuadruplets) {
+            ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size(apc->get().first) == 0);
+            ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size() == apc->get().second);
+          }
+        }
+        const auto ntNbins = foundNtuplets->nbins();
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntBins)) {
+          if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
+            printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
+          ALPAKA_ASSERT_OFFLOAD(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack);
+          for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih)
+            ALPAKA_ASSERT_OFFLOAD(int(*ih) < nHits);
+        }
+#endif
+
+        if (cms::alpakatools::once_per_grid(acc)) {
+          if (apc->get().first >= TrackerTraits::maxNumberOfQuadruplets)
+            printf("Tuples overflow\n");
+          if (*nCells >= maxNumberOfDoublets)
+            printf("Cells overflow\n");
+          if (cellNeighbors && cellNeighbors->full())
+            printf("cellNeighbors overflow %d %d \n", cellNeighbors->capacity(), cellNeighbors->size());
+          if (cellTracks && cellTracks->full())
+            printf("cellTracks overflow\n");
+          if (int(hitToTuple->nOnes()) < nHits)
+            printf("ERROR hitToTuple  overflow %d %d\n", hitToTuple->nOnes(), nHits);
+#ifdef GPU_DEBUG
+          printf("size of cellNeighbors %d \n cellTracks %d \n hitToTuple %d \n",
+                 cellNeighbors->size(),
+                 cellTracks->size(),
+                 hitToTuple->size());
+#endif
+        }
+
+        const auto ntNCells = (*nCells);
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+          auto const &thisCell = cells[idx];
+          if (thisCell.hasFishbone() && !thisCell.isKilled())
+            alpaka::atomicAdd(acc, &c.nFishCells, 1ull, alpaka::hierarchy::Blocks{});
+          if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+            printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId());
+          if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+            printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId());
+          if (thisCell.isKilled())
+            alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{});
+          if (!thisCell.unused())
+            alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{});
+          if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id())))
+            alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{});
+        }
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, nHits))
+          if ((*isOuterHitOfCell).container[idx].full())  // ++tooManyOuterHitOfCell;
+            printf("OuterHitOfCell overflow %d\n", idx);
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_fishboneCleaner {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    CACellT<TrackerTraits> const *cells,
+                                    uint32_t const *__restrict__ nCells,
+                                    TkSoAView<TrackerTraits> tracks_view) const {
+        constexpr auto reject = Quality::dup;
+        const auto ntNCells = (*nCells);
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+          auto const &thisCell = cells[idx];
+          if (!thisCell.isKilled())
+            continue;
+
+          for (auto it : thisCell.tracks())
+            tracks_view[it].quality() = reject;
+        }
+      }
+    };
+    // remove shorter tracks if sharing a cell
+    // It does not seem to affect efficiency in any way!
+    template <typename TrackerTraits>
+    class Kernel_earlyDuplicateRemover {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    CACellT<TrackerTraits> const *cells,
+                                    uint32_t const *__restrict__ nCells,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    bool dupPassThrough) const {
+        // quality to mark rejected
+        constexpr auto reject = Quality::edup;  /// cannot be loose
+        ALPAKA_ASSERT_OFFLOAD(nCells);
+        const auto ntNCells = (*nCells);
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+          auto const &thisCell = cells[idx];
+
+          if (thisCell.tracks().size() < 2)
+            continue;
+
+          int8_t maxNl = 0;
+
+          // find maxNl
+          for (auto it : thisCell.tracks()) {
+            auto nl = tracks_view[it].nLayers();
+            maxNl = std::max(nl, maxNl);
+          }
+
+          // if (maxNl<4) continue;
+          // quad pass through (leave it her for tests)
+          //  maxNl = std::min(4, maxNl);
+
+          for (auto it : thisCell.tracks()) {
+            if (tracks_view[it].nLayers() < maxNl)
+              tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
+          }
+        }
+      }
+    };
+
+    // assume the above (so, short tracks already removed)
+    template <typename TrackerTraits>
+    class Kernel_fastDuplicateRemover {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    CACellT<TrackerTraits> const *__restrict__ cells,
+                                    uint32_t const *__restrict__ nCells,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    bool dupPassThrough) const {
+        // quality to mark rejected
+        auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
+        constexpr auto loose = Quality::loose;
+
+        ALPAKA_ASSERT_OFFLOAD(nCells);
+        const auto ntNCells = (*nCells);
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) {
+          auto const &thisCell = cells[idx];
+          if (thisCell.tracks().size() < 2)
+            continue;
+
+          float mc = maxScore;
+          uint16_t im = tkNotFound;
+
+          auto score = [&](auto it) { return std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)); };
+
+          // full crazy combinatorics
+          int ntr = thisCell.tracks().size();
+          for (int i = 0; i < ntr - 1; ++i) {
+            auto it = thisCell.tracks()[i];
+            auto qi = tracks_view[it].quality();
+            if (qi <= reject)
+              continue;
+            auto opi = tracks_view[it].state()(2);
+            auto e2opi = tracks_view[it].covariance()(9);
+            auto cti = tracks_view[it].state()(3);
+            auto e2cti = tracks_view[it].covariance()(12);
+            for (auto j = i + 1; j < ntr; ++j) {
+              auto jt = thisCell.tracks()[j];
+              auto qj = tracks_view[jt].quality();
+              if (qj <= reject)
+                continue;
+              auto opj = tracks_view[jt].state()(2);
+              auto ctj = tracks_view[jt].state()(3);
+              auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
+              if ((cti - ctj) * (cti - ctj) > dct)
+                continue;
+              auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
+              if ((opi - opj) * (opi - opj) > dop)
+                continue;
+              if ((qj < qi) || (qj == qi && score(it) < score(jt)))
+                tracks_view[jt].quality() = reject;
+              else {
+                tracks_view[it].quality() = reject;
+                break;
+              }
+            }
+          }
+
+          // find maxQual
+          auto maxQual = reject;  // no duplicate!
+          for (auto it : thisCell.tracks()) {
+            if (tracks_view[it].quality() > maxQual)
+              maxQual = tracks_view[it].quality();
+          }
+
+          if (maxQual <= loose)
+            continue;
+
+          // find min score
+          for (auto it : thisCell.tracks()) {
+            if (tracks_view[it].quality() == maxQual && score(it) < mc) {
+              mc = score(it);
+              im = it;
+            }
+          }
+
+          if (tkNotFound == im)
+            continue;
+
+          // mark all other duplicates  (not yet, keep it loose)
+          for (auto it : thisCell.tracks()) {
+            if (tracks_view[it].quality() > loose && it != im)
+              tracks_view[it].quality() = loose;  //no race:  simple assignment of the same constant
+          }
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_connect {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    cms::alpakatools::AtomicPairCounter *apc1,
+                                    cms::alpakatools::AtomicPairCounter *apc2,  // just to zero them
+                                    HitsConstView<TrackerTraits> hh,
+                                    CACellT<TrackerTraits> *cells,
+                                    uint32_t *nCells,
+                                    CellNeighborsVector<TrackerTraits> *cellNeighbors,
+                                    OuterHitOfCell<TrackerTraits> const *isOuterHitOfCell,
+                                    CAParams<TrackerTraits> params) const {
+        using Cell = CACellT<TrackerTraits>;
+
+        const uint32_t dimIndexY = 0u;
+        const uint32_t dimIndexX = 1u;
+        const uint32_t threadIdxY(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[dimIndexY]);
+        const uint32_t threadIdxLocalX(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexX]);
+
+        if (0 == (threadIdxY + threadIdxLocalX)) {
+          (*apc1) = 0;
+          (*apc2) = 0;
+        }  // ready for next kernel
+
+        constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex;
+        constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex;
+
+        cms::alpakatools::for_each_element_in_grid_strided(
+            acc,
+            (*nCells),
+            0u,
+            [&](uint32_t idx) {
+              auto cellIndex = idx;
+              auto &thisCell = cells[idx];
+              auto innerHitId = thisCell.inner_hit_id();
+              if (int(innerHitId) >= isOuterHitOfCell->offset) {
+                uint32_t numberOfPossibleNeighbors = (*isOuterHitOfCell)[innerHitId].size();
+                auto vi = (*isOuterHitOfCell)[innerHitId].data();
+
+                auto ri = thisCell.inner_r(hh);
+                auto zi = thisCell.inner_z(hh);
+
+                auto ro = thisCell.outer_r(hh);
+                auto zo = thisCell.outer_z(hh);
+                auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex;
+
+                cms::alpakatools::for_each_element_in_block_strided(
+                    acc,
+                    numberOfPossibleNeighbors,
+                    0u,
+                    [&](uint32_t j) {
+                      auto otherCell = (vi[j]);
+                      auto &oc = cells[otherCell];
+                      auto r1 = oc.inner_r(hh);
+                      auto z1 = oc.inner_z(hh);
+                      bool aligned = Cell::areAlignedRZ(
+                          r1,
+                          z1,
+                          ri,
+                          zi,
+                          ro,
+                          zo,
+                          params.ptmin_,
+                          isBarrel ? params.CAThetaCutBarrel_
+                                   : params.CAThetaCutForward_);  // 2.f*thetaCut); // FIXME tune cuts
+                      if (aligned &&
+                          thisCell.dcaCut(hh,
+                                          oc,
+                                          oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_
+                                                                                      : params.dcaCutOuterTriplet_,
+                                          params.hardCurvCut_)) {  // FIXME tune cuts
+                        oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors);
+                        thisCell.setStatusBits(Cell::StatusBit::kUsed);
+                        oc.setStatusBits(Cell::StatusBit::kUsed);
+                      }
+                    },
+                    dimIndexX);  // loop on inner cells
+              }
+            },
+            dimIndexY);  // loop on outer cells
+      }
+    };
+    template <typename TrackerTraits>
+    class Kernel_find_ntuplets {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    HitsConstView<TrackerTraits> hh,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    CACellT<TrackerTraits> *__restrict__ cells,
+                                    uint32_t const *nCells,
+                                    CellTracksVector<TrackerTraits> *cellTracks,
+                                    cms::alpakatools::AtomicPairCounter *apc,
+                                    CAParams<TrackerTraits> params) const {
+        // recursive: not obvious to widen
+
+        using Cell = CACellT<TrackerTraits>;
+
+#ifdef GPU_DEBUG
+        if (cms::alpakatools::once_per_grid(acc))
+          printf("starting producing ntuplets from %d cells \n", *nCells);
+#endif
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
+          auto const &thisCell = cells[idx];
+
+          if (thisCell.isKilled())
+            continue;  // cut by earlyFishbone
+
+          // we require at least three hits...
+
+          if (thisCell.outerNeighbors().empty())
+            continue;
+
+          auto pid = thisCell.layerPairId();
+          bool doit = params.startingLayerPair(pid);
+
+          constexpr uint32_t maxDepth = TrackerTraits::maxDepth;
+
+          if (doit) {
+            typename Cell::TmpTuple stack;
+            stack.reset();
+            bool bpix1Start = params.startAt0(pid);
+            thisCell.template find_ntuplets<maxDepth, TAcc>(acc,
+                                                            hh,
+                                                            cells,
+                                                            *cellTracks,
+                                                            tracks_view.hitIndices(),
+                                                            *apc,
+                                                            tracks_view.quality(),
+                                                            stack,
+                                                            params.minHitsPerNtuplet_,
+                                                            bpix1Start);
+            ALPAKA_ASSERT_OFFLOAD(stack.empty());
+          }
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_mark_used {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    CACellT<TrackerTraits> *__restrict__ cells,
+                                    uint32_t const *nCells) const {
+        using Cell = CACellT<TrackerTraits>;
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) {
+          auto &thisCell = cells[idx];
+          if (!thisCell.tracks().empty())
+            thisCell.setStatusBits(Cell::StatusBit::kInTrack);
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_countMultiplicity {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    TupleMultiplicity<TrackerTraits> *tupleMultiplicity) const {
+        for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+          auto nhits = tracks_view.hitIndices().size(it);
+          if (nhits < 3)
+            continue;
+          if (tracks_view[it].quality() == Quality::edup)
+            continue;
+          ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
+          if (nhits > TrackerTraits::maxHitsOnTrack)  // current limit
+            printf("wrong mult %d %d\n", it, nhits);
+          ALPAKA_ASSERT_OFFLOAD(nhits <= TrackerTraits::maxHitsOnTrack);
+          tupleMultiplicity->count(acc, nhits);
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_fillMultiplicity {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    TupleMultiplicity<TrackerTraits> *tupleMultiplicity) const {
+        for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+          auto nhits = tracks_view.hitIndices().size(it);
+          if (nhits < 3)
+            continue;
+          if (tracks_view[it].quality() == Quality::edup)
+            continue;
+          ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
+          if (nhits > TrackerTraits::maxHitsOnTrack)
+            printf("wrong mult %d %d\n", it, nhits);
+          ALPAKA_ASSERT_OFFLOAD(nhits <= TrackerTraits::maxHitsOnTrack);
+          tupleMultiplicity->fill(acc, nhits, it);
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_classifyTracks {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    QualityCuts<TrackerTraits> cuts) const {
+        for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+          auto nhits = tracks_view.hitIndices().size(it);
+          if (nhits == 0)
+            break;  // guard
+
+          // if duplicate: not even fit
+          if (tracks_view[it].quality() == Quality::edup)
+            continue;
+
+          ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad);
+
+          // mark doublets as bad
+          if (nhits < 3)
+            continue;
+
+          // if the fit has any invalid parameters, mark it as bad
+          bool isNaN = false;
+          for (int i = 0; i < 5; ++i) {
+            isNaN |= std::isnan(tracks_view[it].state()(i));
+          }
+          if (isNaN) {
+#ifdef NTUPLE_DEBUG
+            printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2());
+#endif
+            continue;
+          }
+
+          tracks_view[it].quality() = Quality::strict;
+
+          if (cuts.strictCut(tracks_view, it))
+            continue;
+
+          tracks_view[it].quality() = Quality::tight;
+
+          if (cuts.isHP(tracks_view, nhits, it))
+            tracks_view[it].quality() = Quality::highPurity;
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_doStatsForTracks {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc, TkSoAView<TrackerTraits> tracks_view, Counters *counters) const {
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+          if (tracks_view.hitIndices().size(idx) == 0)
+            break;  //guard
+          if (tracks_view[idx].quality() < Quality::loose)
+            continue;
+          alpaka::atomicAdd(acc, &(counters->nLooseTracks), 1ull, alpaka::hierarchy::Blocks{});
+          if (tracks_view[idx].quality() < Quality::strict)
+            continue;
+          alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{});
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_countHitInTracks {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    HitToTuple<TrackerTraits> *hitToTuple) const {
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+          if (tracks_view.hitIndices().size(idx) == 0)
+            break;  // guard
+          for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
+            hitToTuple->count(acc, *h);
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_fillHitInTracks {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    HitToTuple<TrackerTraits> *hitToTuple) const {
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+          if (tracks_view.hitIndices().size(idx) == 0)
+            break;  // guard
+          for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
+            hitToTuple->fill(acc, *h, idx);
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_fillHitDetIndices {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    HitsConstView<TrackerTraits> hh) const {
+        // copy offsets
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) {
+          tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
+        }
+        // fill hit indices
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().size())) {
+          ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().content[idx] < (uint32_t)hh.metadata().size());
+          tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex();
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_fillNLayers {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    cms::alpakatools::AtomicPairCounter *apc) const {
+        // clamp the number of tracks to the capacity of the SoA
+        auto ntracks = std::min<int>(apc->get().first, tracks_view.metadata().size() - 1);
+
+        if (cms::alpakatools::once_per_grid(acc))
+          tracks_view.nTracks() = ntracks;
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, ntracks)) {
+          ALPAKA_ASSERT_OFFLOAD(TracksUtilities<TrackerTraits>::nHits(tracks_view, idx) >= 3);
+          tracks_view[idx].nLayers() = TracksUtilities<TrackerTraits>::computeNumberOfLayers(tracks_view, idx);
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_doStatsForHitInTracks {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    HitToTuple<TrackerTraits> const *__restrict__ hitToTuple,
+                                    Counters *counters) const {
+        auto &c = *counters;
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nOnes())) {
+          if (hitToTuple->size(idx) == 0)
+            continue;  // SHALL NOT BE break
+          alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{});
+          if (hitToTuple->size(idx) > 1)
+            alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{});
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_countSharedHit {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    int *__restrict__ nshared,
+                                    HitContainer<TrackerTraits> const *__restrict__ ptuples,
+                                    Quality const *__restrict__ quality,
+                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+        constexpr auto loose = Quality::loose;
+
+        auto &hitToTuple = *phitToTuple;
+        auto const &foundNtuplets = *ptuples;
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nbins())) {
+          if (hitToTuple.size(idx) < 2)
+            continue;
+
+          int nt = 0;
+
+          // count "good" tracks
+          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+            if (quality[*it] < loose)
+              continue;
+            ++nt;
+          }
+
+          if (nt < 2)
+            continue;
+
+          // now mark  each track triplet as sharing a hit
+          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+            if (foundNtuplets.size(*it) > 3)
+              continue;
+            alpaka::atomicAdd(acc, &nshared[*it], 1ull, alpaka::hierarchy::Blocks{});
+          }
+
+        }  //  hit loop
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_markSharedHit {
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    int const *__restrict__ nshared,
+                                    HitContainer<TrackerTraits> const *__restrict__ tuples,
+                                    Quality *__restrict__ quality,
+                                    bool dupPassThrough) const {
+        // constexpr auto bad = Quality::bad;
+        constexpr auto dup = Quality::dup;
+        constexpr auto loose = Quality::loose;
+        // constexpr auto strict = Quality::strict;
+
+        // quality to mark rejected
+        auto const reject = dupPassThrough ? loose : dup;
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, tuples->nbins())) {
+          if (tuples->size(idx) == 0)
+            break;  //guard
+          if (quality[idx] <= reject)
+            continue;
+          if (nshared[idx] > 2)
+            quality[idx] = reject;
+        }
+      }
+    };
+
+    // mostly for very forward triplets.....
+    template <typename TrackerTraits>
+    class Kernel_rejectDuplicate {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    uint16_t nmin,
+                                    bool dupPassThrough,
+                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+        // quality to mark rejected
+        auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
+
+        auto &hitToTuple = *phitToTuple;
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+          if (hitToTuple.size(idx) < 2)
+            continue;
+
+          auto score = [&](auto it, auto nl) { return std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)); };
+
+          // full combinatorics
+          for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
+            auto const it = *ip;
+            auto qi = tracks_view[it].quality();
+            if (qi <= reject)
+              continue;
+            auto opi = tracks_view[it].state()(2);
+            auto e2opi = tracks_view[it].covariance()(9);
+            auto cti = tracks_view[it].state()(3);
+            auto e2cti = tracks_view[it].covariance()(12);
+            auto nli = tracks_view[it].nLayers();
+            for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
+              auto const jt = *jp;
+              auto qj = tracks_view[jt].quality();
+              if (qj <= reject)
+                continue;
+              auto opj = tracks_view[jt].state()(2);
+              auto ctj = tracks_view[jt].state()(3);
+              auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
+              if ((cti - ctj) * (cti - ctj) > dct)
+                continue;
+              auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
+              if ((opi - opj) * (opi - opj) > dop)
+                continue;
+              auto nlj = tracks_view[jt].nLayers();
+              if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
+                tracks_view[jt].quality() = reject;
+              else {
+                tracks_view[it].quality() = reject;
+                break;
+              }
+            }
+          }
+        }
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_sharedHitCleaner {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    HitsConstView<TrackerTraits> hh,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    int nmin,
+                                    bool dupPassThrough,
+                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+        // quality to mark rejected
+        auto const reject = dupPassThrough ? Quality::loose : Quality::dup;
+        // quality of longest track
+        auto const longTqual = Quality::highPurity;
+
+        auto &hitToTuple = *phitToTuple;
+
+        uint32_t l1end = hh.hitsLayerStart()[1];
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+          if (hitToTuple.size(idx) < 2)
+            continue;
+
+          int8_t maxNl = 0;
+
+          // find maxNl
+          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+            if (tracks_view[*it].quality() < longTqual)
+              continue;
+            // if (tracks_view[*it].nHits()==3) continue;
+            auto nl = tracks_view[*it].nLayers();
+            maxNl = std::max(nl, maxNl);
+          }
+
+          if (maxNl < 4)
+            continue;
+
+          // quad pass through (leave for tests)
+          // maxNl = std::min(4, maxNl);
+
+          // kill all tracks shorter than maxHl (only triplets???
+          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+            auto nl = tracks_view[*it].nLayers();
+
+            //checking if shared hit is on bpix1 and if the tuple is short enough
+            if (idx < l1end and nl > nmin)
+              continue;
+
+            if (nl < maxNl && tracks_view[*it].quality() > reject)
+              tracks_view[*it].quality() = reject;
+          }
+        }
+      }
+    };
+    template <typename TrackerTraits>
+    class Kernel_tripletCleaner {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    uint16_t nmin,
+                                    bool dupPassThrough,
+                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+        // quality to mark rejected
+        auto const reject = Quality::loose;
+        /// min quality of good
+        auto const good = Quality::strict;
+
+        auto &hitToTuple = *phitToTuple;
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+          if (hitToTuple.size(idx) < 2)
+            continue;
+
+          float mc = maxScore;
+          uint16_t im = tkNotFound;
+          bool onlyTriplets = true;
+
+          // check if only triplets
+          for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+            if (tracks_view[*it].quality() <= good)
+              continue;
+            onlyTriplets &= TracksUtilities<TrackerTraits>::isTriplet(tracks_view, *it);
+            if (!onlyTriplets)
+              break;
+          }
+
+          // only triplets
+          if (!onlyTriplets)
+            continue;
+
+          // for triplets choose best tip!  (should we first find best quality???)
+          for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+            auto const it = *ip;
+            if (tracks_view[it].quality() >= good &&
+                std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)) < mc) {
+              mc = std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it));
+              im = it;
+            }
+          }
+
+          if (tkNotFound == im)
+            continue;
+
+          // mark worse ambiguities
+          for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+            auto const it = *ip;
+            if (tracks_view[it].quality() > reject && it != im)
+              tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
+          }
+
+        }  // loop over hits
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_simpleTripletCleaner {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    uint16_t nmin,
+                                    bool dupPassThrough,
+                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) const {
+        // quality to mark rejected
+        auto const reject = Quality::loose;
+        /// min quality of good
+        auto const good = Quality::loose;
+
+        auto &hitToTuple = *phitToTuple;
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) {
+          if (hitToTuple.size(idx) < 2)
+            continue;
+
+          float mc = maxScore;
+          uint16_t im = tkNotFound;
+
+          // choose best tip!  (should we first find best quality???)
+          for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+            auto const it = *ip;
+            if (tracks_view[it].quality() >= good &&
+                std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)) < mc) {
+              mc = std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it));
+              im = it;
+            }
+          }
+
+          if (tkNotFound == im)
+            continue;
+
+          // mark worse ambiguities
+          for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+            auto const it = *ip;
+            if (tracks_view[it].quality() > reject && TracksUtilities<TrackerTraits>::isTriplet(tracks_view, it) &&
+                it != im)
+              tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
+          }
+
+        }  // loop over hits
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Kernel_print_found_ntuplets {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                    HitsConstView<TrackerTraits> hh,
+                                    TkSoAView<TrackerTraits> tracks_view,
+                                    HitToTuple<TrackerTraits> const *__restrict__ phitToTuple,
+                                    int32_t firstPrint,
+                                    int32_t lastPrint,
+                                    int iev) const {
+        constexpr auto loose = Quality::loose;
+
+        for (auto i : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nbins())) {
+          auto nh = tracks_view.hitIndices().size(i);
+          if (nh < 3)
+            continue;
+          if (tracks_view[i].quality() < loose)
+            continue;
+          printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
+                 10000 * iev + i,
+                 int(tracks_view[i].quality()),
+                 nh,
+                 tracks_view[i].nLayers(),
+                 reco::charge(tracks_view, i),
+                 //TracksUtilities<TrackerTraits>::charge(tracks_view, i),
+                 tracks_view[i].pt(),
+                 tracks_view[i].eta(),
+                 TracksUtilities<TrackerTraits>::phi(tracks_view, i),
+                 TracksUtilities<TrackerTraits>::tip(tracks_view, i),
+                 TracksUtilities<TrackerTraits>::zip(tracks_view, i),
+                 tracks_view[i].chi2(),
+                 hh[*tracks_view.hitIndices().begin(i)].zGlobal(),
+                 hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(),
+                 hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(),
+                 nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0,
+                 nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0,
+                 nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0,
+                 nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0);
+        }
+      }
+    };
+
+    class Kernel_printCounters {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const &acc, Counters const *counters) const {
+        auto const &c = *counters;
+        printf(
+            "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nLooseTracks  |  nGoodTracks | "
+            "nUsedHits "
+            "| "
+            "nDupHits | "
+            "nFishCells | "
+            "nKilledCells | "
+            "nUsedCells | nZeroTrackCells ||\n");
+        printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+               c.nEvents,
+               c.nHits,
+               c.nCells,
+               c.nTuples,
+               c.nFitTracks,
+               c.nLooseTracks,
+               c.nGoodTracks,
+               c.nUsedHits,
+               c.nDupHits,
+               c.nFishCells,
+               c.nKilledCells,
+               c.nEmptyCells,
+               c.nZeroTrackCells);
+        printf(
+            "Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f|  "
+            "%.3f|  "
+            "%.3f||\n",
+            c.nEvents,
+            c.nHits / double(c.nEvents),
+            c.nCells / double(c.nEvents),
+            c.nTuples / double(c.nEvents),
+            c.nFitTracks / double(c.nEvents),
+            c.nLooseTracks / double(c.nEvents),
+            c.nGoodTracks / double(c.nEvents),
+            c.nUsedHits / double(c.nEvents),
+            c.nDupHits / double(c.nEvents),
+            c.nFishCells / double(c.nCells),
+            c.nKilledCells / double(c.nCells),
+            c.nEmptyCells / double(c.nCells),
+            c.nZeroTrackCells / double(c.nCells));
+      }
+    };
+  }  // namespace caHitNtupletGeneratorKernels
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
new file mode 100644
index 0000000000000..0b5ab0a985163
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h
@@ -0,0 +1,71 @@
+#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoublets_h
+#define RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoublets_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "CAPixelDoubletsAlgos.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  using namespace alpaka;
+  using namespace cms::alpakatools;
+  namespace caPixelDoublets {
+
+    template <typename TrackerTraits>
+    class InitDoublets {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                    OuterHitOfCell<TrackerTraits>* isOuterHitOfCell,
+                                    int nHits,
+                                    CellNeighborsVector<TrackerTraits>* cellNeighbors,
+                                    CellNeighbors<TrackerTraits>* cellNeighborsContainer,
+                                    CellTracksVector<TrackerTraits>* cellTracks,
+                                    CellTracks<TrackerTraits>* cellTracksContainer) const {
+        ALPAKA_ASSERT_OFFLOAD((*isOuterHitOfCell).container);
+
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nHits))
+          (*isOuterHitOfCell).container[i].reset();
+
+        if (cms::alpakatools::once_per_grid(acc)) {
+          cellNeighbors->construct(TrackerTraits::maxNumOfActiveDoublets, cellNeighborsContainer);
+          cellTracks->construct(TrackerTraits::maxNumOfActiveDoublets, cellTracksContainer);
+          [[maybe_unused]] auto i = cellNeighbors->extend(acc);
+          ALPAKA_ASSERT_OFFLOAD(0 == i);
+          (*cellNeighbors)[0].reset();
+          i = cellTracks->extend(acc);
+          ALPAKA_ASSERT_OFFLOAD(0 == i);
+          (*cellTracks)[0].reset();
+        }
+      }
+    };
+
+    // Not used for the moment, see below.
+    //constexpr auto getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
+    //constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16;
+
+    template <typename TrackerTraits>
+    class GetDoubletsFromHisto {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      // #ifdef __CUDACC__
+      //       __launch_bounds__(getDoubletsFromHistoMaxBlockSize, getDoubletsFromHistoMinBlocksPerMP)  // TODO: Alapakify
+      // #endif
+      ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                    CACellT<TrackerTraits>* cells,
+                                    uint32_t* nCells,
+                                    CellNeighborsVector<TrackerTraits>* cellNeighbors,
+                                    CellTracksVector<TrackerTraits>* cellTracks,
+                                    HitsConstView<TrackerTraits> hh,
+                                    OuterHitOfCell<TrackerTraits>* isOuterHitOfCell,
+                                    uint32_t nActualPairs,
+                                    const uint32_t maxNumOfDoublets,
+                                    CellCutsT<TrackerTraits> cuts) const {
+        doubletsFromHisto<TrackerTraits>(
+            acc, nActualPairs, maxNumOfDoublets, cells, nCells, cellNeighbors, cellTracks, hh, *isOuterHitOfCell, cuts);
+      }
+    };
+  }  // namespace caPixelDoublets
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAPixelDoublets_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h
new file mode 100644
index 0000000000000..29ce8d7d76e3c
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h
@@ -0,0 +1,333 @@
+#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoubletsAlgos_h
+#define RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoubletsAlgos_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <limits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/Math/interface/approx_atan2.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+#include "CACell.h"
+#include "CAStructures.h"
+
+//#define GPU_DEBUG
+//#define NTUPLE_DEBUG
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace caPixelDoublets {
+    using namespace cms::alpakatools;
+
+    template <typename TrackerTraits>
+    using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using CellTracks = caStructures::CellTracksT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
+    template <typename TrackerTraits>
+    using HitsConstView = typename CACellT<TrackerTraits>::HitsConstView;
+
+    template <typename TrackerTraits>
+    struct CellCutsT {
+      using H = HitsConstView<TrackerTraits>;
+      using T = TrackerTraits;
+
+      CellCutsT() = default;
+
+      CellCutsT(const bool doClusterCut,
+                const bool doZ0Cut,
+                const bool doPtCut,
+                const bool idealConditions,
+                const float z0Cut,
+                const float ptCut,
+                const std::vector<int>& phiCutsV)
+          : doClusterCut_(doClusterCut),
+            doZ0Cut_(doZ0Cut),
+            doPtCut_(doPtCut),
+            idealConditions_(idealConditions),
+            z0Cut_(z0Cut),
+            ptCut_(ptCut) {
+        assert(phiCutsV.size() == TrackerTraits::nPairs);
+        std::copy(phiCutsV.begin(), phiCutsV.end(), &phiCuts[0]);
+      }
+
+      bool doClusterCut_;
+      bool doZ0Cut_;
+      bool doPtCut_;
+      bool idealConditions_;  //this is actually not used by phase2
+
+      float z0Cut_;  //FIXME: check if could be const now
+      float ptCut_;
+
+      int phiCuts[T::nPairs];
+
+      template <typename TAcc>
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool __attribute__((always_inline))
+      zSizeCut(const TAcc& acc, H hh, int i, int o) const {
+        const uint32_t mi = hh[i].detectorIndex();
+
+        bool innerB1 = mi < T::last_bpix1_detIndex;
+        bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
+        auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1;
+
+        if (mes < 0)
+          return false;
+
+        const uint32_t mo = hh[o].detectorIndex();
+        auto so = hh[o].clusterSizeY();
+
+        auto dz = hh[i].zGlobal() - hh[o].zGlobal();
+        auto dr = hh[i].rGlobal() - hh[o].rGlobal();
+
+        auto innerBarrel = mi < T::last_barrel_detIndex;
+        auto onlyBarrel = mo < T::last_barrel_detIndex;
+
+        if (not innerBarrel and not onlyBarrel)
+          return false;
+        auto dy = innerB1 ? T::maxDYsize12 : T::maxDYsize;
+
+        return onlyBarrel ? so > 0 && std::abs(so - mes) > dy
+                          : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred;
+      }
+
+      template <typename TAcc>
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE bool __attribute__((always_inline))
+      clusterCut(const TAcc& acc, H hh, uint32_t i) const {
+        const uint32_t mi = hh[i].detectorIndex();
+        bool innerB1orB2 = mi < T::last_bpix2_detIndex;
+
+        if (!innerB1orB2)
+          return false;
+
+        bool innerB1 = mi < T::last_bpix1_detIndex;
+        bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
+        auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1;
+
+        if (innerB1)  // B1
+          if (mes > 0 && mes < T::minYsizeB1)
+            return true;                                                                 // only long cluster  (5*8)
+        bool innerB2 = (mi >= T::last_bpix1_detIndex) && (mi < T::last_bpix2_detIndex);  //FIXME number
+        if (innerB2)                                                                     // B2 and F1
+          if (mes > 0 && mes < T::minYsizeB2)
+            return true;
+
+        return false;
+      }
+    };
+
+    template <typename TrackerTraits, typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline))
+    doubletsFromHisto(const TAcc& acc,
+                      uint32_t nPairs,
+                      const uint32_t maxNumOfDoublets,
+                      CACellT<TrackerTraits>* cells,
+                      uint32_t* nCells,
+                      CellNeighborsVector<TrackerTraits>* cellNeighbors,
+                      CellTracksVector<TrackerTraits>* cellTracks,
+                      HitsConstView<TrackerTraits> hh,
+                      OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
+                      CellCutsT<TrackerTraits> const& cuts) {  // ysize cuts (z in the barrel)  times 8
+                                                               // these are used if doClusterCut is true
+
+      const bool doClusterCut = cuts.doClusterCut_;
+      const bool doZ0Cut = cuts.doZ0Cut_;
+      const bool doPtCut = cuts.doPtCut_;
+
+      const float z0cut = cuts.z0Cut_;      // cm
+      const float hardPtCut = cuts.ptCut_;  // GeV
+      // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field)
+      const float minRadius = hardPtCut * 87.78f;
+      const float minRadius2T4 = 4.f * minRadius * minRadius;
+
+      using PhiBinner = typename TrackingRecHitSoA<TrackerTraits>::PhiBinner;
+
+      auto const& __restrict__ phiBinner = hh.phiBinner();
+      uint32_t const* __restrict__ offsets = hh.hitsLayerStart().data();
+      ALPAKA_ASSERT_OFFLOAD(offsets);
+
+      auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
+
+      // nPairsMax to be optimized later (originally was 64).
+      // If it should much be bigger, consider using a block-wide parallel prefix scan,
+      // e.g. see  https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html
+      auto& innerLayerCumulativeSize = alpaka::declareSharedVar<uint32_t[TrackerTraits::nPairs], __COUNTER__>(acc);
+      auto& ntot = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+
+      constexpr uint32_t dimIndexY = 0u;
+      constexpr uint32_t dimIndexX = 1u;
+      const uint32_t threadIdxLocalY(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexY]);
+      const uint32_t threadIdxLocalX(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[dimIndexX]);
+
+      if (threadIdxLocalY == 0 && threadIdxLocalX == 0) {
+        innerLayerCumulativeSize[0] = layerSize(TrackerTraits::layerPairs[0]);
+        for (uint32_t i = 1; i < nPairs; ++i) {
+          innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(TrackerTraits::layerPairs[2 * i]);
+        }
+        ntot = innerLayerCumulativeSize[nPairs - 1];
+      }
+      alpaka::syncBlockThreads(acc);
+
+      // x runs faster
+      const uint32_t blockDimensionX(alpaka::getWorkDiv<alpaka::Block, alpaka::Elems>(acc)[dimIndexX]);
+      const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] =
+          cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX);
+
+      uint32_t pairLayerId = 0;  // cannot go backward
+
+      // Outermost loop on Y
+      const uint32_t gridDimensionY(alpaka::getWorkDiv<alpaka::Grid, alpaka::Elems>(acc)[dimIndexY]);
+      const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] =
+          cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY);
+      uint32_t firstElementIdxY = firstElementIdxNoStrideY;
+
+      for (uint32_t j = firstElementIdxY; j < ntot; j += gridDimensionY) {
+        while (j >= innerLayerCumulativeSize[pairLayerId++])
+          ;
+        --pairLayerId;  // move to lower_bound ??
+
+        ALPAKA_ASSERT_OFFLOAD(pairLayerId < nPairs);
+        ALPAKA_ASSERT_OFFLOAD(j < innerLayerCumulativeSize[pairLayerId]);
+        ALPAKA_ASSERT_OFFLOAD(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]);
+
+        uint8_t inner = TrackerTraits::layerPairs[2 * pairLayerId];
+        uint8_t outer = TrackerTraits::layerPairs[2 * pairLayerId + 1];
+        ALPAKA_ASSERT_OFFLOAD(outer > inner);
+
+        auto hoff = PhiBinner::histOff(outer);
+        auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
+        i += offsets[inner];
+
+        ALPAKA_ASSERT_OFFLOAD(i >= offsets[inner]);
+        ALPAKA_ASSERT_OFFLOAD(i < offsets[inner + 1]);
+
+        // found hit corresponding to our cuda thread, now do the job
+        if (hh[i].detectorIndex() > pixelClustering::maxNumModules)
+          continue;  // invalid
+
+        /* maybe clever, not effective when zoCut is on
+      auto bpos = (mi%8)/4;  // if barrel is 1 for z>0
+      auto fpos = (outer>3) & (outer<7);
+      if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue;
+      */
+
+        auto mez = hh[i].zGlobal();
+
+        if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId])
+          continue;
+
+        if (doClusterCut && outer > pixelTopology::last_barrel_layer && cuts.clusterCut(acc, hh, i))
+          continue;
+
+        auto mep = hh[i].iphi();
+        auto mer = hh[i].rGlobal();
+
+        // all cuts: true if fails
+        auto ptcut = [&](int j, int16_t idphi) {
+          auto r2t4 = minRadius2T4;
+          auto ri = mer;
+          auto ro = hh[j].rGlobal();
+          auto dphi = short2phi(idphi);
+          return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
+        };
+        auto z0cutoff = [&](int j) {
+          auto zo = hh[j].zGlobal();
+          auto ro = hh[j].rGlobal();
+          auto dr = ro - mer;
+          return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
+        };
+
+        auto iphicut = cuts.phiCuts[pairLayerId];
+
+        auto kl = PhiBinner::bin(int16_t(mep - iphicut));
+        auto kh = PhiBinner::bin(int16_t(mep + iphicut));
+        auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); };
+
+#ifdef GPU_DEBUG
+        int tot = 0;
+        int nmin = 0;
+        int tooMany = 0;
+#endif
+
+        auto khh = kh;
+        incr(khh);
+        for (auto kk = kl; kk != khh; incr(kk)) {
+#ifdef GPU_DEBUG
+          if (kk != kl && kk != kh)
+            nmin += phiBinner.size(kk + hoff);
+#endif
+          auto const* __restrict__ p = phiBinner.begin(kk + hoff);
+          auto const* __restrict__ e = phiBinner.end(kk + hoff);
+          auto const maxpIndex = e - p;
+
+          // Here we parallelize in X
+          uint32_t firstElementIdxX = firstElementIdxNoStrideX;
+          for (uint32_t pIndex = firstElementIdxX; pIndex < maxpIndex; pIndex += blockDimensionX) {
+            auto oi = p[pIndex];  // auto oi = __ldg(p); is not allowed since __ldg is device-only
+            ALPAKA_ASSERT_OFFLOAD(oi >= offsets[outer]);
+            ALPAKA_ASSERT_OFFLOAD(oi < offsets[outer + 1]);
+            auto mo = hh[oi].detectorIndex();
+
+            if (mo > pixelClustering::maxNumModules)
+              continue;  //    invalid
+
+            if (doZ0Cut && z0cutoff(oi))
+              continue;
+
+            auto mop = hh[oi].iphi();
+            uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
+
+            if (idphi > iphicut)
+              continue;
+
+            if (doClusterCut && cuts.zSizeCut(acc, hh, i, oi))
+              continue;
+
+            if (doPtCut && ptcut(oi, idphi))
+              continue;
+
+            auto ind = alpaka::atomicAdd(acc, nCells, (uint32_t)1, alpaka::hierarchy::Blocks{});
+            if (ind >= maxNumOfDoublets) {
+              alpaka::atomicSub(acc, nCells, (uint32_t)1, alpaka::hierarchy::Blocks{});
+              break;
+            }  // move to SimpleVector??
+            cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, i, oi);
+            isOuterHitOfCell[oi].push_back(acc, ind);
+#ifdef GPU_DEBUG
+            if (isOuterHitOfCell[oi].full())
+              ++tooMany;
+            ++tot;
+#endif
+          }
+        }
+//      #endif
+#ifdef GPU_DEBUG
+        if (tooMany > 0 or tot > 0)
+          printf("OuterHitOfCell for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f %s\n",
+                 i,
+                 inner,
+                 outer,
+                 nmin,
+                 tot,
+                 tooMany,
+                 iphicut,
+                 TrackerTraits::minz[pairLayerId],
+                 TrackerTraits::maxz[pairLayerId],
+                 tooMany > 0 ? "FULL!!" : "not full.");
+#endif
+      }  // loop in block...
+    }    // namespace caPixelDoublets
+  }      // namespace caPixelDoublets
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelTriplets_CAPixelDoubletsAlgos_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h
new file mode 100644
index 0000000000000..6ac7a90c724fc
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h
@@ -0,0 +1,52 @@
+#ifndef RecoPixelVertexing_PixelTriplets_CAStructures_h
+#define RecoPixelVertexing_PixelTriplets_CAStructures_h
+
+#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+
+namespace caStructures {
+
+  template <typename TrackerTraits>
+  using CellNeighborsT =
+      cms::alpakatools::VecArray<typename TrackerTraits::cindex_type, TrackerTraits::maxCellNeighbors>;
+
+  template <typename TrackerTraits>
+  using CellTracksT = cms::alpakatools::VecArray<typename TrackerTraits::tindex_type, TrackerTraits::maxCellTracks>;
+
+  template <typename TrackerTraits>
+  using CellNeighborsVectorT = cms::alpakatools::SimpleVector<CellNeighborsT<TrackerTraits>>;
+
+  template <typename TrackerTraits>
+  using CellTracksVectorT = cms::alpakatools::SimpleVector<CellTracksT<TrackerTraits>>;
+
+  template <typename TrackerTraits>
+  using OuterHitOfCellContainerT = cms::alpakatools::VecArray<uint32_t, TrackerTraits::maxCellsPerHit>;
+
+  template <typename TrackerTraits>
+  using TupleMultiplicityT = cms::alpakatools::OneToManyAssocRandomAccess<typename TrackerTraits::tindex_type,
+                                                                          TrackerTraits::maxHitsOnTrack + 1,
+                                                                          TrackerTraits::maxNumberOfTuples>;
+
+  template <typename TrackerTraits>
+  using HitToTupleT =
+      cms::alpakatools::OneToManyAssocRandomAccess<typename TrackerTraits::tindex_type,
+                                                   TrackerTraits::maxNumberOfHits,
+                                                   TrackerTraits::maxHitsForContainers>;  // 3.5 should be enough
+
+  template <typename TrackerTraits>
+  using TuplesContainerT = cms::alpakatools::OneToManyAssocRandomAccess<typename TrackerTraits::hindex_type,
+                                                                        TrackerTraits::maxNumberOfTuples,
+                                                                        TrackerTraits::maxHitsForContainers>;
+
+  template <typename TrackerTraits>
+  struct OuterHitOfCellT {
+    OuterHitOfCellContainerT<TrackerTraits>* container;
+    int32_t offset;
+    constexpr auto& operator[](int i) { return container[i - offset]; }
+    constexpr auto const& operator[](int i) const { return container[i - offset]; }
+  };
+
+}  // namespace caStructures
+
+#endif
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.cc b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.cc
new file mode 100644
index 0000000000000..078cbe8de45a4
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.cc
@@ -0,0 +1,21 @@
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HelixFit.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  template <typename TrackerTraits>
+  void HelixFit<TrackerTraits>::allocate(TupleMultiplicity const *tupleMultiplicity, OutputSoAView &helix_fit_results) {
+    tuples_ = &helix_fit_results.hitIndices();
+    tupleMultiplicity_ = tupleMultiplicity;
+    outputSoa_ = helix_fit_results;
+
+    ALPAKA_ASSERT_OFFLOAD(tuples_);
+    ALPAKA_ASSERT_OFFLOAD(tupleMultiplicity_);
+  }
+
+  template <typename TrackerTraits>
+  void HelixFit<TrackerTraits>::deallocate() {}
+
+  template class HelixFit<pixelTopology::Phase1>;
+  template class HelixFit<pixelTopology::Phase2>;
+  template class HelixFit<pixelTopology::HIonPhase1>;
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h
new file mode 100644
index 0000000000000..908124bb83081
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h
@@ -0,0 +1,93 @@
+#ifndef RecoPixelVertexing_PixelTriplets_HelixFit_h
+#define RecoPixelVertexing_PixelTriplets_HelixFit_h
+
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
+
+#include "CAStructures.h"
+namespace riemannFit {
+  // TODO: Can this be taken from TrackerTraits or somewhere else?
+  // in case of memory issue can be made smaller
+  constexpr uint32_t maxNumberOfConcurrentFits = 32 * 1024;
+  constexpr uint32_t stride = maxNumberOfConcurrentFits;
+  using Matrix3x4d = Eigen::Matrix<double, 3, 4>;
+  using Map3x4d = Eigen::Map<Matrix3x4d, 0, Eigen::Stride<3 * stride, stride> >;
+  using Matrix6x4f = Eigen::Matrix<float, 6, 4>;
+  using Map6x4f = Eigen::Map<Matrix6x4f, 0, Eigen::Stride<6 * stride, stride> >;
+
+  // hits
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;
+  template <int N>
+  using Map3xNd = Eigen::Map<Matrix3xNd<N>, 0, Eigen::Stride<3 * stride, stride> >;
+  // errors
+  template <int N>
+  using Matrix6xNf = Eigen::Matrix<float, 6, N>;
+  template <int N>
+  using Map6xNf = Eigen::Map<Matrix6xNf<N>, 0, Eigen::Stride<6 * stride, stride> >;
+  // fast fit
+  using Map4d = Eigen::Map<Vector4d, 0, Eigen::InnerStride<stride> >;
+
+  template <auto Start, auto End, auto Inc, class F>  //a compile-time bounded for loop
+  constexpr void rolling_fits(F &&f) {
+    if constexpr (Start < End) {
+      f(std::integral_constant<decltype(Start), Start>());
+      rolling_fits<Start + Inc, End, Inc>(f);
+    }
+  }
+
+}  // namespace riemannFit
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  template <typename TrackerTraits>
+  class HelixFit {
+  public:
+    using TrackingRecHitSoAs = TrackingRecHitSoA<TrackerTraits>;
+
+    using HitView = TrackingRecHitSoAView<TrackerTraits>;
+    using HitConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+
+    using Tuples = typename reco::TrackSoA<TrackerTraits>::HitContainer;
+    using OutputSoAView = reco::TrackSoAView<TrackerTraits>;
+
+    using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+
+    using ParamsOnDevice = pixelCPEforDevice::ParamsOnDeviceT<TrackerTraits>;
+
+    explicit HelixFit(float bf, bool fitNas4) : bField_(bf), fitNas4_(fitNas4) {}
+    ~HelixFit() { deallocate(); }
+
+    void setBField(double bField) { bField_ = bField; }
+    void launchRiemannKernels(const HitConstView &hv,
+                              ParamsOnDevice const *cpeParams,
+                              uint32_t nhits,
+                              uint32_t maxNumberOfTuples,
+                              Queue &queue);
+    void launchBrokenLineKernels(const HitConstView &hv,
+                                 ParamsOnDevice const *cpeParams,
+                                 uint32_t nhits,
+                                 uint32_t maxNumberOfTuples,
+                                 Queue &queue);
+
+    void allocate(TupleMultiplicity const *tupleMultiplicity, OutputSoAView &helix_fit_results);
+    void deallocate();
+
+  private:
+    static constexpr uint32_t maxNumberOfConcurrentFits_ = riemannFit::maxNumberOfConcurrentFits;
+
+    // fowarded
+    Tuples const *tuples_ = nullptr;
+    TupleMultiplicity const *tupleMultiplicity_ = nullptr;
+    OutputSoAView outputSoa_;
+    float bField_;
+
+    const bool fitNas4_;
+  };
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_HelixFit_h
diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc
new file mode 100644
index 0000000000000..5aa202700580c
--- /dev/null
+++ b/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc
@@ -0,0 +1,401 @@
+//
+// Author: Felice Pantaleo, CERN
+//
+
+#include <alpaka/alpaka.hpp>
+#include <cstdint>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h"
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h"
+#include "RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h"
+#include "HelixFit.h"
+#include "CAStructures.h"
+
+template <typename TrackerTraits>
+using Tuples = typename reco::TrackSoA<TrackerTraits>::HitContainer;
+template <typename TrackerTraits>
+using OutputSoAView = reco::TrackSoAView<TrackerTraits>;
+template <typename TrackerTraits>
+using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  using namespace alpaka;
+  using namespace cms::alpakatools;
+
+  template <int N, typename TrackerTraits>
+  class Kernel_FastFit {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  Tuples<TrackerTraits> const *__restrict__ foundNtuplets,
+                                  TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
+                                  uint32_t nHits,
+                                  TrackingRecHitSoAConstView<TrackerTraits> hh,
+                                  pixelCPEforDevice::ParamsOnDeviceT<TrackerTraits> const *__restrict__ cpeParams,
+                                  double *__restrict__ phits,
+                                  float *__restrict__ phits_ge,
+                                  double *__restrict__ pfast_fit,
+                                  uint32_t offset) const {
+      constexpr uint32_t hitsInFit = N;
+
+      ALPAKA_ASSERT_OFFLOAD(hitsInFit <= nHits);
+
+      ALPAKA_ASSERT_OFFLOAD(pfast_fit);
+      ALPAKA_ASSERT_OFFLOAD(foundNtuplets);
+      ALPAKA_ASSERT_OFFLOAD(tupleMultiplicity);
+
+      // look in bin for this hit multiplicity
+
+#ifdef RIEMANN_DEBUG
+      const uint32_t threadIdx(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0u]);
+      if (cms::alpakatools::once_per_grid(acc))
+        printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit);
+#endif
+
+      const auto nt = riemannFit::maxNumberOfConcurrentFits;
+      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+        auto tuple_idx = local_idx + offset;
+        if (tuple_idx >= tupleMultiplicity->size(nHits))
+          break;
+
+        // get it from the ntuple container (one to one to helix)
+        auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+        ALPAKA_ASSERT_OFFLOAD(static_cast<int>(tkid) < foundNtuplets->nOnes());
+
+        ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(tkid) == nHits);
+
+        riemannFit::Map3xNd<N> hits(phits + local_idx);
+        riemannFit::Map4d fast_fit(pfast_fit + local_idx);
+        riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+        // Prepare data structure
+        auto const *hitId = foundNtuplets->begin(tkid);
+        for (unsigned int i = 0; i < hitsInFit; ++i) {
+          auto hit = hitId[i];
+          float ge[6];
+          cpeParams->detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge);
+
+          hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal();
+          hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
+        }
+        riemannFit::fastFit(acc, hits, fast_fit);
+
+        // no NaN here....
+        ALPAKA_ASSERT_OFFLOAD(fast_fit(0) == fast_fit(0));
+        ALPAKA_ASSERT_OFFLOAD(fast_fit(1) == fast_fit(1));
+        ALPAKA_ASSERT_OFFLOAD(fast_fit(2) == fast_fit(2));
+        ALPAKA_ASSERT_OFFLOAD(fast_fit(3) == fast_fit(3));
+      }
+    }
+  };
+
+  template <int N, typename TrackerTraits>
+  class Kernel_CircleFit {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
+                                  uint32_t nHits,
+                                  double bField,
+                                  double *__restrict__ phits,
+                                  float *__restrict__ phits_ge,
+                                  double *__restrict__ pfast_fit_input,
+                                  riemannFit::CircleFit *circle_fit,
+                                  uint32_t offset) const {
+      ALPAKA_ASSERT_OFFLOAD(circle_fit);
+      ALPAKA_ASSERT_OFFLOAD(N <= nHits);
+
+      // same as above...
+
+      // look in bin for this hit multiplicity
+      const auto nt = riemannFit::maxNumberOfConcurrentFits;
+      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+        auto tuple_idx = local_idx + offset;
+        if (tuple_idx >= tupleMultiplicity->size(nHits))
+          break;
+
+        riemannFit::Map3xNd<N> hits(phits + local_idx);
+        riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
+        riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+        riemannFit::VectorNd<N> rad = (hits.block(0, 0, 2, N).colwise().norm());
+
+        riemannFit::Matrix2Nd<N> hits_cov = riemannFit::Matrix2Nd<N>::Zero();
+        riemannFit::loadCovariance2D(acc, hits_ge, hits_cov);
+
+        circle_fit[local_idx] =
+            riemannFit::circleFit(acc, hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, bField, true);
+
+#ifdef RIEMANN_DEBUG
+//    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid,
+//         circle_fit[local_idx].par(0), circle_fit[local_idx].par(1), circle_fit[local_idx].par(2));
+#endif
+      }
+    }
+  };
+
+  template <int N, typename TrackerTraits>
+  class Kernel_LineFit {
+  public:
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const &acc,
+                                  TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
+                                  uint32_t nHits,
+                                  double bField,
+                                  OutputSoAView<TrackerTraits> results_view,
+                                  double *__restrict__ phits,
+                                  float *__restrict__ phits_ge,
+                                  double *__restrict__ pfast_fit_input,
+                                  riemannFit::CircleFit *__restrict__ circle_fit,
+                                  uint32_t offset) const {
+      ALPAKA_ASSERT_OFFLOAD(circle_fit);
+      ALPAKA_ASSERT_OFFLOAD(N <= nHits);
+
+      // same as above...
+
+      // look in bin for this hit multiplicity
+      const auto nt = riemannFit::maxNumberOfConcurrentFits;
+      for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) {
+        auto tuple_idx = local_idx + offset;
+        if (tuple_idx >= tupleMultiplicity->size(nHits))
+          break;
+
+        // get it for the ntuple container (one to one to helix)
+        int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+
+        riemannFit::Map3xNd<N> hits(phits + local_idx);
+        riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
+        riemannFit::Map6xNf<N> hits_ge(phits_ge + local_idx);
+
+        auto const &line_fit = riemannFit::lineFit(acc, hits, hits_ge, circle_fit[local_idx], fast_fit, bField, true);
+
+        riemannFit::fromCircleToPerigee(acc, circle_fit[local_idx]);
+
+        TracksUtilities<TrackerTraits>::copyFromCircle(results_view,
+                                                       circle_fit[local_idx].par,
+                                                       circle_fit[local_idx].cov,
+                                                       line_fit.par,
+                                                       line_fit.cov,
+                                                       1.f / float(bField),
+                                                       tkid);
+        results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2));
+        results_view[tkid].eta() = asinhf(line_fit.par(0));
+        results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
+
+#ifdef RIEMANN_DEBUG
+        printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
+               N,
+               nHits,
+               tkid,
+               circle_fit[local_idx].par(0),
+               circle_fit[local_idx].par(1),
+               circle_fit[local_idx].par(2));
+        printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
+        printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
+               circle_fit[local_idx].chi2,
+               line_fit.chi2,
+               circle_fit[local_idx].cov(0, 0),
+               circle_fit[local_idx].cov(1, 1),
+               circle_fit[local_idx].cov(2, 2),
+               line_fit.cov(0, 0),
+               line_fit.cov(1, 1));
+#endif
+      }
+    }
+  };
+
+  template <typename TrackerTraits>
+  void HelixFit<TrackerTraits>::launchRiemannKernels(const TrackingRecHitSoAConstView<TrackerTraits> &hv,
+                                                     pixelCPEforDevice::ParamsOnDeviceT<TrackerTraits> const *cpeParams,
+                                                     uint32_t nhits,
+                                                     uint32_t maxNumberOfTuples,
+                                                     Queue &queue) {
+    assert(tuples_);
+
+    auto blockSize = 64;
+    auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
+    const auto workDivTriplets = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+    const auto workDivQuadsPenta = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks / 4, blockSize);
+
+    //  Fit internals
+    auto hitsDevice = cms::alpakatools::make_device_buffer<double[]>(
+        queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double));
+    auto hits_geDevice = cms::alpakatools::make_device_buffer<float[]>(
+        queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float));
+    auto fast_fit_resultsDevice = cms::alpakatools::make_device_buffer<double[]>(
+        queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double));
+    auto circle_fit_resultsDevice_holder =
+        cms::alpakatools::make_device_buffer<char[]>(queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit));
+    riemannFit::CircleFit *circle_fit_resultsDevice_ =
+        (riemannFit::CircleFit *)(circle_fit_resultsDevice_holder.data());
+
+    for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
+      // triplets
+      alpaka::exec<Acc1D>(queue,
+                          workDivTriplets,
+                          Kernel_FastFit<3, TrackerTraits>{},
+                          tuples_,
+                          tupleMultiplicity_,
+                          3,
+                          hv,
+                          cpeParams,
+                          hitsDevice.data(),
+                          hits_geDevice.data(),
+                          fast_fit_resultsDevice.data(),
+                          offset);
+
+      alpaka::exec<Acc1D>(queue,
+                          workDivTriplets,
+                          Kernel_CircleFit<3, TrackerTraits>{},
+                          tupleMultiplicity_,
+                          3,
+                          bField_,
+                          hitsDevice.data(),
+                          hits_geDevice.data(),
+                          fast_fit_resultsDevice.data(),
+                          circle_fit_resultsDevice_,
+                          offset);
+
+      alpaka::exec<Acc1D>(queue,
+                          workDivTriplets,
+                          Kernel_LineFit<3, TrackerTraits>{},
+                          tupleMultiplicity_,
+                          3,
+                          bField_,
+                          outputSoa_,
+                          hitsDevice.data(),
+                          hits_geDevice.data(),
+                          fast_fit_resultsDevice.data(),
+                          circle_fit_resultsDevice_,
+                          offset);
+
+      // quads
+      alpaka::exec<Acc1D>(queue,
+                          workDivQuadsPenta,
+                          Kernel_FastFit<4, TrackerTraits>{},
+                          tuples_,
+                          tupleMultiplicity_,
+                          4,
+                          hv,
+                          cpeParams,
+                          hitsDevice.data(),
+                          hits_geDevice.data(),
+                          fast_fit_resultsDevice.data(),
+                          offset);
+
+      alpaka::exec<Acc1D>(queue,
+                          workDivQuadsPenta,
+                          Kernel_CircleFit<4, TrackerTraits>{},
+                          tupleMultiplicity_,
+                          4,
+                          bField_,
+                          hitsDevice.data(),
+                          hits_geDevice.data(),
+                          fast_fit_resultsDevice.data(),
+                          circle_fit_resultsDevice_,
+                          offset);
+
+      alpaka::exec<Acc1D>(queue,
+                          workDivQuadsPenta,
+                          Kernel_LineFit<4, TrackerTraits>{},
+                          tupleMultiplicity_,
+                          4,
+                          bField_,
+                          outputSoa_,
+                          hitsDevice.data(),
+                          hits_geDevice.data(),
+                          fast_fit_resultsDevice.data(),
+                          circle_fit_resultsDevice_,
+                          offset);
+
+      if (fitNas4_) {
+        // penta
+        alpaka::exec<Acc1D>(queue,
+                            workDivQuadsPenta,
+                            Kernel_FastFit<4, TrackerTraits>{},
+                            tuples_,
+                            tupleMultiplicity_,
+                            5,
+                            hv,
+                            cpeParams,
+                            hitsDevice.data(),
+                            hits_geDevice.data(),
+                            fast_fit_resultsDevice.data(),
+                            offset);
+
+        alpaka::exec<Acc1D>(queue,
+                            workDivQuadsPenta,
+                            Kernel_CircleFit<4, TrackerTraits>{},
+                            tupleMultiplicity_,
+                            5,
+                            bField_,
+                            hitsDevice.data(),
+                            hits_geDevice.data(),
+                            fast_fit_resultsDevice.data(),
+                            circle_fit_resultsDevice_,
+                            offset);
+
+        alpaka::exec<Acc1D>(queue,
+                            workDivQuadsPenta,
+                            Kernel_LineFit<4, TrackerTraits>{},
+                            tupleMultiplicity_,
+                            5,
+                            bField_,
+                            outputSoa_,
+                            hitsDevice.data(),
+                            hits_geDevice.data(),
+                            fast_fit_resultsDevice.data(),
+                            circle_fit_resultsDevice_,
+                            offset);
+      } else {
+        // penta all 5
+        alpaka::exec<Acc1D>(queue,
+                            workDivQuadsPenta,
+                            Kernel_FastFit<5, TrackerTraits>{},
+                            tuples_,
+                            tupleMultiplicity_,
+                            5,
+                            hv,
+                            cpeParams,
+                            hitsDevice.data(),
+                            hits_geDevice.data(),
+                            fast_fit_resultsDevice.data(),
+                            offset);
+
+        alpaka::exec<Acc1D>(queue,
+                            workDivQuadsPenta,
+                            Kernel_CircleFit<5, TrackerTraits>{},
+                            tupleMultiplicity_,
+                            5,
+                            bField_,
+                            hitsDevice.data(),
+                            hits_geDevice.data(),
+                            fast_fit_resultsDevice.data(),
+                            circle_fit_resultsDevice_,
+                            offset);
+
+        alpaka::exec<Acc1D>(queue,
+                            workDivQuadsPenta,
+                            Kernel_LineFit<5, TrackerTraits>{},
+                            tupleMultiplicity_,
+                            5,
+                            bField_,
+                            outputSoa_,
+                            hitsDevice.data(),
+                            hits_geDevice.data(),
+                            fast_fit_resultsDevice.data(),
+                            circle_fit_resultsDevice_,
+                            offset);
+      }
+    }
+  }
+
+  template class HelixFit<pixelTopology::Phase1>;
+  template class HelixFit<pixelTopology::Phase2>;
+  template class HelixFit<pixelTopology::HIonPhase1>;
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/RecoTracker/PixelSeeding/plugins/gpuPixelDoubletsAlgos.h b/RecoTracker/PixelSeeding/plugins/gpuPixelDoubletsAlgos.h
index b86ba09949416..583021081d534 100644
--- a/RecoTracker/PixelSeeding/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoTracker/PixelSeeding/plugins/gpuPixelDoubletsAlgos.h
@@ -9,15 +9,15 @@
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "CAStructures.h"
 #include "GPUCACell.h"
 
-// #define GPU_DEBUG
-// #define NTUPLE_DEBUG
+//#define GPU_DEBUG
+//#define NTUPLE_DEBUG
 
 namespace gpuPixelDoublets {
 
@@ -287,8 +287,8 @@ namespace gpuPixelDoublets {
       }
 //      #endif
 #ifdef GPU_DEBUG
-      if (tooMany > 0)
-        printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f\n",
+      if (tooMany > 0 || tot > 0)
+        printf("OuterHitOfCell for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f %s\n",
                i,
                inner,
                outer,
@@ -297,7 +297,8 @@ namespace gpuPixelDoublets {
                tooMany,
                iphicut,
                TrackerTraits::minz[pairLayerId],
-               TrackerTraits::maxz[pairLayerId]);
+               TrackerTraits::maxz[pairLayerId],
+               tooMany > 0 ? "FULL!!" : "not full.");
 #endif
     }  // loop in block...
   }
diff --git a/RecoTracker/PixelSeeding/test/BuildFile.xml b/RecoTracker/PixelSeeding/test/BuildFile.xml
index 37e12c0ec6aed..74e7849e410e4 100644
--- a/RecoTracker/PixelSeeding/test/BuildFile.xml
+++ b/RecoTracker/PixelSeeding/test/BuildFile.xml
@@ -28,3 +28,10 @@
   <use name="eigen"/>
   <use name="boost"/>
 </bin>
+
+<bin file="alpaka/CAsizes_t.cpp">
+<flags ALPAKA_BACKENDS="1"/>  
+  <use name="alpaka"/>
+  <use name="eigen"/>
+  <use name="boost"/>
+</bin>
diff --git a/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp b/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp
new file mode 100644
index 0000000000000..770957d9a79c0
--- /dev/null
+++ b/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp
@@ -0,0 +1,40 @@
+#include "RecoTracker/PixelSeeding/plugins/alpaka/CACell.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include <typeinfo>
+#include <iostream>
+
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+template <typename T>
+void print() {
+  std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl;
+}
+
+int main() {
+  using namespace pixelTopology;
+  using namespace caStructures;
+  //for Phase-I
+  print<CACellT<Phase1>>();
+  print<CellNeighborsT<Phase1>>();
+  print<CellTracksT<Phase1>>();
+  print<OuterHitOfCellContainerT<Phase1>>();
+  print<TuplesContainerT<Phase1>>();
+  print<HitToTupleT<Phase1>>();
+  print<TupleMultiplicityT<Phase1>>();
+
+  print<CellNeighborsVectorT<Phase1>>();
+
+  //for Phase-II
+
+  print<CACellT<Phase2>>();
+  print<CellNeighborsT<Phase2>>();
+  print<CellTracksT<Phase2>>();
+  print<OuterHitOfCellContainerT<Phase2>>();
+  print<TuplesContainerT<Phase2>>();
+  print<HitToTupleT<Phase2>>();
+  print<TupleMultiplicityT<Phase2>>();
+
+  print<CellNeighborsVectorT<Phase2>>();
+
+  return 0;
+}
diff --git a/RecoTracker/PixelTrackFitting/BuildFile.xml b/RecoTracker/PixelTrackFitting/BuildFile.xml
index b57493ad60503..c21f4634d0308 100644
--- a/RecoTracker/PixelTrackFitting/BuildFile.xml
+++ b/RecoTracker/PixelTrackFitting/BuildFile.xml
@@ -1,3 +1,4 @@
+<use name="alpaka"/>
 <use name="cuda"/>
 <use name="eigen"/>
 <use name="root"/>
@@ -13,6 +14,7 @@
 <use name="FWCore/MessageLogger"/>
 <use name="FWCore/ParameterSet"/>
 <use name="FWCore/Utilities"/>
+<use name="HeterogeneousCore/AlpakaInterface"/>
 <use name="Geometry/CommonDetUnit" source_only="1"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
 <use name="MagneticField/Engine"/>
diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h b/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h
new file mode 100644
index 0000000000000..9e656e2de18dc
--- /dev/null
+++ b/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h
@@ -0,0 +1,634 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
+#include <alpaka/alpaka.hpp>
+#include <Eigen/Eigenvalues>
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace brokenline {
+    using namespace cms::alpakatools;
+    using namespace ::riemannFit;
+
+    //!< Karimäki's parameters: (phi, d, k=1/R)
+    /*!< covariance matrix: \n
+    |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n
+    |cov(phi, d )|cov( d , d )|cov( k , d )| \n
+    |cov(phi, k )|cov( d , k )|cov( k , k )| \n
+    as defined in Karimäki V., 1990, Effective circle fitting for particle trajectories, 
+    Nucl. Instr. and Meth. A305 (1991) 187.
+  */
+    using karimaki_circle_fit = riemannFit::CircleFit;
+
+    /*!
+    \brief data needed for the Broken Line fit procedure.
+  */
+    template <int n>
+    struct PreparedBrokenLineData {
+      int qCharge;                          //!< particle charge
+      riemannFit::Matrix2xNd<n> radii;      //!< xy data in the system in which the pre-fitted center is the origin
+      riemannFit::VectorNd<n> sTransverse;  //!< total distance traveled in the transverse plane
+                                            //   starting from the pre-fitted closest approach
+      riemannFit::VectorNd<n> sTotal;       //!< total distance traveled (three-dimensional)
+      riemannFit::VectorNd<n> zInSZplane;   //!< orthogonal coordinate to the pre-fitted line in the sz plane
+      riemannFit::VectorNd<n> varBeta;      //!< kink angles in the SZ plane
+    };
+
+    /*!
+    \brief Computes the Coulomb multiple scattering variance of the planar angle.
+    
+    \param length length of the track in the material.
+    \param bField magnetic field in Gev/cm/c.
+    \param radius radius of curvature (needed to evaluate p).
+    \param layer denotes which of the four layers of the detector is the endpoint of the 
+   *             multiple scattered track. For example, if Layer=3, then the particle has 
+   *             just gone through the material between the second and the third layer.
+    
+    \todo add another Layer variable to identify also the start point of the track, 
+   *      so if there are missing hits or multiple hits, the part of the detector that 
+   *      the particle has traversed can be exactly identified.
+    
+    \warning the formula used here assumes beta=1, and so neglects the dependence 
+   *         of theta_0 on the mass of the particle at fixed momentum.
+    
+    \return the variance of the planar angle ((theta_0)^2 /3).
+  */
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE double multScatt(
+        const TAcc& acc, const double& length, const double bField, const double radius, int layer, double slope) {
+      // limit R to 20GeV...
+      auto pt2 = alpaka::math::min(acc, 20., bField * radius);
+      pt2 *= pt2;
+      constexpr double inv_X0 = 0.06 / 16.;  //!< inverse of radiation length of the material in cm
+      //if(Layer==1) XXI_0=0.06/16.;
+      // else XXI_0=0.06/16.;
+      //XX_0*=1;
+
+      //! number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned
+      constexpr double geometry_factor = 0.7;
+      constexpr double fact = geometry_factor * riemannFit::sqr(13.6 / 1000.);
+      return fact / (pt2 * (1. + riemannFit::sqr(slope))) * (alpaka::math::abs(acc, length) * inv_X0) *
+             riemannFit::sqr(1. + 0.038 * log(alpaka::math::abs(acc, length) * inv_X0));
+    }
+
+    /*!
+    \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0.
+    
+    \param slope tangent of the angle of rotation.
+    
+    \return 2D rotation matrix.
+  */
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE riemannFit::Matrix2d rotationMatrix(const TAcc& acc, double slope) {
+      riemannFit::Matrix2d rot;
+      rot(0, 0) = 1. / alpaka::math::sqrt(acc, 1. + riemannFit::sqr(slope));
+      rot(0, 1) = slope * rot(0, 0);
+      rot(1, 0) = -rot(0, 1);
+      rot(1, 1) = rot(0, 0);
+      return rot;
+    }
+
+    /*!
+    \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a 
+   *       translation of the coordinate system, such that the old origin has coordinates (x0,y0) 
+   *       in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective 
+   *       circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187.
+    
+    \param circle circle fit in the old coordinate system. circle.par(0) is phi, circle.par(1) is d and circle.par(2) is rho. 
+    \param x0 x coordinate of the translation vector.
+    \param y0 y coordinate of the translation vector.
+    \param jacobian passed by reference in order to save stack.
+  */
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void translateKarimaki(
+        const TAcc& acc, karimaki_circle_fit& circle, double x0, double y0, riemannFit::Matrix3d& jacobian) {
+      // Avoid multiple access to the circle.par vector.
+      using scalar = typename std::remove_reference<decltype(circle.par(0))>::type;
+      scalar phi = circle.par(0);
+      scalar dee = circle.par(1);
+      scalar rho = circle.par(2);
+
+      // Avoid repeated trig. computations
+      scalar sinPhi = alpaka::math::sin(acc, phi);
+      scalar cosPhi = alpaka::math::cos(acc, phi);
+
+      // Intermediate computations for the circle parameters
+      scalar deltaPara = x0 * cosPhi + y0 * sinPhi;
+      scalar deltaOrth = x0 * sinPhi - y0 * cosPhi + dee;
+      scalar tempSmallU = 1 + rho * dee;
+      scalar tempC = -rho * y0 + tempSmallU * cosPhi;
+      scalar tempB = rho * x0 + tempSmallU * sinPhi;
+      scalar tempA = 2. * deltaOrth + rho * (riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara));
+      scalar tempU = alpaka::math::sqrt(acc, 1. + rho * tempA);
+
+      // Intermediate computations for the error matrix transform
+      scalar xi = 1. / (riemannFit::sqr(tempB) + riemannFit::sqr(tempC));
+      scalar tempV = 1. + rho * deltaOrth;
+      scalar lambda = (0.5 * tempA) / (riemannFit::sqr(1. + tempU) * tempU);
+      scalar mu = 1. / (tempU * (1. + tempU)) + rho * lambda;
+      scalar zeta = riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara);
+      jacobian << xi * tempSmallU * tempV, -xi * riemannFit::sqr(rho) * deltaOrth, xi * deltaPara,
+          2. * mu * tempSmallU * deltaPara, 2. * mu * tempV, mu * zeta - lambda * tempA, 0, 0, 1.;
+
+      // translated circle parameters
+      // phi
+      circle.par(0) = alpaka::math::atan2(acc, tempB, tempC);
+      // d
+      circle.par(1) = tempA / (1 + tempU);
+      // rho after translation. It is invariant, so noop
+      // circle.par(2)= rho;
+
+      // translated error matrix
+      circle.cov = jacobian * circle.cov * jacobian.transpose();
+    }
+
+    /*!
+    \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit.
+    
+    \param hits hits coordinates.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param bField magnetic field in Gev/cm/c.
+    \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData).
+  */
+    template <typename TAcc, typename M3xN, typename V4, int n>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline))
+    prepareBrokenLineData(const TAcc& acc,
+                          const M3xN& hits,
+                          const V4& fast_fit,
+                          const double bField,
+                          PreparedBrokenLineData<n>& results) {
+      riemannFit::Vector2d dVec;
+      riemannFit::Vector2d eVec;
+
+      int mId = 1;
+
+      if constexpr (n > 3) {
+        riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1));
+        auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm();
+        auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm();
+        mId = d1 < d2 ? n / 2 : n / 2 - 1;
+      }
+
+      dVec = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1);
+      eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1);
+      results.qCharge = riemannFit::cross2D(acc, dVec, eVec) > 0 ? -1 : 1;
+
+      const double slope = -results.qCharge / fast_fit(3);
+
+      riemannFit::Matrix2d rotMat = rotationMatrix(acc, slope);
+
+      // calculate radii and s
+      results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * riemannFit::MatrixXd::Constant(1, n, 1);
+      eVec = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm();
+      for (u_int i = 0; i < n; i++) {
+        dVec = results.radii.block(0, i, 2, 1);
+        results.sTransverse(i) =
+            results.qCharge * fast_fit(2) *
+            alpaka::math::atan2(
+                acc, riemannFit::cross2D(acc, dVec, eVec), dVec.dot(eVec));  // calculates the arc length
+      }
+      riemannFit::VectorNd<n> zVec = hits.block(2, 0, 1, n).transpose();
+
+      //calculate sTotal and zVec
+      riemannFit::Matrix2xNd<n> pointsSZ = riemannFit::Matrix2xNd<n>::Zero();
+      for (u_int i = 0; i < n; i++) {
+        pointsSZ(0, i) = results.sTransverse(i);
+        pointsSZ(1, i) = zVec(i);
+        pointsSZ.block(0, i, 2, 1) = rotMat * pointsSZ.block(0, i, 2, 1);
+      }
+      results.sTotal = pointsSZ.block(0, 0, 1, n).transpose();
+      results.zInSZplane = pointsSZ.block(1, 0, 1, n).transpose();
+
+      //calculate varBeta
+      results.varBeta(0) = results.varBeta(n - 1) = 0;
+      for (u_int i = 1; i < n - 1; i++) {
+        results.varBeta(i) =
+            multScatt(acc, results.sTotal(i + 1) - results.sTotal(i), bField, fast_fit(2), i + 2, slope) +
+            multScatt(acc, results.sTotal(i) - results.sTotal(i - 1), bField, fast_fit(2), i + 1, slope);
+      }
+    }
+
+    /*!
+    \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. 
+   *       This is the whole matrix in the case of the line fit and the main n-by-n block in the case 
+   *       of the circle fit.
+    
+    \param weights weights of the first part of the cost function, the one with the measurements 
+   *         and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2).
+    \param sTotal total distance traveled by the particle from the pre-fitted closest approach.
+    \param varBeta kink angles' variance.
+    
+    \return the n-by-n matrix of the linear system
+  */
+    template <typename TAcc, int n>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE riemannFit::MatrixNd<n> matrixC_u(const TAcc& acc,
+                                                                     const riemannFit::VectorNd<n>& weights,
+                                                                     const riemannFit::VectorNd<n>& sTotal,
+                                                                     const riemannFit::VectorNd<n>& varBeta) {
+      riemannFit::MatrixNd<n> c_uMat = riemannFit::MatrixNd<n>::Zero();
+      for (u_int i = 0; i < n; i++) {
+        c_uMat(i, i) = weights(i);
+        if (i > 1)
+          c_uMat(i, i) += 1. / (varBeta(i - 1) * riemannFit::sqr(sTotal(i) - sTotal(i - 1)));
+        if (i > 0 && i < n - 1)
+          c_uMat(i, i) +=
+              (1. / varBeta(i)) * riemannFit::sqr((sTotal(i + 1) - sTotal(i - 1)) /
+                                                  ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
+        if (i < n - 2)
+          c_uMat(i, i) += 1. / (varBeta(i + 1) * riemannFit::sqr(sTotal(i + 1) - sTotal(i)));
+
+        if (i > 0 && i < n - 1)
+          c_uMat(i, i + 1) =
+              1. / (varBeta(i) * (sTotal(i + 1) - sTotal(i))) *
+              (-(sTotal(i + 1) - sTotal(i - 1)) / ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))));
+        if (i < n - 2)
+          c_uMat(i, i + 1) +=
+              1. / (varBeta(i + 1) * (sTotal(i + 1) - sTotal(i))) *
+              (-(sTotal(i + 2) - sTotal(i)) / ((sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i))));
+
+        if (i < n - 2)
+          c_uMat(i, i + 2) = 1. / (varBeta(i + 1) * (sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i)));
+
+        c_uMat(i, i) *= 0.5;
+      }
+      return c_uMat + c_uMat.transpose();
+    }
+
+    /*!
+    \brief A very fast helix fit.
+    
+    \param hits the measured hits.
+    
+    \return (X0,Y0,R,tan(theta)).
+    
+    \warning sign of theta is (intentionally, for now) mistaken for negative charges.
+  */
+
+    template <typename TAcc, typename M3xN, typename V4>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void fastFit(const TAcc& acc, const M3xN& hits, V4& result) {
+      constexpr uint32_t n = M3xN::ColsAtCompileTime;
+
+      int mId = 1;
+
+      if constexpr (n > 3) {
+        riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1));
+        auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm();
+        auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm();
+        mId = d1 < d2 ? n / 2 : n / 2 - 1;
+      }
+
+      const riemannFit::Vector2d a = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1);
+      const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1);
+      const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1);
+
+      auto tmp = 0.5 / riemannFit::cross2D(acc, c, a);
+      result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp;
+      result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp;
+      // check Wikipedia for these formulas
+
+      result(2) = alpaka::math::sqrt(acc, a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) /
+                  (2. * alpaka::math::abs(acc, riemannFit::cross2D(acc, b, a)));
+      // Using Math Olympiad's formula R=abc/(4A)
+
+      const riemannFit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2);
+      const riemannFit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2);
+
+      result(3) = result(2) * atan2(riemannFit::cross2D(acc, d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0));
+      // ds/dz slope between last and first point
+    }
+
+    /*!
+    \brief Performs the Broken Line fit in the curved track case (that is, the fit 
+   *       parameters are the interceptions u and the curvature correction \Delta\kappa).
+    
+    \param hits hits coordinates.
+    \param hits_cov hits covariance matrix.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param bField magnetic field in Gev/cm/c.
+    \param data PreparedBrokenLineData.
+    \param circle_results struct to be filled with the results in this form:
+    -par parameter of the line in this form: (phi, d, k); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2 value of the cost function in the minimum.
+    
+    \details The function implements the steps 2 and 3 of the Broken Line fit 
+   *         with the curvature correction.\n
+   * The step 2 is the least square fit, done by imposing the minimum constraint on 
+   * the cost function and solving the consequent linear system. It determines the 
+   * fitted parameters u and \Delta\kappa and their covariance matrix.
+   * The step 3 is the correction of the fast pre-fitted parameters for the innermost 
+   * part of the track. It is first done in a comfortable coordinate system (the one 
+   * in which the first hit is the origin) and then the parameters and their 
+   * covariance matrix are transformed to the original coordinate system.
+  */
+    template <typename TAcc, typename M3xN, typename M6xN, typename V4, int n>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void circleFit(const TAcc& acc,
+                                                  const M3xN& hits,
+                                                  const M6xN& hits_ge,
+                                                  const V4& fast_fit,
+                                                  const double bField,
+                                                  PreparedBrokenLineData<n>& data,
+                                                  karimaki_circle_fit& circle_results) {
+      circle_results.qCharge = data.qCharge;
+      auto& radii = data.radii;
+      const auto& sTransverse = data.sTransverse;
+      const auto& sTotal = data.sTotal;
+      auto& zInSZplane = data.zInSZplane;
+      auto& varBeta = data.varBeta;
+      const double slope = -circle_results.qCharge / fast_fit(3);
+      varBeta *= 1. + riemannFit::sqr(slope);  // the kink angles are projected!
+
+      for (u_int i = 0; i < n; i++) {
+        zInSZplane(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2);
+      }
+
+      riemannFit::Matrix2d vMat;           // covariance matrix
+      riemannFit::VectorNd<n> weightsVec;  // weights
+      riemannFit::Matrix2d rotMat;         // rotation matrix point by point
+      for (u_int i = 0; i < n; i++) {
+        vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+        vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+        vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+        rotMat = rotationMatrix(acc, -radii(0, i) / radii(1, i));
+        weightsVec(i) =
+            1. / ((rotMat * vMat * rotMat.transpose())(1, 1));  // compute the orthogonal weight point by point
+      }
+
+      riemannFit::VectorNplusONEd<n> r_uVec;
+      r_uVec(n) = 0;
+      for (u_int i = 0; i < n; i++) {
+        r_uVec(i) = weightsVec(i) * zInSZplane(i);
+      }
+
+      riemannFit::MatrixNplusONEd<n> c_uMat;
+      c_uMat.block(0, 0, n, n) = matrixC_u(acc, weightsVec, sTransverse, varBeta);
+      c_uMat(n, n) = 0;
+      //add the border to the c_uMat matrix
+      for (u_int i = 0; i < n; i++) {
+        c_uMat(i, n) = 0;
+        if (i > 0 && i < n - 1) {
+          c_uMat(i, n) +=
+              -(sTransverse(i + 1) - sTransverse(i - 1)) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+              (2. * varBeta(i) * (sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1)));
+        }
+        if (i > 1) {
+          c_uMat(i, n) +=
+              (sTransverse(i) - sTransverse(i - 2)) / (2. * varBeta(i - 1) * (sTransverse(i) - sTransverse(i - 1)));
+        }
+        if (i < n - 2) {
+          c_uMat(i, n) +=
+              (sTransverse(i + 2) - sTransverse(i)) / (2. * varBeta(i + 1) * (sTransverse(i + 1) - sTransverse(i)));
+        }
+        c_uMat(n, i) = c_uMat(i, n);
+        if (i > 0 && i < n - 1)
+          c_uMat(n, n) += riemannFit::sqr(sTransverse(i + 1) - sTransverse(i - 1)) / (4. * varBeta(i));
+      }
+
+#ifdef CPP_DUMP
+      std::cout << "CU5\n" << c_uMat << std::endl;
+#endif
+      riemannFit::MatrixNplusONEd<n> iMat;
+      math::cholesky::invert(c_uMat, iMat);
+#ifdef CPP_DUMP
+      std::cout << "I5\n" << iMat << std::endl;
+#endif
+      riemannFit::VectorNplusONEd<n> uVec = iMat * r_uVec;  // obtain the fitted parameters by solving the linear system
+
+      // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin...
+
+      radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm();
+      radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm();
+
+      riemannFit::Vector2d dVec = hits.block(0, 0, 2, 1) + (-zInSZplane(0) + uVec(0)) * radii.block(0, 0, 2, 1);
+      riemannFit::Vector2d eVec = hits.block(0, 1, 2, 1) + (-zInSZplane(1) + uVec(1)) * radii.block(0, 1, 2, 1);
+      auto eMinusd = eVec - dVec;
+      auto eMinusd2 = eMinusd.squaredNorm();
+      auto tmp1 = 1. / eMinusd2;
+      auto tmp2 = alpaka::math::sqrt(acc, riemannFit::sqr(fast_fit(2)) - 0.25 * eMinusd2);
+
+      circle_results.par << atan2(eMinusd(1), eMinusd(0)), circle_results.qCharge * (tmp2 - fast_fit(2)),
+          circle_results.qCharge * (1. / fast_fit(2) + uVec(n));
+
+      tmp2 = 1. / tmp2;
+
+      riemannFit::Matrix3d jacobian;
+      jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) * tmp1,
+          (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) * tmp1, 0,
+          circle_results.qCharge * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) * tmp2,
+          circle_results.qCharge * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) * tmp2, 0, 0, 0,
+          circle_results.qCharge;
+
+      circle_results.cov << iMat(0, 0), iMat(0, 1), iMat(0, n), iMat(1, 0), iMat(1, 1), iMat(1, n), iMat(n, 0),
+          iMat(n, 1), iMat(n, n);
+
+      circle_results.cov = jacobian * circle_results.cov * jacobian.transpose();
+
+      //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction...
+
+      translateKarimaki(acc, circle_results, 0.5 * eMinusd(0), 0.5 * eMinusd(1), jacobian);
+      circle_results.cov(0, 0) +=
+          (1 + riemannFit::sqr(slope)) * multScatt(acc, sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope);
+
+      //...And translate back to the original system
+
+      translateKarimaki(acc, circle_results, dVec(0), dVec(1), jacobian);
+
+      // compute chi2
+      circle_results.chi2 = 0;
+      for (u_int i = 0; i < n; i++) {
+        circle_results.chi2 += weightsVec(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
+        if (i > 0 && i < n - 1)
+          circle_results.chi2 +=
+              riemannFit::sqr(uVec(i - 1) / (sTransverse(i) - sTransverse(i - 1)) -
+                              uVec(i) * (sTransverse(i + 1) - sTransverse(i - 1)) /
+                                  ((sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))) +
+                              uVec(i + 1) / (sTransverse(i + 1) - sTransverse(i)) +
+                              (sTransverse(i + 1) - sTransverse(i - 1)) * uVec(n) / 2) /
+              varBeta(i);
+      }
+    }
+
+    /*!
+    \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u).
+    
+    \param hits hits coordinates.
+    \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)).
+    \param bField magnetic field in Gev/cm/c.
+    \param data PreparedBrokenLineData.
+    \param line_results struct to be filled with the results in this form:
+    -par parameter of the line in this form: (cot(theta), Zip); \n
+    -cov covariance matrix of the fitted parameter; \n
+    -chi2 value of the cost function in the minimum.
+    
+    \details The function implements the steps 2 and 3 of the Broken Line fit without 
+   *        the curvature correction.\n
+   * The step 2 is the least square fit, done by imposing the minimum constraint 
+   * on the cost function and solving the consequent linear system. It determines 
+   * the fitted parameters u and their covariance matrix.
+   * The step 3 is the correction of the fast pre-fitted parameters for the innermost 
+   * part of the track. It is first done in a comfortable coordinate system (the one 
+   * in which the first hit is the origin) and then the parameters and their covariance 
+   * matrix are transformed to the original coordinate system.
+   */
+    template <typename TAcc, typename V4, typename M6xN, int n>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void lineFit(const TAcc& acc,
+                                                const M6xN& hits_ge,
+                                                const V4& fast_fit,
+                                                const double bField,
+                                                const PreparedBrokenLineData<n>& data,
+                                                riemannFit::LineFit& line_results) {
+      const auto& radii = data.radii;
+      const auto& sTotal = data.sTotal;
+      const auto& zInSZplane = data.zInSZplane;
+      const auto& varBeta = data.varBeta;
+
+      const double slope = -data.qCharge / fast_fit(3);
+      riemannFit::Matrix2d rotMat = rotationMatrix(acc, slope);
+
+      riemannFit::Matrix3d vMat = riemannFit::Matrix3d::Zero();  // covariance matrix XYZ
+      riemannFit::Matrix2x3d jacobXYZtosZ =
+          riemannFit::Matrix2x3d::Zero();  // jacobian for computation of the error on s (xyz -> sz)
+      riemannFit::VectorNd<n> weights = riemannFit::VectorNd<n>::Zero();
+      for (u_int i = 0; i < n; i++) {
+        vMat(0, 0) = hits_ge.col(i)[0];               // x errors
+        vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1];  // cov_xy
+        vMat(0, 2) = vMat(2, 0) = hits_ge.col(i)[3];  // cov_xz
+        vMat(1, 1) = hits_ge.col(i)[2];               // y errors
+        vMat(2, 1) = vMat(1, 2) = hits_ge.col(i)[4];  // cov_yz
+        vMat(2, 2) = hits_ge.col(i)[5];               // z errors
+        auto tmp = 1. / radii.block(0, i, 2, 1).norm();
+        jacobXYZtosZ(0, 0) = radii(1, i) * tmp;
+        jacobXYZtosZ(0, 1) = -radii(0, i) * tmp;
+        jacobXYZtosZ(1, 2) = 1.;
+        weights(i) = 1. / ((rotMat * jacobXYZtosZ * vMat * jacobXYZtosZ.transpose() * rotMat.transpose())(
+                              1, 1));  // compute the orthogonal weight point by point
+      }
+
+      riemannFit::VectorNd<n> r_u;
+      for (u_int i = 0; i < n; i++) {
+        r_u(i) = weights(i) * zInSZplane(i);
+      }
+#ifdef CPP_DUMP
+      std::cout << "CU4\n" << matrixC_u(w, sTotal, varBeta) << std::endl;
+#endif
+      riemannFit::MatrixNd<n> iMat;
+      math::cholesky::invert(matrixC_u(acc, weights, sTotal, varBeta), iMat);
+#ifdef CPP_DUMP
+      std::cout << "I4\n" << iMat << std::endl;
+#endif
+
+      riemannFit::VectorNd<n> uVec = iMat * r_u;  // obtain the fitted parameters by solving the linear system
+
+      // line parameters in the system in which the first hit is the origin and with axis along SZ
+      line_results.par << (uVec(1) - uVec(0)) / (sTotal(1) - sTotal(0)), uVec(0);
+      auto idiff = 1. / (sTotal(1) - sTotal(0));
+      line_results.cov << (iMat(0, 0) - 2 * iMat(0, 1) + iMat(1, 1)) * riemannFit::sqr(idiff) +
+                              multScatt(acc, sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope),
+          (iMat(0, 1) - iMat(0, 0)) * idiff, (iMat(0, 1) - iMat(0, 0)) * idiff, iMat(0, 0);
+
+      // translate to the original SZ system
+      riemannFit::Matrix2d jacobian;
+      jacobian(0, 0) = 1.;
+      jacobian(0, 1) = 0;
+      jacobian(1, 0) = -sTotal(0);
+      jacobian(1, 1) = 1.;
+      line_results.par(1) += -line_results.par(0) * sTotal(0);
+      line_results.cov = jacobian * line_results.cov * jacobian.transpose();
+
+      // rotate to the original sz system
+      auto tmp = rotMat(0, 0) - line_results.par(0) * rotMat(0, 1);
+      jacobian(1, 1) = 1. / tmp;
+      jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1);
+      jacobian(0, 1) = 0;
+      jacobian(1, 0) = line_results.par(1) * rotMat(0, 1) * jacobian(0, 0);
+      line_results.par(1) = line_results.par(1) * jacobian(1, 1);
+      line_results.par(0) = (rotMat(0, 1) + line_results.par(0) * rotMat(0, 0)) * jacobian(1, 1);
+      line_results.cov = jacobian * line_results.cov * jacobian.transpose();
+
+      // compute chi2
+      line_results.chi2 = 0;
+      for (u_int i = 0; i < n; i++) {
+        line_results.chi2 += weights(i) * riemannFit::sqr(zInSZplane(i) - uVec(i));
+        if (i > 0 && i < n - 1)
+          line_results.chi2 += riemannFit::sqr(uVec(i - 1) / (sTotal(i) - sTotal(i - 1)) -
+                                               uVec(i) * (sTotal(i + 1) - sTotal(i - 1)) /
+                                                   ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))) +
+                                               uVec(i + 1) / (sTotal(i + 1) - sTotal(i))) /
+                               varBeta(i);
+      }
+    }
+
+    /*!
+    \brief Helix fit by three step:
+    -fast pre-fit (see Fast_fit() for further info); \n
+    -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n
+    -line fit of the hits projected on the (pre-fitted) cilinder surface by Broken Line algorithm (see BL_Line_fit() for further info); \n
+    Points must be passed ordered (from inner to outer layer).
+    
+    \param hits Matrix3xNd hits coordinates in this form: \n
+    |x1|x2|x3|...|xn| \n
+    |y1|y2|y3|...|yn| \n
+    |z1|z2|z3|...|zn|
+    \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+    |(x1,x1)|(x2,x1)|(x3,x1)|(x4,x1)|.|(y1,x1)|(y2,x1)|(y3,x1)|(y4,x1)|.|(z1,x1)|(z2,x1)|(z3,x1)|(z4,x1)| \n
+    |(x1,x2)|(x2,x2)|(x3,x2)|(x4,x2)|.|(y1,x2)|(y2,x2)|(y3,x2)|(y4,x2)|.|(z1,x2)|(z2,x2)|(z3,x2)|(z4,x2)| \n
+    |(x1,x3)|(x2,x3)|(x3,x3)|(x4,x3)|.|(y1,x3)|(y2,x3)|(y3,x3)|(y4,x3)|.|(z1,x3)|(z2,x3)|(z3,x3)|(z4,x3)| \n
+    |(x1,x4)|(x2,x4)|(x3,x4)|(x4,x4)|.|(y1,x4)|(y2,x4)|(y3,x4)|(y4,x4)|.|(z1,x4)|(z2,x4)|(z3,x4)|(z4,x4)| \n
+    .       .       .       .       . .       .       .       .       . .       .       .       .       . \n
+    |(x1,y1)|(x2,y1)|(x3,y1)|(x4,y1)|.|(y1,y1)|(y2,y1)|(y3,x1)|(y4,y1)|.|(z1,y1)|(z2,y1)|(z3,y1)|(z4,y1)| \n
+    |(x1,y2)|(x2,y2)|(x3,y2)|(x4,y2)|.|(y1,y2)|(y2,y2)|(y3,x2)|(y4,y2)|.|(z1,y2)|(z2,y2)|(z3,y2)|(z4,y2)| \n
+    |(x1,y3)|(x2,y3)|(x3,y3)|(x4,y3)|.|(y1,y3)|(y2,y3)|(y3,x3)|(y4,y3)|.|(z1,y3)|(z2,y3)|(z3,y3)|(z4,y3)| \n
+    |(x1,y4)|(x2,y4)|(x3,y4)|(x4,y4)|.|(y1,y4)|(y2,y4)|(y3,x4)|(y4,y4)|.|(z1,y4)|(z2,y4)|(z3,y4)|(z4,y4)| \n
+    .       .       .    .          . .       .       .       .       . .       .       .       .       . \n
+    |(x1,z1)|(x2,z1)|(x3,z1)|(x4,z1)|.|(y1,z1)|(y2,z1)|(y3,z1)|(y4,z1)|.|(z1,z1)|(z2,z1)|(z3,z1)|(z4,z1)| \n
+    |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n
+    |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n
+    |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)|
+    \param bField magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation.
+    
+    \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings.
+    
+    \bug see BL_Circle_fit(), BL_Line_fit() and Fast_fit() bugs.
+    
+    \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits.
+  */
+
+    template <int n>
+    class helixFit {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const TAcc& acc,
+                                                     const riemannFit::Matrix3xNd<n>* hits,
+                                                     const Eigen::Matrix<float, 6, 4>* hits_ge,
+                                                     const double bField,
+                                                     riemannFit::HelixFit* helix) const {
+        riemannFit::Vector4d fast_fit;
+        fastFit(acc, *hits, fast_fit);
+
+        PreparedBrokenLineData<n> data;
+        karimaki_circle_fit circle;
+        riemannFit::LineFit line;
+        riemannFit::Matrix3d jacobian;
+
+        prepareBrokenLineData(acc, *hits, fast_fit, bField, data);
+        lineFit(acc, *hits_ge, fast_fit, bField, data, line);
+        circleFit(acc, *hits, *hits_ge, fast_fit, bField, data, circle);
+
+        // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix
+        jacobian << 1., 0, 0, 0, 1., 0, 0, 0,
+            -alpaka::math::abs(acc, circle.par(2)) * bField / (riemannFit::sqr(circle.par(2)) * circle.par(2));
+        circle.par(2) = bField / alpaka::math::abs(acc, circle.par(2));
+        circle.cov = jacobian * circle.cov * jacobian.transpose();
+
+        helix->par << circle.par, line.par;
+        helix->cov = riemannFit::MatrixXd::Zero(5, 5);
+        helix->cov.block(0, 0, 3, 3) = circle.cov;
+        helix->cov.block(3, 3, 2, 2) = line.cov;
+        helix->qCharge = circle.qCharge;
+        helix->chi2_circle = circle.chi2;
+        helix->chi2_line = line.chi2;
+      }
+    };
+  }  // namespace brokenline
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h
diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h b/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h
new file mode 100644
index 0000000000000..3daf271a5ca13
--- /dev/null
+++ b/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h
@@ -0,0 +1,64 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h
+
+#include <cmath>
+#include <cstdint>
+
+#include <Eigen/Core>
+#include <Eigen/Eigenvalues>
+
+namespace riemannFit {
+
+  using Vector2d = Eigen::Vector2d;
+  using Vector3d = Eigen::Vector3d;
+  using Vector4d = Eigen::Vector4d;
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix2d = Eigen::Matrix2d;
+  using Matrix3d = Eigen::Matrix3d;
+  using Matrix4d = Eigen::Matrix4d;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+  using Matrix6d = Eigen::Matrix<double, 6, 6>;
+
+  template <int N>
+  using Matrix3xNd = Eigen::Matrix<double, 3, N>;  // used for inputs hits
+
+  struct CircleFit {
+    Vector3d par;  //!< parameter: (X0,Y0,R)
+    Matrix3d cov;
+    /*!< covariance matrix: \n
+      |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n
+      |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n
+      |cov(X0, R)|cov(Y0, R)|cov( R, R)|
+    */
+    int32_t qCharge;  //!< particle charge
+    float chi2;
+  };
+
+  struct LineFit {
+    Vector2d par;  //!<(cotan(theta),Zip)
+    Matrix2d cov;
+    /*!<
+      |cov(c_t,c_t)|cov(Zip,c_t)| \n
+      |cov(c_t,Zip)|cov(Zip,Zip)|
+    */
+    double chi2;
+  };
+
+  struct HelixFit {
+    Vector5d par;  //!<(phi,Tip,pt,cotan(theta)),Zip)
+    Matrix5d cov;
+    /*!< ()->cov() \n
+      |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n
+      |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n
+      |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n
+      |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n
+      |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)|
+    */
+    float chi2_circle;
+    float chi2_line;
+    //    Vector4d fast_fit;
+    int32_t qCharge;  //!< particle charge
+  };                  // __attribute__((aligned(16)));
+
+}  // namespace riemannFit
+#endif
diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h b/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h
new file mode 100644
index 0000000000000..5dfa609ad3905
--- /dev/null
+++ b/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h
@@ -0,0 +1,253 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_alpaka_FitUtils_h
+#define RecoPixelVertexing_PixelTrackFitting_alpaka_FitUtils_h
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/Math/interface/choleskyInversion.h"
+#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h"
+namespace riemannFit {
+
+  constexpr double epsilon = 1.e-4;  //!< used in numerical derivative (J2 in Circle_fit())
+
+  using VectorXd = Eigen::VectorXd;
+  using MatrixXd = Eigen::MatrixXd;
+  template <int N>
+  using MatrixNd = Eigen::Matrix<double, N, N>;
+  template <int N>
+  using MatrixNplusONEd = Eigen::Matrix<double, N + 1, N + 1>;
+  template <int N>
+  using ArrayNd = Eigen::Array<double, N, N>;
+  template <int N>
+  using Matrix2Nd = Eigen::Matrix<double, 2 * N, 2 * N>;
+  template <int N>
+  using Matrix3Nd = Eigen::Matrix<double, 3 * N, 3 * N>;
+  template <int N>
+  using Matrix2xNd = Eigen::Matrix<double, 2, N>;
+  template <int N>
+  using Array2xNd = Eigen::Array<double, 2, N>;
+  template <int N>
+  using MatrixNx3d = Eigen::Matrix<double, N, 3>;
+  template <int N>
+  using MatrixNx5d = Eigen::Matrix<double, N, 5>;
+  template <int N>
+  using VectorNd = Eigen::Matrix<double, N, 1>;
+  template <int N>
+  using VectorNplusONEd = Eigen::Matrix<double, N + 1, 1>;
+  template <int N>
+  using Vector2Nd = Eigen::Matrix<double, 2 * N, 1>;
+  template <int N>
+  using Vector3Nd = Eigen::Matrix<double, 3 * N, 1>;
+  template <int N>
+  using RowVectorNd = Eigen::Matrix<double, 1, 1, N>;
+  template <int N>
+  using RowVector2Nd = Eigen::Matrix<double, 1, 2 * N>;
+
+  using Matrix2x3d = Eigen::Matrix<double, 2, 3>;
+
+  using Matrix3f = Eigen::Matrix3f;
+  using Vector3f = Eigen::Vector3f;
+  using Vector4f = Eigen::Vector4f;
+  using Vector6f = Eigen::Matrix<double, 6, 1>;
+  // transformation between the "perigee" to cmssw localcoord frame
+  // the plane of the latter is the perigee plane...
+  // from   //!<(phi,Tip,q/pt,cotan(theta)),Zip)
+  // to q/p,dx/dz,dy/dz,x,z
+  template <typename VI5, typename MI5, typename VO5, typename MO5>
+  inline void transformToPerigeePlane(VI5 const& ip, MI5 const& icov, VO5& op, MO5& ocov) {
+    auto sinTheta2 = 1. / (1. + ip(3) * ip(3));
+    auto sinTheta = std::sqrt(sinTheta2);
+    auto cosTheta = ip(3) * sinTheta;
+
+    op(0) = sinTheta * ip(2);
+    op(1) = 0.;
+    op(2) = -ip(3);
+    op(3) = ip(1);
+    op(4) = -ip(4);
+
+    Matrix5d jMat = Matrix5d::Zero();
+
+    jMat(0, 2) = sinTheta;
+    jMat(0, 3) = -sinTheta2 * cosTheta * ip(2);
+    jMat(1, 0) = 1.;
+    jMat(2, 3) = -1.;
+    jMat(3, 1) = 1.;
+    jMat(4, 4) = -1;
+
+    ocov = jMat * icov * jMat.transpose();
+  }
+
+}  // namespace riemannFit
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace riemannFit {
+    using namespace ::riemannFit;
+
+    template <typename TAcc, class C>
+    ALPAKA_FN_ACC void printIt(const TAcc& acc, C* m, const char* prefix = "") {
+#ifdef RFIT_DEBUG
+      for (uint r = 0; r < m->rows(); ++r) {
+        for (uint c = 0; c < m->cols(); ++c) {
+          printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c));
+        }
+      }
+#endif
+    }
+
+    /*!
+    \brief raise to square.
+  */
+    template <typename T>
+    constexpr T sqr(const T a) {
+      return a * a;
+    }
+
+    /*!
+    \brief Compute cross product of two 2D vector (assuming z component 0),
+    returning z component of the result.
+    \param a first 2D vector in the product.
+    \param b second 2D vector in the product.
+    \return z component of the cross product.
+  */
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE double cross2D(const TAcc& acc, const Vector2d& a, const Vector2d& b) {
+      return a.x() * b.y() - a.y() * b.x();
+    }
+
+    /*!
+   *  load error in CMSSW format to our formalism
+   *  
+   */
+    template <typename TAcc, typename M6xNf, typename M2Nd>
+    ALPAKA_FN_ACC void loadCovariance2D(const TAcc& acc, M6xNf const& ge, M2Nd& hits_cov) {
+      // Index numerology:
+      // i: index of the hits/point (0,..,3)
+      // j: index of space component (x,y,z)
+      // l: index of space components (x,y,z)
+      // ge is always in sync with the index i and is formatted as:
+      // ge[] ==> [xx, xy, yy, xz, yz, zz]
+      // in (j,l) notation, we have:
+      // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+      // so the index ge_idx corresponds to the matrix elements:
+      // | 0  1  3 |
+      // | 1  2  4 |
+      // | 3  4  5 |
+      constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
+      for (uint32_t i = 0; i < hits_in_fit; ++i) {
+        {
+          constexpr uint32_t ge_idx = 0, j = 0, l = 0;
+          hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+        }
+        {
+          constexpr uint32_t ge_idx = 2, j = 1, l = 1;
+          hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+        }
+        {
+          constexpr uint32_t ge_idx = 1, j = 1, l = 0;
+          hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+              ge.col(i)[ge_idx];
+        }
+      }
+    }
+
+    template <typename TAcc, typename M6xNf, typename M3xNd>
+    ALPAKA_FN_ACC void loadCovariance(const TAcc& acc, M6xNf const& ge, M3xNd& hits_cov) {
+      // Index numerology:
+      // i: index of the hits/point (0,..,3)
+      // j: index of space component (x,y,z)
+      // l: index of space components (x,y,z)
+      // ge is always in sync with the index i and is formatted as:
+      // ge[] ==> [xx, xy, yy, xz, yz, zz]
+      // in (j,l) notation, we have:
+      // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)]
+      // so the index ge_idx corresponds to the matrix elements:
+      // | 0  1  3 |
+      // | 1  2  4 |
+      // | 3  4  5 |
+      constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime;
+      for (uint32_t i = 0; i < hits_in_fit; ++i) {
+        {
+          constexpr uint32_t ge_idx = 0, j = 0, l = 0;
+          hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+        }
+        {
+          constexpr uint32_t ge_idx = 2, j = 1, l = 1;
+          hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+        }
+        {
+          constexpr uint32_t ge_idx = 5, j = 2, l = 2;
+          hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx];
+        }
+        {
+          constexpr uint32_t ge_idx = 1, j = 1, l = 0;
+          hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+              ge.col(i)[ge_idx];
+        }
+        {
+          constexpr uint32_t ge_idx = 3, j = 2, l = 0;
+          hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+              ge.col(i)[ge_idx];
+        }
+        {
+          constexpr uint32_t ge_idx = 4, j = 2, l = 1;
+          hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) =
+              ge.col(i)[ge_idx];
+        }
+      }
+    }
+
+    /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+    \param B magnetic field in Gev/cm/c unit.
+    \param error flag for errors computation.
+  */
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void par_uvrtopak(const TAcc& acc,
+                                                     CircleFit& circle,
+                                                     const double B,
+                                                     const bool error) {
+      Vector3d par_pak;
+      const double temp0 = circle.par.head(2).squaredNorm();
+      const double temp1 = alpaka::math::sqrt(acc, temp0);
+      par_pak << alpaka::math::atan2(acc, circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)),
+          circle.qCharge * (temp1 - circle.par(2)), circle.par(2) * B;
+      if (error) {
+        const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+        const double temp3 = 1. / temp1 * circle.qCharge;
+        Matrix3d j4Mat;
+        j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0.,
+            circle.par(0) * temp3, circle.par(1) * temp3, -circle.qCharge, 0., 0., B;
+        circle.cov = j4Mat * circle.cov * j4Mat.transpose();
+      }
+      circle.par = par_pak;
+    }
+
+    /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+  */
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void fromCircleToPerigee(const TAcc& acc, CircleFit& circle) {
+      Vector3d par_pak;
+      const double temp0 = circle.par.head(2).squaredNorm();
+      const double temp1 = alpaka::math::sqrt(acc, temp0);
+      par_pak << alpaka::math::atan2(acc, circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)),
+          circle.qCharge * (temp1 - circle.par(2)), circle.qCharge / circle.par(2);
+
+      const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+      const double temp3 = 1. / temp1 * circle.qCharge;
+      Matrix3d j4Mat;
+      j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.qCharge, 0., 0., -circle.qCharge / (circle.par(2) * circle.par(2));
+      circle.cov = j4Mat * circle.cov * j4Mat.transpose();
+
+      circle.par = par_pak;
+    }
+
+  }  // namespace riemannFit
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h
diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h b/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h
new file mode 100644
index 0000000000000..8455a03e9f58f
--- /dev/null
+++ b/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h
@@ -0,0 +1,1023 @@
+#ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+#define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
+#include <alpaka/alpaka.hpp>
+#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  namespace riemannFit {
+    using namespace ::riemannFit;
+    /*!  Compute the Radiation length in the uniform hypothesis
+ *
+ * The Pixel detector, barrel and forward, is considered as an homogeneous
+ * cylinder of material, whose radiation lengths has been derived from the TDR
+ * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore
+ * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation
+ * lengths are computed using this unique number, in both regions, barrel and
+ * endcap.
+ *
+ * NB: no angle corrections nor projections are computed inside this routine.
+ * It is therefore the responsibility of the caller to supply the proper
+ * lengths in input. These lengths are the path traveled by the particle along
+ * its trajectory, namely the so called S of the helix in 3D space.
+ *
+ * \param length_values vector of incremental distances that will be translated
+ * into radiation length equivalent. Each radiation length i is computed
+ * incrementally with respect to the previous length i-1. The first length has
+ * no reference point (i.e. it has the dca).
+ *
+ * \return incremental radiation lengths that correspond to each segment.
+ */
+
+    template <typename TAcc, typename VNd1, typename VNd2>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeRadLenUniformMaterial(const TAcc& acc,
+                                                                     const VNd1& length_values,
+                                                                     VNd2& rad_lengths) {
+      // Radiation length of the pixel detector in the uniform assumption, with
+      // 0.06 rad_len at 16 cm
+      constexpr double xx_0_inv = 0.06 / 16.;
+      uint n = length_values.rows();
+      rad_lengths(0) = length_values(0) * xx_0_inv;
+      for (uint j = 1; j < n; ++j) {
+        rad_lengths(j) = alpaka::math::abs(acc, length_values(j) - length_values(j - 1)) * xx_0_inv;
+      }
+    }
+
+    /*!
+    \brief Compute the covariance matrix along cartesian S-Z of points due to
+    multiple Coulomb scattering to be used in the line_fit, for the barrel
+    and forward cases.
+    The input covariance matrix is in the variables s-z, original and
+    unrotated.
+    The multiple scattering component is computed in the usual linear
+    approximation, using the 3D path which is computed as the squared root of
+    the squared sum of the s and z components passed in.
+    Internally a rotation by theta is performed and the covariance matrix
+    returned is the one in the direction orthogonal to the rotated S3D axis,
+    i.e. along the rotated Z axis.
+    The choice of the rotation is not arbitrary, but derived from the fact that
+    putting the horizontal axis along the S3D direction allows the usage of the
+    ordinary least squared fitting techiques with the trivial parametrization y
+    = mx + q, avoiding the patological case with m = +/- inf, that would
+    correspond to the case at eta = 0.
+ */
+
+    template <typename TAcc, typename V4, typename VNd1, typename VNd2, int N>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE auto scatterCovLine(const TAcc& acc,
+                                                       Matrix2d const* cov_sz,
+                                                       const V4& fast_fit,
+                                                       VNd1 const& s_arcs,
+                                                       VNd2 const& z_values,
+                                                       const double theta,
+                                                       const double bField,
+                                                       MatrixNd<N>& ret) {
+#ifdef RFIT_DEBUG
+      riemannFit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: ");
+#endif
+      constexpr uint n = N;
+      double p_t = alpaka::math::min(acc, 20., fast_fit(2) * bField);  // limit pt to avoid too small error!!!
+      double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
+      VectorNd<N> rad_lengths_S;
+      // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html
+      // Basically, to perform cwise operations on Matrices and Vectors, you need
+      // to transform them into Array-like objects.
+      VectorNd<N> s_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array();
+      s_values = s_values.array().sqrt();
+      computeRadLenUniformMaterial(acc, s_values, rad_lengths_S);
+      VectorNd<N> sig2_S;
+      sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array();
+#ifdef RFIT_DEBUG
+      riemannFit::printIt(cov_sz, "Scatter_cov_line - cov_sz: ");
+#endif
+      Matrix2Nd<N> tmp = Matrix2Nd<N>::Zero();
+      for (uint k = 0; k < n; ++k) {
+        tmp(k, k) = cov_sz[k](0, 0);
+        tmp(k + n, k + n) = cov_sz[k](1, 1);
+        tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1);
+      }
+      for (uint k = 0; k < n; ++k) {
+        for (uint l = k; l < n; ++l) {
+          for (uint i = 0; i < uint(alpaka::math::min(acc, k, l)); ++i) {
+            tmp(k + n, l + n) += alpaka::math::abs(acc, s_values(k) - s_values(i)) *
+                                 alpaka::math::abs(acc, s_values(l) - s_values(i)) * sig2_S(i);
+          }
+          tmp(l + n, k + n) = tmp(k + n, l + n);
+        }
+      }
+      // We are interested only in the errors orthogonal to the rotated s-axis
+      // which, in our formalism, are in the lower square matrix.
+#ifdef RFIT_DEBUG
+      riemannFit::printIt(&tmp, "Scatter_cov_line - tmp: ");
+#endif
+      ret = tmp.block(n, n, n, n);
+    }
+
+    /*!
+    \brief Compute the covariance matrix (in radial coordinates) of points in
+    the transverse plane due to multiple Coulomb scattering.
+    \param p2D 2D points in the transverse plane.
+    \param fast_fit fast_fit Vector4d result of the previous pre-fit
+    structured in this form:(X0, Y0, R, Tan(Theta))).
+    \param B magnetic field use to compute p
+    \return scatter_cov_rad errors due to multiple scattering.
+    \warning input points must be ordered radially from the detector center
+    (from inner layer to outer ones; points on the same layer must ordered too).
+    \details Only the tangential component is computed (the radial one is
+    negligible).
+ */
+    template <typename TAcc, typename M2xN, typename V4, int N>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE MatrixNd<N> scatter_cov_rad(
+        const TAcc& acc, const M2xN& p2D, const V4& fast_fit, VectorNd<N> const& rad, double B) {
+      constexpr uint n = N;
+      double p_t = alpaka::math::min(acc, 20., fast_fit(2) * B);  // limit pt to avoid too small error!!!
+      double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3)));
+      double theta = atan(fast_fit(3));
+      theta = theta < 0. ? theta + M_PI : theta;
+      VectorNd<N> s_values;
+      VectorNd<N> rad_lengths;
+      const Vector2d oVec(fast_fit(0), fast_fit(1));
+
+      // associated Jacobian, used in weights and errors computation
+      for (uint i = 0; i < n; ++i) {  // x
+        Vector2d pVec = p2D.block(0, i, 2, 1) - oVec;
+        const double cross = cross2D(acc, -oVec, pVec);
+        const double dot = (-oVec).dot(pVec);
+        const double tempAtan2 = atan2(cross, dot);
+        s_values(i) = alpaka::math::abs(acc, tempAtan2 * fast_fit(2));
+      }
+      computeRadLenUniformMaterial(acc, s_values * sqrt(1. + 1. / sqr(fast_fit(3))), rad_lengths);
+      MatrixNd<N> scatter_cov_rad = MatrixNd<N>::Zero();
+      VectorNd<N> sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array();
+      sig2 *= 0.000225 / (p_2 * sqr(sin(theta)));
+      for (uint k = 0; k < n; ++k) {
+        for (uint l = k; l < n; ++l) {
+          for (uint i = 0; i < uint(alpaka::math::min(acc, k, l)); ++i) {
+            scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i);
+          }
+          scatter_cov_rad(l, k) = scatter_cov_rad(k, l);
+        }
+      }
+#ifdef RFIT_DEBUG
+      riemannFit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: ");
+#endif
+      return scatter_cov_rad;
+    }
+
+    /*!
+    \brief Transform covariance matrix from radial (only tangential component)
+    to Cartesian coordinates (only transverse plane component).
+    \param p2D 2D points in the transverse plane.
+    \param cov_rad covariance matrix in radial coordinate.
+    \return cov_cart covariance matrix in Cartesian coordinates.
+*/
+
+    template <typename TAcc, typename M2xN, int N>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE Matrix2Nd<N> cov_radtocart(const TAcc& acc,
+                                                              const M2xN& p2D,
+                                                              const MatrixNd<N>& cov_rad,
+                                                              const VectorNd<N>& rad) {
+#ifdef RFIT_DEBUG
+      printf("Address of p2D: %p\n", &p2D);
+#endif
+      printIt(&p2D, "cov_radtocart - p2D:");
+      constexpr uint n = N;
+      Matrix2Nd<N> cov_cart = Matrix2Nd<N>::Zero();
+      VectorNd<N> rad_inv = rad.cwiseInverse();
+      printIt(&rad_inv, "cov_radtocart - rad_inv:");
+      for (uint i = 0; i < n; ++i) {
+        for (uint j = i; j < n; ++j) {
+          cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+          cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+          cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j);
+          cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j);
+          cov_cart(j, i) = cov_cart(i, j);
+          cov_cart(j + n, i + n) = cov_cart(i + n, j + n);
+          cov_cart(j + n, i) = cov_cart(i, j + n);
+          cov_cart(j, i + n) = cov_cart(i + n, j);
+        }
+      }
+      return cov_cart;
+    }
+
+    /*!
+    \brief Transform covariance matrix from Cartesian coordinates (only
+    transverse plane component) to radial coordinates (both radial and
+    tangential component but only diagonal terms, correlation between different
+    point are not managed).
+    \param p2D 2D points in transverse plane.
+    \param cov_cart covariance matrix in Cartesian coordinates.
+    \return cov_rad covariance matrix in raidal coordinate.
+    \warning correlation between different point are not computed.
+*/
+    template <typename TAcc, typename M2xN, int N>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> cov_carttorad(const TAcc& acc,
+                                                             const M2xN& p2D,
+                                                             const Matrix2Nd<N>& cov_cart,
+                                                             const VectorNd<N>& rad) {
+      constexpr uint n = N;
+      VectorNd<N> cov_rad;
+      const VectorNd<N> rad_inv2 = rad.cwiseInverse().array().square();
+      for (uint i = 0; i < n; ++i) {
+        //!< in case you have (0,0) to avoid dividing by 0 radius
+        if (rad(i) < 1.e-4)
+          cov_rad(i) = cov_cart(i, i);
+        else {
+          cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) -
+                                      2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i));
+        }
+      }
+      return cov_rad;
+    }
+
+    /*!
+    \brief Transform covariance matrix from Cartesian coordinates (only
+    transverse plane component) to coordinates system orthogonal to the
+    pre-fitted circle in each point.
+    Further information in attached documentation.
+    \param p2D 2D points in transverse plane.
+    \param cov_cart covariance matrix in Cartesian coordinates.
+    \param fast_fit fast_fit Vector4d result of the previous pre-fit
+    structured in this form:(X0, Y0, R, tan(theta))).
+    \return cov_rad covariance matrix in the pre-fitted circle's
+    orthogonal system.
+*/
+    template <typename TAcc, typename M2xN, typename V4, int N>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> cov_carttorad_prefit(
+        const TAcc& acc, const M2xN& p2D, const Matrix2Nd<N>& cov_cart, V4& fast_fit, const VectorNd<N>& rad) {
+      constexpr uint n = N;
+      VectorNd<N> cov_rad;
+      for (uint i = 0; i < n; ++i) {
+        //!< in case you have (0,0) to avoid dividing by 0 radius
+        if (rad(i) < 1.e-4)
+          cov_rad(i) = cov_cart(i, i);  // TO FIX
+        else {
+          Vector2d a = p2D.col(i);
+          Vector2d b = p2D.col(i) - fast_fit.head(2);
+          const double x2 = a.dot(b);
+          const double y2 = cross2D(acc, a, b);
+          const double tan_c = -y2 / x2;
+          const double tan_c2 = sqr(tan_c);
+          cov_rad(i) =
+              1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c);
+        }
+      }
+      return cov_rad;
+    }
+
+    /*!
+    \brief Compute the points' weights' vector for the circle fit when multiple
+    scattering is managed.
+    Further information in attached documentation.
+    \param cov_rad_inv covariance matrix inverse in radial coordinated
+    (or, beter, pre-fitted circle's orthogonal system).
+    \return weight VectorNd points' weights' vector.
+    \bug I'm not sure this is the right way to compute the weights for non
+    diagonal cov matrix. Further investigation needed.
+*/
+
+    template <typename TAcc, int N>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd<N> weightCircle(const TAcc& acc, const MatrixNd<N>& cov_rad_inv) {
+      return cov_rad_inv.colwise().sum().transpose();
+    }
+
+    /*!
+    \brief Find particle q considering the  sign of cross product between
+    particles velocity (estimated by the first 2 hits) and the vector radius
+    between the first hit and the center of the fitted circle.
+    \param p2D 2D points in transverse plane.
+    \param par_uvr result of the circle fit in this form: (X0,Y0,R).
+    \return q int 1 or -1.
+*/
+    template <typename TAcc, typename M2xN>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t charge(const TAcc& acc, const M2xN& p2D, const Vector3d& par_uvr) {
+      return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) -
+                  (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) >
+              0)
+                 ? -1
+                 : 1;
+    }
+
+    /*!
+    \brief Compute the eigenvector associated to the minimum eigenvalue.
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored.
+    \return the eigenvector associated to the minimum eigenvalue.
+    \warning double precision is needed for a correct assessment of chi2.
+    \details The minimus eigenvalue is related to chi2.
+    We exploit the fact that the matrix is symmetrical and small (2x2 for line
+    fit and 3x3 for circle fit), so the SelfAdjointEigenSolver from Eigen
+    library is used, with the computedDirect  method (available only for 2x2
+    and 3x3 Matrix) wich computes eigendecomposition of given matrix using a
+    fast closed-form algorithm.
+    For this optimization the matrix type must be known at compiling time.
+*/
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector3d min_eigen3D(const TAcc& acc, const Matrix3d& A, double& chi2) {
+#ifdef RFIT_DEBUG
+      printf("min_eigen3D - enter\n");
+#endif
+      Eigen::SelfAdjointEigenSolver<Matrix3d> solver(3);
+      solver.computeDirect(A);
+      int min_index;
+      chi2 = solver.eigenvalues().minCoeff(&min_index);
+#ifdef RFIT_DEBUG
+      printf("min_eigen3D - exit\n");
+#endif
+      return solver.eigenvectors().col(min_index);
+    }
+
+    /*!
+    \brief A faster version of min_eigen3D() where double precision is not
+    needed.
+    \param A the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored
+    \return the eigenvector associated to the minimum eigenvalue.
+    \detail The computedDirect() method of SelfAdjointEigenSolver for 3x3 Matrix
+    indeed, use trigonometry function (it solves a third degree equation) which
+    speed up in  single precision.
+*/
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector3d min_eigen3D_fast(const TAcc& acc, const Matrix3d& A) {
+      Eigen::SelfAdjointEigenSolver<Matrix3f> solver(3);
+      solver.computeDirect(A.cast<float>());
+      int min_index;
+      solver.eigenvalues().minCoeff(&min_index);
+      return solver.eigenvectors().col(min_index).cast<double>();
+    }
+
+    /*!
+    \brief 2D version of min_eigen3D().
+    \param aMat the Matrix you want to know eigenvector and eigenvalue.
+    \param chi2 the double were the chi2-related quantity will be stored
+    \return the eigenvector associated to the minimum eigenvalue.
+    \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix
+    do not use special math function (just sqrt) therefore it doesn't speed up
+    significantly in single precision.
+*/
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector2d min_eigen2D(const TAcc& acc, const Matrix2d& aMat, double& chi2) {
+      Eigen::SelfAdjointEigenSolver<Matrix2d> solver(2);
+      solver.computeDirect(aMat);
+      int min_index;
+      chi2 = solver.eigenvalues().minCoeff(&min_index);
+      return solver.eigenvectors().col(min_index);
+    }
+
+    /*!
+    \brief A very fast helix fit: it fits a circle by three points (first, middle
+    and last point) and a line by two points (first and last).
+    \param hits points to be fitted
+    \return result in this form: (X0,Y0,R,tan(theta)).
+    \warning points must be passed ordered (from internal layer to external) in
+    order to maximize accuracy and do not mistake tan(theta) sign.
+    \details This fast fit is used as pre-fit which is needed for:
+    - weights estimation and chi2 computation in line fit (fundamental);
+    - weights estimation and chi2 computation in circle fit (useful);
+    - computation of error due to multiple scattering.
+*/
+
+    template <typename TAcc, typename M3xN, typename V4>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void fastFit(const TAcc& acc, const M3xN& hits, V4& result) {
+      constexpr uint32_t N = M3xN::ColsAtCompileTime;
+      constexpr auto n = N;  // get the number of hits
+      printIt(&hits, "Fast_fit - hits: ");
+
+      // CIRCLE FIT
+      // Make segments between middle-to-first(b) and last-to-first(c) hits
+      const Vector2d bVec = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1);
+      const Vector2d cVec = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1);
+      printIt(&bVec, "Fast_fit - b: ");
+      printIt(&cVec, "Fast_fit - c: ");
+      // Compute their lengths
+      auto b2 = bVec.squaredNorm();
+      auto c2 = cVec.squaredNorm();
+      // The algebra has been verified (MR). The usual approach has been followed:
+      // * use an orthogonal reference frame passing from the first point.
+      // * build the segments (chords)
+      // * build orthogonal lines through mid points
+      // * make a system and solve for X0 and Y0.
+      // * add the initial point
+      bool flip = abs(bVec.x()) < abs(bVec.y());
+      auto bx = flip ? bVec.y() : bVec.x();
+      auto by = flip ? bVec.x() : bVec.y();
+      auto cx = flip ? cVec.y() : cVec.x();
+      auto cy = flip ? cVec.x() : cVec.y();
+      //!< in case b.x is 0 (2 hits with same x)
+      auto div = 2. * (cx * by - bx * cy);
+      // if aligned TO FIX
+      auto y0 = (cx * b2 - bx * c2) / div;
+      auto x0 = (0.5 * b2 - y0 * by) / bx;
+      result(0) = hits(0, 0) + (flip ? y0 : x0);
+      result(1) = hits(1, 0) + (flip ? x0 : y0);
+      result(2) = sqrt(sqr(x0) + sqr(y0));
+      printIt(&result, "Fast_fit - result: ");
+
+      // LINE FIT
+      const Vector2d dVec = hits.block(0, 0, 2, 1) - result.head(2);
+      const Vector2d eVec = hits.block(0, n - 1, 2, 1) - result.head(2);
+      printIt(&eVec, "Fast_fit - e: ");
+      printIt(&dVec, "Fast_fit - d: ");
+      // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) )
+      auto dr = result(2) * atan2(cross2D(acc, dVec, eVec), dVec.dot(eVec));
+      // Simple difference in Z between last and first hit
+      auto dz = hits(2, n - 1) - hits(2, 0);
+
+      result(3) = (dr / dz);
+
+#ifdef RFIT_DEBUG
+      printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3));
+#endif
+    }
+
+    /*!
+    \brief Fit a generic number of 2D points with a circle using Riemann-Chernov
+    algorithm. Covariance matrix of fitted parameter is optionally computed.
+    Multiple scattering (currently only in barrel layer) is optionally handled.
+    \param hits2D 2D points to be fitted.
+    \param hits_cov2D covariance matrix of 2D points.
+    \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)).
+    (tan(theta) is not used).
+    \param bField magnetic field
+    \param error flag for error computation.
+    \param scattering flag for multiple scattering
+    \return circle circle_fit:
+    -par parameter of the fitted circle in this form (X0,Y0,R); \n
+    -cov covariance matrix of the fitted parameter (not initialized if
+    error = false); \n
+    -q charge of the particle; \n
+    -chi2.
+    \warning hits must be passed ordered from inner to outer layer (double hits
+    on the same layer must be ordered too) so that multiple scattering is
+    treated properly.
+    \warning Multiple scattering for barrel is still not tested.
+    \warning Multiple scattering for endcap hits is not handled (yet). Do not
+    fit endcap hits with scattering = true !
+    \bug for small pt (<0.3 Gev/c) chi2 could be slightly underestimated.
+    \bug further investigation needed for error propagation with multiple
+    scattering.
+*/
+    template <typename TAcc, typename M2xN, typename V4, int N>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE CircleFit circleFit(const TAcc& acc,
+                                                       const M2xN& hits2D,
+                                                       const Matrix2Nd<N>& hits_cov2D,
+                                                       const V4& fast_fit,
+                                                       const VectorNd<N>& rad,
+                                                       const double bField,
+                                                       const bool error) {
+#ifdef RFIT_DEBUG
+      printf("circle_fit - enter\n");
+#endif
+      // INITIALIZATION
+      Matrix2Nd<N> vMat = hits_cov2D;
+      constexpr uint n = N;
+      printIt(&hits2D, "circle_fit - hits2D:");
+      printIt(&hits_cov2D, "circle_fit - hits_cov2D:");
+
+#ifdef RFIT_DEBUG
+      printf("circle_fit - WEIGHT COMPUTATION\n");
+#endif
+      // WEIGHT COMPUTATION
+      VectorNd<N> weight;
+      MatrixNd<N> gMat;
+      double renorm;
+      {
+        MatrixNd<N> cov_rad = cov_carttorad_prefit(acc, hits2D, vMat, fast_fit, rad).asDiagonal();
+        MatrixNd<N> scatterCovRadMat = scatter_cov_rad(acc, hits2D, fast_fit, rad, bField);
+        printIt(&scatterCovRadMat, "circle_fit - scatter_cov_rad:");
+        printIt(&hits2D, "circle_fit - hits2D bis:");
+#ifdef RFIT_DEBUG
+        printf("Address of hits2D: a) %p\n", &hits2D);
+#endif
+        vMat += cov_radtocart(acc, hits2D, scatterCovRadMat, rad);
+        printIt(&vMat, "circle_fit - V:");
+        cov_rad += scatterCovRadMat;
+        printIt(&cov_rad, "circle_fit - cov_rad:");
+        math::cholesky::invert(cov_rad, gMat);
+        // gMat = cov_rad.inverse();
+        renorm = gMat.sum();
+        gMat *= 1. / renorm;
+        weight = weightCircle(acc, gMat);
+      }
+      printIt(&weight, "circle_fit - weight:");
+
+      // SPACE TRANSFORMATION
+#ifdef RFIT_DEBUG
+      printf("circle_fit - SPACE TRANSFORMATION\n");
+#endif
+
+      // center
+#ifdef RFIT_DEBUG
+      printf("Address of hits2D: b) %p\n", &hits2D);
+#endif
+      const Vector2d hCentroid = hits2D.rowwise().mean();  // centroid
+      printIt(&hCentroid, "circle_fit - h_:");
+      Matrix3xNd<N> p3D;
+      p3D.block(0, 0, 2, n) = hits2D.colwise() - hCentroid;
+      printIt(&p3D, "circle_fit - p3D: a)");
+      Vector2Nd<N> mc;  // centered hits, used in error computation
+      mc << p3D.row(0).transpose(), p3D.row(1).transpose();
+      printIt(&mc, "circle_fit - mc(centered hits):");
+
+      // scale
+      const double tempQ = mc.squaredNorm();
+      const double tempS = sqrt(n * 1. / tempQ);  // scaling factor
+      p3D.block(0, 0, 2, n) *= tempS;
+
+      // project on paraboloid
+      p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm();
+      printIt(&p3D, "circle_fit - p3D: b)");
+
+#ifdef RFIT_DEBUG
+      printf("circle_fit - COST FUNCTION\n");
+#endif
+      // COST FUNCTION
+
+      // compute
+      Vector3d r0;
+      r0.noalias() = p3D * weight;  // center of gravity
+      const Matrix3xNd<N> xMat = p3D.colwise() - r0;
+      Matrix3d aMat = xMat * gMat * xMat.transpose();
+      printIt(&aMat, "circle_fit - A:");
+
+#ifdef RFIT_DEBUG
+      printf("circle_fit - MINIMIZE\n");
+#endif
+      // minimize
+      double chi2;
+      Vector3d vVec = min_eigen3D(acc, aMat, chi2);
+#ifdef RFIT_DEBUG
+      printf("circle_fit - AFTER MIN_EIGEN\n");
+#endif
+      printIt(&vVec, "v BEFORE INVERSION");
+      vVec *= (vVec(2) > 0) ? 1 : -1;  // TO FIX dovrebbe essere N(3)>0
+      printIt(&vVec, "v AFTER INVERSION");
+      // This hack to be able to run on GPU where the automatic assignment to a
+      // double from the vector multiplication is not working.
+#ifdef RFIT_DEBUG
+      printf("circle_fit - AFTER MIN_EIGEN 1\n");
+#endif
+      Eigen::Matrix<double, 1, 1> cm;
+#ifdef RFIT_DEBUG
+      printf("circle_fit - AFTER MIN_EIGEN 2\n");
+#endif
+      cm = -vVec.transpose() * r0;
+#ifdef RFIT_DEBUG
+      printf("circle_fit - AFTER MIN_EIGEN 3\n");
+#endif
+      const double tempC = cm(0, 0);
+
+#ifdef RFIT_DEBUG
+      printf("circle_fit - COMPUTE CIRCLE PARAMETER\n");
+#endif
+      // COMPUTE CIRCLE PARAMETER
+
+      // auxiliary quantities
+      const double tempH = sqrt(1. - sqr(vVec(2)) - 4. * tempC * vVec(2));
+      const double v2x2_inv = 1. / (2. * vVec(2));
+      const double s_inv = 1. / tempS;
+      Vector3d par_uvr;  // used in error propagation
+      par_uvr << -vVec(0) * v2x2_inv, -vVec(1) * v2x2_inv, tempH * v2x2_inv;
+
+      CircleFit circle;
+      circle.par << par_uvr(0) * s_inv + hCentroid(0), par_uvr(1) * s_inv + hCentroid(1), par_uvr(2) * s_inv;
+      circle.qCharge = charge(acc, hits2D, circle.par);
+      circle.chi2 = abs(chi2) * renorm / sqr(2 * vVec(2) * par_uvr(2) * tempS);
+      printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:");
+      printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:");
+#ifdef RFIT_DEBUG
+      printf("circle_fit - CIRCLE CHARGE: %d\n", circle.qCharge);
+#endif
+
+#ifdef RFIT_DEBUG
+      printf("circle_fit - ERROR PROPAGATION\n");
+#endif
+      // ERROR PROPAGATION
+      if (error) {
+#ifdef RFIT_DEBUG
+        printf("circle_fit - ERROR PRPAGATION ACTIVATED\n");
+#endif
+        ArrayNd<N> vcsMat[2][2];  // cov matrix of center & scaled points
+        MatrixNd<N> cMat[3][3];   // cov matrix of 3D transformed points
+#ifdef RFIT_DEBUG
+        printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n");
+#endif
+        {
+          Eigen::Matrix<double, 1, 1> cm;
+          Eigen::Matrix<double, 1, 1> cm2;
+          cm = mc.transpose() * vMat * mc;
+          const double tempC2 = cm(0, 0);
+          Matrix2Nd<N> tempVcsMat;
+          tempVcsMat.template triangularView<Eigen::Upper>() =
+              (sqr(tempS) * vMat + sqr(sqr(tempS)) * 1. / (4. * tempQ * n) *
+                                       (2. * vMat.squaredNorm() + 4. * tempC2) *  // mc.transpose() * V * mc) *
+                                       (mc * mc.transpose()));
+
+          printIt(&tempVcsMat, "circle_fit - Vcs:");
+          cMat[0][0] = tempVcsMat.block(0, 0, n, n).template selfadjointView<Eigen::Upper>();
+          vcsMat[0][1] = tempVcsMat.block(0, n, n, n);
+          cMat[1][1] = tempVcsMat.block(n, n, n, n).template selfadjointView<Eigen::Upper>();
+          vcsMat[1][0] = vcsMat[0][1].transpose();
+          printIt(&tempVcsMat, "circle_fit - Vcs:");
+        }
+
+        {
+          const ArrayNd<N> t0 = (VectorXd::Constant(n, 1.) * p3D.row(0));
+          const ArrayNd<N> t1 = (VectorXd::Constant(n, 1.) * p3D.row(1));
+          const ArrayNd<N> t00 = p3D.row(0).transpose() * p3D.row(0);
+          const ArrayNd<N> t01 = p3D.row(0).transpose() * p3D.row(1);
+          const ArrayNd<N> t11 = p3D.row(1).transpose() * p3D.row(1);
+          const ArrayNd<N> t10 = t01.transpose();
+          vcsMat[0][0] = cMat[0][0];
+          cMat[0][1] = vcsMat[0][1];
+          cMat[0][2] = 2. * (vcsMat[0][0] * t0 + vcsMat[0][1] * t1);
+          vcsMat[1][1] = cMat[1][1];
+          cMat[1][2] = 2. * (vcsMat[1][0] * t0 + vcsMat[1][1] * t1);
+          MatrixNd<N> tmp;
+          tmp.template triangularView<Eigen::Upper>() =
+              (2. * (vcsMat[0][0] * vcsMat[0][0] + vcsMat[0][0] * vcsMat[0][1] + vcsMat[1][1] * vcsMat[1][0] +
+                     vcsMat[1][1] * vcsMat[1][1]) +
+               4. * (vcsMat[0][0] * t00 + vcsMat[0][1] * t01 + vcsMat[1][0] * t10 + vcsMat[1][1] * t11))
+                  .matrix();
+          cMat[2][2] = tmp.template selfadjointView<Eigen::Upper>();
+        }
+        printIt(&cMat[0][0], "circle_fit - C[0][0]:");
+
+        Matrix3d c0Mat;  // cov matrix of center of gravity (r0.x,r0.y,r0.z)
+        for (uint i = 0; i < 3; ++i) {
+          for (uint j = i; j < 3; ++j) {
+            Eigen::Matrix<double, 1, 1> tmp;
+            tmp = weight.transpose() * cMat[i][j] * weight;
+            // Workaround to get things working in GPU
+            const double tempC = tmp(0, 0);
+            c0Mat(i, j) = tempC;  //weight.transpose() * C[i][j] * weight;
+            c0Mat(j, i) = c0Mat(i, j);
+          }
+        }
+        printIt(&c0Mat, "circle_fit - C0:");
+
+        const MatrixNd<N> wMat = weight * weight.transpose();
+        const MatrixNd<N> hMat = MatrixNd<N>::Identity().rowwise() - weight.transpose();
+        const MatrixNx3d<N> s_v = hMat * p3D.transpose();
+        printIt(&wMat, "circle_fit - W:");
+        printIt(&hMat, "circle_fit - H:");
+        printIt(&s_v, "circle_fit - s_v:");
+
+        MatrixNd<N> dMat[3][3];  // cov(s_v)
+        dMat[0][0] = (hMat * cMat[0][0] * hMat.transpose()).cwiseProduct(wMat);
+        dMat[0][1] = (hMat * cMat[0][1] * hMat.transpose()).cwiseProduct(wMat);
+        dMat[0][2] = (hMat * cMat[0][2] * hMat.transpose()).cwiseProduct(wMat);
+        dMat[1][1] = (hMat * cMat[1][1] * hMat.transpose()).cwiseProduct(wMat);
+        dMat[1][2] = (hMat * cMat[1][2] * hMat.transpose()).cwiseProduct(wMat);
+        dMat[2][2] = (hMat * cMat[2][2] * hMat.transpose()).cwiseProduct(wMat);
+        dMat[1][0] = dMat[0][1].transpose();
+        dMat[2][0] = dMat[0][2].transpose();
+        dMat[2][1] = dMat[1][2].transpose();
+        printIt(&dMat[0][0], "circle_fit - D_[0][0]:");
+
+        constexpr uint nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}};
+
+        Matrix6d eMat;  // cov matrix of the 6 independent elements of A
+        for (uint a = 0; a < 6; ++a) {
+          const uint i = nu[a][0], j = nu[a][1];
+          for (uint b = a; b < 6; ++b) {
+            const uint k = nu[b][0], l = nu[b][1];
+            VectorNd<N> t0(n);
+            VectorNd<N> t1(n);
+            if (l == k) {
+              t0 = 2. * dMat[j][l] * s_v.col(l);
+              if (i == j)
+                t1 = t0;
+              else
+                t1 = 2. * dMat[i][l] * s_v.col(l);
+            } else {
+              t0 = dMat[j][l] * s_v.col(k) + dMat[j][k] * s_v.col(l);
+              if (i == j)
+                t1 = t0;
+              else
+                t1 = dMat[i][l] * s_v.col(k) + dMat[i][k] * s_v.col(l);
+            }
+
+            if (i == j) {
+              Eigen::Matrix<double, 1, 1> cm;
+              cm = s_v.col(i).transpose() * (t0 + t1);
+              // Workaround to get things working in GPU
+              const double tempC = cm(0, 0);
+              eMat(a, b) = 0. + tempC;
+            } else {
+              Eigen::Matrix<double, 1, 1> cm;
+              cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+              // Workaround to get things working in GPU
+              const double tempC = cm(0, 0);
+              eMat(a, b) = 0. + tempC;  //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1);
+            }
+            if (b != a)
+              eMat(b, a) = eMat(a, b);
+          }
+        }
+        printIt(&eMat, "circle_fit - E:");
+
+        Eigen::Matrix<double, 3, 6> j2Mat;  // Jacobian of min_eigen() (numerically computed)
+        for (uint a = 0; a < 6; ++a) {
+          const uint i = nu[a][0], j = nu[a][1];
+          Matrix3d delta = Matrix3d::Zero();
+          delta(i, j) = delta(j, i) = abs(aMat(i, j) * epsilon);
+          j2Mat.col(a) = min_eigen3D_fast(acc, aMat + delta);
+          const int sign = (j2Mat.col(a)(2) > 0) ? 1 : -1;
+          j2Mat.col(a) = (j2Mat.col(a) * sign - vVec) / delta(i, j);
+        }
+        printIt(&j2Mat, "circle_fit - J2:");
+
+        Matrix4d cvcMat;  // joint cov matrix of (v0,v1,v2,c)
+        {
+          Matrix3d t0 = j2Mat * eMat * j2Mat.transpose();
+          Vector3d t1 = -t0 * r0;
+          cvcMat.block(0, 0, 3, 3) = t0;
+          cvcMat.block(0, 3, 3, 1) = t1;
+          cvcMat.block(3, 0, 1, 3) = t1.transpose();
+          Eigen::Matrix<double, 1, 1> cm1;
+          Eigen::Matrix<double, 1, 1> cm3;
+          cm1 = (vVec.transpose() * c0Mat * vVec);
+          //      cm2 = (c0Mat.cwiseProduct(t0)).sum();
+          cm3 = (r0.transpose() * t0 * r0);
+          // Workaround to get things working in GPU
+          const double tempC = cm1(0, 0) + (c0Mat.cwiseProduct(t0)).sum() + cm3(0, 0);
+          cvcMat(3, 3) = tempC;
+          // (v.transpose() * c0Mat * v) + (c0Mat.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0);
+        }
+        printIt(&cvcMat, "circle_fit - Cvc:");
+
+        Eigen::Matrix<double, 3, 4> j3Mat;  // Jacobian (v0,v1,v2,c)->(X0,Y0,R)
+        {
+          const double t = 1. / tempH;
+          j3Mat << -v2x2_inv, 0, vVec(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, vVec(1) * sqr(v2x2_inv) * 2., 0,
+              vVec(0) * v2x2_inv * t, vVec(1) * v2x2_inv * t,
+              -tempH * sqr(v2x2_inv) * 2. - (2. * tempC + vVec(2)) * v2x2_inv * t, -t;
+        }
+        printIt(&j3Mat, "circle_fit - J3:");
+
+        const RowVector2Nd<N> Jq = mc.transpose() * tempS * 1. / n;  // var(q)
+        printIt(&Jq, "circle_fit - Jq:");
+
+        Matrix3d cov_uvr = j3Mat * cvcMat * j3Mat.transpose() * sqr(s_inv)  // cov(X0,Y0,R)
+                           + (par_uvr * par_uvr.transpose()) * (Jq * vMat * Jq.transpose());
+
+        circle.cov = cov_uvr;
+      }
+
+      printIt(&circle.cov, "Circle cov:");
+#ifdef RFIT_DEBUG
+      printf("circle_fit - exit\n");
+#endif
+      return circle;
+    }
+
+    /*!  \brief Perform an ordinary least square fit in the s-z plane to compute
+ * the parameters cotTheta and Zip.
+ *
+ * The fit is performed in the rotated S3D-Z' plane, following the formalism of
+ * Frodesen, Chapter 10, p. 259.
+ *
+ * The system has been rotated to both try to use the combined errors in s-z
+ * along Z', as errors in the Y direction and to avoid the patological case of
+ * degenerate lines with angular coefficient m = +/- inf.
+ *
+ * The rotation is using the information on the theta angle computed in the
+ * fast fit. The rotation is such that the S3D axis will be the X-direction,
+ * while the rotated Z-axis will be the Y-direction. This pretty much follows
+ * what is done in the same fit in the Broken Line approach.
+ */
+
+    template <typename TAcc, typename M3xN, typename M6xN, typename V4>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE LineFit lineFit(const TAcc& acc,
+                                                   const M3xN& hits,
+                                                   const M6xN& hits_ge,
+                                                   const CircleFit& circle,
+                                                   const V4& fast_fit,
+                                                   const double bField,
+                                                   const bool error) {
+      constexpr uint32_t N = M3xN::ColsAtCompileTime;
+      constexpr auto n = N;
+      double theta = -circle.qCharge * atan(fast_fit(3));
+      theta = theta < 0. ? theta + M_PI : theta;
+
+      // Prepare the Rotation Matrix to rotate the points
+      Eigen::Matrix<double, 2, 2> rot;
+      rot << sin(theta), cos(theta), -cos(theta), sin(theta);
+
+      // PROJECTION ON THE CILINDER
+      //
+      // p2D will be:
+      // [s1, s2, s3, ..., sn]
+      // [z1, z2, z3, ..., zn]
+      // s values will be ordinary x-values
+      // z values will be ordinary y-values
+
+      Matrix2xNd<N> p2D = Matrix2xNd<N>::Zero();
+      Eigen::Matrix<double, 2, 6> jxMat;
+
+#ifdef RFIT_DEBUG
+      printf("Line_fit - B: %g\n", bField);
+      printIt(&hits, "Line_fit points: ");
+      printIt(&hits_ge, "Line_fit covs: ");
+      printIt(&rot, "Line_fit rot: ");
+#endif
+      // x & associated Jacobian
+      // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf
+      // Slide 11
+      // a ==> -o i.e. the origin of the circle in XY plane, negative
+      // b ==> p i.e. distances of the points wrt the origin of the circle.
+      const Vector2d oVec(circle.par(0), circle.par(1));
+
+      // associated Jacobian, used in weights and errors computation
+      Matrix6d covMat = Matrix6d::Zero();
+      Matrix2d cov_sz[N];
+      for (uint i = 0; i < n; ++i) {
+        Vector2d pVec = hits.block(0, i, 2, 1) - oVec;
+        const double cross = cross2D(acc, -oVec, pVec);
+        const double dot = (-oVec).dot(pVec);
+        // atan2(cross, dot) give back the angle in the transverse plane so tha the
+        // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2)
+        const double tempQAtan2 = -circle.qCharge * atan2(cross, dot);
+        //    p2D.coeffRef(1, i) = atan2_ * circle.par(2);
+        p2D(0, i) = tempQAtan2 * circle.par(2);
+
+        // associated Jacobian, used in weights and errors- computation
+        const double temp0 = -circle.qCharge * circle.par(2) * 1. / (sqr(dot) + sqr(cross));
+        double d_X0 = 0., d_Y0 = 0., d_R = 0.;  // good approximation for big pt and eta
+        if (error) {
+          d_X0 = -temp0 * ((pVec(1) + oVec(1)) * dot - (pVec(0) - oVec(0)) * cross);
+          d_Y0 = temp0 * ((pVec(0) + oVec(0)) * dot - (oVec(1) - pVec(1)) * cross);
+          d_R = tempQAtan2;
+        }
+        const double d_x = temp0 * (oVec(1) * dot + oVec(0) * cross);
+        const double d_y = temp0 * (-oVec(0) * dot + oVec(1) * cross);
+        jxMat << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.;
+
+        covMat.block(0, 0, 3, 3) = circle.cov;
+        covMat(3, 3) = hits_ge.col(i)[0];                 // x errors
+        covMat(4, 4) = hits_ge.col(i)[2];                 // y errors
+        covMat(5, 5) = hits_ge.col(i)[5];                 // z errors
+        covMat(3, 4) = covMat(4, 3) = hits_ge.col(i)[1];  // cov_xy
+        covMat(3, 5) = covMat(5, 3) = hits_ge.col(i)[3];  // cov_xz
+        covMat(4, 5) = covMat(5, 4) = hits_ge.col(i)[4];  // cov_yz
+        Matrix2d tmp = jxMat * covMat * jxMat.transpose();
+        cov_sz[i].noalias() = rot * tmp * rot.transpose();
+      }
+      // Math of d_{X0,Y0,R,x,y} all verified by hand
+      p2D.row(1) = hits.row(2);
+
+      // The following matrix will contain errors orthogonal to the rotated S
+      // component only, with the Multiple Scattering properly treated!!
+      MatrixNd<N> cov_with_ms;
+      scatterCovLine(acc, cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, bField, cov_with_ms);
+#ifdef RFIT_DEBUG
+      printIt(cov_sz, "line_fit - cov_sz:");
+      printIt(&cov_with_ms, "line_fit - cov_with_ms: ");
+#endif
+
+      // Rotate Points with the shape [2, n]
+      Matrix2xNd<N> p2D_rot = rot * p2D;
+
+#ifdef RFIT_DEBUG
+      printf("Fast fit Tan(theta): %g\n", fast_fit(3));
+      printf("Rotation angle: %g\n", theta);
+      printIt(&rot, "Rotation Matrix:");
+      printIt(&p2D, "Original Hits(s,z):");
+      printIt(&p2D_rot, "Rotated hits(S3D, Z'):");
+      printIt(&rot, "Rotation Matrix:");
+#endif
+
+      // Build the A Matrix
+      Matrix2xNd<N> aMat;
+      aMat << MatrixXd::Ones(1, n), p2D_rot.row(0);  // rotated s values
+
+#ifdef RFIT_DEBUG
+      printIt(&aMat, "A Matrix:");
+#endif
+
+      // Build A^T V-1 A, where V-1 is the covariance of only the Y components.
+      MatrixNd<N> vyInvMat;
+      math::cholesky::invert(cov_with_ms, vyInvMat);
+      // MatrixNd<N> vyInvMat = cov_with_ms.inverse();
+      Eigen::Matrix<double, 2, 2> covParamsMat = aMat * vyInvMat * aMat.transpose();
+      // Compute the Covariance Matrix of the fit parameters
+      math::cholesky::invert(covParamsMat, covParamsMat);
+
+      // Now Compute the Parameters in the form [2,1]
+      // The first component is q.
+      // The second component is m.
+      Eigen::Matrix<double, 2, 1> sol = covParamsMat * aMat * vyInvMat * p2D_rot.row(1).transpose();
+
+#ifdef RFIT_DEBUG
+      printIt(&sol, "Rotated solutions:");
+#endif
+
+      // We need now to transfer back the results in the original s-z plane
+      const auto sinTheta = sin(theta);
+      const auto cosTheta = cos(theta);
+      auto common_factor = 1. / (sinTheta - sol(1, 0) * cosTheta);
+      Eigen::Matrix<double, 2, 2> jMat;
+      jMat << 0., common_factor * common_factor, common_factor, sol(0, 0) * cosTheta * common_factor * common_factor;
+
+      double tempM = common_factor * (sol(1, 0) * sinTheta + cosTheta);
+      double tempQ = common_factor * sol(0, 0);
+      auto cov_mq = jMat * covParamsMat * jMat.transpose();
+
+      VectorNd<N> res = p2D_rot.row(1).transpose() - aMat.transpose() * sol;
+      double chi2 = res.transpose() * vyInvMat * res;
+
+      LineFit line;
+      line.par << tempM, tempQ;
+      line.cov << cov_mq;
+      line.chi2 = chi2;
+
+#ifdef RFIT_DEBUG
+      printf("Common_factor: %g\n", common_factor);
+      printIt(&jMat, "Jacobian:");
+      printIt(&sol, "Rotated solutions:");
+      printIt(&covParamsMat, "Cov_params:");
+      printIt(&cov_mq, "Rotated Covariance Matrix:");
+      printIt(&(line.par), "Real Parameters:");
+      printIt(&(line.cov), "Real Covariance Matrix:");
+      printf("Chi2: %g\n", chi2);
+#endif
+
+      return line;
+    }
+
+  }  // namespace riemannFit
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+namespace riemannFit {
+  /*!
+    \brief Helix fit by three step:
+    -fast pre-fit (see Fast_fit() for further info); \n
+    -circle fit of hits projected in the transverse plane by Riemann-Chernov
+        algorithm (see Circle_fit() for further info); \n
+    -line fit of hits projected on cylinder surface by orthogonal distance
+        regression (see Line_fit for further info). \n
+    Points must be passed ordered (from inner to outer layer).
+    \param hits Matrix3xNd hits coordinates in this form: \n
+        |x0|x1|x2|...|xn| \n
+        |y0|y1|y2|...|yn| \n
+        |z0|z1|z2|...|zn|
+    \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n
+   |(x0,x0)|(x1,x0)|(x2,x0)|.|(y0,x0)|(y1,x0)|(y2,x0)|.|(z0,x0)|(z1,x0)|(z2,x0)| \n
+   |(x0,x1)|(x1,x1)|(x2,x1)|.|(y0,x1)|(y1,x1)|(y2,x1)|.|(z0,x1)|(z1,x1)|(z2,x1)| \n
+   |(x0,x2)|(x1,x2)|(x2,x2)|.|(y0,x2)|(y1,x2)|(y2,x2)|.|(z0,x2)|(z1,x2)|(z2,x2)| \n
+       .       .       .    .    .       .       .    .    .       .       .     \n
+   |(x0,y0)|(x1,y0)|(x2,y0)|.|(y0,y0)|(y1,y0)|(y2,x0)|.|(z0,y0)|(z1,y0)|(z2,y0)| \n
+   |(x0,y1)|(x1,y1)|(x2,y1)|.|(y0,y1)|(y1,y1)|(y2,x1)|.|(z0,y1)|(z1,y1)|(z2,y1)| \n
+   |(x0,y2)|(x1,y2)|(x2,y2)|.|(y0,y2)|(y1,y2)|(y2,x2)|.|(z0,y2)|(z1,y2)|(z2,y2)| \n
+       .       .       .    .    .       .       .    .    .       .       .     \n
+   |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n
+   |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n
+   |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)|
+   \param bField magnetic field in the center of the detector in Gev/cm/c
+   unit, in order to perform pt calculation.
+   \param error flag for error computation.
+   \param scattering flag for multiple scattering treatment.
+   (see Circle_fit() documentation for further info).
+   \warning see Circle_fit(), Line_fit() and Fast_fit() warnings.
+   \bug see Circle_fit(), Line_fit() and Fast_fit() bugs.
+*/
+
+  template <int N>
+  class helixFit {
+  public:
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const TAcc& acc,
+                                                   const Matrix3xNd<N>* hits,
+                                                   const Eigen::Matrix<float, 6, N>* hits_ge,
+                                                   const double bField,
+                                                   const bool error,
+                                                   HelixFit* helix) const {
+      constexpr uint n = N;
+      VectorNd<4> rad = (hits->block(0, 0, 2, n).colwise().norm());
+
+      // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points.
+      Vector4d fast_fit;
+      ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::fastFit(acc, *hits, fast_fit);
+      riemannFit::Matrix2Nd<N> hits_cov = MatrixXd::Zero(2 * n, 2 * n);
+      ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::loadCovariance2D(acc, *hits_ge, hits_cov);
+      CircleFit circle = ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::circleFit(
+          acc, hits->block(0, 0, 2, n), hits_cov, fast_fit, rad, bField, error);
+      LineFit line =
+          ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::lineFit(acc, *hits, *hits_ge, circle, fast_fit, bField, error);
+
+      ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::par_uvrtopak(acc, circle, bField, error);
+
+      helix->par << circle.par, line.par;
+      if (error) {
+        helix->cov = MatrixXd::Zero(5, 5);
+        helix->cov.block(0, 0, 3, 3) = circle.cov;
+        helix->cov.block(3, 3, 2, 2) = line.cov;
+      }
+      helix->qCharge = circle.qCharge;
+      helix->chi2_circle = circle.chi2;
+      helix->chi2_line = line.chi2;
+    }
+  };
+}  // namespace riemannFit
+#endif  // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h
diff --git a/RecoTracker/PixelTrackFitting/plugins/BuildFile.xml b/RecoTracker/PixelTrackFitting/plugins/BuildFile.xml
index d28dad5793a66..6c8c102293651 100644
--- a/RecoTracker/PixelTrackFitting/plugins/BuildFile.xml
+++ b/RecoTracker/PixelTrackFitting/plugins/BuildFile.xml
@@ -1,8 +1,10 @@
-<use name="cuda"/>
-<use name="CUDADataFormats/Track"/>
-<use name="Geometry/Records"/>
-<use name="HeterogeneousCore/CUDACore"/>
-<use name="RecoTracker/PixelTrackFitting"/>
 <library file="*.cc" name="RecoPixelVertexingPixelTrackFittingPlugins">
+  <use name="cuda"/>
+  <use name="CUDADataFormats/Track"/>
+  <use name="DataFormats/TrackSoA"/>
+  <use name="DataFormats/TrackingRecHit"/>
+  <use name="Geometry/Records"/>
+  <use name="RecoTracker/PixelTrackFitting"/>
+  <use name="HeterogeneousCore/CUDACore"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc b/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc
new file mode 100644
index 0000000000000..c4f0b97dba8a9
--- /dev/null
+++ b/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc
@@ -0,0 +1,79 @@
+#include <Eigen/Core>  // needed here by soa layout
+
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+
+template <typename TrackerTraits>
+class PixelTrackDumpAlpakaT : public edm::global::EDAnalyzer<> {
+public:
+  using TkSoAHost = TracksHost<TrackerTraits>;
+  using VertexSoAHost = ZVertexHost;
+
+  explicit PixelTrackDumpAlpakaT(const edm::ParameterSet& iConfig);
+  ~PixelTrackDumpAlpakaT() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
+  edm::EDGetTokenT<TkSoAHost> tokenSoATrack_;
+  edm::EDGetTokenT<VertexSoAHost> tokenSoAVertex_;
+};
+
+template <typename TrackerTraits>
+PixelTrackDumpAlpakaT<TrackerTraits>::PixelTrackDumpAlpakaT(const edm::ParameterSet& iConfig) {
+  tokenSoATrack_ = consumes(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+  tokenSoAVertex_ = consumes(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+}
+
+template <typename TrackerTraits>
+void PixelTrackDumpAlpakaT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("pixelTracksAlpaka"));
+  desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVerticesAlpaka"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+template <typename TrackerTraits>
+void PixelTrackDumpAlpakaT<TrackerTraits>::analyze(edm::StreamID streamID,
+                                                   edm::Event const& iEvent,
+                                                   const edm::EventSetup& iSetup) const {
+  auto const& tracks = iEvent.get(tokenSoATrack_);
+  assert(tracks.view().quality());
+  assert(tracks.view().chi2());
+  assert(tracks.view().nLayers());
+  assert(tracks.view().eta());
+  assert(tracks.view().pt());
+  assert(tracks.view().state());
+  assert(tracks.view().covariance());
+  assert(tracks.view().nTracks());
+
+  auto const& vertices = iEvent.get(tokenSoAVertex_);
+  assert(vertices.view().idv());
+  assert(vertices.view().zv());
+  assert(vertices.view().wv());
+  assert(vertices.view().chi2());
+  assert(vertices.view().ptv2());
+  assert(vertices.view().ndof());
+  assert(vertices.view().sortInd());
+  assert(vertices.view().nvFinal());
+}
+
+using PixelTrackDumpAlpakaPhase1 = PixelTrackDumpAlpakaT<pixelTopology::Phase1>;
+using PixelTrackDumpAlpakaPhase2 = PixelTrackDumpAlpakaT<pixelTopology::Phase2>;
+using PixelTrackDumpAlpakaHIonPhase1 = PixelTrackDumpAlpakaT<pixelTopology::HIonPhase1>;
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(PixelTrackDumpAlpakaPhase1);
+DEFINE_FWK_MODULE(PixelTrackDumpAlpakaPhase2);
+DEFINE_FWK_MODULE(PixelTrackDumpAlpakaHIonPhase1);
diff --git a/RecoTracker/PixelTrackFitting/plugins/PixelTrackProducerFromSoAAlpaka.cc b/RecoTracker/PixelTrackFitting/plugins/PixelTrackProducerFromSoAAlpaka.cc
new file mode 100644
index 0000000000000..48d9072dc2d71
--- /dev/null
+++ b/RecoTracker/PixelTrackFitting/plugins/PixelTrackProducerFromSoAAlpaka.cc
@@ -0,0 +1,264 @@
+#include <vector>
+
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+#include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h"
+#include "DataFormats/TrackSoA/interface/TracksHost.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h"
+
+#include "storeTracks.h"
+
+/**
+ * This class creates "legacy" reco::Track
+ * objects from the output of SoA CA.
+ */
+
+//#define GPU_DEBUG
+
+template <typename TrackerTraits>
+class PixelTrackProducerFromSoAAlpaka : public edm::global::EDProducer<> {
+  using TkSoAHost = TracksHost<TrackerTraits>;
+  using tracksHelpers = TracksUtilities<TrackerTraits>;
+  using HMSstorage = std::vector<uint32_t>;
+
+public:
+  using IndToEdm = std::vector<uint32_t>;
+
+  explicit PixelTrackProducerFromSoAAlpaka(const edm::ParameterSet &iConfig);
+  ~PixelTrackProducerFromSoAAlpaka() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
+
+  // Event Data tokens
+  const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  const edm::EDGetTokenT<TkSoAHost> tokenTrack_;
+  const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
+  const edm::EDGetTokenT<HMSstorage> hmsToken_;
+  // Event Setup tokens
+  const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
+  const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> ttTopoToken_;
+
+  int32_t const minNumberOfHits_;
+  pixelTrack::Quality const minQuality_;
+};
+
+template <typename TrackerTraits>
+PixelTrackProducerFromSoAAlpaka<TrackerTraits>::PixelTrackProducerFromSoAAlpaka(const edm::ParameterSet &iConfig)
+    : tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTrack_(consumes(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      idealMagneticFieldToken_(esConsumes()),
+      ttTopoToken_(esConsumes()),
+      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits")),
+      minQuality_(pixelTrack::qualityByName(iConfig.getParameter<std::string>("minQuality"))) {
+  if (minQuality_ == pixelTrack::Quality::notQuality) {
+    throw cms::Exception("PixelTrackConfiguration")
+        << iConfig.getParameter<std::string>("minQuality") + " is not a pixelTrack::Quality";
+  }
+  if (minQuality_ < pixelTrack::Quality::dup) {
+    throw cms::Exception("PixelTrackConfiguration")
+        << iConfig.getParameter<std::string>("minQuality") + " not supported";
+  }
+  produces<TrackingRecHitCollection>();
+  produces<reco::TrackExtraCollection>();
+  // TrackCollection refers to TrackingRechit and TrackExtra
+  // collections, need to declare its production after them to work
+  // around a rare race condition in framework scheduling
+  produces<reco::TrackCollection>();
+  produces<IndToEdm>();
+}
+
+template <typename TrackerTraits>
+void PixelTrackProducerFromSoAAlpaka<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("trackSrc", edm::InputTag("pixelTracksAlpaka"));
+  desc.add<edm::InputTag>("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsPreSplittingLegacy"));
+  desc.add<int>("minNumberOfHits", 0);
+  desc.add<std::string>("minQuality", "loose");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+template <typename TrackerTraits>
+void PixelTrackProducerFromSoAAlpaka<TrackerTraits>::produce(edm::StreamID streamID,
+                                                             edm::Event &iEvent,
+                                                             const edm::EventSetup &iSetup) const {
+  // enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity };
+  reco::TrackBase::TrackQuality recoQuality[] = {reco::TrackBase::undefQuality,
+                                                 reco::TrackBase::undefQuality,
+                                                 reco::TrackBase::discarded,
+                                                 reco::TrackBase::loose,
+                                                 reco::TrackBase::tight,
+                                                 reco::TrackBase::tight,
+                                                 reco::TrackBase::highPurity};
+  assert(reco::TrackBase::highPurity == recoQuality[int(pixelTrack::Quality::highPurity)]);
+
+#ifdef GPU_DEBUG
+  std::cout << "Converting soa helix in reco tracks" << std::endl;
+#endif
+
+  auto indToEdmP = std::make_unique<IndToEdm>();
+  auto &indToEdm = *indToEdmP;
+
+  auto const &idealField = iSetup.getData(idealMagneticFieldToken_);
+
+  pixeltrackfitting::TracksWithRecHits tracks;
+
+  auto const &httopo = iSetup.getData(ttTopoToken_);
+
+  const auto &bsh = iEvent.get(tBeamSpot_);
+  GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
+
+  auto const &rechits = iEvent.get(cpuHits_);
+  std::vector<TrackingRecHit const *> hitmap;
+  auto const &rcs = rechits.data();
+  auto const nhits = rcs.size();
+
+  hitmap.resize(nhits, nullptr);
+
+  auto const &hitsModuleStart = iEvent.get(hmsToken_);
+
+  for (auto const &hit : rcs) {
+    auto const &thit = static_cast<BaseTrackerRecHit const &>(hit);
+    auto const detI = thit.det()->index();
+    auto const &clus = thit.firstClusterRef();
+    assert(clus.isPixel());
+    auto const idx = hitsModuleStart[detI] + clus.pixelCluster().originalId();
+    if (idx >= hitmap.size())
+      hitmap.resize(idx + 256, nullptr);  // only in case of hit overflow in one module
+
+    assert(nullptr == hitmap[idx]);
+    hitmap[idx] = &hit;
+  }
+
+  std::vector<const TrackingRecHit *> hits;
+  hits.reserve(5);
+
+  auto const &tsoa = iEvent.get(tokenTrack_);
+  auto const *quality = tsoa.view().quality();
+  auto const &hitIndices = tsoa.view().hitIndices();
+  auto nTracks = tsoa.view().nTracks();
+
+  tracks.reserve(nTracks);
+
+  int32_t nt = 0;
+
+  //sort index by pt
+  std::vector<int32_t> sortIdxs(nTracks);
+  std::iota(sortIdxs.begin(), sortIdxs.end(), 0);
+  std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) {
+    return tsoa.view()[i1].pt() > tsoa.view()[i2].pt();
+  });
+
+  //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists)
+  indToEdm.resize(sortIdxs.size(), -1);
+  for (const auto &it : sortIdxs) {
+    auto nHits = tracksHelpers::nHits(tsoa.view(), it);
+    assert(nHits >= 3);
+    auto q = quality[it];
+
+    if (q < minQuality_)
+      continue;
+    if (nHits < minNumberOfHits_)  //move to nLayers?
+      continue;
+    indToEdm[it] = nt;
+    ++nt;
+
+    hits.resize(nHits);
+    auto b = hitIndices.begin(it);
+    for (int iHit = 0; iHit < nHits; ++iHit)
+      hits[iHit] = hitmap[*(b + iHit)];
+
+    // mind: this values are respect the beamspot!
+
+    float chi2 = tsoa.view()[it].chi2();
+    float phi = tracksHelpers::phi(tsoa.view(), it);
+
+    riemannFit::Vector5d ipar, opar;
+    riemannFit::Matrix5d icov, ocov;
+    tracksHelpers::template copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa.view(), ipar, icov, it);
+    riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
+
+    LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
+    AlgebraicSymMatrix55 m;
+    for (int i = 0; i < 5; ++i)
+      for (int j = i; j < 5; ++j)
+        m(i, j) = ocov(i, j);
+
+    float sp = std::sin(phi);
+    float cp = std::cos(phi);
+    Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0);
+
+    Plane impPointPlane(bs, rot);
+    GlobalTrajectoryParameters gp(
+        impPointPlane.toGlobal(lpar.position()), impPointPlane.toGlobal(lpar.momentum()), lpar.charge(), &idealField);
+    JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, idealField);
+
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m);
+
+    int ndof = 2 * hits.size() - 5;
+    chi2 = chi2 * ndof;
+    GlobalPoint vv = gp.position();
+    math::XYZPoint pos(vv.x(), vv.y(), vv.z());
+    GlobalVector pp = gp.momentum();
+    math::XYZVector mom(pp.x(), pp.y(), pp.z());
+
+    auto track = std::make_unique<reco::Track>(chi2, ndof, pos, mom, gp.charge(), CurvilinearTrajectoryError(mo));
+
+    // bad and edup not supported as fit not present or not reliable
+    auto tkq = recoQuality[int(q)];
+    track->setQuality(tkq);
+    // loose,tight and HP are inclusive
+    if (reco::TrackBase::highPurity == tkq) {
+      track->setQuality(reco::TrackBase::tight);
+      track->setQuality(reco::TrackBase::loose);
+    } else if (reco::TrackBase::tight == tkq) {
+      track->setQuality(reco::TrackBase::loose);
+    }
+    track->setQuality(tkq);
+    // filter???
+    tracks.emplace_back(track.release(), hits);
+  }
+#ifdef GPU_DEBUG
+  std::cout << "processed " << nt << " good tuples " << tracks.size() << " out of " << indToEdm.size() << std::endl;
+#endif
+  // store tracks
+  storeTracks(iEvent, tracks, httopo);
+  iEvent.put(std::move(indToEdmP));
+}
+
+using PixelTrackProducerFromSoAAlpakaPhase1 = PixelTrackProducerFromSoAAlpaka<pixelTopology::Phase1>;
+using PixelTrackProducerFromSoAAlpakaPhase2 = PixelTrackProducerFromSoAAlpaka<pixelTopology::Phase2>;
+using PixelTrackProducerFromSoAAlpakaHIonPhase1 = PixelTrackProducerFromSoAAlpaka<pixelTopology::HIonPhase1>;
+
+#include "FWCore/Framework/interface/MakerMacros.h"
+DEFINE_FWK_MODULE(PixelTrackProducerFromSoAAlpakaPhase1);
+DEFINE_FWK_MODULE(PixelTrackProducerFromSoAAlpakaPhase2);
+DEFINE_FWK_MODULE(PixelTrackProducerFromSoAAlpakaHIonPhase1);
diff --git a/RecoTracker/PixelTrackFitting/python/PixelTracks_cff.py b/RecoTracker/PixelTrackFitting/python/PixelTracks_cff.py
index 91eb380a33da9..046caa0b033f3 100644
--- a/RecoTracker/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoTracker/PixelTrackFitting/python/PixelTracks_cff.py
@@ -1,4 +1,5 @@
 import FWCore.ParameterSet.Config as cms
+from HeterogeneousCore.AlpakaCore.functions import *
 from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
 
 from RecoLocalTracker.SiStripRecHitConverter.StripCPEfromTrackAngle_cfi import *
@@ -203,3 +204,42 @@
 (pixelNtupletFit & gpu & gpuValidationPixel).toModify(pixelTracksSoA.cpu,
     pixelRecHitSrc = "siPixelRecHitsPreSplittingSoA@cpu"
     )
+
+######################################################################
+
+### Alpaka Pixel Track Reco
+
+from Configuration.ProcessModifiers.alpaka_cff import alpaka
+
+# pixel tracks SoA producer on the device
+from RecoTracker.PixelSeeding.caHitNtupletAlpakaPhase1_cfi import caHitNtupletAlpakaPhase1 as _pixelTracksAlpakaPhase1
+from RecoTracker.PixelSeeding.caHitNtupletAlpakaPhase2_cfi import caHitNtupletAlpakaPhase2 as _pixelTracksAlpakaPhase2
+
+pixelTracksAlpaka = _pixelTracksAlpakaPhase1.clone()
+phase2_tracker.toReplaceWith(pixelTracksAlpaka,_pixelTracksAlpakaPhase2.clone())
+
+# pixel tracks SoA producer on the cpu, for validation
+pixelTracksAlpakaSerial = makeSerialClone(pixelTracksAlpaka,
+    pixelRecHitSrc = 'siPixelRecHitsPreSplittingAlpakaSerial'
+)
+
+# legacy pixel tracks from SoA
+from  RecoTracker.PixelTrackFitting.pixelTrackProducerFromSoAAlpakaPhase1_cfi import pixelTrackProducerFromSoAAlpakaPhase1 as _pixelTrackProducerFromSoAAlpakaPhase1
+from  RecoTracker.PixelTrackFitting.pixelTrackProducerFromSoAAlpakaPhase2_cfi import pixelTrackProducerFromSoAAlpakaPhase2 as _pixelTrackProducerFromSoAAlpakaPhase2
+
+(alpaka & ~phase2_tracker).toReplaceWith(pixelTracks, _pixelTrackProducerFromSoAAlpakaPhase1.clone(
+    pixelRecHitLegacySrc = "siPixelRecHitsPreSplitting",
+))
+
+(alpaka & phase2_tracker).toReplaceWith(pixelTracks, _pixelTrackProducerFromSoAAlpakaPhase2.clone(
+    pixelRecHitLegacySrc = "siPixelRecHitsPreSplitting",
+))
+
+alpaka.toReplaceWith(pixelTracksTask, cms.Task(
+    # Build the pixel ntuplets and the pixel tracks in SoA format with alpaka on the device
+    pixelTracksAlpaka,
+    # Build the pixel ntuplets and the pixel tracks in SoA format with alpaka on the cpu (if requested by the validation)
+    pixelTracksAlpakaSerial,
+    # Convert the pixel tracks from SoA to legacy format
+    pixelTracks)
+)
diff --git a/RecoTracker/PixelVertexFinding/BuildFile.xml b/RecoTracker/PixelVertexFinding/BuildFile.xml
index 6171a7a94824a..aebe052016d0d 100644
--- a/RecoTracker/PixelVertexFinding/BuildFile.xml
+++ b/RecoTracker/PixelVertexFinding/BuildFile.xml
@@ -1,3 +1,5 @@
+<use name="alpaka"/>
+<use name="rootmath"/>
 <use name="CommonTools/Clustering1D"/>
 <use name="DataFormats/GeometryCommonDetAlgo"/>
 <use name="DataFormats/Math"/>
@@ -5,7 +7,6 @@
 <use name="DataFormats/VertexReco"/>
 <use name="FWCore/MessageLogger"/>
 <use name="FWCore/Utilities"/>
-<use name="rootmath"/>
 <export>
   <lib name="1"/>
 </export>
diff --git a/RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h b/RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h
new file mode 100644
index 0000000000000..0948d88ef3acf
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h
@@ -0,0 +1,33 @@
+#ifndef RecoTracker_PixelVertexFinding_interface_PixelVertexWorkSpaceLayout_h
+#define RecoTracker_PixelVertexFinding_interface_PixelVertexWorkSpaceLayout_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+// Intermediate data used in the vertex reco algos
+// For internal use only
+namespace vertexFinder {
+
+  GENERATE_SOA_LAYOUT(PixelVertexWSSoALayout,
+                      SOA_COLUMN(uint16_t, itrk),            // index of original track
+                      SOA_COLUMN(float, zt),                 // input track z at bs
+                      SOA_COLUMN(float, ezt2),               // input error^2 on the above
+                      SOA_COLUMN(float, ptt2),               // input pt^2 on the above
+                      SOA_COLUMN(uint8_t, izt),              // interized z-position of input tracks
+                      SOA_COLUMN(int32_t, iv),               // vertex index for each associated track
+                      SOA_SCALAR(uint32_t, ntrks),           // number of "selected tracks"
+                      SOA_SCALAR(uint32_t, nvIntermediate))  // the number of vertices after splitting pruning etc.
+
+  using PixelVertexWorkSpaceSoALayout = PixelVertexWSSoALayout<>;
+  using PixelVertexWorkSpaceSoAView = PixelVertexWSSoALayout<>::View;
+  using PixelVertexWorkSpaceSoAConstView = PixelVertexWSSoALayout<>::ConstView;
+
+  ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void init(PixelVertexWorkSpaceSoAView& workspace_view) {
+    workspace_view.ntrks() = 0;
+    workspace_view.nvIntermediate() = 0;
+  }
+
+}  // namespace vertexFinder
+
+#endif  // RecoTracker_PixelVertexFinding_interface_PixelVertexWorkSpaceLayout_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/BuildFile.xml b/RecoTracker/PixelVertexFinding/plugins/BuildFile.xml
index d330676889f26..2df520dffcf5b 100644
--- a/RecoTracker/PixelVertexFinding/plugins/BuildFile.xml
+++ b/RecoTracker/PixelVertexFinding/plugins/BuildFile.xml
@@ -1,5 +1,3 @@
-<use name="CUDADataFormats/Vertex"/>
-<use name="CUDADataFormats/Track"/>
 <use name="CommonTools/Clustering1D"/>
 <use name="DataFormats/BeamSpot"/>
 <use name="DataFormats/GeometryCommonDetAlgo"/>
@@ -17,18 +15,33 @@
 <use name="FWCore/Utilities"/>
 <use name="Geometry/Records"/>
 <use name="Geometry/TrackerGeometryBuilder"/>
-<use name="HeterogeneousCore/CUDACore"/>
-<use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="RecoLocalTracker/ClusterParameterEstimator"/>
 <use name="RecoLocalTracker/Records"/>
 <use name="RecoTracker/PixelVertexFinding"/>
 <use name="SimDataFormats/PileupSummaryInfo"/>
+
 <iftool name="cuda-gcc-support">
   <use name="cuda"/>
   <set name="cuda_src" value="*.cu"/>
 <else/>
   <set name="cuda_src" value=""/>
 </iftool>
+
 <library file="*.cc ${cuda_src}" name="RecoPixelVertexingPixelVertexFindingPlugins">
+ <use name="CUDADataFormats/Vertex"/>
+ <use name="CUDADataFormats/Track"/>
+ <use name="HeterogeneousCore/CUDACore"/>
+ <use name="HeterogeneousCore/CUDAUtilities"/>
+ <flags EDM_PLUGIN="1"/>
+</library>
+
+<library file="alpaka/*.cc" name="RecoPixelVertexingPixelVertexFindingPluginsPortable">
+  <use name="alpaka"/>
+  <use name="DataFormats/Portable"/>
+  <use name="DataFormats/TrackSoA"/>
+  <use name="DataFormats/VertexSoA"/>
+  <use name="HeterogeneousCore/AlpakaCore"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc b/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc
new file mode 100644
index 0000000000000..6e542f7870c2e
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc
@@ -0,0 +1,175 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/VertexReco/interface/Vertex.h"
+#include "DataFormats/VertexReco/interface/VertexFwd.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+
+class PixelVertexProducerFromSoAAlpaka : public edm::global::EDProducer<> {
+public:
+  using IndToEdm = std::vector<uint32_t>;
+
+  explicit PixelVertexProducerFromSoAAlpaka(const edm::ParameterSet &iConfig);
+  ~PixelVertexProducerFromSoAAlpaka() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
+
+  edm::EDGetTokenT<ZVertexHost> tokenVertex_;
+  edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
+  edm::EDGetTokenT<reco::TrackCollection> tokenTracks_;
+  edm::EDGetTokenT<IndToEdm> tokenIndToEdm_;
+};
+
+PixelVertexProducerFromSoAAlpaka::PixelVertexProducerFromSoAAlpaka(const edm::ParameterSet &conf)
+    : tokenVertex_(consumes(conf.getParameter<edm::InputTag>("src"))),
+      tokenBeamSpot_(consumes(conf.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTracks_(consumes(conf.getParameter<edm::InputTag>("TrackCollection"))),
+      tokenIndToEdm_(consumes(conf.getParameter<edm::InputTag>("TrackCollection"))) {
+  produces<reco::VertexCollection>();
+}
+
+void PixelVertexProducerFromSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("TrackCollection", edm::InputTag("pixelTracks"));
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelVerticesAlpaka"));
+
+  descriptions.add("pixelVertexFromSoAAlpaka", desc);
+}
+
+void PixelVertexProducerFromSoAAlpaka::produce(edm::StreamID streamID,
+                                               edm::Event &iEvent,
+                                               const edm::EventSetup &) const {
+  auto vertexes = std::make_unique<reco::VertexCollection>();
+
+  auto tracksHandle = iEvent.getHandle(tokenTracks_);
+  auto tracksSize = tracksHandle->size();
+  auto const &indToEdm = iEvent.get(tokenIndToEdm_);
+  auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
+
+  float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0;
+  std::vector<int32_t> itrk;
+  itrk.reserve(64);  // avoid first relocations
+  if (!bsHandle.isValid()) {
+    edm::LogWarning("PixelVertexProducer") << "No beamspot found. returning vertexes with (0,0,Z) ";
+  } else {
+    const reco::BeamSpot &bs = *bsHandle;
+    x0 = bs.x0();
+    y0 = bs.y0();
+    z0 = bs.z0();
+    dxdz = bs.dxdz();
+    dydz = bs.dydz();
+  }
+
+  auto const &soa = iEvent.get(tokenVertex_);
+
+  int nv = soa.view().nvFinal();
+
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+  std::cout << "converting " << nv << " vertices "
+            << " from " << indToEdm.size() << " tracks" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+
+  std::set<uint32_t> uind;  // for verifing index consistency
+  for (int j = nv - 1; j >= 0; --j) {
+    auto i = soa.view()[j].sortInd();  // on gpu sorted in ascending order....
+    assert(i < nv);
+    uind.insert(i);
+    assert(itrk.empty());
+    auto z = soa.view()[i].zv();
+    auto x = x0 + dxdz * z;
+    auto y = y0 + dydz * z;
+    z += z0;
+    reco::Vertex::Error err;
+    err(2, 2) = 1.f / soa.view()[i].wv();
+    err(2, 2) *= 2.;  // artifically inflate error
+    //Copy also the tracks (no intention to be efficient....)
+    for (auto k = 0U; k < indToEdm.size(); ++k) {
+      if (soa.view()[k].idv() == int16_t(i))
+        itrk.push_back(k);
+    }
+    auto nt = itrk.size();
+    if (nt == 0) {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+      std::cout << "vertex " << i << " with no tracks..." << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+      continue;
+    }
+    if (nt < 2) {
+      itrk.clear();
+      continue;
+    }  // remove outliers
+    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt);
+    auto &v = (*vertexes).back();
+    v.reserve(itrk.size());
+    for (auto it : itrk) {
+      assert(it < int(indToEdm.size()));
+      auto k = indToEdm[it];
+      if (k > tracksSize) {
+        edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k;
+        continue;
+      }
+      auto tk = reco::TrackRef(tracksHandle, k);
+      v.add(tk);
+    }
+    itrk.clear();
+  }
+
+  LogDebug("PixelVertexProducer") << ": Found " << vertexes->size() << " vertexes\n";
+  for (unsigned int i = 0; i < vertexes->size(); ++i) {
+    LogDebug("PixelVertexProducer") << "Vertex number " << i << " has " << (*vertexes)[i].tracksSize()
+                                    << " tracks with a position of " << (*vertexes)[i].z() << " +- "
+                                    << std::sqrt((*vertexes)[i].covariance(2, 2));
+  }
+
+  // legacy logic....
+  if (vertexes->empty() && bsHandle.isValid()) {
+    const reco::BeamSpot &bs = *bsHandle;
+
+    GlobalError bse(bs.rotatedCovariance3D());
+    if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) {
+      AlgebraicSymMatrix33 we;
+      we(0, 0) = 10000;
+      we(1, 1) = 10000;
+      we(2, 2) = 10000;
+      vertexes->push_back(reco::Vertex(bs.position(), we, 0., 0., 0));
+
+      edm::LogInfo("PixelVertexProducer") << "No vertices found. Beamspot with invalid errors " << bse.matrix()
+                                          << "\nWill put Vertex derived from dummy-fake BeamSpot into Event.\n"
+                                          << (*vertexes)[0].x() << "\n"
+                                          << (*vertexes)[0].y() << "\n"
+                                          << (*vertexes)[0].z() << "\n";
+    } else {
+      vertexes->push_back(reco::Vertex(bs.position(), bs.rotatedCovariance3D(), 0., 0., 0));
+
+      edm::LogInfo("PixelVertexProducer") << "No vertices found. Will put Vertex derived from BeamSpot into Event:\n"
+                                          << (*vertexes)[0].x() << "\n"
+                                          << (*vertexes)[0].y() << "\n"
+                                          << (*vertexes)[0].z() << "\n";
+    }
+  } else if (vertexes->empty() && !bsHandle.isValid()) {
+    edm::LogWarning("PixelVertexProducer") << "No beamspot and no vertex found. No vertex returned.";
+  }
+
+  iEvent.put(std::move(vertexes));
+}
+
+DEFINE_FWK_MODULE(PixelVertexProducerFromSoAAlpaka);
diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h
index 0c55cd97b070a..48848ff959554 100644
--- a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h
+++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h
@@ -8,7 +8,7 @@
 template <int32_t S>
 class PixelVertexWorkSpaceSoADevice : public cms::cuda::PortableDeviceCollection<PixelVertexWSSoALayout<>> {
 public:
-  PixelVertexWorkSpaceSoADevice() = default;
+  explicit PixelVertexWorkSpaceSoADevice() = default;
 
   // Constructor which specifies the SoA size and CUDA stream
   explicit PixelVertexWorkSpaceSoADevice(cudaStream_t stream)
diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h
index 0e698933b0731..9df8cc4580a1f 100644
--- a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h
+++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h
@@ -9,6 +9,7 @@ template <int32_t S>
 class PixelVertexWorkSpaceSoAHost : public cms::cuda::PortableHostCollection<PixelVertexWSSoALayout<>> {
 public:
   explicit PixelVertexWorkSpaceSoAHost() : PortableHostCollection<PixelVertexWSSoALayout<>>(S) {}
+
   // Constructor which specifies the SoA size and CUDA stream
   explicit PixelVertexWorkSpaceSoAHost(cudaStream_t stream)
       : PortableHostCollection<PixelVertexWSSoALayout<>>(S, stream) {}
diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h
new file mode 100644
index 0000000000000..33e163dbab784
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h
@@ -0,0 +1,15 @@
+#ifndef RecoTracker_PixelVertexFinding_plugins_PixelVertexWorkSpaceSoAHostAlpaka_h
+#define RecoTracker_PixelVertexFinding_plugins_PixelVertexWorkSpaceSoAHostAlpaka_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/Portable/interface/PortableHostCollection.h"
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+
+namespace vertexFinder {
+
+  using PixelVertexWorkSpaceSoAHost = PortableHostCollection<PixelVertexWSSoALayout<>>;
+
+}  // namespace vertexFinder
+
+#endif  // RecoTracker_PixelVertexFinding_plugins_PixelVertexWorkSpaceSoAHostAlpaka_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc b/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc
new file mode 100644
index 0000000000000..d572a181ccf85
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc
@@ -0,0 +1,110 @@
+#include <alpaka/alpaka.hpp>
+
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESGetToken.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/Event.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EventSetup.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/global/EDProducer.h"
+
+#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h"
+#include "DataFormats/TrackSoA/interface/TracksDevice.h"
+#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDevice.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h"
+
+#include "vertexFinder.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  using namespace cms::alpakatools;
+
+  template <typename TrackerTraits>
+  class PixelVertexProducerAlpaka : public global::EDProducer<> {
+    using TkSoADevice = TracksSoACollection<TrackerTraits>;
+    using Algo = vertexFinder::Producer<TrackerTraits>;
+
+  public:
+    explicit PixelVertexProducerAlpaka(const edm::ParameterSet& iConfig);
+    ~PixelVertexProducerAlpaka() override = default;
+
+    static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  private:
+    void produce(edm::StreamID streamID, device::Event& iEvent, const device::EventSetup& iSetup) const override;
+
+    const Algo algo_;
+
+    // Tracking cuts before sending tracks to vertex algo
+    const float ptMin_;
+    const float ptMax_;
+
+    device::EDGetToken<TkSoADevice> tokenDeviceTrack_;
+    device::EDPutToken<ZVertexSoACollection> tokenDeviceVertex_;
+  };
+
+  template <typename TrackerTraits>
+  PixelVertexProducerAlpaka<TrackerTraits>::PixelVertexProducerAlpaka(const edm::ParameterSet& conf)
+      : algo_(conf.getParameter<bool>("oneKernel"),
+              conf.getParameter<bool>("useDensity"),
+              conf.getParameter<bool>("useDBSCAN"),
+              conf.getParameter<bool>("useIterative"),
+              conf.getParameter<bool>("doSplitting"),
+              conf.getParameter<int>("minT"),
+              conf.getParameter<double>("eps"),
+              conf.getParameter<double>("errmax"),
+              conf.getParameter<double>("chi2max")),
+        ptMin_(conf.getParameter<double>("PtMin")),  // 0.5 GeV
+        ptMax_(conf.getParameter<double>("PtMax")),  // 75. Onsumes
+        tokenDeviceTrack_(consumes(conf.getParameter<edm::InputTag>("pixelTrackSrc"))),
+        tokenDeviceVertex_(produces()) {}
+
+  template <typename TrackerTraits>
+  void PixelVertexProducerAlpaka<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+    edm::ParameterSetDescription desc;
+
+    // Only one of these three algos can be used at once.
+    // Maybe this should become a Plugin Factory
+    desc.add<bool>("oneKernel", true);
+    desc.add<bool>("useDensity", true);
+    desc.add<bool>("useDBSCAN", false);
+    desc.add<bool>("useIterative", false);
+    desc.add<bool>("doSplitting", true);
+
+    desc.add<int>("minT", 2);          // min number of neighbours to be "core"
+    desc.add<double>("eps", 0.07);     // max absolute distance to cluster
+    desc.add<double>("errmax", 0.01);  // max error to be "seed"
+    desc.add<double>("chi2max", 9.);   // max normalized distance to cluster
+
+    desc.add<double>("PtMin", 0.5);
+    desc.add<double>("PtMax", 75.);
+    desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("pixelTracksAlpaka"));
+
+    descriptions.addWithDefaultLabel(desc);
+  }
+
+  template <typename TrackerTraits>
+  void PixelVertexProducerAlpaka<TrackerTraits>::produce(edm::StreamID streamID,
+                                                         device::Event& iEvent,
+                                                         const device::EventSetup& iSetup) const {
+    auto const& hTracks = iEvent.get(tokenDeviceTrack_);
+
+    iEvent.emplace(tokenDeviceVertex_, algo_.makeAsync(iEvent.queue(), hTracks.view(), ptMin_, ptMax_));
+  }
+
+  using PixelVertexProducerAlpakaPhase1 = PixelVertexProducerAlpaka<pixelTopology::Phase1>;
+  using PixelVertexProducerAlpakaPhase2 = PixelVertexProducerAlpaka<pixelTopology::Phase2>;
+  using PixelVertexProducerAlpakaHIonPhase1 = PixelVertexProducerAlpaka<pixelTopology::HIonPhase1>;
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+DEFINE_FWK_ALPAKA_MODULE(PixelVertexProducerAlpakaPhase1);
+DEFINE_FWK_ALPAKA_MODULE(PixelVertexProducerAlpakaPhase2);
+DEFINE_FWK_ALPAKA_MODULE(PixelVertexProducerAlpakaHIonPhase1);
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h
new file mode 100644
index 0000000000000..d0ec816b32aee
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h
@@ -0,0 +1,23 @@
+#ifndef RecoTracker_PixelVertexFinding_plugins_alpaka_PixelVertexWorkSpaceSoADeviceAlpaka_h
+#define RecoTracker_PixelVertexFinding_plugins_alpaka_PixelVertexWorkSpaceSoADeviceAlpaka_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/Portable/interface/alpaka/PortableCollection.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+#include "RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  namespace vertexFinder {
+
+    using PixelVertexWorkSpaceSoADevice = PortableCollection<::vertexFinder::PixelVertexWSSoALayout<>>;
+    using PixelVertexWorkSpaceSoAHost = ::vertexFinder::PixelVertexWorkSpaceSoAHost;
+
+  }  // namespace vertexFinder
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#endif  // RecoTracker_PixelVertexFinding_plugins_alpaka_PixelVertexWorkSpaceSoADeviceAlpaka_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h
new file mode 100644
index 0000000000000..29cd537ac4aa7
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h
@@ -0,0 +1,248 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h
+#define RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+#include "vertexFinder.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace vertexFinder {
+    using VtxSoAView = ::reco::ZVertexSoAView;
+    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+    // this algo does not really scale as it works in a single block...
+    // enough for <10K tracks we have
+    //
+    // based on Rodrighez&Laio algo
+    //
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline))
+    clusterTracksByDensity(const TAcc& acc,
+                           VtxSoAView& pdata,
+                           WsSoAView& pws,
+                           int minT,      // min number of neighbours to be "seed"
+                           float eps,     // max absolute distance to cluster
+                           float errmax,  // max error to be "seed"
+                           float chi2max  // max normalized distance to cluster
+    ) {
+      using namespace vertexFinder;
+      constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+
+      if constexpr (verbose) {
+        if (cms::alpakatools::once_per_block(acc))
+          printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+      }
+      auto er2mx = errmax * errmax;
+
+      auto& __restrict__ data = pdata;
+      auto& __restrict__ ws = pws;
+      auto nt = ws.ntrks();
+      float const* __restrict__ zt = ws.zt();
+      float const* __restrict__ ezt2 = ws.ezt2();
+
+      uint32_t& nvFinal = data.nvFinal();
+      uint32_t& nvIntermediate = ws.nvIntermediate();
+
+      uint8_t* __restrict__ izt = ws.izt();
+      int32_t* __restrict__ nn = data.ndof();
+      int32_t* __restrict__ iv = ws.iv();
+
+      ALPAKA_ASSERT_OFFLOAD(zt);
+      ALPAKA_ASSERT_OFFLOAD(ezt2);
+      ALPAKA_ASSERT_OFFLOAD(izt);
+      ALPAKA_ASSERT_OFFLOAD(nn);
+      ALPAKA_ASSERT_OFFLOAD(iv);
+
+      using Hist = cms::alpakatools::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+      auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
+      auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
+
+      for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
+        hist.off[j] = 0;
+      }
+      alpaka::syncBlockThreads(acc);
+
+      if constexpr (verbose) {
+        if (cms::alpakatools::once_per_block(acc))
+          printf("booked hist with %d bins, size %d for %d tracks\n", hist.totbins(), hist.capacity(), nt);
+      }
+      ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
+
+      // fill hist  (bin shall be wider than "eps")
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
+        int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+        // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
+        iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
+        izt[i] = iz - INT8_MIN;
+        ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0);
+        ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256);
+        hist.count(acc, izt[i]);
+        iv[i] = i;
+        nn[i] = 0;
+      }
+      alpaka::syncBlockThreads(acc);
+      if (threadIdxLocal < 32)
+        hws[threadIdxLocal] = 0;  // used by prefix scan...
+      alpaka::syncBlockThreads(acc);
+      hist.finalize(acc, hws);
+      alpaka::syncBlockThreads(acc);
+      ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        hist.fill(acc, izt[i], uint16_t(i));
+      }
+      alpaka::syncBlockThreads(acc);
+      // count neighbours
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (ezt2[i] > er2mx)
+          continue;
+        auto loop = [&](uint32_t j) {
+          if (i == j)
+            return;
+          auto dist = std::abs(zt[i] - zt[j]);
+          if (dist > eps)
+            return;
+          if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+            return;
+          nn[i]++;
+        };
+
+        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+      }
+      alpaka::syncBlockThreads(acc);
+
+      // find closest above me .... (we ignore the possibility of two j at same distance from i)
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        float mdist = eps;
+        auto loop = [&](uint32_t j) {
+          if (nn[j] < nn[i])
+            return;
+          if (nn[j] == nn[i] && zt[j] >= zt[i])
+            return;  // if equal use natural order...
+          auto dist = std::abs(zt[i] - zt[j]);
+          if (dist > mdist)
+            return;
+          if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+            return;  // (break natural order???)
+          mdist = dist;
+          iv[i] = j;  // assign to cluster (better be unique??)
+        };
+        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+      }
+      alpaka::syncBlockThreads(acc);
+
+#ifdef GPU_DEBUG
+      //  mini verification
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] != int(i))
+          ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
+      }
+      alpaka::syncBlockThreads(acc);
+#endif
+
+      // consolidate graph (percolate index of seed)
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        auto m = iv[i];
+        while (m != iv[m])
+          m = iv[m];
+        iv[i] = m;
+      }
+
+#ifdef GPU_DEBUG
+      alpaka::syncBlockThreads(acc);
+      //  mini verification
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] != int(i))
+          ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
+      }
+#endif
+
+#ifdef GPU_DEBUG
+      // and verify that we did not spit any cluster...
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        auto minJ = i;
+        auto mdist = eps;
+        auto loop = [&](uint32_t j) {
+          if (nn[j] < nn[i])
+            return;
+          if (nn[j] == nn[i] && zt[j] >= zt[i])
+            return;  // if equal use natural order...
+          auto dist = std::abs(zt[i] - zt[j]);
+          if (dist > mdist)
+            return;
+          if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+            return;
+          mdist = dist;
+          minJ = j;
+        };
+        cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+        // should belong to the same cluster...
+        ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[minJ]);
+        ALPAKA_ASSERT_OFFLOAD(nn[i] <= nn[iv[i]]);
+      }
+      alpaka::syncBlockThreads(acc);
+#endif
+
+      auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+      foundClusters = 0;
+      alpaka::syncBlockThreads(acc);
+
+      // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold;
+      // mark these tracks with a negative id.
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] == int(i)) {
+          if (nn[i] >= minT) {
+            auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
+            iv[i] = -(old + 1);
+          } else {  // noise
+            iv[i] = -9998;
+          }
+        }
+      }
+      alpaka::syncBlockThreads(acc);
+
+      ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
+
+      // propagate the negative id to all the tracks in the cluster.
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] >= 0) {
+          // mark each track in a cluster with the same id as the first one
+          iv[i] = iv[iv[i]];
+        }
+      }
+      alpaka::syncBlockThreads(acc);
+
+      // adjust the cluster id to be a positive value starting from 0
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        iv[i] = -iv[i] - 1;
+      }
+
+      nvIntermediate = nvFinal = foundClusters;
+      if constexpr (verbose) {
+        if (cms::alpakatools::once_per_block(acc))
+          printf("found %d proto vertices\n", foundClusters);
+      }
+    }
+    class ClusterTracksByDensityKernel {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    VtxSoAView pdata,
+                                    WsSoAView pws,
+                                    int minT,      // min number of neighbours to be "seed"
+                                    float eps,     // max absolute distance to cluster
+                                    float errmax,  // max error to be "seed"
+                                    float chi2max  // max normalized distance to cluster
+      ) const {
+        clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max);
+      }
+    };
+  }  // namespace vertexFinder
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h
new file mode 100644
index 0000000000000..46ae2ad80ecc9
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h
@@ -0,0 +1,255 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_gpuClusterTracksDBSCAN_h
+#define RecoPixelVertexing_PixelVertexFinding_gpuClusterTracksDBSCAN_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+#include "vertexFinder.h"
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace vertexFinder {
+    using VtxSoAView = ::reco::ZVertexSoAView;
+    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+    // this algo does not really scale as it works in a single block...
+    // enough for <10K tracks we have
+    class ClusterTracksDBSCAN {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    VtxSoAView pdata,
+                                    WsSoAView pws,
+                                    int minT,      // min number of neighbours to be "core"
+                                    float eps,     // max absolute distance to cluster
+                                    float errmax,  // max error to be "seed"
+                                    float chi2max  // max normalized distance to cluster
+      ) const {
+        constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+        const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+        if constexpr (verbose) {
+          if (cms::alpakatools::once_per_block(acc))
+            printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+        }
+        auto er2mx = errmax * errmax;
+
+        auto& __restrict__ data = pdata;
+        auto& __restrict__ ws = pws;
+        auto nt = ws.ntrks();
+        float const* __restrict__ zt = ws.zt();
+        float const* __restrict__ ezt2 = ws.ezt2();
+
+        uint32_t& nvFinal = data.nvFinal();
+        uint32_t& nvIntermediate = ws.nvIntermediate();
+
+        uint8_t* __restrict__ izt = ws.izt();
+        int32_t* __restrict__ nn = data.ndof();
+        int32_t* __restrict__ iv = ws.iv();
+
+        ALPAKA_ASSERT_OFFLOAD(zt);
+        ALPAKA_ASSERT_OFFLOAD(iv);
+        ALPAKA_ASSERT_OFFLOAD(nn);
+        ALPAKA_ASSERT_OFFLOAD(ezt2);
+
+        using Hist = cms::alpakatools::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+        auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
+        auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
+
+        for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
+          hist.off[j] = 0;
+        }
+        alpaka::syncBlockThreads(acc);
+
+        if constexpr (verbose) {
+          if (cms::alpakatools::once_per_block(acc))
+            printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+        }
+
+        ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
+
+        // fill hist  (bin shall be wider than "eps")
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
+          int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+          iz = std::clamp(iz, INT8_MIN, INT8_MAX);
+          izt[i] = iz - INT8_MIN;
+          ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0);
+          ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256);
+          hist.count(acc, izt[i]);
+          iv[i] = i;
+          nn[i] = 0;
+        }
+        alpaka::syncBlockThreads(acc);
+        if (threadIdxLocal < 32)
+          hws[threadIdxLocal] = 0;  // used by prefix scan...
+        alpaka::syncBlockThreads(acc);
+        hist.finalize(acc, hws);
+        alpaka::syncBlockThreads(acc);
+        ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          hist.fill(acc, izt[i], uint32_t(i));
+        }
+        alpaka::syncBlockThreads(acc);
+
+        // count neighbours
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (ezt2[i] > er2mx)
+            continue;
+          auto loop = [&](uint32_t j) {
+            if (i == j)
+              return;
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > eps)
+              return;
+            //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+            nn[i]++;
+          };
+
+          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+        }
+
+        alpaka::syncBlockThreads(acc);
+
+        // find NN with smaller z...
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (nn[i] < minT)
+            continue;  // DBSCAN core rule
+          float mz = zt[i];
+          auto loop = [&](uint32_t j) {
+            if (zt[j] >= mz)
+              return;
+            if (nn[j] < minT)
+              return;  // DBSCAN core rule
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > eps)
+              return;
+            //        if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+            mz = zt[j];
+            iv[i] = j;  // assign to cluster (better be unique??)
+          };
+          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+        }
+
+        alpaka::syncBlockThreads(acc);
+
+#ifdef GPU_DEBUG
+        //  mini verification
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (iv[i] != int(i))
+            ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
+        }
+        alpaka::syncBlockThreads(acc);
+#endif
+
+        // consolidate graph (percolate index of seed)
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          auto m = iv[i];
+          while (m != iv[m])
+            m = iv[m];
+          iv[i] = m;
+        }
+
+        alpaka::syncBlockThreads(acc);
+
+#ifdef GPU_DEBUG
+        //  mini verification
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (iv[i] != int(i))
+            ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i));
+        }
+        alpaka::syncBlockThreads(acc);
+#endif
+
+#ifdef GPU_DEBUG
+        // and verify that we did not spit any cluster...
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (nn[i] < minT)
+            continue;  // DBSCAN core rule
+          ALPAKA_ASSERT_OFFLOAD(zt[iv[i]] <= zt[i]);
+          auto loop = [&](uint32_t j) {
+            if (nn[j] < minT)
+              return;  // DBSCAN core rule
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > eps)
+              return;
+            //  if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return;
+            // they should belong to the same cluster, isn't it?
+            if (iv[i] != iv[j]) {
+              printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]);
+              printf("      %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]);
+              ;
+            }
+            ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[j]);
+          };
+          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+        }
+        alpaka::syncBlockThreads(acc);
+#endif
+
+        // collect edges (assign to closest cluster of closest point??? here to closest point)
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+          if (nn[i] >= minT)
+            continue;  // DBSCAN edge rule
+          float mdist = eps;
+          auto loop = [&](uint32_t j) {
+            if (nn[j] < minT)
+              return;  // DBSCAN core rule
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > mdist)
+              return;
+            if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+              return;  // needed?
+            mdist = dist;
+            iv[i] = iv[j];  // assign to cluster (better be unique??)
+          };
+          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+        }
+
+        auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+        foundClusters = 0;
+        alpaka::syncBlockThreads(acc);
+
+        // find the number of different clusters, identified by a tracks with clus[i] == i;
+        // mark these tracks with a negative id.
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (iv[i] == int(i)) {
+            if (nn[i] >= minT) {
+              auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
+              iv[i] = -(old + 1);
+            } else {  // noise
+              iv[i] = -9998;
+            }
+          }
+        }
+        alpaka::syncBlockThreads(acc);
+
+        ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
+
+        // propagate the negative id to all the tracks in the cluster.
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (iv[i] >= 0) {
+            // mark each track in a cluster with the same id as the first one
+            iv[i] = iv[iv[i]];
+          }
+        }
+        alpaka::syncBlockThreads(acc);
+
+        // adjust the cluster id to be a positive value starting from 0
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          iv[i] = -iv[i] - 1;
+        }
+
+        nvIntermediate = nvFinal = foundClusters;
+
+        if constexpr (verbose) {
+          if (cms::alpakatools::once_per_block(acc))
+            printf("found %d proto vertices\n", foundClusters);
+        }
+      }
+    };
+  }  // namespace vertexFinder
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h
new file mode 100644
index 0000000000000..3fe0202121f80
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h
@@ -0,0 +1,230 @@
+#ifndef RecoTracker_PixelVertexFinding_clusterTracksIterativeAlpaka_h
+#define RecoTracker_PixelVertexFinding_clusterTracksIterativeAlpaka_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+
+#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+#include "vertexFinder.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace vertexFinder {
+
+    // this algo does not really scale as it works in a single block...
+    // enough for <10K tracks we have
+    class ClusterTracksIterative {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    VtxSoAView pdata,
+                                    WsSoAView pws,
+                                    int minT,      // min number of neighbours to be "core"
+                                    float eps,     // max absolute distance to cluster
+                                    float errmax,  // max error to be "seed"
+                                    float chi2max  // max normalized distance to cluster
+      ) const {
+        constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+        const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+        if constexpr (verbose) {
+          if (cms::alpakatools::once_per_block(acc))
+            printf("params %d %f %f %f\n", minT, eps, errmax, chi2max);
+        }
+        auto er2mx = errmax * errmax;
+
+        auto& __restrict__ data = pdata;
+        auto& __restrict__ ws = pws;
+        auto nt = ws.ntrks();
+        float const* __restrict__ zt = ws.zt();
+        float const* __restrict__ ezt2 = ws.ezt2();
+
+        uint32_t& nvFinal = data.nvFinal();
+        uint32_t& nvIntermediate = ws.nvIntermediate();
+
+        uint8_t* __restrict__ izt = ws.izt();
+        int32_t* __restrict__ nn = data.ndof();
+        int32_t* __restrict__ iv = ws.iv();
+
+        ALPAKA_ASSERT_OFFLOAD(zt);
+        ALPAKA_ASSERT_OFFLOAD(nn);
+        ALPAKA_ASSERT_OFFLOAD(iv);
+        ALPAKA_ASSERT_OFFLOAD(ezt2);
+
+        using Hist = cms::alpakatools::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
+        auto& hist = alpaka::declareSharedVar<Hist, __COUNTER__>(acc);
+        auto& hws = alpaka::declareSharedVar<Hist::Counter[32], __COUNTER__>(acc);
+
+        for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) {
+          hist.off[j] = 0;
+        }
+        alpaka::syncBlockThreads(acc);
+
+        if constexpr (verbose) {
+          if (cms::alpakatools::once_per_block(acc))
+            printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt);
+        }
+
+        ALPAKA_ASSERT_OFFLOAD(static_cast<int>(nt) <= hist.capacity());
+
+        // fill hist  (bin shall be wider than "eps")
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS);
+          int iz = int(zt[i] * 10.);  // valid if eps<=0.1
+          iz = std::clamp(iz, INT8_MIN, INT8_MAX);
+          izt[i] = iz - INT8_MIN;
+          ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0);
+          ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256);
+          hist.count(acc, izt[i]);
+          iv[i] = i;
+          nn[i] = 0;
+        }
+        alpaka::syncBlockThreads(acc);
+
+        if (threadIdxLocal < 32)
+          hws[threadIdxLocal] = 0;  // used by prefix scan...
+        alpaka::syncBlockThreads(acc);
+
+        hist.finalize(acc, hws);
+        alpaka::syncBlockThreads(acc);
+
+        ALPAKA_ASSERT_OFFLOAD(hist.size() == nt);
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          hist.fill(acc, izt[i], uint16_t(i));
+        }
+        alpaka::syncBlockThreads(acc);
+
+        // count neighbours
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (ezt2[i] > er2mx)
+            continue;
+          auto loop = [&](uint32_t j) {
+            if (i == j)
+              return;
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > eps)
+              return;
+            if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+              return;
+            nn[i]++;
+          };
+
+          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+        }
+
+        auto& nloops = alpaka::declareSharedVar<int, __COUNTER__>(acc);
+        nloops = 0;
+
+        alpaka::syncBlockThreads(acc);
+
+        // cluster seeds only
+        bool more = true;
+        while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
+          if (1 == nloops % 2) {
+            for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+              auto m = iv[i];
+              while (m != iv[m])
+                m = iv[m];
+              iv[i] = m;
+            }
+          } else {
+            more = false;
+            for (auto k : cms::alpakatools::elements_with_stride(acc, hist.size())) {
+              auto p = hist.begin() + k;
+              auto i = (*p);
+              auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1));
+              if (nn[i] < minT)
+                continue;  // DBSCAN core rule
+              auto loop = [&](uint32_t j) {
+                ALPAKA_ASSERT_OFFLOAD(i != j);
+                if (nn[j] < minT)
+                  return;  // DBSCAN core rule
+                auto dist = std::abs(zt[i] - zt[j]);
+                if (dist > eps)
+                  return;
+                if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+                  return;
+                auto old = alpaka::atomicMin(acc, &iv[j], iv[i], alpaka::hierarchy::Blocks{});
+                if (old != iv[i]) {
+                  // end the loop only if no changes were applied
+                  more = true;
+                }
+                alpaka::atomicMin(acc, &iv[i], old, alpaka::hierarchy::Blocks{});
+              };
+              ++p;
+              for (; p < hist.end(be); ++p)
+                loop(*p);
+            }  // for i
+          }
+          if (threadIdxLocal == 0)
+            ++nloops;
+        }  // while
+
+        // collect edges (assign to closest cluster of closest point??? here to closest point)
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          //    if (nn[i]==0 || nn[i]>=minT) continue;    // DBSCAN edge rule
+          if (nn[i] >= minT)
+            continue;  // DBSCAN edge rule
+          float mdist = eps;
+          auto loop = [&](int j) {
+            if (nn[j] < minT)
+              return;  // DBSCAN core rule
+            auto dist = std::abs(zt[i] - zt[j]);
+            if (dist > mdist)
+              return;
+            if (dist * dist > chi2max * (ezt2[i] + ezt2[j]))
+              return;  // needed?
+            mdist = dist;
+            iv[i] = iv[j];  // assign to cluster (better be unique??)
+          };
+          cms::alpakatools::forEachInBins(hist, izt[i], 1, loop);
+        }
+
+        auto& foundClusters = alpaka::declareSharedVar<unsigned int, __COUNTER__>(acc);
+        foundClusters = 0;
+        alpaka::syncBlockThreads(acc);
+
+        // find the number of different clusters, identified by a tracks with clus[i] == i;
+        // mark these tracks with a negative id.
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (iv[i] == int(i)) {
+            if (nn[i] >= minT) {
+              auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{});
+              iv[i] = -(old + 1);
+            } else {  // noise
+              iv[i] = -9998;
+            }
+          }
+        }
+        alpaka::syncBlockThreads(acc);
+
+        ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX);
+
+        // propagate the negative id to all the tracks in the cluster.
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (iv[i] >= 0) {
+            // mark each track in a cluster with the same id as the first one
+            iv[i] = iv[iv[i]];
+          }
+        }
+        alpaka::syncBlockThreads(acc);
+
+        // adjust the cluster id to be a positive value starting from 0
+        for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+          iv[i] = -iv[i] - 1;
+        }
+
+        nvIntermediate = nvFinal = foundClusters;
+
+        if constexpr (verbose) {
+          if (cms::alpakatools::once_per_block(acc))
+            printf("found %d proto vertices\n", foundClusters);
+        }
+      }
+    };
+  }  // namespace vertexFinder
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoTracker_PixelVertexFinding_plugins_clusterTracksIterativeAlpaka_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h
new file mode 100644
index 0000000000000..9ff4656b9718e
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h
@@ -0,0 +1,123 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_gpuFitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_gpuFitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+
+#include "vertexFinder.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace vertexFinder {
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fitVertices(
+        const TAcc& acc,
+        VtxSoAView& pdata,
+        WsSoAView& pws,
+        float chi2Max  // for outlier rejection
+    ) {
+      constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+
+      auto& __restrict__ data = pdata;
+      auto& __restrict__ ws = pws;
+      auto nt = ws.ntrks();
+      float const* __restrict__ zt = ws.zt();
+      float const* __restrict__ ezt2 = ws.ezt2();
+      float* __restrict__ zv = data.zv();
+      float* __restrict__ wv = data.wv();
+      float* __restrict__ chi2 = data.chi2();
+      uint32_t& nvFinal = data.nvFinal();
+      uint32_t& nvIntermediate = ws.nvIntermediate();
+
+      int32_t* __restrict__ nn = data.ndof();
+      int32_t* __restrict__ iv = ws.iv();
+
+      ALPAKA_ASSERT_OFFLOAD(nvFinal <= nvIntermediate);
+      nvFinal = nvIntermediate;
+      auto foundClusters = nvFinal;
+
+      // zero
+      for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+        zv[i] = 0;
+        wv[i] = 0;
+        chi2[i] = 0;
+      }
+
+      // only for test
+      auto& noise = alpaka::declareSharedVar<int, __COUNTER__>(acc);
+
+      if constexpr (verbose) {
+        if (cms::alpakatools::once_per_block(acc))
+          noise = 0;
+      }
+      alpaka::syncBlockThreads(acc);
+
+      // compute cluster location
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] > 9990) {
+          if constexpr (verbose)
+            alpaka::atomicAdd(acc, &noise, 1, alpaka::hierarchy::Threads{});
+          continue;
+        }
+        ALPAKA_ASSERT_OFFLOAD(iv[i] >= 0);
+        ALPAKA_ASSERT_OFFLOAD(iv[i] < int(foundClusters));
+        auto w = 1.f / ezt2[i];
+        alpaka::atomicAdd(acc, &zv[iv[i]], zt[i] * w, alpaka::hierarchy::Threads{});
+        alpaka::atomicAdd(acc, &wv[iv[i]], w, alpaka::hierarchy::Threads{});
+      }
+
+      alpaka::syncBlockThreads(acc);
+      // reuse nn
+      for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+        ALPAKA_ASSERT_OFFLOAD(wv[i] > 0.f);
+        zv[i] /= wv[i];
+        nn[i] = -1;  // ndof
+      }
+      alpaka::syncBlockThreads(acc);
+
+      // compute chi2
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] > 9990)
+          continue;
+
+        auto c2 = zv[iv[i]] - zt[i];
+        c2 *= c2 / ezt2[i];
+        if (c2 > chi2Max) {
+          iv[i] = 9999;
+          continue;
+        }
+        alpaka::atomicAdd(acc, &chi2[iv[i]], c2, alpaka::hierarchy::Blocks{});
+        alpaka::atomicAdd(acc, &nn[iv[i]], 1, alpaka::hierarchy::Blocks{});
+      }
+      alpaka::syncBlockThreads(acc);
+
+      for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) {
+        if (nn[i] > 0) {
+          wv[i] *= float(nn[i]) / chi2[i];
+        }
+      }
+      if constexpr (verbose) {
+        if (cms::alpakatools::once_per_block(acc)) {
+          printf("found %d proto clusters ", foundClusters);
+          printf("and %d noise\n", noise);
+        }
+      }
+    }
+
+    class FitVerticesKernel {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    VtxSoAView pdata,
+                                    WsSoAView pws,
+                                    float chi2Max  // for outlier rejection
+      ) const {
+        fitVertices(acc, pdata, pws, chi2Max);
+      }
+    };
+  }  // namespace vertexFinder
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h
new file mode 100644
index 0000000000000..2c6f0cb0597e4
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h
@@ -0,0 +1,80 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_sortByPt2_h
+#define RecoPixelVertexing_PixelVertexFinding_sortByPt2_h
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h"
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+
+#include "vertexFinder.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace vertexFinder {
+    using VtxSoAView = ::reco::ZVertexSoAView;
+    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE void sortByPt2(const TAcc& acc, VtxSoAView& data, WsSoAView& ws) {
+      auto nt = ws.ntrks();
+      float const* __restrict__ ptt2 = ws.ptt2();
+      uint32_t const& nvFinal = data.nvFinal();
+
+      int32_t const* __restrict__ iv = ws.iv();
+      float* __restrict__ ptv2 = data.ptv2();
+      uint16_t* __restrict__ sortInd = data.sortInd();
+
+      if (nvFinal < 1)
+        return;
+
+      // fill indexing
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        data.idv()[ws.itrk()[i]] = iv[i];
+      };
+
+      // can be done asynchronously at the end of previous event
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nvFinal)) {
+        ptv2[i] = 0;
+      };
+      alpaka::syncBlockThreads(acc);
+
+      for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) {
+        if (iv[i] <= 9990) {
+          alpaka::atomicAdd(acc, &ptv2[iv[i]], ptt2[i], alpaka::hierarchy::Blocks{});
+        }
+      };
+      alpaka::syncBlockThreads(acc);
+
+      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+      if (1 == nvFinal) {
+        if (threadIdxLocal == 0)
+          sortInd[0] = 0;
+        return;
+      }
+
+      if constexpr (not cms::alpakatools::requires_single_thread_per_block_v<TAcc>) {
+        auto& sws = alpaka::declareSharedVar<uint16_t[1024], __COUNTER__>(acc);
+        // sort using only 16 bits
+        cms::alpakatools::radixSort<Acc1D, float, 2>(acc, ptv2, sortInd, sws, nvFinal);
+      } else {
+        for (uint16_t i = 0; i < nvFinal; ++i)
+          sortInd[i] = i;
+        std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; });
+      }
+    }
+
+    class SortByPt2Kernel {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const {
+        sortByPt2(acc, pdata, pws);
+      }
+    };
+  }  // namespace vertexFinder
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelVertexFinding_sortByPt2_h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h
new file mode 100644
index 0000000000000..7d31a48a0f6f3
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h
@@ -0,0 +1,162 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_splitVertices_h
+#define RecoPixelVertexing_PixelVertexFinding_splitVertices_h
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h"
+
+#include "vertexFinder.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace vertexFinder {
+    using VtxSoAView = ::reco::ZVertexSoAView;
+    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+    template <typename TAcc>
+    ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void splitVertices(const TAcc& acc,
+                                                                                     VtxSoAView& pdata,
+                                                                                     WsSoAView& pws,
+                                                                                     float maxChi2) {
+      constexpr bool verbose = false;  // in principle the compiler should optmize out if false
+      const uint32_t threadIdxLocal(alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0u]);
+
+      auto& __restrict__ data = pdata;
+      auto& __restrict__ ws = pws;
+      auto nt = ws.ntrks();
+      float const* __restrict__ zt = ws.zt();
+      float const* __restrict__ ezt2 = ws.ezt2();
+      float* __restrict__ zv = data.zv();
+      float* __restrict__ wv = data.wv();
+      float const* __restrict__ chi2 = data.chi2();
+      uint32_t& nvFinal = data.nvFinal();
+
+      int32_t const* __restrict__ nn = data.ndof();
+      int32_t* __restrict__ iv = ws.iv();
+
+      ALPAKA_ASSERT_OFFLOAD(zt);
+      ALPAKA_ASSERT_OFFLOAD(wv);
+      ALPAKA_ASSERT_OFFLOAD(chi2);
+      ALPAKA_ASSERT_OFFLOAD(nn);
+
+      constexpr uint32_t MAXTK = 512;
+
+      auto& it = alpaka::declareSharedVar<uint32_t[MAXTK], __COUNTER__>(acc);   // track index
+      auto& zz = alpaka::declareSharedVar<float[MAXTK], __COUNTER__>(acc);      // z pos
+      auto& newV = alpaka::declareSharedVar<uint8_t[MAXTK], __COUNTER__>(acc);  // 0 or 1
+      auto& ww = alpaka::declareSharedVar<float[MAXTK], __COUNTER__>(acc);      // z weight
+      auto& nq = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);          // number of track for this vertex
+
+      const uint32_t blockIdx(alpaka::getIdx<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+      const uint32_t gridDimension(alpaka::getWorkDiv<alpaka::Grid, alpaka::Blocks>(acc)[0u]);
+
+      // one vertex per block
+      for (auto kv = blockIdx; kv < nvFinal; kv += gridDimension) {
+        if (nn[kv] < 4)
+          continue;
+        if (chi2[kv] < maxChi2 * float(nn[kv]))
+          continue;
+
+        ALPAKA_ASSERT_OFFLOAD(nn[kv] < int32_t(MAXTK));
+
+        if ((uint32_t)nn[kv] >= MAXTK)
+          continue;  // too bad FIXME
+
+        nq = 0u;
+        alpaka::syncBlockThreads(acc);
+
+        // copy to local
+        for (auto k : cms::alpakatools::elements_with_stride(acc, nt)) {
+          if (iv[k] == int(kv)) {
+            auto old = alpaka::atomicInc(acc, &nq, MAXTK, alpaka::hierarchy::Threads{});
+            zz[old] = zt[k] - zv[kv];
+            newV[old] = zz[old] < 0 ? 0 : 1;
+            ww[old] = 1.f / ezt2[k];
+            it[old] = k;
+          }
+        }
+
+        // the new vertices
+        auto& znew = alpaka::declareSharedVar<float[2], __COUNTER__>(acc);
+        auto& wnew = alpaka::declareSharedVar<float[2], __COUNTER__>(acc);
+        alpaka::syncBlockThreads(acc);
+
+        ALPAKA_ASSERT_OFFLOAD(int(nq) == nn[kv] + 1);
+
+        int maxiter = 20;
+        // kt-min....
+        bool more = true;
+        while (alpaka::syncBlockThreadsPredicate<alpaka::BlockOr>(acc, more)) {
+          more = false;
+          if (0 == threadIdxLocal) {
+            znew[0] = 0;
+            znew[1] = 0;
+            wnew[0] = 0;
+            wnew[1] = 0;
+          }
+          alpaka::syncBlockThreads(acc);
+
+          for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+            auto i = newV[k];
+            alpaka::atomicAdd(acc, &znew[i], zz[k] * ww[k], alpaka::hierarchy::Threads{});
+            alpaka::atomicAdd(acc, &wnew[i], ww[k], alpaka::hierarchy::Threads{});
+          }
+          alpaka::syncBlockThreads(acc);
+
+          if (0 == threadIdxLocal) {
+            znew[0] /= wnew[0];
+            znew[1] /= wnew[1];
+          }
+          alpaka::syncBlockThreads(acc);
+
+          for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+            auto d0 = fabs(zz[k] - znew[0]);
+            auto d1 = fabs(zz[k] - znew[1]);
+            auto newer = d0 < d1 ? 0 : 1;
+            more |= newer != newV[k];
+            newV[k] = newer;
+          }
+          --maxiter;
+          if (maxiter <= 0)
+            more = false;
+        }
+
+        // avoid empty vertices
+        if (0 == wnew[0] || 0 == wnew[1])
+          continue;
+
+        // quality cut
+        auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]);
+
+        auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]);
+
+        if (verbose && 0 == threadIdxLocal)
+          printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]);
+
+        if (chi2Dist < 4)
+          continue;
+
+        // get a new global vertex
+        auto& igv = alpaka::declareSharedVar<uint32_t, __COUNTER__>(acc);
+        if (0 == threadIdxLocal)
+          igv = alpaka::atomicAdd(acc, &ws.nvIntermediate(), 1u, alpaka::hierarchy::Blocks{});
+        alpaka::syncBlockThreads(acc);
+        for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) {
+          if (1 == newV[k])
+            iv[it[k]] = igv;
+        }
+
+      }  // loop on vertices
+    }
+
+    class SplitVerticesKernel {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws, float maxChi2) const {
+        splitVertices(acc, pdata, pws, maxChi2);
+      }
+    };
+  }  // namespace vertexFinder
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif  // RecoPixelVertexing_PixelVertexFinding_plugins_splitVertices.h
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc
new file mode 100644
index 0000000000000..83bc8f0d84ec2
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc
@@ -0,0 +1,208 @@
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/traits.h"
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h"
+
+#include "vertexFinder.h"
+#include "vertexFinder.h"
+#include "clusterTracksDBSCAN.h"
+#include "clusterTracksIterative.h"
+#include "clusterTracksByDensity.h"
+#include "fitVertices.h"
+#include "sortByPt2.h"
+#include "splitVertices.h"
+
+#undef PIXVERTEX_DEBUG_PRODUCE
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace vertexFinder {
+    using namespace cms::alpakatools;
+    // reject outlier tracks that contribute more than this to the chi2 of the vertex fit
+    constexpr float maxChi2ForFirstFit = 50.f;
+    constexpr float maxChi2ForFinalFit = 5000.f;
+
+    // split vertices with a chi2/NDoF greater than this
+    constexpr float maxChi2ForSplit = 9.f;
+
+    template <typename TrackerTraits>
+    class LoadTracks {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    reco::TrackSoAConstView<TrackerTraits> tracks_view,
+                                    VtxSoAView soa,
+                                    WsSoAView pws,
+                                    float ptMin,
+                                    float ptMax) const {
+        auto const* quality = tracks_view.quality();
+        using helper = TracksUtilities<TrackerTraits>;
+
+        for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.nTracks())) {
+          [[maybe_unused]] auto nHits = helper::nHits(tracks_view, idx);
+          ALPAKA_ASSERT_OFFLOAD(nHits >= 3);
+
+          // initialize soa...
+          soa[idx].idv() = -1;
+
+          if (helper::isTriplet(tracks_view, idx))
+            continue;  // no triplets
+          if (quality[idx] < ::pixelTrack::Quality::highPurity)
+            continue;
+
+          auto pt = tracks_view[idx].pt();
+
+          if (pt < ptMin)
+            continue;
+
+          // clamp pt
+          pt = std::min<float>(pt, ptMax);
+
+          auto& data = pws;
+          auto it = alpaka::atomicAdd(acc, &data.ntrks(), 1u, alpaka::hierarchy::Blocks{});
+          data[it].itrk() = idx;
+          data[it].zt() = helper::zip(tracks_view, idx);
+          data[it].ezt2() = tracks_view[idx].covariance()(14);
+          data[it].ptt2() = pt * pt;
+        }
+      }
+    };
+// #define THREE_KERNELS
+#ifndef THREE_KERNELS
+    class VertexFinderOneKernel {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    VtxSoAView pdata,
+                                    WsSoAView pws,
+                                    bool doSplit,
+                                    int minT,      // min number of neighbours to be "seed"
+                                    float eps,     // max absolute distance to cluster
+                                    float errmax,  // max error to be "seed"
+                                    float chi2max  // max normalized distance to cluster,
+      ) const {
+        clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max);
+        alpaka::syncBlockThreads(acc);
+        fitVertices(acc, pdata, pws, maxChi2ForFirstFit);
+        alpaka::syncBlockThreads(acc);
+        if (doSplit) {
+          splitVertices(acc, pdata, pws, maxChi2ForSplit);
+          alpaka::syncBlockThreads(acc);
+          fitVertices(acc, pdata, pws, maxChi2ForFinalFit);
+          alpaka::syncBlockThreads(acc);
+        }
+        sortByPt2(acc, pdata, pws);
+      }
+    };
+#else
+    class VertexFinderKernel1 {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    VtxSoAView pdata,
+                                    WsSoAView pws,
+                                    int minT,      // min number of neighbours to be "seed"
+                                    float eps,     // max absolute distance to cluster
+                                    float errmax,  // max error to be "seed"
+                                    float chi2max  // max normalized distance to cluster,
+      ) const {
+        clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+        alpaka::syncBlockThreads(acc);
+        fitVertices(pdata, pws, maxChi2ForFirstFit);
+      }
+    };
+    class VertexFinderKernel2 {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const {
+        fitVertices(pdata, pws, maxChi2ForFinalFit);
+        alpaka::syncBlockThreads(acc);
+        sortByPt2(pdata, pws);
+      }
+    };
+#endif
+
+    template <typename TrackerTraits>
+    ZVertexSoACollection Producer<TrackerTraits>::makeAsync(Queue& queue,
+                                                            const reco::TrackSoAConstView<TrackerTraits>& tracks_view,
+                                                            float ptMin,
+                                                            float ptMax) const {
+#ifdef PIXVERTEX_DEBUG_PRODUCE
+      std::cout << "producing Vertices on GPU" << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+      ZVertexSoACollection vertices(queue);
+
+      auto soa = vertices.view();
+
+      auto ws_d = PixelVertexWorkSpaceSoADevice(::zVertex::MAXTRACKS, queue);
+
+      // Initialize
+      const auto initWorkDiv = cms::alpakatools::make_workdiv<Acc1D>(1, 1);
+      alpaka::exec<Acc1D>(queue, initWorkDiv, Init{}, soa, ws_d.view());
+
+      // Load Tracks
+      const uint32_t blockSize = 128;
+      const uint32_t numberOfBlocks =
+          cms::alpakatools::divide_up_by(tracks_view.metadata().size() + blockSize - 1, blockSize);
+      const auto loadTracksWorkDiv = cms::alpakatools::make_workdiv<Acc1D>(numberOfBlocks, blockSize);
+      alpaka::exec<Acc1D>(
+          queue, loadTracksWorkDiv, LoadTracks<TrackerTraits>{}, tracks_view, soa, ws_d.view(), ptMin, ptMax);
+
+      // Running too many thread lead to problems when printf is enabled.
+      const auto finderSorterWorkDiv = cms::alpakatools::make_workdiv<Acc1D>(1, 1024 - 128);
+      const auto splitterFitterWorkDiv = cms::alpakatools::make_workdiv<Acc1D>(1024, 128);
+
+      if (oneKernel_) {
+        // implemented only for density clustesrs
+#ifndef THREE_KERNELS
+        alpaka::exec<Acc1D>(queue,
+                            finderSorterWorkDiv,
+                            VertexFinderOneKernel{},
+                            soa,
+                            ws_d.view(),
+                            doSplitting_,
+                            minT,
+                            eps,
+                            errmax,
+                            chi2max);
+#else
+        alpaka::exec<Acc1D>(
+            queue, finderSorterWorkDiv, VertexFinderOneKernel{}, soa, ws_d.view(), minT, eps, errmax, chi2max);
+
+        // one block per vertex...
+        if (doSplitting_)
+          alpaka::exec<Acc1D>(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, ws_d.view(), maxChi2ForSplit);
+        alpaka::exec<Acc1D>(queue, finderSorterWorkDiv{}, soa, ws_d.view());
+#endif
+      } else {  // five kernels
+        if (useDensity_) {
+          alpaka::exec<Acc1D>(
+              queue, finderSorterWorkDiv, ClusterTracksByDensityKernel{}, soa, ws_d.view(), minT, eps, errmax, chi2max);
+
+        } else if (useDBSCAN_) {
+          alpaka::exec<Acc1D>(
+              queue, finderSorterWorkDiv, ClusterTracksDBSCAN{}, soa, ws_d.view(), minT, eps, errmax, chi2max);
+        } else if (useIterative_) {
+          alpaka::exec<Acc1D>(
+              queue, finderSorterWorkDiv, ClusterTracksIterative{}, soa, ws_d.view(), minT, eps, errmax, chi2max);
+        }
+        alpaka::exec<Acc1D>(queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, ws_d.view(), maxChi2ForFirstFit);
+
+        // one block per vertex...
+        if (doSplitting_) {
+          alpaka::exec<Acc1D>(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, ws_d.view(), maxChi2ForSplit);
+
+          alpaka::exec<Acc1D>(queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, ws_d.view(), maxChi2ForFinalFit);
+        }
+        alpaka::exec<Acc1D>(queue, finderSorterWorkDiv, SortByPt2Kernel{}, soa, ws_d.view());
+      }
+
+      return vertices;
+    }
+
+    template class Producer<pixelTopology::Phase1>;
+    template class Producer<pixelTopology::Phase2>;
+    template class Producer<pixelTopology::HIonPhase1>;
+  }  // namespace vertexFinder
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h
new file mode 100644
index 0000000000000..23e5db1e706c4
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h
@@ -0,0 +1,76 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_vertexFinder_h
+#define RecoPixelVertexing_PixelVertexFinding_vertexFinder_h
+
+#include <cstddef>
+#include <cstdint>
+#include <alpaka/alpaka.hpp>
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "DataFormats/VertexSoA/interface/ZVertexSoA.h"
+#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDevice.h"
+
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  namespace vertexFinder {
+    using namespace cms::alpakatools;
+    using VtxSoAView = ::reco::ZVertexSoAView;
+    using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView;
+
+    class Init {
+    public:
+      template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+      ALPAKA_FN_ACC void operator()(const TAcc &acc, VtxSoAView pdata, WsSoAView pws) const {
+        pdata.nvFinal() = 0;  // initialization
+        ::vertexFinder::init(pws);
+      }
+    };
+
+    template <typename TrackerTraits>
+    class Producer {
+      using TkSoAConstView = reco::TrackSoAConstView<TrackerTraits>;
+
+    public:
+      Producer(bool oneKernel,
+               bool useDensity,
+               bool useDBSCAN,
+               bool useIterative,
+               bool doSplitting,
+               int iminT,      // min number of neighbours to be "core"
+               float ieps,     // max absolute distance to cluster
+               float ierrmax,  // max error to be "seed"
+               float ichi2max  // max normalized distance to cluster
+               )
+          : oneKernel_(oneKernel && !(useDBSCAN || useIterative)),
+            useDensity_(useDensity),
+            useDBSCAN_(useDBSCAN),
+            useIterative_(useIterative),
+            doSplitting_(doSplitting),
+            minT(iminT),
+            eps(ieps),
+            errmax(ierrmax),
+            chi2max(ichi2max) {}
+
+      ~Producer() = default;
+
+      ZVertexSoACollection makeAsync(Queue &queue, const TkSoAConstView &tracks_view, float ptMin, float ptMax) const;
+
+    private:
+      const bool oneKernel_;     // run everything (cluster,fit,split,sort) in one kernel. Uses only density clusterizer
+      const bool useDensity_;    // use density clusterizer
+      const bool useDBSCAN_;     // use DBScan clusterizer
+      const bool useIterative_;  // use iterative clusterizer
+      const bool doSplitting_;   //run vertex splitting
+
+      int minT;       // min number of neighbours to be "core"
+      float eps;      // max absolute distance to cluster
+      float errmax;   // max error to be "seed"
+      float chi2max;  // max normalized distance to cluster
+    };
+
+  }  // namespace vertexFinder
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+#endif
diff --git a/RecoTracker/PixelVertexFinding/test/BuildFile.xml b/RecoTracker/PixelVertexFinding/test/BuildFile.xml
index 9343f00f9a027..d5d0142eca659 100644
--- a/RecoTracker/PixelVertexFinding/test/BuildFile.xml
+++ b/RecoTracker/PixelVertexFinding/test/BuildFile.xml
@@ -10,29 +10,31 @@
 <use name="TrackingTools/TransientTrack"/>
 
 <iftool name="cuda-gcc-support">
-<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderOneKernel_t">
-  <use name="cuda"/>
-  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
-  <flags CXXFLAGS="-g"/>
-</bin>
 
-<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
-  <use name="cuda"/>
-  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
-  <flags CXXFLAGS="-g"/>
-</bin>
+  <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderOneKernel_t">
+    <use name="cuda"/>
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
+    <flags CXXFLAGS="-g"/>
+  </bin>
 
-<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderDBSCAN_t">
-  <use name="cuda"/>
-  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_DBSCAN"/>
-  <flags CXXFLAGS="-g"/>
-</bin>
+  <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderByDensity_t">
+    <use name="cuda"/>
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+    <flags CXXFLAGS="-g"/>
+  </bin>
+
+  <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderDBSCAN_t">
+    <use name="cuda"/>
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_DBSCAN"/>
+    <flags CXXFLAGS="-g"/>
+  </bin>
+
+  <bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderIterative_t">
+    <use name="cuda"/>
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
+    <flags CXXFLAGS="-g"/>
+  </bin>
 
-<bin file="gpuVertexFinder_t.cu" name="gpuVertexFinderIterative_t">
-  <use name="cuda"/>
-  <flags CUDA_FLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
-  <flags CXXFLAGS="-g"/>
-</bin>
 </iftool>
 
 <bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderByDensity_t">
@@ -42,3 +44,31 @@
 <bin file="cpuVertexFinder_t.cpp" name="cpuVertexFinderIterative_t">
   <flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
 </bin>
+
+<bin file="alpaka/VertexFinder_t.cc alpaka/VertexFinder_t.dev.cc" name="deviceVertexFinderOneKernel_t">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG -DONE_KERNEL"/>
+</bin>
+
+<bin file="alpaka/VertexFinder_t.cc alpaka/VertexFinder_t.dev.cc" name="deviceVertexFinderByDensity_t">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="alpaka/VertexFinder_t.cc alpaka/VertexFinder_t.dev.cc" name="deviceVertexFinderDBSCAN_t">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_DBSCAN"/>
+</bin>
+
+<bin file="alpaka/VertexFinder_t.cc alpaka/VertexFinder_t.dev.cc" name="deviceVertexFinderIterative_t">
+  <use name="alpaka"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG -DUSE_ITERATIVE"/>
+</bin>
diff --git a/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.cc b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.cc
new file mode 100644
index 0000000000000..c3a74676956f8
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.cc
@@ -0,0 +1,33 @@
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/host.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+#include "DataFormats/VertexSoA/interface/ZVertexHost.h"
+#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h"
+#include "DataFormats/VertexSoA/interface/ZVertexDevice.h"
+
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+#include "RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h"
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h"
+
+using namespace std;
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  namespace vertexfinder_t {
+    void runKernels(Queue& queue);
+  }
+
+};  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+int main() {
+  const auto host = cms::alpakatools::host();
+  const auto device = cms::alpakatools::devices<Platform>()[0];
+  Queue queue(device);
+
+  vertexfinder_t::runKernels(queue);
+  return 0;
+}
diff --git a/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc
new file mode 100644
index 0000000000000..e92d586dc1833
--- /dev/null
+++ b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc
@@ -0,0 +1,282 @@
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <random>
+#include <vector>
+#include <alpaka/alpaka.hpp>
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+// TrackUtilities only included in order to compile SoALayout with Eigen columns
+#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h"
+#ifdef USE_DBSCAN
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h"
+#define CLUSTERIZE ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ClusterTracksDBSCAN
+#elif USE_ITERATIVE
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h"
+#define CLUSTERIZE ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ClusterTracksIterative
+#else
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h"
+#define CLUSTERIZE ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ClusterTracksByDensityKernel
+#endif
+
+#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h"
+#include "RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h"
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h"
+
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h"
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h"
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h"
+#include "RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  using namespace cms::alpakatools;
+
+  struct ClusterGenerator {
+    explicit ClusterGenerator(float nvert, float ntrack)
+        : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {}
+
+    void operator()(vertexFinder::PixelVertexWorkSpaceSoAHost& pwsh, ZVertexHost& vtxh) {
+      int nclus = clusGen(reng);
+      for (int zint = 0; zint < vtxh.view().metadata().size(); ++zint) {
+        vtxh.view().zv()[zint] = 3.5f * gauss(reng);
+      }
+
+      int aux = 0;
+      for (int iv = 0; iv < nclus; ++iv) {
+        auto nt = trackGen(reng);
+        pwsh.view().itrk()[iv] = nt;
+        for (int it = 0; it < nt; ++it) {
+          auto err = errgen(reng);  // reality is not flat....
+          pwsh.view().zt()[aux] = vtxh.view().zv()[iv] + err * gauss(reng);
+          pwsh.view().ezt2()[aux] = err * err;
+          pwsh.view().iv()[aux] = iv;
+          pwsh.view().ptt2()[aux] = (iv == 5 ? 1.f : 0.5f) + ptGen(reng);
+          pwsh.view().ptt2()[aux] *= pwsh.view().ptt2()[aux];
+          ++aux;
+        }
+      }
+      pwsh.view().ntrks() = aux;
+      // add noise
+      auto nt = 2 * trackGen(reng);
+      for (int it = 0; it < nt; ++it) {
+        auto err = 0.03f;
+        pwsh.view().zt()[it] = rgen(reng);
+        pwsh.view().ezt2()[it] = err * err;
+        pwsh.view().iv()[it] = 9999;
+        pwsh.view().ptt2()[it] = 0.5f + ptGen(reng);
+        pwsh.view().ptt2()[it] *= pwsh.view().ptt2()[it];
+      }
+    }
+
+    std::mt19937 reng;
+    std::uniform_real_distribution<float> rgen;
+    std::uniform_real_distribution<float> errgen;
+    std::poisson_distribution<int> clusGen;
+    std::poisson_distribution<int> trackGen;
+    std::normal_distribution<float> gauss;
+    std::exponential_distribution<float> ptGen;
+  };
+
+  namespace vertexfinder_t {
+#ifdef ONE_KERNEL
+    class VertexFinderOneKernel {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    vertexFinder::VtxSoAView pdata,
+                                    vertexFinder::WsSoAView pws,
+                                    int minT,      // min number of neighbours to be "seed"
+                                    float eps,     // max absolute distance to cluster
+                                    float errmax,  // max error to be "seed"
+                                    float chi2max  // max normalized distance to cluster,
+      ) const {
+        vertexFinder::clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max);
+        alpaka::syncBlockThreads(acc);
+        vertexFinder::fitVertices(acc, pdata, pws, 50.);
+        alpaka::syncBlockThreads(acc);
+        vertexFinder::splitVertices(acc, pdata, pws, 9.f);
+        alpaka::syncBlockThreads(acc);
+        vertexFinder::fitVertices(acc, pdata, pws, 5000.);
+        alpaka::syncBlockThreads(acc);
+        vertexFinder::sortByPt2(acc, pdata, pws);
+        alpaka::syncBlockThreads(acc);
+      }
+    };
+#endif
+
+    class Kernel_print {
+    public:
+      template <typename TAcc>
+      ALPAKA_FN_ACC void operator()(const TAcc& acc,
+                                    vertexFinder::VtxSoAView pdata,
+                                    vertexFinder::WsSoAView pws) const {
+        printf("nt,nv %d %d,%d\n", pws.ntrks(), pdata.nvFinal(), pws.nvIntermediate());
+      }
+    };
+
+    void runKernels(Queue& queue) {
+      vertexFinder::PixelVertexWorkSpaceSoADevice ws_d(zVertex::MAXTRACKS, queue);
+      vertexFinder::PixelVertexWorkSpaceSoAHost ws_h(zVertex::MAXTRACKS, queue);
+      ZVertexHost vertices_h(queue);
+      ZVertexSoACollection vertices_d(queue);
+
+      float eps = 0.1f;
+      std::array<float, 3> par{{eps, 0.01f, 9.0f}};
+      for (int nav = 30; nav < 80; nav += 20) {
+        ClusterGenerator gen(nav, 10);
+
+        for (int i = 8; i < 20; ++i) {
+          auto kk = i / 4;  // M param
+
+          gen(ws_h, vertices_h);
+          auto workDiv1D = make_workdiv<Acc1D>(1, 1);
+          alpaka::exec<Acc1D>(queue, workDiv1D, vertexFinder::Init{}, vertices_d.view(), ws_d.view());
+          // std::cout << "v,t size " << ws_h.view().zt()[0] << ' ' << vertices_h.view().zv()[0] << std::endl;
+          alpaka::memcpy(queue, ws_d.buffer(), ws_h.buffer());
+          alpaka::wait(queue);
+
+          std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
+
+          if ((i % 4) == 0)
+            par = {{eps, 0.02f, 12.0f}};
+          if ((i % 4) == 1)
+            par = {{eps, 0.02f, 9.0f}};
+          if ((i % 4) == 2)
+            par = {{eps, 0.01f, 9.0f}};
+          if ((i % 4) == 3)
+            par = {{0.7f * eps, 0.01f, 9.0f}};
+
+          alpaka::exec<Acc1D>(queue, workDiv1D, Kernel_print{}, vertices_d.view(), ws_d.view());
+
+          auto workDivClusterizer = make_workdiv<Acc1D>(1, 512 + 256);
+#ifdef ONE_KERNEL
+          alpaka::exec<Acc1D>(queue,
+                              workDivClusterizer,
+                              VertexFinderOneKernel{},
+                              vertices_d.view(),
+                              ws_d.view(),
+                              kk,
+                              par[0],
+                              par[1],
+                              par[2]);
+#else
+          alpaka::exec<Acc1D>(
+              queue, workDivClusterizer, CLUSTERIZE{}, vertices_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
+#endif
+          alpaka::wait(queue);
+          alpaka::exec<Acc1D>(queue, workDiv1D, Kernel_print{}, vertices_d.view(), ws_d.view());
+          alpaka::wait(queue);
+
+          auto workDivFitter = make_workdiv<Acc1D>(1, 1024 - 256);
+
+          alpaka::exec<Acc1D>(
+              queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 50.f);
+
+          alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer());
+          alpaka::wait(queue);
+
+          if (vertices_h.view().nvFinal() == 0) {
+            std::cout << "NO VERTICES???" << std::endl;
+            continue;
+          }
+
+          for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j)
+            if (vertices_h.view().ndof()[j] > 0)
+              vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]);
+          {
+            auto mx =
+                std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal());
+            std::cout << "after fit nv, min max chi2 " << vertices_h.view().nvFinal() << " " << *mx.first << ' '
+                      << *mx.second << std::endl;
+          }
+
+          alpaka::exec<Acc1D>(
+              queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 50.f);
+          alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer());
+          alpaka::wait(queue);
+
+          for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j)
+            if (vertices_h.view().ndof()[j] > 0)
+              vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]);
+          {
+            auto mx =
+                std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal());
+            std::cout << "before splitting nv, min max chi2 " << vertices_h.view().nvFinal() << " " << *mx.first << ' '
+                      << *mx.second << std::endl;
+          }
+
+          auto workDivSplitter = make_workdiv<Acc1D>(1024, 64);
+
+          // one vertex per block!!!
+          alpaka::exec<Acc1D>(
+              queue, workDivSplitter, vertexFinder::SplitVerticesKernel{}, vertices_d.view(), ws_d.view(), 9.f);
+          alpaka::memcpy(queue, ws_h.buffer(), ws_d.buffer());
+          alpaka::wait(queue);
+          std::cout << "after split " << ws_h.view().nvIntermediate() << std::endl;
+
+          alpaka::exec<Acc1D>(
+              queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 5000.f);
+
+          auto workDivSorter = make_workdiv<Acc1D>(1, 256);
+          alpaka::exec<Acc1D>(queue, workDivSorter, vertexFinder::SortByPt2Kernel{}, vertices_d.view(), ws_d.view());
+          alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer());
+          alpaka::wait(queue);
+
+          if (vertices_h.view().nvFinal() == 0) {
+            std::cout << "NO VERTICES???" << std::endl;
+            continue;
+          }
+
+          for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j)
+            if (vertices_h.view().ndof()[j] > 0)
+              vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]);
+          {
+            auto mx =
+                std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal());
+            std::cout << "nv, min max chi2 " << vertices_h.view().nvFinal() << " " << *mx.first << ' ' << *mx.second
+                      << std::endl;
+          }
+
+          {
+            auto mx = std::minmax_element(vertices_h.view().wv(), vertices_h.view().wv() + vertices_h.view().nvFinal());
+            std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second)
+                      << std::endl;
+          }
+
+          {
+            auto mx =
+                std::minmax_element(vertices_h.view().ptv2(), vertices_h.view().ptv2() + vertices_h.view().nvFinal());
+            std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
+            std::cout << "min max ptv2 " << vertices_h.view().ptv2()[vertices_h.view().sortInd()[0]] << ' '
+                      << vertices_h.view().ptv2()[vertices_h.view().sortInd()[vertices_h.view().nvFinal() - 1]]
+                      << " at " << vertices_h.view().sortInd()[0] << ' '
+                      << vertices_h.view().sortInd()[vertices_h.view().nvFinal() - 1] << std::endl;
+          }
+
+          float dd[vertices_h.view().nvFinal()];
+          for (auto kv = 0U; kv < vertices_h.view().nvFinal(); ++kv) {
+            auto zr = vertices_h.view().zv()[kv];
+            auto md = 500.0f;
+            for (int zint = 0; zint < ws_h.view().metadata().size(); ++zint) {
+              auto d = std::abs(zr - ws_h.view().zt()[zint]);
+              md = std::min(d, md);
+            }
+            dd[kv] = md;
+          }
+          if (i == 6) {
+            for (auto d : dd)
+              std::cout << d << ' ';
+            std::cout << std::endl;
+          }
+          auto mx = std::minmax_element(dd, dd + vertices_h.view().nvFinal());
+          float rms = 0;
+          for (auto d : dd)
+            rms += d * d;
+          rms = std::sqrt(rms) / (vertices_h.view().nvFinal() - 1);
+          std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+        }  // loop on events
+      }    // lopp on ave vert
+    }
+  }  // namespace vertexfinder_t
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml b/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml
index dec839e2af6cc..318ef5848183d 100644
--- a/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml
+++ b/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml
@@ -12,33 +12,42 @@
 <library file="BeamSpotProducer.cc" name="BeamSpotProducer">
   <flags EDM_PLUGIN="1"/>
 </library>
+
 <library file="BeamSpotOnlineProducer.cc" name="BeamSpotOnlineProducer">
   <flags EDM_PLUGIN="1"/>
   <use name="DataFormats/L1GlobalTrigger"/>
   <use name="DataFormats/Scalers"/>
 </library>
+
 <library file="BeamSpotAnalyzer.cc" name="BeamSpotAnalyzer">
   <use name="clhep"/>
   <flags EDM_PLUGIN="1"/>
 </library>
+
 <library file="BeamSpotWrite2DB.cc" name="BeamSpotWrite2DB">
   <flags EDM_PLUGIN="1"/>
 </library>
+
 <library file="BeamSpotFakeConditions.cc" name="BeamSpotFakeConditions">
   <flags EDM_PLUGIN="1"/>
 </library>
+
 <library file="BeamSpotFromDB.cc" name="BeamSpotFromDB">
   <flags EDM_PLUGIN="1"/>
 </library>
+
 <library file="OnlineBeamSpotFromDB.cc" name="OnlineBeamSpotFromDB">
   <flags EDM_PLUGIN="1"/>
 </library>
+
 <library file="OnlineBeamSpotESProducer.cc" name="OnlineBeamSpotESProducer">
   <flags EDM_PLUGIN="1"/>
 </library>
+
 <library file="OfflineToTransientBeamSpotESProducer.cc" name="OfflineToTransientBeamSpotESProducer">
   <flags EDM_PLUGIN="1"/>
 </library>
+
 <iftool name="cuda">
   <library file="BeamSpotToCUDA.cc" name="BeamSpotToCUDA">
     <use name="cuda"/>
@@ -48,3 +57,11 @@
     <flags EDM_PLUGIN="1"/>
   </library>
 </iftool>
+
+<library file="alpaka/*.cc" name="BeamSpotDeviceProducer">
+  <use name="DataFormats/BeamSpot"/>
+  <use name="HeterogeneousCore/AlpakaCore"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags EDM_PLUGIN="1"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</library>
diff --git a/RecoVertex/BeamSpotProducer/plugins/alpaka/BeamSpotDeviceProducer.cc b/RecoVertex/BeamSpotProducer/plugins/alpaka/BeamSpotDeviceProducer.cc
new file mode 100644
index 0000000000000..bd597164827fa
--- /dev/null
+++ b/RecoVertex/BeamSpotProducer/plugins/alpaka/BeamSpotDeviceProducer.cc
@@ -0,0 +1,59 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/BeamSpot/interface/BeamSpotHost.h"
+#include "DataFormats/BeamSpot/interface/BeamSpotPOD.h"
+#include "DataFormats/BeamSpot/interface/alpaka/BeamSpotDevice.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/global/EDProducer.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+
+  class BeamSpotDeviceProducer : public global::EDProducer<> {
+  public:
+    BeamSpotDeviceProducer(edm::ParameterSet const& config)
+        : legacyToken_{consumes(config.getParameter<edm::InputTag>("src"))}, deviceToken_{produces()} {}
+
+    void produce(edm::StreamID, device::Event& event, device::EventSetup const& setup) const override {
+      reco::BeamSpot const& beamspot = event.get(legacyToken_);
+
+      BeamSpotHost hostProduct{event.queue()};
+      hostProduct->x = beamspot.x0();
+      hostProduct->y = beamspot.y0();
+      hostProduct->z = beamspot.z0();
+      hostProduct->sigmaZ = beamspot.sigmaZ();
+      hostProduct->beamWidthX = beamspot.BeamWidthX();
+      hostProduct->beamWidthY = beamspot.BeamWidthY();
+      hostProduct->dxdz = beamspot.dxdz();
+      hostProduct->dydz = beamspot.dydz();
+      hostProduct->emittanceX = beamspot.emittanceX();
+      hostProduct->emittanceY = beamspot.emittanceY();
+      hostProduct->betaStar = beamspot.betaStar();
+
+      if constexpr (std::is_same_v<Device, alpaka::DevCpu>) {
+        event.emplace(deviceToken_, std::move(hostProduct));
+      } else {
+        BeamSpotDevice deviceProduct{event.queue()};
+        alpaka::memcpy(event.queue(), deviceProduct.buffer(), hostProduct.const_buffer());
+        event.emplace(deviceToken_, std::move(deviceProduct));
+      }
+    }
+
+    static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+      edm::ParameterSetDescription desc;
+      desc.add("src", edm::InputTag{});
+      descriptions.addWithDefaultLabel(desc);
+    }
+
+  private:
+    const edm::EDGetTokenT<reco::BeamSpot> legacyToken_;
+    const device::EDPutToken<BeamSpotDevice> deviceToken_;
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h"
+DEFINE_FWK_ALPAKA_MODULE(BeamSpotDeviceProducer);
diff --git a/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py b/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py
index 7cc651af22106..5c17275c17274 100644
--- a/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py
+++ b/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py
@@ -2,6 +2,7 @@
 
 from RecoVertex.BeamSpotProducer.BeamSpot_cfi import *
 from RecoVertex.BeamSpotProducer.offlineBeamSpotToCUDA_cfi import offlineBeamSpotToCUDA
+from RecoVertex.BeamSpotProducer.beamSpotDeviceProducer_cfi import beamSpotDeviceProducer as _beamSpotDeviceProducer
 
 offlineBeamSpotTask = cms.Task(offlineBeamSpot)
 
@@ -9,3 +10,9 @@
 _offlineBeamSpotTask_gpu = offlineBeamSpotTask.copy()
 _offlineBeamSpotTask_gpu.add(offlineBeamSpotToCUDA)
 gpu.toReplaceWith(offlineBeamSpotTask, _offlineBeamSpotTask_gpu)
+
+from Configuration.ProcessModifiers.alpaka_cff import alpaka
+_offlineBeamSpotTask_alpaka = offlineBeamSpotTask.copy()
+offlineBeamSpotDevice = _beamSpotDeviceProducer.clone(src = cms.InputTag('offlineBeamSpot'))
+_offlineBeamSpotTask_alpaka.add(offlineBeamSpotDevice)
+alpaka.toReplaceWith(offlineBeamSpotTask, _offlineBeamSpotTask_alpaka)