diff --git a/Configuration/ProcessModifiers/python/alpakaValidationPixel_cff.py b/Configuration/ProcessModifiers/python/alpakaValidationPixel_cff.py new file mode 100644 index 0000000000000..ebdb7d9e6981a --- /dev/null +++ b/Configuration/ProcessModifiers/python/alpakaValidationPixel_cff.py @@ -0,0 +1,6 @@ +import FWCore.ParameterSet.Config as cms + +# This modifier chain is for turning on DQM modules used for alpaka device/host validation for pixels + +alpakaValidationPixel = cms.Modifier() + diff --git a/Configuration/ProcessModifiers/python/alpakaValidation_cff.py b/Configuration/ProcessModifiers/python/alpakaValidation_cff.py new file mode 100644 index 0000000000000..3399bdda7c4df --- /dev/null +++ b/Configuration/ProcessModifiers/python/alpakaValidation_cff.py @@ -0,0 +1,11 @@ +import FWCore.ParameterSet.Config as cms + +from Configuration.ProcessModifiers.alpaka_cff import * +from Configuration.ProcessModifiers.alpakaValidationPixel_cff import * + +# This modifier chain is for turning on DQM modules used for alpaka device/host validation + +alpakaValidation = cms.ModifierChain( + alpaka, + alpakaValidationPixel +) diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py index ef7525a26b540..3f9a6ed96c9e5 100644 --- a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py +++ b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py @@ -894,6 +894,7 @@ def setup_(self, step, stepName, stepDict, k, properties): # - HLT on CPU # - Pixel-only reconstruction on CPU, with DQM and validation # - harvesting + upgradeWFs['PatatrackPixelOnlyCPU'] = PatatrackWorkflow( digi = { # the HLT menu is already set up for using GPUs if available and if the "gpu" modifier is enabled @@ -1513,6 +1514,53 @@ def setup_(self, step, stepName, stepDict, k, properties): offset = 0.597, ) + +# Alpaka workflows + +upgradeWFs['PatatrackPixelOnlyAlpaka'] = PatatrackWorkflow( + digi = { + '--procModifiers': 'alpaka' + }, + reco = { + '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM', + '--procModifiers': 'alpaka' + }, + harvest = { + '-s': 'HARVESTING:@trackingOnlyValidation+@pixelTrackingOnlyDQM' + }, + suffix = 'Patatrack_PixelOnlyAlpaka', + offset = 0.402, +) + +upgradeWFs['PatatrackPixelOnlyAlpakaValidation'] = PatatrackWorkflow( + digi = { + '--procModifiers': 'alpaka' + }, + reco = { + '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM', + '--procModifiers': 'alpakaValidation' + }, + harvest = { + '-s': 'HARVESTING:@trackingOnlyValidation+@pixelTrackingOnlyDQM' + }, + suffix = 'Patatrack_PixelOnlyAlpaka_Validation', + offset = 0.403, +) + +upgradeWFs['PatatrackPixelOnlyAlpakaProfiling'] = PatatrackWorkflow( + digi = { + '--procModifiers': 'alpaka' + }, + reco = { + '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly', + '--procModifiers': 'alpaka', + '--customise' : 'RecoTracker/Configuration/customizePixelOnlyForProfiling.customizePixelOnlyForProfilingGPUOnly' + }, + harvest = None, + suffix = 'Patatrack_PixelOnlyAlpaka_Profiling', + offset = 0.404, +) + # end of Patatrack workflows class UpgradeWorkflow_ProdLike(UpgradeWorkflow): @@ -2718,7 +2766,7 @@ def condition(self, fragment, stepList, key, hasHarvest): }, '2022HI' : { 'Geom' : 'DB:Extended', - 'GT':'auto:phase1_2022_realistic_hi', + 'GT':'auto:phase1_2022_realistic_hi', 'HLTmenu': '@fake2', 'Era':'Run3_pp_on_PbPb', 'BeamSpot': 'DBrealistic', @@ -2726,7 +2774,7 @@ def condition(self, fragment, stepList, key, hasHarvest): }, '2022HIRP' : { 'Geom' : 'DB:Extended', - 'GT':'auto:phase1_2022_realistic_hi', + 'GT':'auto:phase1_2022_realistic_hi', 'HLTmenu': '@fake2', 'Era':'Run3_pp_on_PbPb_approxSiStripClusters', 'BeamSpot': 'DBrealistic', @@ -2734,7 +2782,7 @@ def condition(self, fragment, stepList, key, hasHarvest): }, '2023HI' : { 'Geom' : 'DB:Extended', - 'GT':'auto:phase1_2023_realistic_hi', + 'GT':'auto:phase1_2023_realistic_hi', 'HLTmenu': '@fake2', 'Era':'Run3_pp_on_PbPb', 'BeamSpot': 'DBrealistic', @@ -2742,7 +2790,7 @@ def condition(self, fragment, stepList, key, hasHarvest): }, '2023HIRP' : { 'Geom' : 'DB:Extended', - 'GT':'auto:phase1_2023_realistic_hi', + 'GT':'auto:phase1_2023_realistic_hi', 'HLTmenu': '@fake2', 'Era':'Run3_pp_on_PbPb_approxSiStripClusters', 'BeamSpot': 'DBrealistic', diff --git a/DQM/SiPixelHeterogeneous/plugins/BuildFile.xml b/DQM/SiPixelHeterogeneous/plugins/BuildFile.xml index 66adf1666762e..79925fdcb6cf8 100644 --- a/DQM/SiPixelHeterogeneous/plugins/BuildFile.xml +++ b/DQM/SiPixelHeterogeneous/plugins/BuildFile.xml @@ -5,8 +5,11 @@ + + + + - diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoAAlpaka.cc new file mode 100644 index 0000000000000..474194ad72616 --- /dev/null +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoAAlpaka.cc @@ -0,0 +1,244 @@ +#include "DQMServices/Core/interface/MonitorElement.h" +#include "DQMServices/Core/interface/DQMEDAnalyzer.h" +#include "DQMServices/Core/interface/DQMStore.h" +#include "DataFormats/Math/interface/approx_atan2.h" +#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h" +#include "Geometry/CommonTopologies/interface/PixelTopology.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" + +template +class SiPixelCompareRecHitsSoAAlpaka : public DQMEDAnalyzer { +public: + using HitsOnHost = TrackingRecHitHost; + + explicit SiPixelCompareRecHitsSoAAlpaka(const edm::ParameterSet&); + ~SiPixelCompareRecHitsSoAAlpaka() override = default; + void dqmBeginRun(const edm::Run&, const edm::EventSetup&) override; + void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; + void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override; + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + const edm::ESGetToken geomToken_; + const edm::ESGetToken topoToken_; + const edm::EDGetTokenT tokenSoAHitsHost_; //these two are both on Host but originally they have been + const edm::EDGetTokenT tokenSoAHitsDevice_; //produced on Host or on Device + const std::string topFolderName_; + const float mind2cut_; + static constexpr uint32_t invalidHit_ = std::numeric_limits::max(); + static constexpr float micron_ = 10000.; + const TrackerGeometry* tkGeom_ = nullptr; + const TrackerTopology* tTopo_ = nullptr; + MonitorElement* hnHits_; + MonitorElement* hBchargeL_[4]; // max 4 barrel hits + MonitorElement* hBsizexL_[4]; + MonitorElement* hBsizeyL_[4]; + MonitorElement* hBposxL_[4]; + MonitorElement* hBposyL_[4]; + MonitorElement* hFchargeD_[2][12]; // max 12 endcap disks + MonitorElement* hFsizexD_[2][12]; + MonitorElement* hFsizeyD_[2][12]; + MonitorElement* hFposxD_[2][12]; + MonitorElement* hFposyD_[2][12]; + //differences + MonitorElement* hBchargeDiff_; + MonitorElement* hFchargeDiff_; + MonitorElement* hBsizeXDiff_; + MonitorElement* hFsizeXDiff_; + MonitorElement* hBsizeYDiff_; + MonitorElement* hFsizeYDiff_; + MonitorElement* hBposXDiff_; + MonitorElement* hFposXDiff_; + MonitorElement* hBposYDiff_; + MonitorElement* hFposYDiff_; +}; + +// +// constructors +// +template +SiPixelCompareRecHitsSoAAlpaka::SiPixelCompareRecHitsSoAAlpaka(const edm::ParameterSet& iConfig) + : geomToken_(esConsumes()), + topoToken_(esConsumes()), + tokenSoAHitsHost_(consumes(iConfig.getParameter("pixelHitsSrcHost"))), + tokenSoAHitsDevice_(consumes(iConfig.getParameter("pixelHitsSrcDevice"))), + topFolderName_(iConfig.getParameter("topFolderName")), + mind2cut_(iConfig.getParameter("minD2cut")) {} + +// +// Begin Run +// +template +void SiPixelCompareRecHitsSoAAlpaka::dqmBeginRun(const edm::Run& iRun, const edm::EventSetup& iSetup) { + tkGeom_ = &iSetup.getData(geomToken_); + tTopo_ = &iSetup.getData(topoToken_); +} + +// +// -- Analyze +// +template +void SiPixelCompareRecHitsSoAAlpaka::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { + const auto& rhsoaHandleHost = iEvent.getHandle(tokenSoAHitsHost_); + const auto& rhsoaHandleDevice = iEvent.getHandle(tokenSoAHitsDevice_); + if (not rhsoaHandleHost or not rhsoaHandleDevice) { + edm::LogWarning out("SiPixelCompareRecHitsSoAAlpaka"); + if (not rhsoaHandleHost) { + out << "reference (Host) rechits not found; "; + } + if (not rhsoaHandleDevice) { + out << "target (Device) rechits not found; "; + } + out << "the comparison will not run."; + return; + } + + auto const& rhsoaHost = *rhsoaHandleHost; + auto const& rhsoaDevice = *rhsoaHandleDevice; + + auto const& soa2dHost = rhsoaHost.const_view(); + auto const& soa2dDevice = rhsoaDevice.const_view(); + + uint32_t nHitsHost = soa2dHost.metadata().size(); + uint32_t nHitsDevice = soa2dDevice.metadata().size(); + + hnHits_->Fill(nHitsHost, nHitsDevice); + auto detIds = tkGeom_->detUnitIds(); + for (uint32_t i = 0; i < nHitsHost; i++) { + float minD = mind2cut_; + uint32_t matchedHit = invalidHit_; + uint16_t indHost = soa2dHost[i].detectorIndex(); + float xLocalHost = soa2dHost[i].xLocal(); + float yLocalHost = soa2dHost[i].yLocal(); + for (uint32_t j = 0; j < nHitsDevice; j++) { + if (soa2dDevice.detectorIndex(j) == indHost) { + float dx = xLocalHost - soa2dDevice[j].xLocal(); + float dy = yLocalHost - soa2dDevice[j].yLocal(); + float distance = dx * dx + dy * dy; + if (distance < minD) { + minD = distance; + matchedHit = j; + } + } + } + DetId id = detIds[indHost]; + uint32_t chargeHost = soa2dHost[i].chargeAndStatus().charge; + int16_t sizeXHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeX()) / 8.)); + int16_t sizeYHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeY()) / 8.)); + uint32_t chargeDevice = 0; + int16_t sizeXDevice = -99; + int16_t sizeYDevice = -99; + float xLocalDevice = -999.; + float yLocalDevice = -999.; + if (matchedHit != invalidHit_) { + chargeDevice = soa2dDevice[matchedHit].chargeAndStatus().charge; + sizeXDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeX()) / 8.)); + sizeYDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeY()) / 8.)); + xLocalDevice = soa2dDevice[matchedHit].xLocal(); + yLocalDevice = soa2dDevice[matchedHit].yLocal(); + } + switch (id.subdetId()) { + case PixelSubdetector::PixelBarrel: + hBchargeL_[tTopo_->pxbLayer(id) - 1]->Fill(chargeHost, chargeDevice); + hBsizexL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeXHost, sizeXDevice); + hBsizeyL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeYHost, sizeYDevice); + hBposxL_[tTopo_->pxbLayer(id) - 1]->Fill(xLocalHost, xLocalDevice); + hBposyL_[tTopo_->pxbLayer(id) - 1]->Fill(yLocalHost, yLocalDevice); + hBchargeDiff_->Fill(chargeHost - chargeDevice); + hBsizeXDiff_->Fill(sizeXHost - sizeXDevice); + hBsizeYDiff_->Fill(sizeYHost - sizeYDevice); + hBposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice)); + hBposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice)); + break; + case PixelSubdetector::PixelEndcap: + hFchargeD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(chargeHost, chargeDevice); + hFsizexD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeXHost, sizeXDevice); + hFsizeyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeYHost, sizeYDevice); + hFposxD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xLocalHost, xLocalDevice); + hFposyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(yLocalHost, yLocalDevice); + hFchargeDiff_->Fill(chargeHost - chargeDevice); + hFsizeXDiff_->Fill(sizeXHost - sizeXDevice); + hFsizeYDiff_->Fill(sizeYHost - sizeYDevice); + hFposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice)); + hFposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice)); + break; + } + } +} + +// +// -- Book Histograms +// +template +void SiPixelCompareRecHitsSoAAlpaka::bookHistograms(DQMStore::IBooker& iBook, + edm::Run const& iRun, + edm::EventSetup const& iSetup) { + iBook.cd(); + iBook.setCurrentFolder(topFolderName_); + + // clang-format off + //Global + hnHits_ = iBook.book2I("nHits", "HostvsDevice RecHits per event;#Host RecHits;#Device RecHits", 200, 0, 5000,200, 0, 5000); + //Barrel Layer + for(unsigned int il=0;ilnumberOfLayers(PixelSubdetector::PixelBarrel);il++){ + hBchargeL_[il] = iBook.book2I(Form("recHitsBLay%dCharge",il+1), Form("HostvsDevice RecHits Charge Barrel Layer%d;Host Charge;Device Charge",il+1), 250, 0, 100000, 250, 0, 100000); + hBsizexL_[il] = iBook.book2I(Form("recHitsBLay%dSizex",il+1), Form("HostvsDevice RecHits SizeX Barrel Layer%d;Host SizeX;Device SizeX",il+1), 30, 0, 30, 30, 0, 30); + hBsizeyL_[il] = iBook.book2I(Form("recHitsBLay%dSizey",il+1), Form("HostvsDevice RecHits SizeY Barrel Layer%d;Host SizeY;Device SizeY",il+1), 30, 0, 30, 30, 0, 30); + hBposxL_[il] = iBook.book2D(Form("recHitsBLay%dPosx",il+1), Form("HostvsDevice RecHits x-pos in Barrel Layer%d;Host pos x;Device pos x",il+1), 200, -5, 5, 200,-5,5); + hBposyL_[il] = iBook.book2D(Form("recHitsBLay%dPosy",il+1), Form("HostvsDevice RecHits y-pos in Barrel Layer%d;Host pos y;Device pos y",il+1), 200, -5, 5, 200,-5,5); + } + //Endcaps + //Endcaps Disk + for(int is=0;is<2;is++){ + int sign=is==0? -1:1; + for(unsigned int id=0;idnumberOfLayers(PixelSubdetector::PixelEndcap);id++){ + hFchargeD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("HostvsDevice RecHits Charge Endcaps Disk%+d;Host Charge;Device Charge",id*sign+sign), 250, 0, 100000, 250, 0, 100000); + hFsizexD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("HostvsDevice RecHits SizeX Endcaps Disk%+d;Host SizeX;Device SizeX",id*sign+sign), 30, 0, 30, 30, 0, 30); + hFsizeyD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("HostvsDevice RecHits SizeY Endcaps Disk%+d;Host SizeY;Device SizeY",id*sign+sign), 30, 0, 30, 30, 0, 30); + hFposxD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosx",id*sign+sign), Form("HostvsDevice RecHits x-pos Endcaps Disk%+d;Host pos x;Device pos x",id*sign+sign), 200, -5, 5, 200, -5, 5); + hFposyD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosy",id*sign+sign), Form("HostvsDevice RecHits y-pos Endcaps Disk%+d;Host pos y;Device pos y",id*sign+sign), 200, -5, 5, 200, -5, 5); + } + } + //1D differences + hBchargeDiff_ = iBook.book1D("rechitChargeDiffBpix","Charge differnce of rechits in BPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5); + hFchargeDiff_ = iBook.book1D("rechitChargeDiffFpix","Charge differnce of rechits in FPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5); + hBsizeXDiff_ = iBook.book1D("rechitsizeXDiffBpix","SizeX difference of rechits in BPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5); + hFsizeXDiff_ = iBook.book1D("rechitsizeXDiffFpix","SizeX difference of rechits in FPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5); + hBsizeYDiff_ = iBook.book1D("rechitsizeYDiffBpix","SizeY difference of rechits in BPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5); + hFsizeYDiff_ = iBook.book1D("rechitsizeYDiffFpix","SizeY difference of rechits in FPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5); + hBposXDiff_ = iBook.book1D("rechitsposXDiffBpix","x-position difference of rechits in BPix; rechit x-pos difference (Host - Device)", 1000, -10, 10); + hFposXDiff_ = iBook.book1D("rechitsposXDiffFpix","x-position difference of rechits in FPix; rechit x-pos difference (Host - Device)", 1000, -10, 10); + hBposYDiff_ = iBook.book1D("rechitsposYDiffBpix","y-position difference of rechits in BPix; rechit y-pos difference (Host - Device)", 1000, -10, 10); + hFposYDiff_ = iBook.book1D("rechitsposYDiffFpix","y-position difference of rechits in FPix; rechit y-pos difference (Host - Device)", 1000, -10, 10); +} + +template +void SiPixelCompareRecHitsSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + // monitorpixelRecHitsSoAAlpaka + edm::ParameterSetDescription desc; + desc.add("pixelHitsSrcHost", edm::InputTag("siPixelRecHitsPreSplittingAlpakaSerial")); + desc.add("pixelHitsSrcDevice", edm::InputTag("siPixelRecHitsPreSplittingAlpaka")); + desc.add("topFolderName", "SiPixelHeterogeneous/PixelRecHitsCompareDeviceVSHost"); + desc.add("minD2cut", 0.0001); + descriptions.addWithDefaultLabel(desc); +} + +using SiPixelPhase1CompareRecHitsSoAAlpaka = SiPixelCompareRecHitsSoAAlpaka; +using SiPixelPhase2CompareRecHitsSoAAlpaka = SiPixelCompareRecHitsSoAAlpaka; +using SiPixelHIonPhase1CompareRecHitsSoAAlpaka = SiPixelCompareRecHitsSoAAlpaka; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(SiPixelPhase1CompareRecHitsSoAAlpaka); +DEFINE_FWK_MODULE(SiPixelPhase2CompareRecHitsSoAAlpaka); +DEFINE_FWK_MODULE(SiPixelHIonPhase1CompareRecHitsSoAAlpaka); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoAAlpaka.cc new file mode 100644 index 0000000000000..65a6dc2802831 --- /dev/null +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoAAlpaka.cc @@ -0,0 +1,308 @@ +// for string manipulations +#include +#include "DataFormats/Common/interface/Handle.h" +#include "DataFormats/Math/interface/deltaR.h" +#include "DataFormats/Math/interface/deltaPhi.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Utilities/interface/InputTag.h" +// DQM Histograming +#include "DQMServices/Core/interface/MonitorElement.h" +#include "DQMServices/Core/interface/DQMEDAnalyzer.h" +#include "DQMServices/Core/interface/DQMStore.h" +// DataFormats +#include "DataFormats/TrackSoA/interface/TracksHost.h" +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" + +namespace { + // same logic used for the MTV: + // cf https://github.com/cms-sw/cmssw/blob/master/Validation/RecoTrack/src/MTVHistoProducerAlgoForTracker.cc + typedef dqm::reco::DQMStore DQMStore; + + void setBinLog(TAxis* axis) { + int bins = axis->GetNbins(); + float from = axis->GetXmin(); + float to = axis->GetXmax(); + float width = (to - from) / bins; + std::vector new_bins(bins + 1, 0); + for (int i = 0; i <= bins; i++) { + new_bins[i] = TMath::Power(10, from + i * width); + } + axis->Set(bins, new_bins.data()); + } + + void setBinLogX(TH1* h) { + TAxis* axis = h->GetXaxis(); + setBinLog(axis); + } + void setBinLogY(TH1* h) { + TAxis* axis = h->GetYaxis(); + setBinLog(axis); + } + + template + dqm::reco::MonitorElement* make2DIfLog(DQMStore::IBooker& ibook, bool logx, bool logy, Args&&... args) { + auto h = std::make_unique(std::forward(args)...); + if (logx) + setBinLogX(h.get()); + if (logy) + setBinLogY(h.get()); + const auto& name = h->GetName(); + return ibook.book2I(name, h.release()); + } +} // namespace + +template +class SiPixelCompareTrackSoAAlpaka : public DQMEDAnalyzer { +public: + using PixelTrackSoA = TracksHost; + + explicit SiPixelCompareTrackSoAAlpaka(const edm::ParameterSet&); + ~SiPixelCompareTrackSoAAlpaka() override = default; + void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; + void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override; + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + const edm::EDGetTokenT tokenSoATrackHost_; + const edm::EDGetTokenT tokenSoATrackDevice_; + const std::string topFolderName_; + const bool useQualityCut_; + const pixelTrack::Quality minQuality_; + const float dr2cut_; + MonitorElement* hnTracks_; + MonitorElement* hnLooseAndAboveTracks_; + MonitorElement* hnLooseAndAboveTracks_matched_; + MonitorElement* hnHits_; + MonitorElement* hnHitsVsPhi_; + MonitorElement* hnHitsVsEta_; + MonitorElement* hnLayers_; + MonitorElement* hnLayersVsPhi_; + MonitorElement* hnLayersVsEta_; + MonitorElement* hCharge_; + MonitorElement* hchi2_; + MonitorElement* hChi2VsPhi_; + MonitorElement* hChi2VsEta_; + MonitorElement* hpt_; + MonitorElement* hptLogLog_; + MonitorElement* heta_; + MonitorElement* hphi_; + MonitorElement* hz_; + MonitorElement* htip_; + MonitorElement* hquality_; + //1D differences + MonitorElement* hptdiffMatched_; + MonitorElement* hCurvdiffMatched_; + MonitorElement* hetadiffMatched_; + MonitorElement* hphidiffMatched_; + MonitorElement* hzdiffMatched_; + MonitorElement* htipdiffMatched_; + + //for matching eff vs region: derive the ratio at harvesting + MonitorElement* hpt_eta_tkAllHost_; + MonitorElement* hpt_eta_tkAllHostMatched_; + MonitorElement* hphi_z_tkAllHost_; + MonitorElement* hphi_z_tkAllHostMatched_; +}; + +// +// constructors +// + +template +SiPixelCompareTrackSoAAlpaka::SiPixelCompareTrackSoAAlpaka(const edm::ParameterSet& iConfig) + : tokenSoATrackHost_(consumes(iConfig.getParameter("pixelTrackSrcHost"))), + tokenSoATrackDevice_(consumes(iConfig.getParameter("pixelTrackSrcDevice"))), + topFolderName_(iConfig.getParameter("topFolderName")), + useQualityCut_(iConfig.getParameter("useQualityCut")), + minQuality_(pixelTrack::qualityByName(iConfig.getParameter("minQuality"))), + dr2cut_(iConfig.getParameter("deltaR2cut")) {} + +// +// -- Analyze +// +template +void SiPixelCompareTrackSoAAlpaka::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { + using helper = TracksUtilities; + const auto& tsoaHandleHost = iEvent.getHandle(tokenSoATrackHost_); + const auto& tsoaHandleDevice = iEvent.getHandle(tokenSoATrackDevice_); + if (not tsoaHandleHost or not tsoaHandleDevice) { + edm::LogWarning out("SiPixelCompareTrackSoAAlpaka"); + if (not tsoaHandleHost) { + out << "reference (cpu) tracks not found; "; + } + if (not tsoaHandleDevice) { + out << "target (gpu) tracks not found; "; + } + out << "the comparison will not run."; + return; + } + + auto const& tsoaHost = *tsoaHandleHost; + auto const& tsoaDevice = *tsoaHandleDevice; + auto maxTracksHost = tsoaHost.view().metadata().size(); //this should be same for both? + auto maxTracksDevice = tsoaDevice.view().metadata().size(); //this should be same for both? + auto const* qualityHost = tsoaHost.view().quality(); + auto const* qualityDevice = tsoaDevice.view().quality(); + int32_t nTracksHost = 0; + int32_t nTracksDevice = 0; + int32_t nLooseAndAboveTracksHost = 0; + int32_t nLooseAndAboveTracksHost_matchedDevice = 0; + int32_t nLooseAndAboveTracksDevice = 0; + + //Loop over Device tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false? + std::vector looseTrkidxDevice; + for (int32_t jt = 0; jt < maxTracksDevice; ++jt) { + if (helper::nHits(tsoaDevice.view(), jt) == 0) + break; // this is a guard + if (!(tsoaDevice.view()[jt].pt() > 0.)) + continue; + nTracksDevice++; + if (useQualityCut_ && qualityDevice[jt] < minQuality_) + continue; + nLooseAndAboveTracksDevice++; + looseTrkidxDevice.emplace_back(jt); + } + + //Now loop over Host tracks//nested loop for loose gPU tracks + for (int32_t it = 0; it < maxTracksHost; ++it) { + int nHitsHost = helper::nHits(tsoaHost.view(), it); + + if (nHitsHost == 0) + break; // this is a guard + + float ptHost = tsoaHost.view()[it].pt(); + float etaHost = tsoaHost.view()[it].eta(); + float phiHost = helper::phi(tsoaHost.view(), it); + float zipHost = helper::zip(tsoaHost.view(), it); + float tipHost = helper::tip(tsoaHost.view(), it); + + if (!(ptHost > 0.)) + continue; + nTracksHost++; + if (useQualityCut_ && qualityHost[it] < minQuality_) + continue; + nLooseAndAboveTracksHost++; + //Now loop over loose Device trk and find the closest in DeltaR//do we need pt cut? + const int32_t notFound = -1; + int32_t closestTkidx = notFound; + float mindr2 = dr2cut_; + + for (auto gid : looseTrkidxDevice) { + float etaDevice = tsoaDevice.view()[gid].eta(); + float phiDevice = helper::phi(tsoaDevice.view(), gid); + float dr2 = reco::deltaR2(etaHost, phiHost, etaDevice, phiDevice); + if (dr2 > dr2cut_) + continue; // this is arbitrary + if (mindr2 > dr2) { + mindr2 = dr2; + closestTkidx = gid; + } + } + + hpt_eta_tkAllHost_->Fill(etaHost, ptHost); //all Host tk + hphi_z_tkAllHost_->Fill(phiHost, zipHost); + if (closestTkidx == notFound) + continue; + nLooseAndAboveTracksHost_matchedDevice++; + + hchi2_->Fill(tsoaHost.view()[it].chi2(), tsoaDevice.view()[closestTkidx].chi2()); + hCharge_->Fill(reco::charge(tsoaHost.view(), it), reco::charge(tsoaDevice.view(), closestTkidx)); + hnHits_->Fill(helper::nHits(tsoaHost.view(), it), helper::nHits(tsoaDevice.view(), closestTkidx)); + hnLayers_->Fill(tsoaHost.view()[it].nLayers(), tsoaDevice.view()[closestTkidx].nLayers()); + hpt_->Fill(tsoaHost.view()[it].pt(), tsoaDevice.view()[closestTkidx].pt()); + hptLogLog_->Fill(tsoaHost.view()[it].pt(), tsoaDevice.view()[closestTkidx].pt()); + heta_->Fill(etaHost, tsoaDevice.view()[closestTkidx].eta()); + hphi_->Fill(phiHost, helper::phi(tsoaDevice.view(), closestTkidx)); + hz_->Fill(zipHost, helper::zip(tsoaDevice.view(), closestTkidx)); + htip_->Fill(tipHost, helper::tip(tsoaDevice.view(), closestTkidx)); + hptdiffMatched_->Fill(ptHost - tsoaDevice.view()[closestTkidx].pt()); + hCurvdiffMatched_->Fill((reco::charge(tsoaHost.view(), it) / tsoaHost.view()[it].pt()) - + (reco::charge(tsoaDevice.view(), closestTkidx) / tsoaDevice.view()[closestTkidx].pt())); + hetadiffMatched_->Fill(etaHost - tsoaDevice.view()[closestTkidx].eta()); + hphidiffMatched_->Fill(reco::deltaPhi(phiHost, helper::phi(tsoaDevice.view(), closestTkidx))); + hzdiffMatched_->Fill(zipHost - helper::zip(tsoaDevice.view(), closestTkidx)); + htipdiffMatched_->Fill(tipHost - helper::tip(tsoaDevice.view(), closestTkidx)); + hpt_eta_tkAllHostMatched_->Fill(etaHost, tsoaHost.view()[it].pt()); //matched to gpu + hphi_z_tkAllHostMatched_->Fill(etaHost, zipHost); + } + hnTracks_->Fill(nTracksHost, nTracksDevice); + hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksHost, nLooseAndAboveTracksDevice); + hnLooseAndAboveTracks_matched_->Fill(nLooseAndAboveTracksHost, nLooseAndAboveTracksHost_matchedDevice); +} + +// +// -- Book Histograms +// +template +void SiPixelCompareTrackSoAAlpaka::bookHistograms(DQMStore::IBooker& iBook, + edm::Run const& iRun, + edm::EventSetup const& iSetup) { + iBook.cd(); + iBook.setCurrentFolder(topFolderName_); + + // clang-format off + std::string toRep = "Number of tracks"; + // FIXME: all the 2D correlation plots are quite heavy in terms of memory consumption, so a as soon as DQM supports THnSparse + // these should be moved to a less resource consuming format + hnTracks_ = iBook.book2I("nTracks", fmt::format("{} per event; Host; Device",toRep), 501, -0.5, 500.5, 501, -0.5, 500.5); + hnLooseAndAboveTracks_ = iBook.book2I("nLooseAndAboveTracks", fmt::format("{} (quality #geq loose) per event; Host; Device",toRep), 501, -0.5, 500.5, 501, -0.5, 500.5); + hnLooseAndAboveTracks_matched_ = iBook.book2I("nLooseAndAboveTracks_matched", fmt::format("{} (quality #geq loose) per event; Host; Device",toRep), 501, -0.5, 500.5, 501, -0.5, 500.5); + + toRep = "Number of all RecHits per track (quality #geq loose)"; + hnHits_ = iBook.book2I("nRecHits", fmt::format("{};Host;Device",toRep), 15, -0.5, 14.5, 15, -0.5, 14.5); + + toRep = "Number of all layers per track (quality #geq loose)"; + hnLayers_ = iBook.book2I("nLayers", fmt::format("{};Host;Device",toRep), 15, -0.5, 14.5, 15, -0.5, 14.5); + + toRep = "Track (quality #geq loose) #chi^{2}/ndof"; + hchi2_ = iBook.book2I("nChi2ndof", fmt::format("{};Host;Device",toRep), 40, 0., 20., 40, 0., 20.); + + toRep = "Track (quality #geq loose) charge"; + hCharge_ = iBook.book2I("charge",fmt::format("{};Host;Device",toRep),3, -1.5, 1.5, 3, -1.5, 1.5); + + hpt_ = iBook.book2I("pt", "Track (quality #geq loose) p_{T} [GeV];Host;Device", 200, 0., 200., 200, 0., 200.); + hptLogLog_ = make2DIfLog(iBook, true, true, "ptLogLog", "Track (quality #geq loose) p_{T} [GeV];Host;Device", 200, log10(0.5), log10(200.), 200, log10(0.5), log10(200.)); + heta_ = iBook.book2I("eta", "Track (quality #geq loose) #eta;Host;Device", 30, -3., 3., 30, -3., 3.); + hphi_ = iBook.book2I("phi", "Track (quality #geq loose) #phi;Host;Device", 30, -M_PI, M_PI, 30, -M_PI, M_PI); + hz_ = iBook.book2I("z", "Track (quality #geq loose) z [cm];Host;Device", 30, -30., 30., 30, -30., 30.); + htip_ = iBook.book2I("tip", "Track (quality #geq loose) TIP [cm];Host;Device", 100, -0.5, 0.5, 100, -0.5, 0.5); + //1D difference plots + hptdiffMatched_ = iBook.book1D("ptdiffmatched", " p_{T} diff [GeV] between matched tracks; #Delta p_{T} [GeV]", 60, -30., 30.); + hCurvdiffMatched_ = iBook.book1D("curvdiffmatched", "q/p_{T} diff [GeV] between matched tracks; #Delta q/p_{T} [GeV]", 60, -30., 30.); + hetadiffMatched_ = iBook.book1D("etadiffmatched", " #eta diff between matched tracks; #Delta #eta", 160, -0.04 ,0.04); + hphidiffMatched_ = iBook.book1D("phidiffmatched", " #phi diff between matched tracks; #Delta #phi", 160, -0.04 ,0.04); + hzdiffMatched_ = iBook.book1D("zdiffmatched", " z diff between matched tracks; #Delta z [cm]", 300, -1.5, 1.5); + htipdiffMatched_ = iBook.book1D("tipdiffmatched", " TIP diff between matched tracks; #Delta TIP [cm]", 300, -1.5, 1.5); + //2D plots for eff + hpt_eta_tkAllHost_ = iBook.book2I("ptetatrkAllHost", "Track (quality #geq loose) on Host; #eta; p_{T} [GeV];", 30, -M_PI, M_PI, 200, 0., 200.); + hpt_eta_tkAllHostMatched_ = iBook.book2I("ptetatrkAllHostmatched", "Track (quality #geq loose) on Host matched to Device track; #eta; p_{T} [GeV];", 30, -M_PI, M_PI, 200, 0., 200.); + + hphi_z_tkAllHost_ = iBook.book2I("phiztrkAllHost", "Track (quality #geq loose) on Host; #phi; z [cm];", 30, -M_PI, M_PI, 30, -30., 30.); + hphi_z_tkAllHostMatched_ = iBook.book2I("phiztrkAllHostmatched", "Track (quality #geq loose) on Host; #phi; z [cm];", 30, -M_PI, M_PI, 30, -30., 30.); + +} + +template +void SiPixelCompareTrackSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + // monitorpixelTrackSoA + edm::ParameterSetDescription desc; + desc.add("pixelTrackSrcHost", edm::InputTag("pixelTracksAlpakaSerial")); + desc.add("pixelTrackSrcDevice", edm::InputTag("pixelTracksAlpaka")); + desc.add("topFolderName", "SiPixelHeterogeneous/PixelTrackCompareDeviceVSHost"); + desc.add("useQualityCut", true); + desc.add("minQuality", "loose"); + desc.add("deltaR2cut", 0.04); + descriptions.addWithDefaultLabel(desc); +} + +using SiPixelPhase1CompareTrackSoAAlpaka = SiPixelCompareTrackSoAAlpaka; +using SiPixelPhase2CompareTrackSoAAlpaka = SiPixelCompareTrackSoAAlpaka; +using SiPixelHIonPhase1CompareTrackSoAAlpaka = SiPixelCompareTrackSoAAlpaka; + +DEFINE_FWK_MODULE(SiPixelPhase1CompareTrackSoAAlpaka); +DEFINE_FWK_MODULE(SiPixelPhase2CompareTrackSoAAlpaka); +DEFINE_FWK_MODULE(SiPixelHIonPhase1CompareTrackSoAAlpaka); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc new file mode 100644 index 0000000000000..2eea6a980d9c5 --- /dev/null +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoAAlpaka.cc @@ -0,0 +1,186 @@ +// -*- C++ -*- +// Package: SiPixelCompareVertexSoAAlpaka +// Class: SiPixelCompareVertexSoAAlpaka +// +/**\class SiPixelCompareVertexSoAAlpaka SiPixelCompareVertexSoAAlpaka.cc +*/ +// +// Author: Suvankar Roy Chowdhury +// +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "DataFormats/Common/interface/Handle.h" +// DQM Histograming +#include "DQMServices/Core/interface/MonitorElement.h" +#include "DQMServices/Core/interface/DQMEDAnalyzer.h" +#include "DQMServices/Core/interface/DQMStore.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "DataFormats/BeamSpot/interface/BeamSpot.h" + +class SiPixelCompareVertexSoAAlpaka : public DQMEDAnalyzer { +public: + using IndToEdm = std::vector; + explicit SiPixelCompareVertexSoAAlpaka(const edm::ParameterSet&); + ~SiPixelCompareVertexSoAAlpaka() override = default; + void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; + void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override; + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + const edm::EDGetTokenT tokenSoAVertexHost_; + const edm::EDGetTokenT tokenSoAVertexDevice_; + const edm::EDGetTokenT tokenBeamSpot_; + const std::string topFolderName_; + const float dzCut_; + MonitorElement* hnVertex_; + MonitorElement* hx_; + MonitorElement* hy_; + MonitorElement* hz_; + MonitorElement* hchi2_; + MonitorElement* hchi2oNdof_; + MonitorElement* hptv2_; + MonitorElement* hntrks_; + MonitorElement* hxdiff_; + MonitorElement* hydiff_; + MonitorElement* hzdiff_; +}; + +// +// constructors +// + +// Note tokenSoAVertexDevice_ contains data copied from device to host, hence is a HostCollection +SiPixelCompareVertexSoAAlpaka::SiPixelCompareVertexSoAAlpaka(const edm::ParameterSet& iConfig) + : tokenSoAVertexHost_(consumes(iConfig.getParameter("pixelVertexSrcHost"))), + tokenSoAVertexDevice_(consumes(iConfig.getParameter("pixelVertexSrcDevice"))), + tokenBeamSpot_(consumes(iConfig.getParameter("beamSpotSrc"))), + topFolderName_(iConfig.getParameter("topFolderName")), + dzCut_(iConfig.getParameter("dzCut")) {} + +// +// -- Analyze +// +void SiPixelCompareVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { + const auto& vsoaHandleHost = iEvent.getHandle(tokenSoAVertexHost_); + const auto& vsoaHandleDevice = iEvent.getHandle(tokenSoAVertexDevice_); + if (not vsoaHandleHost or not vsoaHandleDevice) { + edm::LogWarning out("SiPixelCompareVertexSoAAlpaka"); + if (not vsoaHandleHost) { + out << "reference (cpu) tracks not found; "; + } + if (not vsoaHandleDevice) { + out << "target (gpu) tracks not found; "; + } + out << "the comparison will not run."; + return; + } + + auto const& vsoaHost = *vsoaHandleHost; + int nVerticesHost = vsoaHost.view().nvFinal(); + auto const& vsoaDevice = *vsoaHandleDevice; + int nVerticesDevice = vsoaDevice.view().nvFinal(); + + auto bsHandle = iEvent.getHandle(tokenBeamSpot_); + float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; + if (!bsHandle.isValid()) { + edm::LogWarning("SiPixelCompareVertexSoAAlpaka") << "No beamspot found. returning vertexes with (0,0,Z) "; + } else { + const reco::BeamSpot& bs = *bsHandle; + x0 = bs.x0(); + y0 = bs.y0(); + z0 = bs.z0(); + dxdz = bs.dxdz(); + dydz = bs.dydz(); + } + + for (int ivc = 0; ivc < nVerticesHost; ivc++) { + auto sic = vsoaHost.view()[ivc].sortInd(); + auto zc = vsoaHost.view()[sic].zv(); + auto xc = x0 + dxdz * zc; + auto yc = y0 + dydz * zc; + zc += z0; + + auto ndofHost = vsoaHost.view()[sic].ndof(); + auto chi2Host = vsoaHost.view()[sic].chi2(); + + const int32_t notFound = -1; + int32_t closestVtxidx = notFound; + float mindz = dzCut_; + + for (int ivg = 0; ivg < nVerticesDevice; ivg++) { + auto sig = vsoaDevice.view()[ivg].sortInd(); + auto zgc = vsoaDevice.view()[sig].zv() + z0; + auto zDist = std::abs(zc - zgc); + //insert some matching condition + if (zDist > dzCut_) + continue; + if (mindz > zDist) { + mindz = zDist; + closestVtxidx = sig; + } + } + if (closestVtxidx == notFound) + continue; + + auto zg = vsoaDevice.view()[closestVtxidx].zv(); + auto xg = x0 + dxdz * zg; + auto yg = y0 + dydz * zg; + zg += z0; + auto ndofDevice = vsoaDevice.view()[closestVtxidx].ndof(); + auto chi2Device = vsoaDevice.view()[closestVtxidx].chi2(); + + hx_->Fill(xc - x0, xg - x0); + hy_->Fill(yc - y0, yg - y0); + hz_->Fill(zc, zg); + hxdiff_->Fill(xc - xg); + hydiff_->Fill(yc - yg); + hzdiff_->Fill(zc - zg); + hchi2_->Fill(chi2Host, chi2Device); + hchi2oNdof_->Fill(chi2Host / ndofHost, chi2Device / ndofDevice); + hptv2_->Fill(vsoaHost.view()[sic].ptv2(), vsoaDevice.view()[closestVtxidx].ptv2()); + hntrks_->Fill(ndofHost + 1, ndofDevice + 1); + } + hnVertex_->Fill(nVerticesHost, nVerticesDevice); +} + +// +// -- Book Histograms +// +void SiPixelCompareVertexSoAAlpaka::bookHistograms(DQMStore::IBooker& ibooker, + edm::Run const& iRun, + edm::EventSetup const& iSetup) { + ibooker.cd(); + ibooker.setCurrentFolder(topFolderName_); + + // FIXME: all the 2D correlation plots are quite heavy in terms of memory consumption, so a as soon as DQM supports either TH2I or THnSparse + // these should be moved to a less resource consuming format + hnVertex_ = ibooker.book2I("nVertex", "# of Vertices;Host;Device", 101, -0.5, 100.5, 101, -0.5, 100.5); + hx_ = ibooker.book2I("vx", "Vertez x - Beamspot x;Host;Device", 50, -0.1, 0.1, 50, -0.1, 0.1); + hy_ = ibooker.book2I("vy", "Vertez y - Beamspot y;Host;Device", 50, -0.1, 0.1, 50, -0.1, 0.1); + hz_ = ibooker.book2I("vz", "Vertez z;Host;Device", 30, -30., 30., 30, -30., 30.); + hchi2_ = ibooker.book2I("chi2", "Vertex chi-squared;Host;Device", 40, 0., 20., 40, 0., 20.); + hchi2oNdof_ = ibooker.book2I("chi2oNdof", "Vertex chi-squared/Ndof;Host;Device", 40, 0., 20., 40, 0., 20.); + hptv2_ = ibooker.book2I("ptsq", "Vertex #sum (p_{T})^{2};Host;Device", 200, 0., 200., 200, 0., 200.); + hntrks_ = ibooker.book2I("ntrk", "#tracks associated;Host;Device", 100, -0.5, 99.5, 100, -0.5, 99.5); + hntrks_ = ibooker.book2I("ntrk", "#tracks associated;Host;Device", 100, -0.5, 99.5, 100, -0.5, 99.5); + hxdiff_ = ibooker.book1D("vxdiff", ";Vertex x difference (Host - Device);#entries", 100, -0.001, 0.001); + hydiff_ = ibooker.book1D("vydiff", ";Vertex y difference (Host - Device);#entries", 100, -0.001, 0.001); + hzdiff_ = ibooker.book1D("vzdiff", ";Vertex z difference (Host - Device);#entries", 100, -2.5, 2.5); +} + +void SiPixelCompareVertexSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + // monitorpixelVertexSoA + edm::ParameterSetDescription desc; + desc.add("pixelVertexSrcHost", edm::InputTag("pixelVerticesAlpakaSerial")); + desc.add("pixelVertexSrcDevice", edm::InputTag("pixelVerticesAlpaka")); + desc.add("beamSpotSrc", edm::InputTag("offlineBeamSpot")); + desc.add("topFolderName", "SiPixelHeterogeneous/PixelVertexCompareSoADeviceVSHost"); + desc.add("dzCut", 1.); + descriptions.addWithDefaultLabel(desc); +} + +DEFINE_FWK_MODULE(SiPixelCompareVertexSoAAlpaka); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoAAlpaka.cc new file mode 100644 index 0000000000000..f4c8968fafb16 --- /dev/null +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoAAlpaka.cc @@ -0,0 +1,198 @@ +#include "DQMServices/Core/interface/MonitorElement.h" +#include "DQMServices/Core/interface/DQMEDAnalyzer.h" +#include "DQMServices/Core/interface/DQMStore.h" +#include "DataFormats/Math/interface/approx_atan2.h" +#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsHost.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h" +#include "Geometry/CommonTopologies/interface/PixelTopology.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" + +template +class SiPixelMonitorRecHitsSoAAlpaka : public DQMEDAnalyzer { +public: + using HitsOnHost = TrackingRecHitHost; + + explicit SiPixelMonitorRecHitsSoAAlpaka(const edm::ParameterSet&); + ~SiPixelMonitorRecHitsSoAAlpaka() override = default; + void dqmBeginRun(const edm::Run&, const edm::EventSetup&) override; + void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; + void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override; + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + const edm::ESGetToken geomToken_; + const edm::ESGetToken topoToken_; + const edm::EDGetTokenT tokenSoAHits_; + const std::string topFolderName_; + const TrackerGeometry* tkGeom_ = nullptr; + const TrackerTopology* tTopo_ = nullptr; + MonitorElement* hnHits; + MonitorElement* hBFposZP; + MonitorElement* hBFposZR; + MonitorElement* hBposXY; + MonitorElement* hBposZP; + MonitorElement* hBcharge; + MonitorElement* hBsizex; + MonitorElement* hBsizey; + MonitorElement* hBposZPL[4]; // max 4 barrel hits + MonitorElement* hBchargeL[4]; + MonitorElement* hBsizexL[4]; + MonitorElement* hBsizeyL[4]; + MonitorElement* hFposXY; + MonitorElement* hFposZP; + MonitorElement* hFcharge; + MonitorElement* hFsizex; + MonitorElement* hFsizey; + MonitorElement* hFposXYD[2][12]; // max 12 endcap disks + MonitorElement* hFchargeD[2][12]; + MonitorElement* hFsizexD[2][12]; + MonitorElement* hFsizeyD[2][12]; +}; + +// +// constructors +// +template +SiPixelMonitorRecHitsSoAAlpaka::SiPixelMonitorRecHitsSoAAlpaka(const edm::ParameterSet& iConfig) + : geomToken_(esConsumes()), + topoToken_(esConsumes()), + tokenSoAHits_(consumes(iConfig.getParameter("pixelHitsSrc"))), + topFolderName_(iConfig.getParameter("TopFolderName")) {} + +// +// Begin Run +// +template +void SiPixelMonitorRecHitsSoAAlpaka::dqmBeginRun(const edm::Run& iRun, const edm::EventSetup& iSetup) { + tkGeom_ = &iSetup.getData(geomToken_); + tTopo_ = &iSetup.getData(topoToken_); +} + +// +// -- Analyze +// +template +void SiPixelMonitorRecHitsSoAAlpaka::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { + const auto& rhsoaHandle = iEvent.getHandle(tokenSoAHits_); + if (!rhsoaHandle.isValid()) { + edm::LogWarning("SiPixelMonitorRecHitsSoAAlpaka") << "No RecHits SoA found \n returning!"; + return; + } + auto const& rhsoa = *rhsoaHandle; + auto const& soa2d = rhsoa.const_view(); + + uint32_t nHits_ = soa2d.metadata().size(); + hnHits->Fill(nHits_); + auto detIds = tkGeom_->detUnitIds(); + for (uint32_t i = 0; i < nHits_; i++) { + DetId id = detIds[soa2d[i].detectorIndex()]; + float xG = soa2d[i].xGlobal(); + float yG = soa2d[i].yGlobal(); + float zG = soa2d[i].zGlobal(); + float rG = soa2d[i].rGlobal(); + float fphi = short2phi(soa2d[i].iphi()); + uint32_t charge = soa2d[i].chargeAndStatus().charge; + int16_t sizeX = std::ceil(float(std::abs(soa2d[i].clusterSizeX()) / 8.)); + int16_t sizeY = std::ceil(float(std::abs(soa2d[i].clusterSizeY()) / 8.)); + hBFposZP->Fill(zG, fphi); + int16_t ysign = yG >= 0 ? 1 : -1; + hBFposZR->Fill(zG, rG * ysign); + switch (id.subdetId()) { + case PixelSubdetector::PixelBarrel: + hBposXY->Fill(xG, yG); + hBposZP->Fill(zG, fphi); + hBcharge->Fill(charge); + hBsizex->Fill(sizeX); + hBsizey->Fill(sizeY); + hBposZPL[tTopo_->pxbLayer(id) - 1]->Fill(zG, fphi); + hBchargeL[tTopo_->pxbLayer(id) - 1]->Fill(charge); + hBsizexL[tTopo_->pxbLayer(id) - 1]->Fill(sizeX); + hBsizeyL[tTopo_->pxbLayer(id) - 1]->Fill(sizeY); + break; + case PixelSubdetector::PixelEndcap: + hFposXY->Fill(xG, yG); + hFposZP->Fill(zG, fphi); + hFcharge->Fill(charge); + hFsizex->Fill(sizeX); + hFsizey->Fill(sizeY); + hFposXYD[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xG, yG); + hFchargeD[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(charge); + hFsizexD[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeX); + hFsizeyD[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeY); + break; + } + } +} + +// +// -- Book Histograms +// +template +void SiPixelMonitorRecHitsSoAAlpaka::bookHistograms(DQMStore::IBooker& iBook, + edm::Run const& iRun, + edm::EventSetup const& iSetup) { + iBook.cd(); + iBook.setCurrentFolder(topFolderName_); + + // clang-format off + //Global + hnHits = iBook.book1D("nHits", "RecHits per event;RecHits;#events", 200, 0, 5000); + hBFposZP = iBook.book2D("recHitsGlobalPosZP", "RecHits position Global;Z;#phi", 1000, -60, 60, 200,-3.2,3.2); + hBFposZR = iBook.book2D("recHitsGlobalPosZR", "RecHits position Global;Z;R", 1000, -60, 60, 200,-20,20); + //Barrel + hBposXY = iBook.book2D("recHitsBarrelPosXY", "RecHits position Barrel;X;Y", 200, -20, 20, 200,-20,20); + hBposZP = iBook.book2D("recHitsBarrelPosZP", "RecHits position Barrel;Z;#phi", 300, -30, 30, 200,-3.2,3.2); + hBcharge = iBook.book1D("recHitsBarrelCharge", "RecHits Charge Barrel;Charge;#events", 250, 0, 100000); + hBsizex = iBook.book1D("recHitsBarrelSizex", "RecHits SizeX Barrel;SizeX;#events", 50, 0, 50); + hBsizey = iBook.book1D("recHitsBarrelSizey", "RecHits SizeY Barrel;SizeY;#events", 50, 0, 50); + //Barrel Layer + for(unsigned int il=0;ilnumberOfLayers(PixelSubdetector::PixelBarrel);il++){ + hBposZPL[il] = iBook.book2D(Form("recHitsBLay%dPosZP",il+1), Form("RecHits position Barrel Layer%d;Z;#phi",il+1), 300, -30, 30, 200,-3.2,3.2); + hBchargeL[il] = iBook.book1D(Form("recHitsBLay%dCharge",il+1), Form("RecHits Charge Barrel Layer%d;Charge;#events",il+1), 250, 0, 100000); + hBsizexL[il] = iBook.book1D(Form("recHitsBLay%dSizex",il+1), Form("RecHits SizeX Barrel Layer%d;SizeX;#events",il+1), 50, 0, 50); + hBsizeyL[il] = iBook.book1D(Form("recHitsBLay%dSizey",il+1), Form("RecHits SizeY Barrel Layer%d;SizeY;#events",il+1), 50, 0, 50); + } + //Endcaps + hFposXY = iBook.book2D("recHitsEndcapsPosXY", "RecHits position Endcaps;X;Y", 200, -20, 20, 200,-20, 20); + hFposZP = iBook.book2D("recHitsEndcapsPosZP", "RecHits position Endcaps;Z;#phi", 600, -60, 60, 200,-3.2,3.2); + hFcharge = iBook.book1D("recHitsEndcapsCharge", "RecHits Charge Endcaps;Charge;#events", 250, 0, 100000); + hFsizex = iBook.book1D("recHitsEndcapsSizex", "RecHits SizeX Endcaps;SizeX;#events", 50, 0, 50); + hFsizey = iBook.book1D("recHitsEndcapsSizey", "RecHits SizeY Endcaps;SizeY;#events", 50, 0, 50); + //Endcaps Disk + for(int is=0;is<2;is++){ + int sign=is==0? -1:1; + for(unsigned int id=0;idnumberOfLayers(PixelSubdetector::PixelEndcap);id++){ + hFposXYD[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosXY",id*sign+sign), Form("RecHits position Endcaps Disk%+d;X;Y",id*sign+sign), 200, -20, 20, 200,-20,20); + hFchargeD[is][id] = iBook.book1D(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("RecHits Charge Endcaps Disk%+d;Charge;#events",id*sign+sign), 250, 0, 100000); + hFsizexD[is][id] = iBook.book1D(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("RecHits SizeX Endcaps Disk%+d;SizeX;#events",id*sign+sign), 50, 0, 50); + hFsizeyD[is][id] = iBook.book1D(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("RecHits SizeY Endcaps Disk%+d;SizeY;#events",id*sign+sign), 50, 0, 50); + } + } +} + +template +void SiPixelMonitorRecHitsSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + // monitorpixelRecHitsSoA + edm::ParameterSetDescription desc; + desc.add("pixelHitsSrc", edm::InputTag("siPixelRecHitsPreSplittingAlpaka")); + desc.add("TopFolderName", "SiPixelHeterogeneous/PixelRecHitsAlpaka"); + descriptions.addWithDefaultLabel(desc); +} + +using SiPixelPhase1MonitorRecHitsSoAAlpaka = SiPixelMonitorRecHitsSoAAlpaka; +using SiPixelPhase2MonitorRecHitsSoAAlpaka = SiPixelMonitorRecHitsSoAAlpaka; +using SiPixelHIonPhase1MonitorRecHitsSoAAlpaka = SiPixelMonitorRecHitsSoAAlpaka; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(SiPixelPhase1MonitorRecHitsSoAAlpaka); +DEFINE_FWK_MODULE(SiPixelPhase2MonitorRecHitsSoAAlpaka); +DEFINE_FWK_MODULE(SiPixelHIonPhase1MonitorRecHitsSoAAlpaka); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoAAlpaka.cc new file mode 100644 index 0000000000000..fd98957ee8492 --- /dev/null +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoAAlpaka.cc @@ -0,0 +1,197 @@ +// -*- C++ -*- +// Package: SiPixelMonitorTrackSoAAlpaka +// Class: SiPixelMonitorTrackSoAAlpaka +// +/**\class SiPixelMonitorTrackSoAAlpaka SiPixelMonitorTrackSoAAlpaka.cc +*/ +// +// Author: Suvankar Roy Chowdhury +// + +// for string manipulations +#include +#include "DataFormats/Common/interface/Handle.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/InputTag.h" +// DQM Histograming +#include "DQMServices/Core/interface/MonitorElement.h" +#include "DQMServices/Core/interface/DQMEDAnalyzer.h" +#include "DQMServices/Core/interface/DQMStore.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" + +template +class SiPixelMonitorTrackSoAAlpaka : public DQMEDAnalyzer { +public: + using PixelTrackHeterogeneous = TracksHost; + explicit SiPixelMonitorTrackSoAAlpaka(const edm::ParameterSet&); + ~SiPixelMonitorTrackSoAAlpaka() override = default; + void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; + void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override; + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + edm::EDGetTokenT tokenSoATrack_; + std::string topFolderName_; + bool useQualityCut_; + pixelTrack::Quality minQuality_; + MonitorElement* hnTracks; + MonitorElement* hnLooseAndAboveTracks; + MonitorElement* hnHits; + MonitorElement* hnHitsVsPhi; + MonitorElement* hnHitsVsEta; + MonitorElement* hnLayers; + MonitorElement* hnLayersVsPhi; + MonitorElement* hnLayersVsEta; + MonitorElement* hchi2; + MonitorElement* hChi2VsPhi; + MonitorElement* hChi2VsEta; + MonitorElement* hpt; + MonitorElement* heta; + MonitorElement* hphi; + MonitorElement* hz; + MonitorElement* htip; + MonitorElement* hquality; +}; + +// +// constructors +// + +template +SiPixelMonitorTrackSoAAlpaka::SiPixelMonitorTrackSoAAlpaka(const edm::ParameterSet& iConfig) { + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + topFolderName_ = iConfig.getParameter("topFolderName"); //"SiPixelHeterogeneous/PixelTrackSoA"; + useQualityCut_ = iConfig.getParameter("useQualityCut"); + minQuality_ = pixelTrack::qualityByName(iConfig.getParameter("minQuality")); +} + +// +// -- Analyze +// +template +void SiPixelMonitorTrackSoAAlpaka::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { + const auto& tsoaHandle = iEvent.getHandle(tokenSoATrack_); + if (!tsoaHandle.isValid()) { + edm::LogWarning("SiPixelMonitorTrackSoAAlpaka") << "No Track SoA found \n returning!" << std::endl; + return; + } + + auto const& tsoa = *tsoaHandle.product(); + auto maxTracks = tsoa.view().metadata().size(); + auto const* quality = tsoa.view().quality(); + int32_t nTracks = 0; + int32_t nLooseAndAboveTracks = 0; + + for (int32_t it = 0; it < maxTracks; ++it) { + auto nHits = tsoa.view().detIndices().size(it); + auto nLayers = tsoa.view()[it].nLayers(); + if (nHits == 0) + break; // this is a guard + float pt = tsoa.view()[it].pt(); + if (!(pt > 0.)) + continue; + + // fill the quality for all tracks + pixelTrack::Quality qual = quality[it]; + hquality->Fill(int(qual)); + nTracks++; + + if (useQualityCut_ && quality[it] < minQuality_) + continue; + + // fill parameters only for quality >= loose + + float chi2 = tsoa.view()[it].chi2(); + float phi = tsoa.view()[it].state()(0); //TODO: put these numbers in enum + float zip = tsoa.view()[it].state()(4); + float eta = tsoa.view()[it].eta(); + float tip = tsoa.view()[it].state()(1); + + hchi2->Fill(chi2); + hChi2VsPhi->Fill(phi, chi2); + hChi2VsEta->Fill(eta, chi2); + hnHits->Fill(nHits); + hnLayers->Fill(nLayers); + hnHitsVsPhi->Fill(phi, nHits); + hnHitsVsEta->Fill(eta, nHits); + hnLayersVsPhi->Fill(phi, nLayers); + hnLayersVsEta->Fill(eta, nLayers); + hpt->Fill(pt); + heta->Fill(eta); + hphi->Fill(phi); + hz->Fill(zip); + htip->Fill(tip); + nLooseAndAboveTracks++; + } + hnTracks->Fill(nTracks); + hnLooseAndAboveTracks->Fill(nLooseAndAboveTracks); +} + +// +// -- Book Histograms +// +template +void SiPixelMonitorTrackSoAAlpaka::bookHistograms(DQMStore::IBooker& iBook, + edm::Run const& iRun, + edm::EventSetup const& iSetup) { + iBook.cd(); + iBook.setCurrentFolder(topFolderName_); + + // clang-format off +std::string toRep = "Number of tracks"; +hnTracks = iBook.book1D("nTracks", fmt::format(";{} per event;#events",toRep), 1001, -0.5, 1000.5); +hnLooseAndAboveTracks = iBook.book1D("nLooseAndAboveTracks", fmt::format(";{} (quality #geq loose) per event;#events",toRep), 1001, -0.5, 1000.5); + +toRep = "Number of all RecHits per track (quality #geq loose)"; +hnHits = iBook.book1D("nRecHits", fmt::format(";{};#tracks",toRep), 15, -0.5, 14.5); +hnHitsVsPhi = iBook.bookProfile("nHitsPerTrackVsPhi", fmt::format("{} vs track #phi;Track #phi;{}",toRep,toRep), 30, -M_PI, M_PI,0., 15.); +hnHitsVsEta = iBook.bookProfile("nHitsPerTrackVsEta", fmt::format("{} vs track #eta;Track #eta;{}",toRep,toRep), 30, -3., 3., 0., 15.); + +toRep = "Number of all layers per track (quality #geq loose)"; +hnLayers = iBook.book1D("nLayers", fmt::format(";{};#tracks",toRep), 15, -0.5, 14.5); +hnLayersVsPhi = iBook.bookProfile("nLayersPerTrackVsPhi", fmt::format("{} vs track #phi;Track #phi;{}",toRep,toRep), 30, -M_PI, M_PI,0., 15.); +hnLayersVsEta = iBook.bookProfile("nLayersPerTrackVsEta", fmt::format("{} vs track #eta;Track #eta;{}",toRep,toRep), 30, -3., 3., 0., 15.); + +toRep = "Track (quality #geq loose) #chi^{2}/ndof"; +hchi2 = iBook.book1D("nChi2ndof", fmt::format(";{};#tracks",toRep), 40, 0., 20.); +hChi2VsPhi = iBook.bookProfile("nChi2ndofVsPhi", fmt::format("{} vs track #phi;Track #phi;{}",toRep,toRep), 30, -M_PI, M_PI, 0., 20.); +hChi2VsEta = iBook.bookProfile("nChi2ndofVsEta", fmt::format("{} vs track #eta;Track #eta;{}",toRep,toRep), 30, -3., 3., 0., 20.); + // clang-format on + + hpt = iBook.book1D("pt", ";Track (quality #geq loose) p_{T} [GeV];#tracks", 200, 0., 200.); + heta = iBook.book1D("eta", ";Track (quality #geq loose) #eta;#tracks", 30, -3., 3.); + hphi = iBook.book1D("phi", ";Track (quality #geq loose) #phi;#tracks", 30, -M_PI, M_PI); + hz = iBook.book1D("z", ";Track (quality #geq loose) z [cm];#tracks", 30, -30., 30.); + htip = iBook.book1D("tip", ";Track (quality #geq loose) TIP [cm];#tracks", 100, -0.5, 0.5); + hquality = iBook.book1D("quality", ";Track Quality;#tracks", 7, -0.5, 6.5); + uint i = 1; + for (const auto& q : pixelTrack::qualityName) { + hquality->setBinLabel(i, q.data(), 1); + i++; + } +} + +template +void SiPixelMonitorTrackSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + // monitorpixelTrackSoA + edm::ParameterSetDescription desc; + desc.add("pixelTrackSrc", edm::InputTag("pixelTracksAlpaka")); + desc.add("topFolderName", "SiPixelHeterogeneous/PixelTrackAlpaka"); + desc.add("useQualityCut", true); + desc.add("minQuality", "loose"); + descriptions.addWithDefaultLabel(desc); +} + +using SiPixelPhase1MonitorTrackSoAAlpaka = SiPixelMonitorTrackSoAAlpaka; +using SiPixelPhase2MonitorTrackSoAAlpaka = SiPixelMonitorTrackSoAAlpaka; +using SiPixelHIonPhase1MonitorTrackSoAAlpaka = SiPixelMonitorTrackSoAAlpaka; + +DEFINE_FWK_MODULE(SiPixelPhase1MonitorTrackSoAAlpaka); +DEFINE_FWK_MODULE(SiPixelPhase2MonitorTrackSoAAlpaka); +DEFINE_FWK_MODULE(SiPixelHIonPhase1MonitorTrackSoAAlpaka); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc new file mode 100644 index 0000000000000..d3121f77bccb8 --- /dev/null +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc @@ -0,0 +1,131 @@ +// -*- C++ -*- +///bookLayer +// Package: SiPixelMonitorVertexSoAAlpaka +// Class: SiPixelMonitorVertexSoAAlpaka +// +/**\class SiPixelMonitorVertexSoAAlpaka SiPixelMonitorVertexSoAAlpaka.cc +*/ +// +// Author: Suvankar Roy Chowdhury +// +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "DataFormats/Common/interface/Handle.h" +// DQM Histograming +#include "DQMServices/Core/interface/MonitorElement.h" +#include "DQMServices/Core/interface/DQMEDAnalyzer.h" +#include "DQMServices/Core/interface/DQMStore.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "DataFormats/BeamSpot/interface/BeamSpot.h" + +class SiPixelMonitorVertexSoAAlpaka : public DQMEDAnalyzer { +public: + using IndToEdm = std::vector; + explicit SiPixelMonitorVertexSoAAlpaka(const edm::ParameterSet&); + ~SiPixelMonitorVertexSoAAlpaka() override = default; + void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override; + void analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) override; + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + const edm::EDGetTokenT tokenSoAVertex_; + const edm::EDGetTokenT tokenBeamSpot_; + std::string topFolderName_; + MonitorElement* hnVertex; + MonitorElement* hx; + MonitorElement* hy; + MonitorElement* hz; + MonitorElement* hchi2; + MonitorElement* hchi2oNdof; + MonitorElement* hptv2; + MonitorElement* hntrks; +}; + +// +// constructors +// + +SiPixelMonitorVertexSoAAlpaka::SiPixelMonitorVertexSoAAlpaka(const edm::ParameterSet& iConfig) + : tokenSoAVertex_(consumes(iConfig.getParameter("pixelVertexSrc"))), + tokenBeamSpot_(consumes(iConfig.getParameter("beamSpotSrc"))), + topFolderName_(iConfig.getParameter("topFolderName")) {} + +// +// -- Analyze +// +void SiPixelMonitorVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) { + const auto& vsoaHandle = iEvent.getHandle(tokenSoAVertex_); + if (!vsoaHandle.isValid()) { + edm::LogWarning("SiPixelMonitorVertexSoAAlpaka") << "No Vertex SoA found \n returning!" << std::endl; + return; + } + + auto const& vsoa = *vsoaHandle; + int nVertices = vsoa.view().nvFinal(); + auto bsHandle = iEvent.getHandle(tokenBeamSpot_); + float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; + if (!bsHandle.isValid()) { + edm::LogWarning("SiPixelMonitorVertexSoAAlpaka") << "No beamspot found. returning vertexes with (0,0,Z) "; + } else { + const reco::BeamSpot& bs = *bsHandle; + x0 = bs.x0(); + y0 = bs.y0(); + z0 = bs.z0(); + dxdz = bs.dxdz(); + dydz = bs.dydz(); + } + + for (int iv = 0; iv < nVertices; iv++) { + auto si = vsoa.view()[iv].sortInd(); + auto z = vsoa.view()[si].zv(); + auto x = x0 + dxdz * z; + auto y = y0 + dydz * z; + + z += z0; + hx->Fill(x); + hy->Fill(y); + hz->Fill(z); + auto ndof = vsoa.view()[si].ndof(); + hchi2->Fill(vsoa.view()[si].chi2()); + hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof); + hptv2->Fill(vsoa.view()[si].ptv2()); + hntrks->Fill(ndof + 1); + } + hnVertex->Fill(nVertices); +} + +// +// -- Book Histograms +// +void SiPixelMonitorVertexSoAAlpaka::bookHistograms(DQMStore::IBooker& ibooker, + edm::Run const& iRun, + edm::EventSetup const& iSetup) { + //std::string top_folder = ""// + ibooker.cd(); + ibooker.setCurrentFolder(topFolderName_); + hnVertex = ibooker.book1D("nVertex", ";# of Vertices;#entries", 101, -0.5, 100.5); + hx = ibooker.book1D("vx", ";Vertex x;#entries", 10, -5., 5.); + hy = ibooker.book1D("vy", ";Vertex y;#entries", 10, -5., 5.); + hz = ibooker.book1D("vz", ";Vertex z;#entries", 30, -30., 30); + hchi2 = ibooker.book1D("chi2", ";Vertex chi-squared;#entries", 40, 0., 20.); + hchi2oNdof = ibooker.book1D("chi2oNdof", ";Vertex chi-squared/Ndof;#entries", 40, 0., 20.); + hptv2 = ibooker.book1D("ptsq", ";Vertex #sum (p_{T})^{2};#entries", 200, 0., 200.); + hntrks = ibooker.book1D("ntrk", ";#tracks associated;#entries", 100, -0.5, 99.5); +} + +void SiPixelMonitorVertexSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + // monitorpixelVertexSoA + edm::ParameterSetDescription desc; + desc.add("pixelVertexSrc", edm::InputTag("pixelVerticesAlpaka")); + desc.add("beamSpotSrc", edm::InputTag("offlineBeamSpot")); + desc.add("topFolderName", "SiPixelHeterogeneous/PixelVertexAlpaka"); + descriptions.addWithDefaultLabel(desc); +} + +DEFINE_FWK_MODULE(SiPixelMonitorVertexSoAAlpaka); diff --git a/DQM/SiPixelHeterogeneous/python/SiPixelHeterogenousDQM_FirstStep_cff.py b/DQM/SiPixelHeterogeneous/python/SiPixelHeterogenousDQM_FirstStep_cff.py index dfb83708c95cf..95245a3fea968 100644 --- a/DQM/SiPixelHeterogeneous/python/SiPixelHeterogenousDQM_FirstStep_cff.py +++ b/DQM/SiPixelHeterogeneous/python/SiPixelHeterogenousDQM_FirstStep_cff.py @@ -7,20 +7,35 @@ from DQM.SiPixelHeterogeneous.siPixelPhase2MonitorTrackSoA_cfi import * from DQM.SiPixelHeterogeneous.siPixelHIonPhase1MonitorTrackSoA_cfi import * from DQM.SiPixelHeterogeneous.siPixelMonitorVertexSoA_cfi import * +# Alpaka Modules +from Configuration.ProcessModifiers.alpaka_cff import alpaka +from DQM.SiPixelHeterogeneous.siPixelPhase1MonitorRecHitsSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelPhase2MonitorRecHitsSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelHIonPhase1MonitorRecHitsSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelPhase1MonitorTrackSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelPhase2MonitorTrackSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelHIonPhase1MonitorTrackSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelMonitorVertexSoAAlpaka_cfi import * # Run-3 sequence monitorpixelSoASource = cms.Sequence(siPixelPhase1MonitorRecHitsSoA * siPixelPhase1MonitorTrackSoA * siPixelMonitorVertexSoA) - +# Run-3 Alpaka sequence +monitorpixelSoASourceAlpaka = cms.Sequence(siPixelPhase1MonitorRecHitsSoAAlpaka * siPixelPhase1MonitorTrackSoAAlpaka * siPixelMonitorVertexSoAAlpaka) +alpaka.toReplaceWith(monitorpixelSoASource, monitorpixelSoASourceAlpaka) # Phase-2 sequence from Configuration.Eras.Modifier_phase2_tracker_cff import phase2_tracker _monitorpixelSoARecHitsSource = cms.Sequence(siPixelPhase2MonitorRecHitsSoA * siPixelPhase2MonitorTrackSoA * siPixelMonitorVertexSoA) -phase2_tracker.toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSource) +(phase2_tracker & ~alpaka).toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSource) +_monitorpixelSoARecHitsSourceAlpaka = cms.Sequence(siPixelPhase2MonitorRecHitsSoAAlpaka * siPixelPhase2MonitorTrackSoAAlpaka * siPixelMonitorVertexSoAAlpaka) +(phase2_tracker & alpaka).toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSourceAlpaka) # HIon Phase 1 sequence from Configuration.ProcessModifiers.pp_on_AA_cff import pp_on_AA _monitorpixelSoARecHitsSourceHIon = cms.Sequence(siPixelHIonPhase1MonitorRecHitsSoA * siPixelHIonPhase1MonitorTrackSoA * siPixelMonitorVertexSoA) (pp_on_AA & ~phase2_tracker).toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSourceHIon) +_monitorpixelSoARecHitsSourceHIonAlpaka = cms.Sequence(siPixelHIonPhase1MonitorRecHitsSoAAlpaka * siPixelHIonPhase1MonitorTrackSoAAlpaka * siPixelMonitorVertexSoAAlpaka) +(pp_on_AA & ~phase2_tracker & alpaka).toReplaceWith(monitorpixelSoASource, _monitorpixelSoARecHitsSourceHIonAlpaka) #Define the sequence for GPU vs CPU validation #This should run:- individual monitor for the 2 collections + comparison module @@ -33,6 +48,14 @@ from DQM.SiPixelHeterogeneous.siPixelCompareVertexSoA_cfi import * from DQM.SiPixelHeterogeneous.siPixelPhase1RawDataErrorComparator_cfi import * from DQM.SiPixelPhase1Common.SiPixelPhase1RawData_cfi import * +#Alpaka +from DQM.SiPixelHeterogeneous.siPixelPhase1CompareRecHitsSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelPhase2CompareRecHitsSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelHIonPhase1CompareRecHitsSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelPhase1CompareTrackSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelPhase2CompareTrackSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelHIonPhase1CompareTrackSoAAlpaka_cfi import * +from DQM.SiPixelHeterogeneous.siPixelCompareVertexSoAAlpaka_cfi import * # digi errors SiPixelPhase1RawDataConfForCPU = copy.deepcopy(SiPixelPhase1RawDataConf) @@ -126,6 +149,43 @@ topFolderName = 'SiPixelHeterogeneous/PixelVertexSoAGPU', ) +### Alpaka + +# PixelRecHits: monitor of CPUSerial product (Alpaka backend: 'serial_sync') +siPixelRecHitsSoAMonitorSerial = siPixelPhase1MonitorRecHitsSoAAlpaka.clone( + pixelHitsSrc = cms.InputTag( 'siPixelRecHitsPreSplittingAlpakaSerial' ), + TopFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsSerial' ) +) + +# PixelRecHits: monitor of Device product (Alpaka backend: '') +siPixelRecHitsSoAMonitorDevice = siPixelPhase1MonitorRecHitsSoAAlpaka.clone( + pixelHitsSrc = cms.InputTag( 'siPixelRecHitsPreSplittingAlpaka' ), + TopFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsDevice' ) +) + +# PixelTracks: monitor of CPUSerial product (Alpaka backend: 'serial_sync') +siPixelTrackSoAMonitorSerial = siPixelPhase1MonitorTrackSoAAlpaka.clone( + pixelTrackSrc = cms.InputTag('pixelTracksAlpakaSerial'), + topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackSerial') +) + +# PixelTracks: monitor of CPUSerial product (Alpaka backend: 'serial_sync') +siPixelTrackSoAMonitorDevice = siPixelPhase1MonitorTrackSoAAlpaka.clone( + pixelTrackSrc = cms.InputTag('pixelTracksAlpaka'), + topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackDevice') +) + +# PixelVertices: monitor of CPUSerial product (Alpaka backend: 'serial_sync') +siPixelVertexSoAMonitorSerial = siPixelMonitorVertexSoAAlpaka.clone( + pixelVertexSrc = cms.InputTag("pixelVerticesAlpakaSerial"), + topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexSerial') +) + +siPixelVertexSoAMonitorDevice = siPixelMonitorVertexSoAAlpaka.clone( + pixelVertexSrc = cms.InputTag("pixelVerticesAlpaka"), + topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexDevice') +) + # Run-3 sequence monitorpixelSoACompareSource = cms.Sequence(siPixelPhase1MonitorRawDataACPU * siPixelPhase1MonitorRawDataAGPU * @@ -139,6 +199,17 @@ siPixelMonitorVertexSoAGPU * siPixelCompareVertexSoA * siPixelPhase1RawDataErrorComparator) +# and the Alpaka version +monitorpixelSoACompareSourceAlpaka = cms.Sequence( + siPixelRecHitsSoAMonitorSerial * + siPixelRecHitsSoAMonitorDevice * + siPixelPhase1CompareRecHitsSoAAlpaka * + siPixelTrackSoAMonitorSerial * + siPixelTrackSoAMonitorDevice * + siPixelPhase1CompareTrackSoAAlpaka * + siPixelVertexSoAMonitorSerial * + siPixelVertexSoAMonitorDevice * + siPixelCompareVertexSoAAlpaka ) # Phase-2 sequence _monitorpixelSoACompareSource = cms.Sequence(siPixelPhase2MonitorRecHitsSoACPU * @@ -166,3 +237,6 @@ from Configuration.ProcessModifiers.gpuValidationPixel_cff import gpuValidationPixel gpuValidationPixel.toReplaceWith(monitorpixelSoASource, monitorpixelSoACompareSource) + +from Configuration.ProcessModifiers.alpakaValidationPixel_cff import alpakaValidationPixel +(alpakaValidationPixel & ~gpuValidationPixel).toReplaceWith(monitorpixelSoASource, monitorpixelSoACompareSourceAlpaka) diff --git a/DataFormats/TrackSoA/BuildFile.xml b/DataFormats/TrackSoA/BuildFile.xml new file mode 100644 index 0000000000000..ac764cf5b95ff --- /dev/null +++ b/DataFormats/TrackSoA/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/DataFormats/TrackSoA/README.md b/DataFormats/TrackSoA/README.md new file mode 100644 index 0000000000000..433dfb0d656c7 --- /dev/null +++ b/DataFormats/TrackSoA/README.md @@ -0,0 +1,60 @@ +# TrackSoA Data Formats + +`DataFormat`s meant to be used on Host (CPU) or Device (GPU) for +storing information about `TrackSoA`s created during the Pixel-local Reconstruction +chain. It stores data in an SoA manner. + +The host format is inheriting from `DataFormats/Portable/interface/PortableHostCollection.h`, +while the device format is inheriting from `DataFormats/Portable/interface/PortableDeviceCollection.h` + +Both formats use the same SoA Layout (`TrackSoA::Layout`) which is generated +via the `GENERATE_SOA_LAYOUT` macro in the `TrackDefinitions.h` file. + +## Notes + +-`hitIndices` and `detIndices`, instances of `HitContainer`, have been added into the +layout as `SOA_SCALAR`s, meaning that they manage their own data independently from the SoA +`Layout`. This could be improved in the future, if `HitContainer` (aka a `OneToManyAssoc` of fixed size) +is replaced, but there don't seem to be any conflicts in including it in the `Layout` like this. +- Host and Device classes should **not** be created via inheritance, as they're done here, +but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309). + +## TracksHost + +The version of the data format to be used for storing `TrackSoA` data on the CPU. +Instances of this class are to be used for: + +- Having a place to copy data to host from device, via `Memcpy`, or +- Running host-side algorithms using data stored in an SoA manner. + +## TracksDevice + +The version of the data format to be used for storing `TrackSoA` data on the GPU. + +Instances of `TracksDevice` are to be created on host and be +used on device only. To do so, the instance's `view()` method is to be called +to pass a `View` to any kernel launched. Accessing data from the `view()` is not +possible on the host side. + +## TracksSoACollection + +Depending on the Alpaka accelerator back-end enabled, `TrackSoACollection` is an alias to either the Host or Device SoA: + +```cpp +template + using TrackSoACollection = std::conditional_t, + TrackSoAHost, + TrackSoADevice>; +``` + +## Utilities + +`alpaka/TrackUtilities.h` contains a collection of methods which were originally +defined as class methods inside either `TrackSoAHeterogeneousT` and `TrajectoryStateSoAT` +which have been adapted to operate on `View` instances, so that they are callable +from within `__global__` kernels, on both CPU and CPU. + +## Use case + +See `test/TrackSoAHeterogeneous_test.cpp` for a simple example of instantiation, +processing and copying from device to host. diff --git a/DataFormats/TrackSoA/interface/TrackDefinitions.h b/DataFormats/TrackSoA/interface/TrackDefinitions.h new file mode 100644 index 0000000000000..6bd36b5bd3cd1 --- /dev/null +++ b/DataFormats/TrackSoA/interface/TrackDefinitions.h @@ -0,0 +1,32 @@ +#ifndef DataFormats_Track_interface_TrackDefinitions_h +#define DataFormats_Track_interface_TrackDefinitions_h +#include +#include +#include + +namespace pixelTrack { + + enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; + constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; + constexpr std::string_view qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"}; + inline Quality qualityByName(std::string_view name) { + auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName; + auto ret = static_cast(qp); + + if (ret == pixelTrack::Quality::notQuality) + throw std::invalid_argument(std::string(name) + " is not a pixelTrack::Quality!"); + + return ret; + } + +#ifdef GPU_SMALL_EVENTS + // kept for testing and debugging + constexpr uint32_t maxNumber() { return 2 * 1024; } +#else + // tested on MC events with 55-75 pileup events + constexpr uint32_t maxNumber() { return 32 * 1024; } +#endif + +} // namespace pixelTrack + +#endif diff --git a/DataFormats/TrackSoA/interface/TracksDevice.h b/DataFormats/TrackSoA/interface/TracksDevice.h new file mode 100644 index 0000000000000..6ef28014bab63 --- /dev/null +++ b/DataFormats/TrackSoA/interface/TracksDevice.h @@ -0,0 +1,38 @@ +#ifndef DataFormats_Track_interface_TracksDevice_h +#define DataFormats_Track_interface_TracksDevice_h + +#include +#include +#include "DataFormats/TrackSoA/interface/TracksSoA.h" +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" +#include "DataFormats/Portable/interface/PortableDeviceCollection.h" + +// TODO: The class is created via inheritance of the PortableCollection. +// This is generally discouraged, and should be done via composition. +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 +template +class TracksDevice : public PortableDeviceCollection, TDev> { +public: + static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; //TODO: this could be made configurable at runtime + TracksDevice() = default; // necessary for ROOT dictionaries + + using PortableDeviceCollection, TDev>::view; + using PortableDeviceCollection, TDev>::const_view; + using PortableDeviceCollection, TDev>::buffer; + + // Constructor which specifies the SoA size + template + explicit TracksDevice(TQueue& queue) + : PortableDeviceCollection, TDev>(S, queue) {} +}; + +namespace pixelTrack { + + template + using TracksDevicePhase1 = TracksDevice; + template + using TracksDevicePhase2 = TracksDevice; + +} // namespace pixelTrack + +#endif // DataFormats_Track_TracksDevice_H diff --git a/DataFormats/TrackSoA/interface/TracksHost.h b/DataFormats/TrackSoA/interface/TracksHost.h new file mode 100644 index 0000000000000..a8f459eac066c --- /dev/null +++ b/DataFormats/TrackSoA/interface/TracksHost.h @@ -0,0 +1,42 @@ +#ifndef DataFormats_Track_TracksHost_H +#define DataFormats_Track_TracksHost_H + +#include +#include +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "DataFormats/TrackSoA/interface/TracksSoA.h" +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" +#include "DataFormats/Portable/interface/PortableHostCollection.h" + +// TODO: The class is created via inheritance of the PortableHostCollection. +// This is generally discouraged, and should be done via composition. +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 +template +class TracksHost : public PortableHostCollection> { +public: + static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; //TODO: this could be made configurable at runtime + TracksHost() = default; // Needed for the dictionary; not sure if line above is needed anymore + + using PortableHostCollection>::view; + using PortableHostCollection>::const_view; + using PortableHostCollection>::buffer; + + // Constructor which specifies the SoA size + template + explicit TracksHost(TQueue& queue) + : PortableHostCollection>(S, queue) {} + + // Constructor which specifies the DevHost + explicit TracksHost(alpaka_common::DevHost const& host) + : PortableHostCollection>(S, host) {} +}; + +namespace pixelTrack { + + using TracksHostPhase1 = TracksHost; + using TracksHostPhase2 = TracksHost; + using TracksHostHIonPhase1 = TracksHost; + +} // namespace pixelTrack + +#endif // DataFormats_Track_TracksHost_H diff --git a/DataFormats/TrackSoA/interface/TracksSoA.h b/DataFormats/TrackSoA/interface/TracksSoA.h new file mode 100644 index 0000000000000..bc3a8c4be9cb5 --- /dev/null +++ b/DataFormats/TrackSoA/interface/TracksSoA.h @@ -0,0 +1,56 @@ +#ifndef DataFormats_Track_interface_TrackLayout_h +#define DataFormats_Track_interface_TrackLayout_h + +#include +#include "HeterogeneousCore/AlpakaInterface/interface/OneToManyAssoc.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "DataFormats/SoATemplate/interface/SoALayout.h" +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" + +namespace reco { + + template + struct TrackSoA { + static constexpr int32_t S = TrackerTraits::maxNumberOfTuples; + static constexpr int32_t H = TrackerTraits::avgHitsPerTrack; + // Aliases in order to not confuse the GENERATE_SOA_LAYOUT + // macro with weird colons and angled brackets. + using Vector5f = Eigen::Matrix; + using Vector15f = Eigen::Matrix; + using Quality = pixelTrack::Quality; + + using hindex_type = uint32_t; + + using HitContainer = cms::alpakatools::OneToManyAssocSequential; + + GENERATE_SOA_LAYOUT(Layout, + SOA_COLUMN(Quality, quality), + SOA_COLUMN(float, chi2), + SOA_COLUMN(int8_t, nLayers), + SOA_COLUMN(float, eta), + SOA_COLUMN(float, pt), + SOA_EIGEN_COLUMN(Vector5f, state), + SOA_EIGEN_COLUMN(Vector15f, covariance), + SOA_SCALAR(int, nTracks), + SOA_SCALAR(HitContainer, hitIndices), + SOA_SCALAR(HitContainer, detIndices)) + }; + + template + using TrackLayout = typename reco::TrackSoA::template Layout<>; + template + using TrackSoAView = typename reco::TrackSoA::template Layout<>::View; + template + using TrackSoAConstView = typename reco::TrackSoA::template Layout<>::ConstView; + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float charge(const TrackSoAConstView &tracks, + int32_t i) { + //was: std::copysign(1.f, tracks[i].state()(2)). Will be constexpr with C++23 + float v = tracks[i].state()(2); + return float((0.0f < v) - (v < 0.0f)); + } + +} // namespace reco + +#endif diff --git a/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h b/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h new file mode 100644 index 0000000000000..8affb29845779 --- /dev/null +++ b/DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h @@ -0,0 +1,197 @@ +#ifndef DataFormats_Track_interface_alpaka_TrackUtilities_h +#define DataFormats_Track_interface_alpaka_TrackUtilities_h + +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" +#include "DataFormats/TrackSoA/interface/TracksSoA.h" + +// Methods that operate on View and ConstView of the TrackSoA, and cannot be class methods. +template +struct TracksUtilities { + using TrackSoAView = typename reco::TrackSoA::template Layout<>::View; + using TrackSoAConstView = typename reco::TrackSoA::template Layout<>::ConstView; + using hindex_type = typename reco::TrackSoA::hindex_type; + + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + /* ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float charge(const TrackSoAConstView &tracks, int32_t i) { + //was: std::copysign(1.f, tracks[i].state()(2)). Will be constexpr with C++23 + float v = tracks[i].state()(2); + return float((0.0f < v) - (v < 0.0f)); + } +*/ + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float phi(const TrackSoAConstView &tracks, int32_t i) { + return tracks[i].state()(0); + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float tip(const TrackSoAConstView &tracks, int32_t i) { + return tracks[i].state()(1); + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr float zip(const TrackSoAConstView &tracks, int32_t i) { + return tracks[i].state()(4); + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr bool isTriplet(const TrackSoAConstView &tracks, int i) { + return tracks[i].nLayers() == 3; + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr void copyFromCircle( + TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) { + tracks[i].state() << cp.template cast(), lp.template cast(); + + tracks[i].state()(2) = tracks[i].state()(2) * b; + auto cov = tracks[i].covariance(); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr void copyFromDense(TrackSoAView &tracks, + V5 const &v, + M5 const &cov, + int32_t i) { + tracks[i].state() = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + tracks[i].covariance()(ind++) = cov(j, k); + } + + template + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr void copyToDense(const TrackSoAConstView &tracks, + V5 &v, + M5 &cov, + int32_t i) { + v = tracks[i].state().template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = tracks[i].covariance()(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++); + } + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr int computeNumberOfLayers(const TrackSoAConstView &tracks, + int32_t i) { + auto pdet = tracks.detIndices().begin(i); + int nl = 1; + auto ol = pixelTopology::getLayer(*pdet); + for (; pdet < tracks.detIndices().end(i); ++pdet) { + auto il = pixelTopology::getLayer(*pdet); + if (il != ol) + ++nl; + ol = il; + } + return nl; + } + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE static constexpr int nHits(const TrackSoAConstView &tracks, int i) { + return tracks.detIndices().size(i); + } +}; + +namespace pixelTrack { + + template + struct QualityCutsT {}; + + template + struct QualityCutsT> { + using TrackSoAView = typename reco::TrackSoA::template Layout<>::View; + using TrackSoAConstView = typename reco::TrackSoA::template Layout<>::ConstView; + using tracksHelper = TracksUtilities; + float chi2Coeff[4]; + float chi2MaxPt; // GeV + float chi2Scale; + + struct Region { + float maxTip; // cm + float minPt; // GeV + float maxZip; // cm + }; + + Region triplet; + Region quadruplet; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool isHP(const TrackSoAConstView &tracks, int nHits, int it) const { + // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip) + // default cuts: + // - for triplets: |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm + // - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm + // (see CAHitNtupletGeneratorGPU.cc) + auto const ®ion = (nHits > 3) ? quadruplet : triplet; + return (std::abs(tracksHelper::tip(tracks, it)) < region.maxTip) and (tracks.pt(it) > region.minPt) and + (std::abs(tracksHelper::zip(tracks, it)) < region.maxZip); + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool strictCut(const TrackSoAConstView &tracks, int it) const { + auto roughLog = [](float x) { + // max diff [0.5,12] at 1.25 0.16143 + // average diff 0.0662998 + union IF { + uint32_t i; + float f; + }; + IF z; + z.f = x; + uint32_t lsb = 1 < 21; + z.i += lsb; + z.i >>= 21; + auto f = z.i & 3; + int ex = int(z.i >> 2) - 127; + + // log2(1+0.25*f) + // averaged over bins + const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f}; + return float(ex) + frac[f]; + }; + + float pt = std::min(tracks.pt(it), chi2MaxPt); + float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]); + if (tracks.chi2(it) >= chi2Cut) { +#ifdef NTUPLE_FIT_DEBUG + printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks.pt(it), tracks.eta(it), tracks.chi2(it)); +#endif + return true; + } + return false; + } + }; + + template + struct QualityCutsT> { + using TrackSoAView = typename reco::TrackSoA::template Layout<>::View; + using TrackSoAConstView = typename reco::TrackSoA::template Layout<>::ConstView; + using tracksHelper = TracksUtilities; + + float maxChi2; + float minPt; + float maxTip; + float maxZip; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool isHP(const TrackSoAConstView &tracks, int nHits, int it) const { + return (std::abs(tracksHelper::tip(tracks, it)) < maxTip) and (tracks.pt(it) > minPt) and + (std::abs(tracksHelper::zip(tracks, it)) < maxZip); + } + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool strictCut(const TrackSoAConstView &tracks, int it) const { + return tracks.chi2(it) >= maxChi2; + } + }; + +} // namespace pixelTrack + +// TODO: Should those be placed in the ALPAKA_ACCELERATOR_NAMESPACE +template struct TracksUtilities; +template struct TracksUtilities; + +#endif diff --git a/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h b/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h new file mode 100644 index 0000000000000..62e9f69e34636 --- /dev/null +++ b/DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h @@ -0,0 +1,52 @@ +#ifndef DataFormats_Track_interface_alpaka_TracksSoACollection_h +#define DataFormats_Track_interface_alpaka_TracksSoACollection_h + +#include +#include +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "DataFormats/Portable/interface/alpaka/PortableCollection.h" +#include "DataFormats/TrackSoA/interface/TracksSoA.h" +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" +#include "DataFormats/TrackSoA/interface/TracksDevice.h" +#include "HeterogeneousCore/AlpakaInterface/interface/CopyToHost.h" + +// TODO: The class is created via inheritance of the PortableCollection. +// This is generally discouraged, and should be done via composition. +// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306 + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + template + using TracksSoACollection = std::conditional_t, + TracksHost, + TracksDevice>; + + //Classes definition for Phase1/Phase2/HIonPhase1, to make the classes_def lighter. Not actually used in the code. + namespace pixelTrack { + using TracksSoACollectionPhase1 = TracksSoACollection; + using TracksSoACollectionPhase2 = TracksSoACollection; + using TracksSoACollectionHIonPhase1 = TracksSoACollection; + } // namespace pixelTrack +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +namespace cms::alpakatools { + template + struct CopyToHost> { + template + static auto copyAsync(TQueue& queue, TracksDevice const& deviceData) { + ::TracksHost hostData(queue); + alpaka::memcpy(queue, hostData.buffer(), deviceData.buffer()); +#ifdef GPU_DEBUG + printf("TracksSoACollection: I'm copying to host.\n"); +#endif + return hostData; + } + }; +} // namespace cms::alpakatools + +ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionPhase1, pixelTrack::TracksHostPhase1); +ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionPhase2, pixelTrack::TracksHostPhase2); +ASSERT_DEVICE_MATCHES_HOST_COLLECTION(pixelTrack::TracksSoACollectionHIonPhase1, pixelTrack::TracksHostHIonPhase1); + +#endif // DataFormats_Track_interface_alpaka_TracksSoACollection_h diff --git a/DataFormats/TrackSoA/src/alpaka/classes_cuda.h b/DataFormats/TrackSoA/src/alpaka/classes_cuda.h new file mode 100644 index 0000000000000..4783184611401 --- /dev/null +++ b/DataFormats/TrackSoA/src/alpaka/classes_cuda.h @@ -0,0 +1,14 @@ + +#ifndef DataFormats_TrackSoA_src_alpaka_classes_cuda_h +#define DataFormats_TrackSoA_src_alpaka_classes_cuda_h + +#include "DataFormats/Common/interface/DeviceProduct.h" +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/TrackSoA/interface/TracksSoA.h" +#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h" +#include "DataFormats/TrackSoA/interface/TracksDevice.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +using namespace reco; + +#endif // DataFormats_TrackSoA_src_alpaka_classes_cuda_h diff --git a/DataFormats/TrackSoA/src/alpaka/classes_cuda_def.xml b/DataFormats/TrackSoA/src/alpaka/classes_cuda_def.xml new file mode 100644 index 0000000000000..c04ca173c49f9 --- /dev/null +++ b/DataFormats/TrackSoA/src/alpaka/classes_cuda_def.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/DataFormats/TrackSoA/src/alpaka/classes_rocm.h b/DataFormats/TrackSoA/src/alpaka/classes_rocm.h new file mode 100644 index 0000000000000..38143a6058c36 --- /dev/null +++ b/DataFormats/TrackSoA/src/alpaka/classes_rocm.h @@ -0,0 +1,14 @@ + +#ifndef DataFormats_TrackSoA_src_alpaka_classes_rocm_h +#define DataFormats_TrackSoA_src_alpaka_classes_rocm_h + +#include "DataFormats/Common/interface/DeviceProduct.h" +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/TrackSoA/interface/TracksSoA.h" +#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h" +#include "DataFormats/TrackSoA/interface/TracksDevice.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +using namespace reco; + +#endif // DataFormats_TrackSoA_src_alpaka_classes_rocm_h diff --git a/DataFormats/TrackSoA/src/alpaka/classes_rocm_def.xml b/DataFormats/TrackSoA/src/alpaka/classes_rocm_def.xml new file mode 100644 index 0000000000000..b7e40aedead42 --- /dev/null +++ b/DataFormats/TrackSoA/src/alpaka/classes_rocm_def.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/DataFormats/TrackSoA/src/classes.cc b/DataFormats/TrackSoA/src/classes.cc new file mode 100644 index 0000000000000..97e00cc5b5638 --- /dev/null +++ b/DataFormats/TrackSoA/src/classes.cc @@ -0,0 +1,9 @@ +#include "DataFormats/Portable/interface/PortableHostCollectionReadRules.h" +#include "DataFormats/TrackSoA/interface/TracksSoA.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +using namespace reco; + +SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection>); +SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection>); +// SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection>); //TODO: For the moment we live without HIons diff --git a/DataFormats/TrackSoA/src/classes.h b/DataFormats/TrackSoA/src/classes.h new file mode 100644 index 0000000000000..43d40e5f8f3ac --- /dev/null +++ b/DataFormats/TrackSoA/src/classes.h @@ -0,0 +1,11 @@ +#ifndef DataFormats_TrackSoA_src_classes_h +#define DataFormats_TrackSoA_src_classes_h + +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/TrackSoA/interface/TracksSoA.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" + +using namespace pixelTopology; +using namespace reco; + +#endif // DataFormats_TrackSoA_src_classes_h diff --git a/DataFormats/TrackSoA/src/classes_def.xml b/DataFormats/TrackSoA/src/classes_def.xml new file mode 100644 index 0000000000000..fd8fc0781ee25 --- /dev/null +++ b/DataFormats/TrackSoA/src/classes_def.xml @@ -0,0 +1,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/DataFormats/TrackSoA/test/BuildFile.xml b/DataFormats/TrackSoA/test/BuildFile.xml new file mode 100644 index 0000000000000..ce2b273d90577 --- /dev/null +++ b/DataFormats/TrackSoA/test/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.cc b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.cc new file mode 100644 index 0000000000000..f4af0688ca1bf --- /dev/null +++ b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.cc @@ -0,0 +1,82 @@ +/** + Simple test for the pixelTrack::TrackSoA data structure + which inherits from PortableDeviceCollection. + + Creates an instance of the class (automatically allocates + memory on device), passes the view of the SoA data to + the CUDA kernels which: + - Fill the SoA with data. + - Verify that the data written is correct. + + Then, the SoA data are copied back to Host, where + a temporary host-side view (tmp_view) is created using + the same Layout to access the data on host and print it. + */ + +#include +#include +#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h" +#include "DataFormats/TrackSoA/interface/TracksDevice.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + +using namespace std; +using namespace reco; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; +using namespace ALPAKA_ACCELERATOR_NAMESPACE::pixelTrack; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace testTrackSoA { + + template + void runKernels(TrackSoAView tracks_view, Queue& queue); + } +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +int main() { + const auto host = cms::alpakatools::host(); + const auto device = cms::alpakatools::devices()[0]; + Queue queue(device); + + // Inner scope to deallocate memory before destroying the stream + { + // Instantiate tracks on device. PortableDeviceCollection allocates + // SoA on device automatically. + TracksSoACollection tracks_d(queue); + testTrackSoA::runKernels(tracks_d.view(), queue); + + // Instantate tracks on host. This is where the data will be + // copied to from device. + TracksHost tracks_h(queue); + + std::cout << tracks_h.view().metadata().size() << std::endl; + alpaka::memcpy(queue, tracks_h.buffer(), tracks_d.const_buffer()); + alpaka::wait(queue); + + // Print results + std::cout << "pt" + << "\t" + << "eta" + << "\t" + << "chi2" + << "\t" + << "quality" + << "\t" + << "nLayers" + << "\t" + << "hitIndices off" << std::endl; + + for (int i = 0; i < 10; ++i) { + std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2() + << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t" + << tracks_h.view().hitIndices().off[i] << std::endl; + } + } + + return 0; +} diff --git a/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc new file mode 100644 index 0000000000000..2c2d0961eb106 --- /dev/null +++ b/DataFormats/TrackSoA/test/alpaka/TrackSoAHeterogeneous_test.dev.cc @@ -0,0 +1,74 @@ +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" +#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h" +#include "DataFormats/TrackSoA/interface/TracksDevice.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" + +using namespace reco; + +using Quality = pixelTrack::Quality; +namespace ALPAKA_ACCELERATOR_NAMESPACE { + using namespace cms::alpakatools; + namespace testTrackSoA { + + // Kernel which fills the TrackSoAView with data + // to test writing to it + template + class TestFillKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, TrackSoAView tracks_view) const { + if (cms::alpakatools::once_per_grid(acc)) { + tracks_view.nTracks() = 420; + } + + for (int32_t j : elements_with_stride(acc, tracks_view.metadata().size())) { + tracks_view[j].pt() = (float)j; + tracks_view[j].eta() = (float)j; + tracks_view[j].chi2() = (float)j; + tracks_view[j].quality() = (Quality)(j % 256); + tracks_view[j].nLayers() = j % 128; + tracks_view.hitIndices().off[j] = j; + } + } + }; + + // Kernel which reads from the TrackSoAView to verify + // that it was written correctly from the fill kernel + template + class TestVerifyKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, TrackSoAConstView tracks_view) const { + if (cms::alpakatools::once_per_grid(acc)) { + ALPAKA_ASSERT_OFFLOAD(tracks_view.nTracks() == 420); + } + for (int32_t j : elements_with_stride(acc, tracks_view.nTracks())) { + assert(abs(tracks_view[j].pt() - (float)j) < .0001); + assert(abs(tracks_view[j].eta() - (float)j) < .0001); + assert(abs(tracks_view[j].chi2() - (float)j) < .0001); + assert(tracks_view[j].quality() == (Quality)(j % 256)); + assert(tracks_view[j].nLayers() == j % 128); + assert(tracks_view.hitIndices().off[j] == uint32_t(j)); + } + } + }; + + // Host function which invokes the two kernels above + template + void runKernels(TrackSoAView tracks_view, Queue& queue) { + uint32_t items = 64; + uint32_t groups = divide_up_by(tracks_view.metadata().size(), items); + auto workDiv = make_workdiv(groups, items); + alpaka::exec(queue, workDiv, TestFillKernel{}, tracks_view); + alpaka::exec(queue, + workDiv, + TestVerifyKernel{}, + tracks_view); //TODO: wait for some PR that solves this and then check it!!! + } + + template void runKernels(TrackSoAView tracks_view, Queue& queue); + template void runKernels(TrackSoAView tracks_view, Queue& queue); + + } // namespace testTrackSoA +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/DataFormats/VertexSoA/BuildFile.xml b/DataFormats/VertexSoA/BuildFile.xml new file mode 100644 index 0000000000000..af53fc68f5a45 --- /dev/null +++ b/DataFormats/VertexSoA/BuildFile.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/DataFormats/VertexSoA/README.md b/DataFormats/VertexSoA/README.md new file mode 100644 index 0000000000000..54172eda14281 --- /dev/null +++ b/DataFormats/VertexSoA/README.md @@ -0,0 +1,45 @@ +# Vertex Portable Data Formats + +`DataFormat`s meant to be used on Host (CPU) or Device (GPU) for +storing information about vertices created during the Pixel-local Reconstruction +chain. It stores data in an SoA manner. It contains the data that was previously +contained in the deprecated `ZVertexSoA` class. + +The host format is inheriting from `DataFormats/Common/interface/PortableHostCollection.h`, +while the device format is inheriting from `DataFormats/Common/interface/PortableDeviceCollection.h` + +Both formats use the same SoA Layout (`ZVertexLayout`) which is generated +via the `GENERATE_SOA_LAYOUT` macro in the `ZVertexUtilities.h` file. + +## Notes + +- Initially, `ZVertexSoA` had distinct array sizes for each attribute (e.g. `zv` was `MAXVTX` elements +long, `ndof` was `MAXTRACKS` elements long). All columns are now of uniform `MAXTRACKS` size, +meaning that there will be some wasted space (appx. 190kB). +- Host and Device classes should **not** be created via inheritance, as they're done here, +but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309). + +## ZVertexHeterogeneousHost + +The version of the data format to be used for storing vertex data on the CPU. +Instances of this class are to be used for: + +- Having a place to copy data to host from device, via `cudaMemcpy`, or +- Running host-side algorithms using data stored in an SoA manner. + +## ZVertexHeterogeneousDevice + +The version of the data format to be used for storing vertex data on the GPU. + +Instances of `ZVertexHeterogeneousDevice` are to be created on host and be +used on device only. To do so, the instance's `view()` method is to be called +to pass a `View` to any kernel launched. Accessing data from the `view()` is not +possible on the host side. + +## Utilities + +Apart from `ZVertexLayout`, `ZVertexUtilities.h` also contains +a collection of methods which were originally +defined as class methods inside the `ZVertexSoA` class +which have been adapted to operate on `View` instances, so that they are callable +from within `__global__` kernels, on both CPU and CPU. diff --git a/DataFormats/VertexSoA/interface/ZVertexDefinitions.h b/DataFormats/VertexSoA/interface/ZVertexDefinitions.h new file mode 100644 index 0000000000000..028668d1ff52a --- /dev/null +++ b/DataFormats/VertexSoA/interface/ZVertexDefinitions.h @@ -0,0 +1,13 @@ +#ifndef DataFormats_VertexSoA_ZVertexDefinitions_h +#define DataFormats_VertexSoA_ZVertexDefinitions_h + +#include + +namespace zVertex { + + constexpr uint32_t MAXTRACKS = 32 * 1024; + constexpr uint32_t MAXVTX = 1024; + +} // namespace zVertex + +#endif diff --git a/DataFormats/VertexSoA/interface/ZVertexDevice.h b/DataFormats/VertexSoA/interface/ZVertexDevice.h new file mode 100644 index 0000000000000..8d120ae190f3c --- /dev/null +++ b/DataFormats/VertexSoA/interface/ZVertexDevice.h @@ -0,0 +1,26 @@ +#ifndef DataFormats_VertexSoA_interface_ZVertexDevice_h +#define DataFormats_VertexSoA_interface_ZVertexDevice_h + +#include + +#include +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "DataFormats/Portable/interface/PortableDeviceCollection.h" + +template +class ZVertexDeviceSoA : public PortableDeviceCollection, TDev> { +public: + ZVertexDeviceSoA() = default; // necessary for ROOT dictionaries + + // Constructor which specifies the SoA size + template + explicit ZVertexDeviceSoA(TQueue queue) : PortableDeviceCollection, TDev>(S, queue) {} +}; + +using namespace ::zVertex; +template +using ZVertexDevice = ZVertexDeviceSoA; + +#endif // DataFormats_VertexSoA_interface_ZVertexDevice_h diff --git a/DataFormats/VertexSoA/interface/ZVertexHost.h b/DataFormats/VertexSoA/interface/ZVertexHost.h new file mode 100644 index 0000000000000..2d72b83bfe385 --- /dev/null +++ b/DataFormats/VertexSoA/interface/ZVertexHost.h @@ -0,0 +1,29 @@ +#ifndef DataFormats_VertexSoA_ZVertexHost_H +#define DataFormats_VertexSoA_ZVertexHost_H + +#include + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" +#include "DataFormats/Portable/interface/PortableHostCollection.h" + +template +class ZVertexHostSoA : public PortableHostCollection { +public: + ZVertexHostSoA() = default; + + // Constructor which specifies the queue + template + explicit ZVertexHostSoA(TQueue queue) : PortableHostCollection(S, queue) {} + + // Constructor which specifies the DevHost + explicit ZVertexHostSoA(alpaka_common::DevHost const& host) : PortableHostCollection(S, host) {} +}; + +//using namespace ::zVertex; +using ZVertexHost = ZVertexHostSoA; + +#endif // DataFormats_VertexSoA_ZVertexHost_H diff --git a/DataFormats/VertexSoA/interface/ZVertexSoA.h b/DataFormats/VertexSoA/interface/ZVertexSoA.h new file mode 100644 index 0000000000000..045603618acd7 --- /dev/null +++ b/DataFormats/VertexSoA/interface/ZVertexSoA.h @@ -0,0 +1,31 @@ +#ifndef DataFormats_VertexSoA_interface_ZVertexSoA_h +#define DataFormats_VertexSoA_interface_ZVertexSoA_h + +#include + +#include + +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +namespace reco { + + GENERATE_SOA_LAYOUT(ZVertexLayout, + SOA_COLUMN(int16_t, idv), + SOA_COLUMN(float, zv), + SOA_COLUMN(float, wv), + SOA_COLUMN(float, chi2), + SOA_COLUMN(float, ptv2), + SOA_COLUMN(int32_t, ndof), + SOA_COLUMN(uint16_t, sortInd), + SOA_SCALAR(uint32_t, nvFinal)) + + // Common types for both Host and Device code + using ZVertexSoA = ZVertexLayout<>; + using ZVertexSoAView = ZVertexSoA::View; + using ZVertexSoAConstView = ZVertexSoA::ConstView; + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; } + +} // namespace reco + +#endif // DataFormats_VertexSoA_interface_ZVertexSoA_h diff --git a/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h b/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h new file mode 100644 index 0000000000000..636a07e2bd978 --- /dev/null +++ b/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h @@ -0,0 +1,39 @@ +#ifndef DataFormats_VertexSoA_interface_ZVertexSoACollection_h +#define DataFormats_VertexSoA_interface_ZVertexSoACollection_h + +#include + +#include +#include "DataFormats/Portable/interface/alpaka/PortableCollection.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "DataFormats/VertexSoA/interface/ZVertexDevice.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/CopyToHost.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + using ZVertexSoACollection = + std::conditional_t, ZVertexHost, ZVertexDevice>; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +namespace cms::alpakatools { + template + struct CopyToHost> { + template + static auto copyAsync(TQueue& queue, ZVertexDevice const& deviceData) { + ZVertexHost hostData(queue); + alpaka::memcpy(queue, hostData.buffer(), deviceData.buffer()); +#ifdef GPU_DEBUG + printf("ZVertexSoACollection: I'm copying to host.\n"); +#endif + return hostData; + } + }; +} // namespace cms::alpakatools + +ASSERT_DEVICE_MATCHES_HOST_COLLECTION(ZVertexSoACollection, ZVertexHost); + +#endif // DataFormats_VertexSoA_interface_ZVertexSoACollection_h diff --git a/DataFormats/VertexSoA/src/alpaka/classes_cuda.h b/DataFormats/VertexSoA/src/alpaka/classes_cuda.h new file mode 100644 index 0000000000000..e76f6ca1365c1 --- /dev/null +++ b/DataFormats/VertexSoA/src/alpaka/classes_cuda.h @@ -0,0 +1,10 @@ +#ifndef DataFormats_VertexSoA_src_alpaka_classes_cuda_h +#define DataFormats_VertexSoA_src_alpaka_classes_cuda_h + +#include "DataFormats/Common/interface/DeviceProduct.h" +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "DataFormats/VertexSoA/interface//ZVertexDevice.h" +#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h" + +#endif // DataFormats_VertexSoA_src_alpaka_classes_cuda_h diff --git a/DataFormats/VertexSoA/src/alpaka/classes_cuda_def.xml b/DataFormats/VertexSoA/src/alpaka/classes_cuda_def.xml new file mode 100644 index 0000000000000..606937a5bd3e5 --- /dev/null +++ b/DataFormats/VertexSoA/src/alpaka/classes_cuda_def.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/DataFormats/VertexSoA/src/alpaka/classes_rocm.h b/DataFormats/VertexSoA/src/alpaka/classes_rocm.h new file mode 100644 index 0000000000000..f5ea845c028b1 --- /dev/null +++ b/DataFormats/VertexSoA/src/alpaka/classes_rocm.h @@ -0,0 +1,9 @@ +#ifndef DataFormats_VertexSoA_src_alpaka_classes_rocm_h +#define DataFormats_VertexSoA_src_alpaka_classes_rocm_h + +#include "DataFormats/Common/interface/DeviceProduct.h" +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "DataFormats/VertexSoA/interface//ZVertexDevice.h" +#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h" +#endif // DataFormats_VertexSoA_src_alpaka_classes_rocm_h diff --git a/DataFormats/VertexSoA/src/alpaka/classes_rocm_def.xml b/DataFormats/VertexSoA/src/alpaka/classes_rocm_def.xml new file mode 100644 index 0000000000000..94deb6fff7d61 --- /dev/null +++ b/DataFormats/VertexSoA/src/alpaka/classes_rocm_def.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/DataFormats/VertexSoA/src/classes.cc b/DataFormats/VertexSoA/src/classes.cc new file mode 100644 index 0000000000000..edffb6e08a9e5 --- /dev/null +++ b/DataFormats/VertexSoA/src/classes.cc @@ -0,0 +1,4 @@ +#include "DataFormats/Portable/interface/PortableHostCollectionReadRules.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" + +SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection); diff --git a/DataFormats/VertexSoA/src/classes.h b/DataFormats/VertexSoA/src/classes.h new file mode 100644 index 0000000000000..883182c01dcf9 --- /dev/null +++ b/DataFormats/VertexSoA/src/classes.h @@ -0,0 +1,8 @@ +#ifndef DataFormats_VertexSoA_src_classes_h +#define DataFormats_VertexSoA_src_classes_h + +#include "DataFormats/Common/interface/Wrapper.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" + +#endif // DataFormats_VertexSoA_src_classes_h diff --git a/DataFormats/VertexSoA/src/classes_def.xml b/DataFormats/VertexSoA/src/classes_def.xml new file mode 100644 index 0000000000000..820d28ecc3493 --- /dev/null +++ b/DataFormats/VertexSoA/src/classes_def.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/DataFormats/VertexSoA/test/BuildFile.xml b/DataFormats/VertexSoA/test/BuildFile.xml new file mode 100644 index 0000000000000..49dee4babd8a1 --- /dev/null +++ b/DataFormats/VertexSoA/test/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc new file mode 100644 index 0000000000000..0c0c8e8591df9 --- /dev/null +++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc @@ -0,0 +1,82 @@ +/** + Simple test for the reco::ZVertexSoA data structure + which inherits from Portable{Host}Collection. + + Creates an instance of the class (automatically allocates + memory on device), passes the view of the SoA data to + the kernels which: + - Fill the SoA with data. + - Verify that the data written is correct. + + Then, the SoA data are copied back to Host, where + a temporary host-side view (tmp_view) is created using + the same Layout to access the data on host and print it. + */ + +#include +#include +#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h" +#include "DataFormats/VertexSoA/interface/ZVertexDevice.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +using namespace std; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; +using namespace reco; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace testZVertexSoAT { + void runKernels(ZVertexSoAView zvertex_view, Queue& queue); + } +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +int main() { + const auto host = cms::alpakatools::host(); + const auto device = cms::alpakatools::devices()[0]; + Queue queue(device); + + // Inner scope to deallocate memory before destroying the stream + { + // Instantiate vertices on device. PortableCollection allocates + // SoA on device automatically. + ZVertexSoACollection zvertex_d(queue); + testZVertexSoAT::runKernels(zvertex_d.view(), queue); + + // Instantate vertices on host. This is where the data will be + // copied to from device. + ZVertexHost zvertex_h(queue); + std::cout << zvertex_h.view().metadata().size() << std::endl; + alpaka::memcpy(queue, zvertex_h.buffer(), zvertex_d.const_buffer()); + alpaka::wait(queue); + + // Print results + std::cout << "idv" + << "\t" + << "zv" + << "\t" + << "wv" + << "\t" + << "chi2" + << "\t" + << "ptv2" + << "\t" + << "ndof" + << "\t" + << "sortInd" + << "\t" + << "nvFinal" << std::endl; + + for (int i = 0; i < 10; ++i) { + std::cout << (int)zvertex_h.view()[i].idv() << "\t" << zvertex_h.view()[i].zv() << "\t" + << zvertex_h.view()[i].wv() << "\t" << zvertex_h.view()[i].chi2() << "\t" << zvertex_h.view()[i].ptv2() + << "\t" << (int)zvertex_h.view()[i].ndof() << "\t" << (int)zvertex_h.view()[i].sortInd() << "\t" + << (int)zvertex_h.view().nvFinal() << std::endl; + } + } + + return 0; +} diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc new file mode 100644 index 0000000000000..1b22159a53b88 --- /dev/null +++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc @@ -0,0 +1,62 @@ +#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h" +#include "DataFormats/VertexSoA/interface/ZVertexDevice.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" // Check if this is really needed; code doesn't compile without it + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + using namespace alpaka; + using namespace cms::alpakatools; + + namespace testZVertexSoAT { + + class TestFillKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, reco::ZVertexSoAView zvertex_view) const { + if (cms::alpakatools::once_per_grid(acc)) { + zvertex_view.nvFinal() = 420; + } + + for (int32_t j : elements_with_stride(acc, zvertex_view.metadata().size())) { + zvertex_view[j].idv() = (int16_t)j; + zvertex_view[j].zv() = (float)j; + zvertex_view[j].wv() = (float)j; + zvertex_view[j].chi2() = (float)j; + zvertex_view[j].ptv2() = (float)j; + zvertex_view[j].ndof() = (int32_t)j; + zvertex_view[j].sortInd() = (uint16_t)j; + } + } + }; + + class TestVerifyKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, reco::ZVertexSoAView zvertex_view) const { + if (cms::alpakatools::once_per_grid(acc)) { + ALPAKA_ASSERT_OFFLOAD(zvertex_view.nvFinal() == 420); + } + + for (int32_t j : elements_with_stride(acc, zvertex_view.nvFinal())) { + assert(zvertex_view[j].idv() == j); + assert(zvertex_view[j].zv() - (float)j < 0.0001); + assert(zvertex_view[j].wv() - (float)j < 0.0001); + assert(zvertex_view[j].chi2() - (float)j < 0.0001); + assert(zvertex_view[j].ptv2() - (float)j < 0.0001); + assert(zvertex_view[j].ndof() == j); + assert(zvertex_view[j].sortInd() == uint32_t(j)); + } + } + }; + + void runKernels(reco::ZVertexSoAView zvertex_view, Queue& queue) { + uint32_t items = 64; + uint32_t groups = divide_up_by(zvertex_view.metadata().size(), items); + auto workDiv = make_workdiv(groups, items); + alpaka::exec(queue, workDiv, TestFillKernel{}, zvertex_view); + alpaka::exec(queue, workDiv, TestVerifyKernel{}, zvertex_view); + } + + } // namespace testZVertexSoAT + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/HLTrigger/Configuration/python/customizeHLTforPatatrack.py b/HLTrigger/Configuration/python/customizeHLTforPatatrack.py new file mode 100644 index 0000000000000..f96716d82ae2d --- /dev/null +++ b/HLTrigger/Configuration/python/customizeHLTforPatatrack.py @@ -0,0 +1,485 @@ +import FWCore.ParameterSet.Config as cms + +def customizeHLTforDQMGPUvsCPUPixel(process): + '''Ad-hoc changes to test HLT config containing only DQM_PixelReconstruction_v and DQMGPUvsCPU stream + only up to the Pixel Local Reconstruction + ''' + dqmPixelRecoPathName = None + for pathName in process.paths_(): + if pathName.startswith('DQM_PixelReconstruction_v'): + dqmPixelRecoPathName = pathName + break + + if dqmPixelRecoPathName == None: + return process + + process.hltPixelConsumerGPU.eventProducts = [ + 'hltSiPixelClusters', + 'hltSiPixelClustersLegacyFormat', + 'hltSiPixelDigiErrorsLegacyFormat', + 'hltSiPixelRecHits', + 'hltSiPixelRecHitsLegacyFormat', + 'hltPixelTracks', + 'hltPixelTracksLegacyFormat', + 'hltPixelVertices', + 'hltPixelVerticesLegacyFormat', + ] + + process.hltPixelConsumerCPU.eventProducts = [] + for foo in process.hltPixelConsumerGPU.eventProducts: + process.hltPixelConsumerCPU.eventProducts += [foo+'CPUSerial'] + + # modify EventContent of DQMGPUvsCPU stream + if hasattr(process, 'hltOutputDQMGPUvsCPU'): + process.hltOutputDQMGPUvsCPU.outputCommands = [ + 'drop *', + 'keep *Cluster*_hltSiPixelClustersLegacyFormat_*_*', + 'keep *Cluster*_hltSiPixelClustersLegacyFormatCPUSerial_*_*', + 'keep *_hltSiPixelDigiErrorsLegacyFormat_*_*', + 'keep *_hltSiPixelDigiErrorsLegacyFormatCPUSerial_*_*', + 'keep *RecHit*_hltSiPixelRecHitsLegacyFormat_*_*', + 'keep *RecHit*_hltSiPixelRecHitsLegacyFormatCPUSerial_*_*', + 'keep *_hltPixelTracksLegacyFormat_*_*', + 'keep *_hltPixelTracksLegacyFormatCPUSerial_*_*', + 'keep *_hltPixelVerticesLegacyFormat_*_*', + 'keep *_hltPixelVerticesLegacyFormatCPUSerial_*_*', + ] + + # PixelRecHits: monitor of CPUSerial product (Alpaka backend: 'serial_sync') + process.hltSiPixelRecHitsSoAMonitorCPU = cms.EDProducer('SiPixelPhase1MonitorRecHitsSoAAlpaka', + pixelHitsSrc = cms.InputTag( 'hltSiPixelRecHitsCPUSerial' ), + TopFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsCPU' ) + ) + + # PixelRecHits: monitor of GPU product (Alpaka backend: '') + process.hltSiPixelRecHitsSoAMonitorGPU = cms.EDProducer('SiPixelPhase1MonitorRecHitsSoAAlpaka', + pixelHitsSrc = cms.InputTag( 'hltSiPixelRecHits' ), + TopFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsGPU' ) + ) + + # PixelRecHits: 'GPUvsCPU' comparisons + process.hltSiPixelRecHitsSoACompareGPUvsCPU = cms.EDProducer('SiPixelPhase1CompareRecHitsSoAAlpaka', + pixelHitsSrcCPU = cms.InputTag( 'hltSiPixelRecHitsCPUSerial' ), + pixelHitsSrcGPU = cms.InputTag( 'hltSiPixelRecHits' ), + topFolderName = cms.string( 'SiPixelHeterogeneous/PixelRecHitsCompareGPUvsCPU' ), + minD2cut = cms.double( 1.0E-4 ) + ) + + process.hltSiPixelTrackSoAMonitorCPU = cms.EDProducer("SiPixelPhase1MonitorTrackSoAAlpaka", + mightGet = cms.optional.untracked.vstring, + minQuality = cms.string('loose'), + pixelTrackSrc = cms.InputTag('hltPixelTracksCPUSerial'), + topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackCPU'), + useQualityCut = cms.bool(True) + ) + + process.hltSiPixelTrackSoAMonitorGPU = cms.EDProducer("SiPixelPhase1MonitorTrackSoAAlpaka", + mightGet = cms.optional.untracked.vstring, + minQuality = cms.string('loose'), + pixelTrackSrc = cms.InputTag('hltPixelTracks'), + topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackGPU'), + useQualityCut = cms.bool(True) + ) + + process.hltSiPixelTrackSoACompareGPUvsCPU = cms.EDProducer("SiPixelPhase1CompareTrackSoAAlpaka", + deltaR2cut = cms.double(0.04), + mightGet = cms.optional.untracked.vstring, + minQuality = cms.string('loose'), + pixelTrackSrcCPU = cms.InputTag("hltPixelTracksCPUSerial"), + pixelTrackSrcGPU = cms.InputTag("hltPixelTracks"), + topFolderName = cms.string('SiPixelHeterogeneous/PixelTrackCompareGPUvsCPU'), + useQualityCut = cms.bool(True) + ) + + process.hltSiPixelVertexSoAMonitorCPU = cms.EDProducer("SiPixelMonitorVertexSoAAlpaka", + beamSpotSrc = cms.InputTag("hltOnlineBeamSpot"), + mightGet = cms.optional.untracked.vstring, + pixelVertexSrc = cms.InputTag("hltPixelVerticesCPUSerial"), + topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexCPU') + ) + + process.hltSiPixelVertexSoAMonitorGPU = cms.EDProducer("SiPixelMonitorVertexSoAAlpaka", + beamSpotSrc = cms.InputTag("hltOnlineBeamSpot"), + mightGet = cms.optional.untracked.vstring, + pixelVertexSrc = cms.InputTag("hltPixelVertices"), + topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexGPU') + ) + + process.hltSiPixelVertexSoACompareGPUvsCPU = cms.EDProducer("SiPixelCompareVertexSoAAlpaka", + beamSpotSrc = cms.InputTag("hltOnlineBeamSpot"), + dzCut = cms.double(1), + mightGet = cms.optional.untracked.vstring, + pixelVertexSrcCPU = cms.InputTag("hltPixelVerticesCPUSerial"), + pixelVertexSrcGPU = cms.InputTag("hltPixelVertices"), + topFolderName = cms.string('SiPixelHeterogeneous/PixelVertexCompareGPUvsCPU') + ) + + process.HLTDQMPixelReconstruction = cms.Sequence( + process.hltSiPixelRecHitsSoAMonitorCPU + + process.hltSiPixelRecHitsSoAMonitorGPU + + process.hltSiPixelRecHitsSoACompareGPUvsCPU + + process.hltSiPixelTrackSoAMonitorCPU + + process.hltSiPixelTrackSoAMonitorGPU + + process.hltSiPixelTrackSoACompareGPUvsCPU + + process.hltSiPixelVertexSoAMonitorCPU + + process.hltSiPixelVertexSoAMonitorGPU + + process.hltSiPixelVertexSoACompareGPUvsCPU + ) + + # Add CPUSerial sequences to DQM_PixelReconstruction_v Path + dqmPixelRecoPath = getattr(process, dqmPixelRecoPathName) + try: + dqmPixelRecoPathIndex = dqmPixelRecoPath.index(process.HLTRecopixelvertexingSequence) + 1 + for cpuSeqName in [ + 'HLTDoLocalPixelCPUSerialSequence', + 'HLTRecopixelvertexingCPUSerialSequence', + ]: + dqmPixelRecoPath.insert(dqmPixelRecoPathIndex, getattr(process, cpuSeqName)) + dqmPixelRecoPathIndex += 1 + except: + dqmPixelRecoPathIndex = None + + return process + +def customizeHLTforAlpakaPixelRecoLocal(process): + '''Customisation to introduce the Local Pixel Reconstruction in Alpaka + ''' + process.hltESPSiPixelCablingSoA = cms.ESProducer('SiPixelCablingSoAESProducer@alpaka', + CablingMapLabel = cms.string(''), + UseQualityInfo = cms.bool(False), + appendToDataLabel = cms.string(''), + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ) + ) + + process.hltESPSiPixelGainCalibrationForHLTSoA = cms.ESProducer('SiPixelGainCalibrationForHLTSoAESProducer@alpaka', + appendToDataLabel = cms.string(''), + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ) + ) + + process.hltESPPixelCPEFastParamsPhase1 = cms.ESProducer('PixelCPEFastParamsESProducerAlpakaPhase1@alpaka', + appendToDataLabel = cms.string(''), + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ) + ) + + ### + + # alpaka EDProducer + # consumes + # - reco::BeamSpot + # produces + # - BeamSpotDeviceProduct + process.hltOnlineBeamSpotDevice = cms.EDProducer('BeamSpotDeviceProducer@alpaka', + src = cms.InputTag('hltOnlineBeamSpot'), + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ) + ) + + # alpaka EDProducer + # consumes + # - FEDRawDataCollection + # produces (* optional) + # - SiPixelClustersSoA + # - SiPixelDigisSoACollection + # - SiPixelDigiErrorsSoACollection * + # - SiPixelFormatterErrors * + process.hltSiPixelClusters = cms.EDProducer('SiPixelRawToClusterPhase1@alpaka', + mightGet = cms.optional.untracked.vstring, + IncludeErrors = cms.bool(True), + UseQualityInfo = cms.bool(False), + clusterThreshold_layer1 = cms.int32(4000), + clusterThreshold_otherLayers = cms.int32(4000), + VCaltoElectronGain = cms.double(1), # all gains=1, pedestals=0 + VCaltoElectronGain_L1 = cms.double(1), + VCaltoElectronOffset = cms.double(0), + VCaltoElectronOffset_L1 = cms.double(0), + InputLabel = cms.InputTag('rawDataCollector'), + Regions = cms.PSet( + inputs = cms.optional.VInputTag, + deltaPhi = cms.optional.vdouble, + maxZ = cms.optional.vdouble, + beamSpot = cms.optional.InputTag + ), + CablingMapLabel = cms.string(''), + # autoselect the alpaka backend + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ) + ) + + process.hltSiPixelClustersLegacyFormat = cms.EDProducer('SiPixelDigisClustersFromSoAAlpakaPhase1', + src = cms.InputTag('hltSiPixelClusters'), + clusterThreshold_layer1 = cms.int32(4000), + clusterThreshold_otherLayers = cms.int32(4000), + produceDigis = cms.bool(False), + storeDigis = cms.bool(False) + ) + + process.hltSiPixelClustersCache = cms.EDProducer('SiPixelClusterShapeCacheProducer', + src = cms.InputTag( 'hltSiPixelClustersLegacyFormat' ), + onDemand = cms.bool( False ) + ) + + # legacy EDProducer + # consumes + # - SiPixelDigiErrorsHost + # - SiPixelFormatterErrors + # produces + # - edm::DetSetVector + # - DetIdCollection + # - DetIdCollection, 'UserErrorModules' + # - edmNew::DetSetVector + process.hltSiPixelDigiErrorsLegacyFormat = cms.EDProducer('SiPixelDigiErrorsFromSoAAlpaka', + digiErrorSoASrc = cms.InputTag('hltSiPixelClusters'), + fmtErrorsSoASrc = cms.InputTag('hltSiPixelClusters'), + CablingMapLabel = cms.string(''), + UsePhase1 = cms.bool(True), + ErrorList = cms.vint32(29), + UserErrorList = cms.vint32(40) + ) + + # alpaka EDProducer + # consumes + # - BeamSpotDeviceProduct + # - SiPixelClustersSoA + # - SiPixelDigisCollection + # produces + # - TrackingRecHitAlpakaCollection + process.hltSiPixelRecHits = cms.EDProducer('SiPixelRecHitAlpakaPhase1@alpaka', + beamSpot = cms.InputTag('hltOnlineBeamSpotDevice'), + src = cms.InputTag('hltSiPixelClusters'), + CPE = cms.string('PixelCPEFastParams'), + mightGet = cms.optional.untracked.vstring, + # autoselect the alpaka backend + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ) + ) + + process.hltSiPixelRecHitsLegacyFormat = cms.EDProducer('SiPixelRecHitFromSoAAlpakaPhase1', + pixelRecHitSrc = cms.InputTag('hltSiPixelRecHits'), + src = cms.InputTag('hltSiPixelClustersLegacyFormat'), + ) + + ### + ### Task: Pixel Local Reconstruction + ### + process.HLTDoLocalPixelTask = cms.ConditionalTask( + process.hltOnlineBeamSpotDevice, + process.hltSiPixelClusters, + process.hltSiPixelClustersLegacyFormat, # was: hltSiPixelClusters + process.hltSiPixelClustersCache, # really needed ?? + process.hltSiPixelDigiErrorsLegacyFormat, # was: hltSiPixelDigis + process.hltSiPixelRecHits, + process.hltSiPixelRecHitsLegacyFormat, # was: hltSiPixelRecHits + ) + + ### + ### CPUSerial version of Pixel Local Reconstruction + ### + process.hltOnlineBeamSpotDeviceCPUSerial = process.hltOnlineBeamSpotDevice.clone( + alpaka = dict( backend = 'serial_sync' ) + ) + + process.hltSiPixelClustersCPUSerial = process.hltSiPixelClusters.clone( + alpaka = dict( backend = 'serial_sync' ) + ) + + process.hltSiPixelClustersLegacyFormatCPUSerial = process.hltSiPixelClustersLegacyFormat.clone( + src = 'hltSiPixelClustersCPUSerial' + ) + + process.hltSiPixelDigiErrorsLegacyFormatCPUSerial = process.hltSiPixelDigiErrorsLegacyFormat.clone( + digiErrorSoASrc = 'hltSiPixelClustersCPUSerial', + fmtErrorsSoASrc = 'hltSiPixelClustersCPUSerial', + ) + + process.hltSiPixelRecHitsCPUSerial = process.hltSiPixelRecHits.clone( + beamSpot = 'hltOnlineBeamSpotDeviceCPUSerial', + src = 'hltSiPixelClustersCPUSerial', + alpaka = dict( backend = 'serial_sync' ) + ) + + process.hltSiPixelRecHitsLegacyFormatCPUSerial = process.hltSiPixelRecHitsLegacyFormat.clone( + pixelRecHitSrc = 'hltSiPixelRecHitsCPUSerial', + src = 'hltSiPixelClustersLegacyFormatCPUSerial', + ) + + process.HLTDoLocalPixelCPUSerialTask = cms.ConditionalTask( + process.hltOnlineBeamSpotDeviceCPUSerial, + process.hltSiPixelClustersCPUSerial, + process.hltSiPixelClustersLegacyFormatCPUSerial, + process.hltSiPixelDigiErrorsLegacyFormatCPUSerial, + process.hltSiPixelRecHitsCPUSerial, + process.hltSiPixelRecHitsLegacyFormatCPUSerial, + ) + + process.HLTDoLocalPixelCPUSerialSequence = cms.Sequence( process.HLTDoLocalPixelCPUSerialTask ) + + return process + +def customizeHLTforAlpakaPixelRecoTracking(process): + '''Customisation to introduce the Pixel-Track Reconstruction in Alpaka + ''' + + # alpaka EDProducer + # consumes + # - TrackingRecHitsSoACollection + # produces + # - TkSoADevice + process.hltPixelTracks = cms.EDProducer('CAHitNtupletAlpakaPhase1@alpaka', + pixelRecHitSrc = cms.InputTag('hltSiPixelRecHits'), + CPE = cms.string('PixelCPEFastParams'), + ptmin = cms.double(0.89999997615814209), + CAThetaCutBarrel = cms.double(0.0020000000949949026), + CAThetaCutForward = cms.double(0.0030000000260770321), + hardCurvCut = cms.double(0.032840722495894911), + dcaCutInnerTriplet = cms.double(0.15000000596046448), + dcaCutOuterTriplet = cms.double(0.25), + earlyFishbone = cms.bool(True), + lateFishbone = cms.bool(False), + fillStatistics = cms.bool(False), + minHitsPerNtuplet = cms.uint32(3), + maxNumberOfDoublets = cms.uint32(524288), + minHitsForSharingCut = cms.uint32(10), + fitNas4 = cms.bool(False), + doClusterCut = cms.bool(True), + doZ0Cut = cms.bool(True), + doPtCut = cms.bool(True), + useRiemannFit = cms.bool(False), + doSharedHitCut = cms.bool(True), + dupPassThrough = cms.bool(False), + useSimpleTripletCleaner = cms.bool(True), + idealConditions = cms.bool(False), + includeJumpingForwardDoublets = cms.bool(True), + trackQualityCuts = cms.PSet( + chi2MaxPt = cms.double(10), + chi2Coeff = cms.vdouble(0.9, 1.8), + chi2Scale = cms.double(8), + tripletMinPt = cms.double(0.5), + tripletMaxTip = cms.double(0.3), + tripletMaxZip = cms.double(12), + quadrupletMinPt = cms.double(0.3), + quadrupletMaxTip = cms.double(0.5), + quadrupletMaxZip = cms.double(12) + ), + # autoselect the alpaka backend + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ) + ) + + process.hltPixelTracksCPUSerial = process.hltPixelTracks.clone( + pixelRecHitSrc = 'hltSiPixelRecHitsCPUSerial', + alpaka = dict( backend = 'serial_sync' ) + ) + + process.hltPixelTracksLegacyFormat = cms.EDProducer("PixelTrackProducerFromSoAAlpakaPhase1", + beamSpot = cms.InputTag("hltOnlineBeamSpot"), + minNumberOfHits = cms.int32(0), + minQuality = cms.string('loose'), + pixelRecHitLegacySrc = cms.InputTag("hltSiPixelRecHitsLegacyFormat"), + trackSrc = cms.InputTag("hltPixelTracks") + ) + + process.hltPixelTracksLegacyFormatCPUSerial = process.hltPixelTracksLegacyFormat.clone( + pixelRecHitLegacySrc = cms.InputTag("hltSiPixelRecHitsLegacyFormatCPUSerial"), + trackSrc = cms.InputTag("hltPixelTracksCPUSerial") + ) + + process.HLTRecoPixelTracksTask = cms.ConditionalTask( + process.hltPixelTracks, + process.hltPixelTracksLegacyFormat, + ) + + process.HLTRecoPixelTracksCPUSerialTask = cms.ConditionalTask( + process.hltPixelTracksCPUSerial, + process.hltPixelTracksLegacyFormatCPUSerial, + ) + + process.HLTRecoPixelTracksCPUSerialSequence = cms.Sequence( process.HLTRecoPixelTracksCPUSerialTask ) + + return process + +def customizeHLTforAlpakaPixelRecoVertexing(process): + '''Customisation to introduce the Pixel-Vertex Reconstruction in Alpaka + ''' + + # alpaka EDProducer + # consumes + # - TkSoADevice + # produces + # - ZVertexDevice + process.hltPixelVertices = cms.EDProducer('PixelVertexProducerAlpakaPhase1@alpaka', + oneKernel = cms.bool(True), + useDensity = cms.bool(True), + useDBSCAN = cms.bool(False), + useIterative = cms.bool(False), + minT = cms.int32(2), + eps = cms.double(0.07), + errmax = cms.double(0.01), + chi2max = cms.double(9), + PtMin = cms.double(0.5), + PtMax = cms.double(75), + pixelTrackSrc = cms.InputTag('hltPixelTracks'), + # autoselect the alpaka backend + alpaka = cms.untracked.PSet( + backend = cms.untracked.string('') + ) + ) + + process.hltPixelVerticesCPUSerial = process.hltPixelVertices.clone( + pixelTrackSrc = 'hltPixelTracksCPUSerial', + alpaka = dict( backend = 'serial_sync' ) + ) + + process.hltPixelVerticesLegacyFormat = cms.EDProducer("PixelVertexProducerFromSoAAlpaka", + TrackCollection = cms.InputTag("hltPixelTracksLegacyFormat"), + beamSpot = cms.InputTag("hltOnlineBeamSpot"), + src = cms.InputTag("hltPixelVertices") + ) + + process.hltPixelVerticesLegacyFormatCPUSerial = process.hltPixelVerticesLegacyFormat.clone( + TrackCollection = cms.InputTag("hltPixelTracksLegacyFormatCPUSerial"), + src = cms.InputTag("hltPixelVerticesCPUSerial") + ) + + process.HLTRecopixelvertexingTask = cms.ConditionalTask( + process.HLTRecoPixelTracksTask, + process.hltPixelVertices, + process.hltPixelVerticesLegacyFormat, + ) + + process.HLTRecopixelvertexingCPUSerialTask = cms.ConditionalTask( + process.HLTRecoPixelTracksCPUSerialTask, + process.hltPixelVerticesCPUSerial, + process.hltPixelVerticesLegacyFormatCPUSerial, + ) + + process.HLTRecopixelvertexingCPUSerialSequence = cms.Sequence( process.HLTRecopixelvertexingCPUSerialTask ) + + return process + +def customizeHLTforAlpakaPixelReco(process): + '''Customisation to introduce the Pixel Local+Track+Vertex Reconstruction in Alpaka + ''' + process.load('Configuration.StandardSequences.Accelerators_cff') + process.load('HeterogeneousCore.AlpakaCore.ProcessAcceleratorAlpaka_cfi') + + process = customizeHLTforAlpakaPixelRecoLocal(process) + process = customizeHLTforAlpakaPixelRecoTracking(process) + process = customizeHLTforAlpakaPixelRecoVertexing(process) + + return process + +def customizeHLTforPatatrack(process): + '''Customize HLT configuration introducing latest Patatrack developments + ''' + process = customizeHLTforAlpakaPixelReco(process) + return process diff --git a/HeterogeneousCore/AlpakaCore/python/functions.py b/HeterogeneousCore/AlpakaCore/python/functions.py new file mode 100644 index 0000000000000..5b79a1b205631 --- /dev/null +++ b/HeterogeneousCore/AlpakaCore/python/functions.py @@ -0,0 +1,23 @@ +def makeSerialClone(module, **kwargs): + type = module._TypedParameterizable__type + if type.endswith('@alpaka'): + # alpaka module with automatic backend selection + base = type.removesuffix('@alpaka') + elif type.startswith('alpaka_serial_sync::'): + # alpaka module with explicit serial_sync backend + base = type.removeprefix('alpaka_serial_sync::') + elif type.startswith('alpaka_cuda_async::'): + # alpaka module with explicit cuda_async backend + base = type.removeprefix('alpaka_cuda_async::') + elif type.startswith('alpaka_rocm_async::'): + # alpaka module with explicit rocm_async backend + base = type.removeprefix('alpaka_rocm_async::') + else: + # non-alpaka module + raise TypeError('%s is not an alpaka-based module, and cannot be used with makeSerialClone()' % str(module)) + + copy = module.clone(**kwargs) + copy._TypedParameterizable__type = 'alpaka_serial_sync::' + base + if 'alpaka' in copy.parameterNames_(): + del copy.alpaka + return copy diff --git a/HeterogeneousCore/AlpakaTest/test/writer.py b/HeterogeneousCore/AlpakaTest/test/writer.py index bd8d2775b31ed..d23ac528629b8 100644 --- a/HeterogeneousCore/AlpakaTest/test/writer.py +++ b/HeterogeneousCore/AlpakaTest/test/writer.py @@ -1,4 +1,5 @@ import FWCore.ParameterSet.Config as cms +from HeterogeneousCore.AlpakaCore.functions import * process = cms.Process('Writer') @@ -31,16 +32,9 @@ ) # run a second producer explicitly on the cpu -process.testProducerSerial = cms.EDProducer('alpaka_serial_sync::TestAlpakaProducer', +process.testProducerSerial = makeSerialClone(process.testProducer, size = cms.int32(99) ) -# an alternative approach would be to use -#process.testProducerSerial = cms.EDProducer('TestAlpakaProducer@alpaka', -# size = cms.int32(99), -# alpaka = cms.untracked.PSet( -# backend = cms.untracked.string("serial_sync") -# ) -#) # analyse the second set of products process.testAnalyzerSerial = cms.EDAnalyzer('TestAlpakaAnalyzer', diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc index 820b6b237c7e5..0bfa989c92969 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc @@ -21,7 +21,8 @@ // local include(s) #include "PixelClusterizerBase.h" -// #define GPU_DEBUG +//#define GPU_DEBUG + template class SiPixelDigisClustersFromSoAT : public edm::global::EDProducer<> { public: diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc index ad05ad3ff60c9..423951f4cb74f 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoAAlpaka.cc @@ -21,8 +21,9 @@ // local include(s) #include "PixelClusterizerBase.h" -// #define EDM_ML_DEBUG -// #define GPU_DEBUG +//#define EDM_ML_DEBUG +//#define GPU_DEBUG + template class SiPixelDigisClustersFromSoAAlpaka : public edm::global::EDProducer<> { public: diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu index 56718b4bdae14..452b0e2097071 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu @@ -33,7 +33,7 @@ #include "gpuClusterChargeCut.h" #include "gpuClustering.h" -// #define GPU_DEBUG +//#define GPU_DEBUG namespace pixelgpudetails { diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h index 06b30da68c8cd..fe9cc260a5853 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h @@ -18,7 +18,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h" -// #define GPU_DEBUG +//#define GPU_DEBUG struct SiPixelROCsStatusAndMapping; class SiPixelGainForHLTonGPU; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h index ff885b5bad07f..d1f5509052468 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/CalibPixel.h @@ -18,7 +18,7 @@ #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h" -// #define GPU_DEBUG +//#define GPU_DEBUG namespace calibPixel { using namespace cms::alpakatools; diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h index c149707e41d9a..4056090517aee 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/ClusterChargeCut.h @@ -10,7 +10,7 @@ #include "HeterogeneousCore/AlpakaInterface/interface/prefixScan.h" #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelClusterThresholds.h" -// #define GPU_DEBUG +//#define GPU_DEBUG namespace pixelClustering { diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h index 616ccbd3eb8c7..7da68c7b2f5da 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/PixelClustering.h @@ -5,15 +5,16 @@ #include #include #include + #include -#include "HeterogeneousCore/AlpakaInterface/interface/config.h" -#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" #include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" #include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" -// #define GPU_DEBUG +//#define GPU_DEBUG namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -140,7 +141,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // find the index of the first pixel not belonging to this module (or invalid) auto& msize = alpaka::declareSharedVar(acc); - const uint32_t blockIdx(alpaka::getIdx(acc)[0u]); + const uint32_t blockIdx = alpaka::getIdx(acc)[0u]; if (blockIdx >= clus_view[0].moduleStart()) return; @@ -274,11 +275,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { ALPAKA_ASSERT_OFFLOAD((hist.size() / blockDimension) <= maxiter); // NB: can be tuned. - constexpr uint32_t threadDimension = cms::alpakatools::requires_single_thread_per_block_v ? 1 : 256; + constexpr uint32_t threadDimension = cms::alpakatools::requires_single_thread_per_block_v ? 256 : 1; #ifndef NDEBUG - [[maybe_unused]] const uint32_t runTimeThreadDimension( - alpaka::getWorkDiv(acc)[0u]); + [[maybe_unused]] const uint32_t runTimeThreadDimension = + alpaka::getWorkDiv(acc)[0u]; ALPAKA_ASSERT_OFFLOAD(runTimeThreadDimension <= threadDimension); #endif diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc index 3e7caf8b2b3a4..597aaa70987f4 100644 --- a/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc +++ b/RecoLocalTracker/SiPixelClusterizer/plugins/alpaka/SiPixelRawToClusterKernel.dev.cc @@ -29,7 +29,7 @@ #include "PixelClustering.h" #include "SiPixelRawToClusterKernel.h" -// #define GPU_DEBUG +//#define GPU_DEBUG namespace ALPAKA_ACCELERATOR_NAMESPACE { namespace pixelDetails { diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py index 8d78599d07d9c..a6dd2bea80e2a 100644 --- a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py +++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py @@ -1,4 +1,5 @@ import FWCore.ParameterSet.Config as cms +from HeterogeneousCore.AlpakaCore.functions import * from Configuration.Eras.Modifier_run3_common_cff import run3_common from Configuration.ProcessModifiers.gpu_cff import gpu from Configuration.ProcessModifiers.alpaka_cff import alpaka @@ -130,11 +131,7 @@ def _addProcessCalibTrackerAlpakaES(process): )) # reconstruct the pixel digis and clusters with alpaka on the cpu, for validation -siPixelClustersPreSplittingAlpakaSerial = siPixelClustersPreSplittingAlpaka.clone( - #alpaka = dict( backend = '*' ) - alpaka = None -) -siPixelClustersPreSplittingAlpakaSerial._TypedParameterizable__type = 'alpaka_serial_sync' + siPixelClustersPreSplittingAlpaka._TypedParameterizable__type.removesuffix('@alpaka') +siPixelClustersPreSplittingAlpakaSerial = makeSerialClone(siPixelClustersPreSplittingAlpaka) from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoAAlpakaPhase1_cfi import siPixelDigisClustersFromSoAAlpakaPhase1 as _siPixelDigisClustersFromSoAAlpakaPhase1 from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoAAlpakaPhase2_cfi import siPixelDigisClustersFromSoAAlpakaPhase2 as _siPixelDigisClustersFromSoAAlpakaPhase2 diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index 61442ea9d2b8c..b1e5e1c3c90e9 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -12,7 +12,8 @@ #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" -// #define GPU_DEBUG + +//#define GPU_DEBUG namespace { template diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h index 25cc724cd4c4a..407a18be04fa9 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h @@ -10,7 +10,9 @@ #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" + //#define GPU_DEBUG + namespace pixelgpudetails { template diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSoAAlpaka.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSoAAlpaka.cc index 9881aeab46bab..a76ff6af49ac9 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSoAAlpaka.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromSoAAlpaka.cc @@ -9,7 +9,6 @@ #include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" #include "FWCore/Framework/interface/Event.h" #include "FWCore/Framework/interface/EventSetup.h" -#include "FWCore/Framework/interface/MakerMacros.h" #include "FWCore/Framework/interface/global/EDProducer.h" #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" @@ -181,7 +180,10 @@ void SiPixelRecHitFromSoAAlpaka::produce(edm::StreamID streamID, } using SiPixelRecHitFromSoAAlpakaPhase1 = SiPixelRecHitFromSoAAlpaka; -DEFINE_FWK_MODULE(SiPixelRecHitFromSoAAlpakaPhase1); - using SiPixelRecHitFromSoAAlpakaPhase2 = SiPixelRecHitFromSoAAlpaka; +using SiPixelRecHitFromSoAAlpakaHIonPhase1 = SiPixelRecHitFromSoAAlpaka; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(SiPixelRecHitFromSoAAlpakaPhase1); DEFINE_FWK_MODULE(SiPixelRecHitFromSoAAlpakaPhase2); +DEFINE_FWK_MODULE(SiPixelRecHitFromSoAAlpakaHIonPhase1); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h index 220a91b85ced3..45587034b572b 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/alpaka/PixelRecHits.h @@ -19,7 +19,8 @@ #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h" -//#define GPU_DEBUG 1 +//#define GPU_DEBUG + namespace ALPAKA_ACCELERATOR_NAMESPACE { namespace pixelRecHits { diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h index 94ae258cc16fb..55c556bd63048 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h +++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h @@ -7,13 +7,14 @@ #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "DataFormats/Math/interface/approx_atan2.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -//#define GPU_DEBUG 1 +//#define GPU_DEBUG + namespace gpuPixelRecHits { template diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py index e6b2c9832600c..7e8910a8e0918 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py @@ -1,4 +1,5 @@ import FWCore.ParameterSet.Config as cms +from HeterogeneousCore.AlpakaCore.functions import * from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA from Configuration.ProcessModifiers.gpu_cff import gpu from Configuration.ProcessModifiers.alpaka_cff import alpaka @@ -139,12 +140,9 @@ )) # Hit SoA producer on the cpu, for validation -siPixelRecHitsPreSplittingAlpakaSerial = siPixelRecHitsPreSplittingAlpaka.clone( - src = "siPixelClustersPreSplittingAlpakaSerial", - #alpaka = dict( backend = '*' ) - alpaka = None +siPixelRecHitsPreSplittingAlpakaSerial = makeSerialClone(siPixelRecHitsPreSplittingAlpaka, + src = "siPixelClustersPreSplittingAlpakaSerial" ) -siPixelRecHitsPreSplittingAlpakaSerial._TypedParameterizable__type = 'alpaka_serial_sync' + siPixelRecHitsPreSplittingAlpaka._TypedParameterizable__type.removesuffix('@alpaka') from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromSoAAlpakaPhase1_cfi import siPixelRecHitFromSoAAlpakaPhase1 as _siPixelRecHitFromSoAAlpakaPhase1 from RecoLocalTracker.SiPixelRecHits.siPixelRecHitFromSoAAlpakaPhase2_cfi import siPixelRecHitFromSoAAlpakaPhase2 as _siPixelRecHitFromSoAAlpakaPhase2 diff --git a/RecoTracker/Configuration/python/RecoPixelVertexing_cff.py b/RecoTracker/Configuration/python/RecoPixelVertexing_cff.py index c08a0987d3f59..f5ba3ad7df1da 100644 --- a/RecoTracker/Configuration/python/RecoPixelVertexing_cff.py +++ b/RecoTracker/Configuration/python/RecoPixelVertexing_cff.py @@ -1,4 +1,5 @@ import FWCore.ParameterSet.Config as cms +from HeterogeneousCore.AlpakaCore.functions import * from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA from RecoTracker.PixelTrackFitting.PixelTracks_cff import * @@ -98,6 +99,31 @@ pixelVerticesTask.copy() )) +## pixel vertex reconstruction with Alpaka + +# pixel vertex SoA producer with alpaka on the device +from RecoTracker.PixelVertexFinding.pixelVertexProducerAlpakaPhase1_cfi import pixelVertexProducerAlpakaPhase1 as _pixelVerticesAlpakaPhase1 +from RecoTracker.PixelVertexFinding.pixelVertexProducerAlpakaPhase2_cfi import pixelVertexProducerAlpakaPhase2 as _pixelVerticesAlpakaPhase2 +pixelVerticesAlpaka = _pixelVerticesAlpakaPhase1.clone() +phase2_tracker.toReplaceWith(pixelVerticesAlpaka,_pixelVerticesAlpakaPhase2.clone()) + +from RecoTracker.PixelVertexFinding.pixelVertexFromSoAAlpaka_cfi import pixelVertexFromSoAAlpaka as _pixelVertexFromSoAAlpaka +alpaka.toReplaceWith(pixelVertices, _pixelVertexFromSoAAlpaka.clone()) + +# pixel vertex SoA producer with alpaka on the cpu, for validation +pixelVerticesAlpakaSerial = makeSerialClone(pixelVerticesAlpaka, + pixelTrackSrc = 'pixelTracksAlpakaSerial' +) + +alpaka.toReplaceWith(pixelVerticesTask, cms.Task( + # Build the pixel vertices in SoA format with alpaka on the device + pixelVerticesAlpaka, + # Build the pixel vertices in SoA format with alpaka on the cpu (if requested by the validation) + pixelVerticesAlpakaSerial, + # Convert the pixel vertices from SoA format (on the host) to the legacy format + pixelVertices +)) + # Tasks and Sequences recopixelvertexingTask = cms.Task( pixelTracksTask, diff --git a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py index 3d121a8736f8e..55a02f83f913c 100644 --- a/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py +++ b/RecoTracker/Configuration/python/customizePixelOnlyForProfiling.py @@ -3,10 +3,12 @@ # Customise the Pixel-only reconstruction to run on GPU # # Run the unpacker, clustering, ntuplets, track fit and vertex reconstruction on GPU. +# CUDA and Alpaka co-living here for the moment + def customizePixelOnlyForProfilingGPUOnly(process): process.consumer = cms.EDAnalyzer("GenericConsumer", - eventProducts = cms.untracked.vstring('pixelTracksCUDA', 'pixelVerticesCUDA') + eventProducts = cms.untracked.vstring('pixelTracksCUDA', 'pixelVerticesCUDA', '*DeviceProduct_pixelTracksAlpaka_*_*', '*DeviceProduct_pixelVerticesAlpaka_*_*') ) process.consume_step = cms.EndPath(process.consumer) @@ -25,10 +27,8 @@ def customizePixelOnlyForProfilingGPUOnly(process): # tracks and vertices on the CPU in SoA format, without conversion to legacy format. def customizePixelOnlyForProfilingGPUWithHostCopy(process): - #? process.siPixelRecHitSoAFromLegacy.convertToLegacy = False - process.consumer = cms.EDAnalyzer("GenericConsumer", - eventProducts = cms.untracked.vstring('pixelTracksSoA', 'pixelVerticesSoA') + eventProducts = cms.untracked.vstring('pixelTracksSoA', 'pixelVerticesSoA', 'pixelTracksAlpaka', 'pixelVerticesAlpaka') ) process.consume_step = cms.EndPath(process.consumer) diff --git a/RecoTracker/PixelSeeding/plugins/BuildFile.xml b/RecoTracker/PixelSeeding/plugins/BuildFile.xml index 82b80e1c55b66..f9863a6a8c292 100644 --- a/RecoTracker/PixelSeeding/plugins/BuildFile.xml +++ b/RecoTracker/PixelSeeding/plugins/BuildFile.xml @@ -1,21 +1,36 @@ - - - + - + + + + + + + + + + + + + + + + + + diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu index efb2a2e17715c..6e07126e9e428 100644 --- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.cu @@ -1,8 +1,9 @@ -#include "RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h" #include -// #define NTUPLE_DEBUG -// #define GPU_DEBUG +#include "RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h" + +//#define GPU_DEBUG +//#define NTUPLE_DEBUG template void CAHitNtupletGeneratorKernelsGPU::launchKernels(const HitsConstView &hh, diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.h b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.h index 0865fa5cbc46a..250aef21c1d6a 100644 --- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernels.h @@ -1,18 +1,17 @@ #ifndef RecoTracker_PixelSeeding_plugins_CAHitNtupletGeneratorKernels_h #define RecoTracker_PixelSeeding_plugins_CAHitNtupletGeneratorKernels_h -// #define GPU_DEBUG +//#define GPU_DEBUG +//#define DUMP_GPU_TK_TUPLES -#include "GPUCACell.h" -#include "gpuPixelDoublets.h" - -#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" -// #define DUMP_GPU_TK_TUPLES +#include "GPUCACell.h" +#include "gpuPixelDoublets.h" namespace caHitNtupletGenerator { diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsAlloc.cc index 6acff4abbd531..64148d5f5ba81 100644 --- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsAlloc.cc +++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsAlloc.cc @@ -2,7 +2,8 @@ #include "CAHitNtupletGeneratorKernels.h" -// #define GPU_DEBUG +//#define GPU_DEBUG + template #ifdef __CUDACC__ void CAHitNtupletGeneratorKernelsGPU::allocateOnGPU(int32_t nHits, cudaStream_t stream) { diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h index 540c0b92f9015..57e4ea6f9441f 100644 --- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h +++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorKernelsImpl.h @@ -2,8 +2,8 @@ // Original Author: Felice Pantaleo, CERN // -// #define NTUPLE_DEBUG -// #define GPU_DEBUG +//#define NTUPLE_DEBUG +//#define GPU_DEBUG #include #include @@ -11,15 +11,14 @@ #include +#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h" -#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" - -#include "CAStructures.h" #include "CAHitNtupletGeneratorKernels.h" +#include "CAStructures.h" #include "GPUCACell.h" #include "gpuFishbone.h" #include "gpuPixelDoublets.h" diff --git a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorOnGPU.cc index faf0bae6fb0a9..5100cf734142c 100644 --- a/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoTracker/PixelSeeding/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -2,8 +2,8 @@ // Original Author: Felice Pantaleo, CERN // -// #define GPU_DEBUG -// #define DUMP_GPU_TK_TUPLES +//#define GPU_DEBUG +//#define DUMP_GPU_TK_TUPLES #include #include diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc new file mode 100644 index 0000000000000..a21fed668b54c --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/BrokenLineFit.dev.cc @@ -0,0 +1,412 @@ +// +// Author: Felice Pantaleo, CERN +// + +//#define BROKENLINE_DEBUG +//#define BL_DUMP_HITS +#include +#include + +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h" +#include "RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h" + +#include "HelixFit.h" + +template +using Tuples = typename reco::TrackSoA::HitContainer; +template +using OutputSoAView = reco::TrackSoAView; +template +using TupleMultiplicity = caStructures::TupleMultiplicityT; + +// #define BL_DUMP_HITS + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + template + class Kernel_BLFastFit { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + Tuples const *__restrict__ foundNtuplets, + TupleMultiplicity const *__restrict__ tupleMultiplicity, + TrackingRecHitSoAConstView hh, + pixelCPEforDevice::ParamsOnDeviceT const *__restrict__ cpeParams, + typename TrackerTraits::tindex_type *__restrict__ ptkids, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit, + uint32_t nHitsL, + uint32_t nHitsH, + int32_t offset) const { + constexpr uint32_t hitsInFit = N; + constexpr auto invalidTkId = std::numeric_limits::max(); + + ALPAKA_ASSERT_OFFLOAD(hitsInFit <= nHitsL); + ALPAKA_ASSERT_OFFLOAD(nHitsL <= nHitsH); + ALPAKA_ASSERT_OFFLOAD(phits); + ALPAKA_ASSERT_OFFLOAD(pfast_fit); + ALPAKA_ASSERT_OFFLOAD(foundNtuplets); + ALPAKA_ASSERT_OFFLOAD(tupleMultiplicity); + + // look in bin for this hit multiplicity + int totTK = tupleMultiplicity->end(nHitsH) - tupleMultiplicity->begin(nHitsL); + ALPAKA_ASSERT_OFFLOAD(totTK <= int(tupleMultiplicity->size())); + ALPAKA_ASSERT_OFFLOAD(totTK >= 0); + +#ifdef BROKENLINE_DEBUG + const uint32_t threadIdx(alpaka::getIdx(acc)[0u]); + if (cms::alpakatools::once_per_grid(acc)) { + printf("%d total Ntuple\n", tupleMultiplicity->size()); + printf("%d Ntuple of size %d/%d for %d hits to fit\n", totTK, nHitsL, nHitsH, hitsInFit); + } +#endif + const auto nt = riemannFit::maxNumberOfConcurrentFits; + for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) { + auto tuple_idx = local_idx + offset; + if ((int)tuple_idx >= totTK) { + ptkids[local_idx] = invalidTkId; + break; + } + // get it from the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHitsL) + tuple_idx); + ALPAKA_ASSERT_OFFLOAD(static_cast(tkid) < foundNtuplets->nOnes()); + + ptkids[local_idx] = tkid; + + auto nHits = foundNtuplets->size(tkid); + + ALPAKA_ASSERT_OFFLOAD(nHits >= nHitsL); + ALPAKA_ASSERT_OFFLOAD(nHits <= nHitsH); + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + +#ifdef BL_DUMP_HITS + auto &&done = alpaka::declareSharedVar(acc); + done = 0; + alpaka::syncBlockThreads(acc); + bool dump = + (foundNtuplets->size(tkid) == 5 && 0 == alpaka::atomicAdd(acc, &done, 1, alpaka::hierarchy::Blocks{})); +#endif + + // Prepare data structure + auto const *hitId = foundNtuplets->begin(tkid); + + // #define YERR_FROM_DC +#ifdef YERR_FROM_DC + // try to compute more precise error in y + auto dx = hh[hitId[hitsInFit - 1]].xGlobal() - hh[hitId[0]].xGlobal(); + auto dy = hh[hitId[hitsInFit - 1]].yGlobal() - hh[hitId[0]].yGlobal(); + auto dz = hh[hitId[hitsInFit - 1]].zGlobal() - hh[hitId[0]].zGlobal(); + float ux, uy, uz; +#endif + + float incr = std::max(1.f, float(nHits) / float(hitsInFit)); + float n = 0; + for (uint32_t i = 0; i < hitsInFit; ++i) { + int j = int(n + 0.5f); // round + if (hitsInFit - 1 == i) + j = nHits - 1; // force last hit to ensure max lever arm. + ALPAKA_ASSERT_OFFLOAD(j < int(nHits)); + n += incr; + auto hit = hitId[j]; + float ge[6]; + +#ifdef YERR_FROM_DC + auto const &dp = cpeParams->detParams(hh.detectorIndex(hit)); + auto status = hh[hit].chargeAndStatus().status; + int qbin = CPEFastParametrisation::kGenErrorQBins - 1 - status.qBin; + ALPAKA_ASSERT_OFFLOAD(qbin >= 0 && qbin < 5); + bool nok = (status.isBigY | status.isOneY); + // compute cotanbeta and use it to recompute error + dp.frame.rotation().multiply(dx, dy, dz, ux, uy, uz); + auto cb = std::abs(uy / uz); + int bin = + int(cb * (float(phase1PixelTopology::pixelThickess) / float(phase1PixelTopology::pixelPitchY)) * 8.f) - 4; + int low_value = 0; + int high_value = CPEFastParametrisation::kNumErrorBins - 1; + // return estimated bin value truncated to [0, 15] + bin = std::clamp(bin, low_value, high_value); + float yerr = dp.sigmay[bin] * 1.e-4f; // toCM + yerr *= dp.yfact[qbin]; // inflate + yerr *= yerr; + yerr += dp.apeYY; + yerr = nok ? hh[hit].yerrLocal() : yerr; + dp.frame.toGlobal(hh[hit].xerrLocal(), 0, yerr, ge); +#else + cpeParams->detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge); +#endif + +#ifdef BL_DUMP_HITS + bool dump = foundNtuplets->size(tkid) == 5; + if (dump) { + printf("Track id %d %d Hit %d on %d\nGlobal: hits.col(%d) << %f,%f,%f\n", + local_idx, + tkid, + hit, + hh[hit].detectorIndex(), + i, + hh[hit].xGlobal(), + hh[hit].yGlobal(), + hh[hit].zGlobal()); + printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]); + } +#endif + + hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal(); + hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; + } + brokenline::fastFit(acc, hits, fast_fit); + + // no NaN here.... + ALPAKA_ASSERT_OFFLOAD(fast_fit(0) == fast_fit(0)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(1) == fast_fit(1)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(2) == fast_fit(2)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(3) == fast_fit(3)); + } + } + }; + + template + struct Kernel_BLFit { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TupleMultiplicity const *__restrict__ tupleMultiplicity, + double bField, + OutputSoAView results_view, + typename TrackerTraits::tindex_type const *__restrict__ ptkids, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit) const { + ALPAKA_ASSERT_OFFLOAD(results_view.pt()); + ALPAKA_ASSERT_OFFLOAD(results_view.eta()); + ALPAKA_ASSERT_OFFLOAD(results_view.chi2()); + ALPAKA_ASSERT_OFFLOAD(pfast_fit); + constexpr auto invalidTkId = std::numeric_limits::max(); + + // same as above... + // look in bin for this hit multiplicity + const auto nt = riemannFit::maxNumberOfConcurrentFits; + for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) { + if (invalidTkId == ptkids[local_idx]) + break; + auto tkid = ptkids[local_idx]; + + ALPAKA_ASSERT_OFFLOAD(tkid < TrackerTraits::maxNumberOfTuples); + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + + brokenline::PreparedBrokenLineData data; + + brokenline::karimaki_circle_fit circle; + riemannFit::LineFit line; + + brokenline::prepareBrokenLineData(acc, hits, fast_fit, bField, data); + brokenline::lineFit(acc, hits_ge, fast_fit, bField, data, line); + brokenline::circleFit(acc, hits, hits_ge, fast_fit, bField, data, circle); + + TracksUtilities::copyFromCircle( + results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid); + results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2))); + results_view[tkid].eta() = alpaka::math::asinh(acc, line.par(0)); + results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5); + +#ifdef BROKENLINE_DEBUG + if (!(circle.chi2 >= 0) || !(line.chi2 >= 0)) + printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2); + printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", + N, + N, + tkid, + circle.par(0), + circle.par(1), + circle.par(2)); + printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1)); + printf("kernelBLHits chi2 cov %f/%f %e,%e,%e,%e,%e\n", + circle.chi2, + line.chi2, + circle.cov(0, 0), + circle.cov(1, 1), + circle.cov(2, 2), + line.cov(0, 0), + line.cov(1, 1)); +#endif + } + } + }; + + template + void HelixFit::launchBrokenLineKernels( + const TrackingRecHitSoAConstView &hv, + pixelCPEforDevice::ParamsOnDeviceT const *cpeParams, + uint32_t hitsInFit, + uint32_t maxNumberOfTuples, + Queue &queue) { + ALPAKA_ASSERT_OFFLOAD(tuples_); + + uint32_t blockSize = 64; + uint32_t numberOfBlocks = cms::alpakatools::divide_up_by(maxNumberOfConcurrentFits_, blockSize); + const WorkDiv1D workDivTriplets = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + const WorkDiv1D workDivQuadsPenta = cms::alpakatools::make_workdiv(numberOfBlocks / 4, blockSize); + + // Fit internals + auto tkidDevice = + cms::alpakatools::make_device_buffer(queue, maxNumberOfConcurrentFits_); + auto hitsDevice = cms::alpakatools::make_device_buffer( + queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double)); + auto hits_geDevice = cms::alpakatools::make_device_buffer( + queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float)); + auto fast_fit_resultsDevice = cms::alpakatools::make_device_buffer( + queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double)); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // fit triplets + + alpaka::exec(queue, + workDivTriplets, + Kernel_BLFastFit<3, TrackerTraits>{}, + tuples_, + tupleMultiplicity_, + hv, + cpeParams, + tkidDevice.data(), + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + 3, + 3, + offset); + + alpaka::exec(queue, + workDivTriplets, + Kernel_BLFit<3, TrackerTraits>{}, + tupleMultiplicity_, + bField_, + outputSoa_, + tkidDevice.data(), + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data()); + + if (fitNas4_) { + // fit all as 4 + riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrack, 1>([this, + &hv, + &cpeParams, + &tkidDevice, + &hitsDevice, + &hits_geDevice, + &fast_fit_resultsDevice, + &offset, + &queue, + &workDivQuadsPenta](auto i) { + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_BLFastFit<4, TrackerTraits>{}, + tuples_, + tupleMultiplicity_, + hv, + cpeParams, + tkidDevice.data(), + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + 4, + 4, + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_BLFit<4, TrackerTraits>{}, + tupleMultiplicity_, + bField_, + outputSoa_, + tkidDevice.data(), + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data()); + }); + + } else { + riemannFit::rolling_fits<4, TrackerTraits::maxHitsOnTrackForFullFit, 1>([this, + &hv, + &cpeParams, + &tkidDevice, + &hitsDevice, + &hits_geDevice, + &fast_fit_resultsDevice, + &offset, + &queue, + &workDivQuadsPenta](auto i) { + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_BLFastFit{}, + tuples_, + tupleMultiplicity_, + hv, + cpeParams, + tkidDevice.data(), + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + i, + i, + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_BLFit{}, + tupleMultiplicity_, + bField_, + outputSoa_, + tkidDevice.data(), + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data()); + }); + + static_assert(TrackerTraits::maxHitsOnTrackForFullFit < TrackerTraits::maxHitsOnTrack); + + //Fit all the rest using the maximum from previous call + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_BLFastFit{}, + tuples_, + tupleMultiplicity_, + hv, + cpeParams, + tkidDevice.data(), + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + TrackerTraits::maxHitsOnTrackForFullFit, + TrackerTraits::maxHitsOnTrack - 1, + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_BLFit{}, + tupleMultiplicity_, + bField_, + outputSoa_, + tkidDevice.data(), + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data()); + } + + } // loop on concurrent fits + } + + template class HelixFit; + template class HelixFit; + template class HelixFit; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h b/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h new file mode 100644 index 0000000000000..d0142f78415ae --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CACell.h @@ -0,0 +1,391 @@ +#ifndef RecoPixelVertexing_PixelTriplets_CACellT_h +#define RecoPixelVertexing_PixelTriplets_CACellT_h + +// +// Author: Felice Pantaleo, CERN +// + +// #define ONLY_TRIPLETS_IN_HOLE + +#include + +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" +#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h" +#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h" +#include "RecoTracker/PixelSeeding/interface/CircleEq.h" +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" +#include "DataFormats/TrackSoA/interface/TracksSoA.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "CAStructures.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + template + class CACellT { + public: + using PtrAsInt = unsigned long long; + + static constexpr auto maxCellsPerHit = TrackerTraits::maxCellsPerHit; + using OuterHitOfCellContainer = caStructures::OuterHitOfCellContainerT; + using OuterHitOfCell = caStructures::OuterHitOfCellT; + using CellNeighbors = caStructures::CellNeighborsT; + using CellTracks = caStructures::CellTracksT; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + using CellTracksVector = caStructures::CellTracksVectorT; + + using HitsConstView = TrackingRecHitSoAConstView; + using hindex_type = typename TrackerTraits::hindex_type; + using tindex_type = typename TrackerTraits::tindex_type; + static constexpr auto invalidHitId = std::numeric_limits::max(); + + using TmpTuple = cms::alpakatools::VecArray; + + using HitContainer = typename reco::TrackSoA::HitContainer; + using Quality = ::pixelTrack::Quality; + static constexpr auto bad = ::pixelTrack::Quality::bad; + + enum class StatusBit : uint16_t { kUsed = 1, kInTrack = 2, kKilled = 1 << 15 }; + + CACellT() = default; + + ALPAKA_FN_ACC ALPAKA_FN_INLINE void init(CellNeighborsVector& cellNeighbors, + CellTracksVector& cellTracks, + const HitsConstView& hh, + int layerPairId, + hindex_type innerHitId, + hindex_type outerHitId) { + theInnerHitId = innerHitId; + theOuterHitId = outerHitId; + theLayerPairId_ = layerPairId; + theStatus_ = 0; + theFishboneId = invalidHitId; + + // optimization that depends on access pattern + theInnerZ = hh[innerHitId].zGlobal(); + theInnerR = hh[innerHitId].rGlobal(); + + // link to default empty + theOuterNeighbors = &cellNeighbors[0]; + theTracks = &cellTracks[0]; + assert(outerNeighbors().empty()); + assert(tracks().empty()); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) int addOuterNeighbor( + const TAcc& acc, typename TrackerTraits::cindex_type t, CellNeighborsVector& cellNeighbors) { + // use smart cache + if (outerNeighbors().empty()) { + auto i = cellNeighbors.extend(acc); // maybe wasted.... + if (i > 0) { + cellNeighbors[i].reset(); + alpaka::mem_fence(acc, alpaka::memory_scope::Grid{}); +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + theOuterNeighbors = &cellNeighbors[i]; +#else + auto zero = (PtrAsInt)(&cellNeighbors[0]); + alpaka::atomicCas(acc, + (PtrAsInt*)(&theOuterNeighbors), + zero, + (PtrAsInt)(&cellNeighbors[i]), + alpaka::hierarchy::Blocks{}); // if fails we cannot give "i" back... +#endif + } else + return -1; + } + alpaka::mem_fence(acc, alpaka::memory_scope::Grid{}); + return outerNeighbors().push_back(acc, t); + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) int addTrack(TAcc const& acc, + tindex_type t, + CellTracksVector& cellTracks) { + if (tracks().empty()) { + auto i = cellTracks.extend(acc); // maybe wasted.... + if (i > 0) { + cellTracks[i].reset(); + alpaka::mem_fence(acc, alpaka::memory_scope::Grid{}); +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + theTracks = &cellTracks[i]; +#else + auto zero = (PtrAsInt)(&cellTracks[0]); + alpaka::atomicCas(acc, + (PtrAsInt*)(&theTracks), + zero, + (PtrAsInt)(&cellTracks[i]), + alpaka::hierarchy::Blocks{}); // if fails we cannot give "i" back... + +#endif + } else + return -1; + } + alpaka::mem_fence(acc, alpaka::memory_scope::Grid{}); + return tracks().push_back(acc, t); + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE CellTracks& tracks() { return *theTracks; } + ALPAKA_FN_ACC ALPAKA_FN_INLINE CellTracks const& tracks() const { return *theTracks; } + ALPAKA_FN_ACC ALPAKA_FN_INLINE CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } + ALPAKA_FN_ACC ALPAKA_FN_INLINE CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_x(const HitsConstView& hh) const { return hh[theInnerHitId].xGlobal(); } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_x(const HitsConstView& hh) const { return hh[theOuterHitId].xGlobal(); } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_y(const HitsConstView& hh) const { return hh[theInnerHitId].yGlobal(); } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_y(const HitsConstView& hh) const { return hh[theOuterHitId].yGlobal(); } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_z(const HitsConstView& hh) const { return theInnerZ; } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_z(const HitsConstView& hh) const { return hh[theOuterHitId].zGlobal(); } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_r(const HitsConstView& hh) const { return theInnerR; } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_r(const HitsConstView& hh) const { return hh[theOuterHitId].rGlobal(); } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE auto inner_iphi(const HitsConstView& hh) const { return hh[theInnerHitId].iphi(); } + ALPAKA_FN_ACC ALPAKA_FN_INLINE auto outer_iphi(const HitsConstView& hh) const { return hh[theOuterHitId].iphi(); } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE float inner_detIndex(const HitsConstView& hh) const { + return hh[theInnerHitId].detectorIndex(); + } + ALPAKA_FN_ACC ALPAKA_FN_INLINE float outer_detIndex(const HitsConstView& hh) const { + return hh[theOuterHitId].detectorIndex(); + } + + constexpr unsigned int inner_hit_id() const { return theInnerHitId; } + constexpr unsigned int outer_hit_id() const { return theOuterHitId; } + + ALPAKA_FN_ACC void print_cell() const { + printf("printing cell: on layerPair: %d, innerHitId: %d, outerHitId: %d \n", + theLayerPairId_, + theInnerHitId, + theOuterHitId); + } + + ALPAKA_FN_ACC bool check_alignment(const HitsConstView& hh, + CACellT const& otherCell, + const float ptmin, + const float hardCurvCut, + const float caThetaCutBarrel, + const float caThetaCutForward, + const float dcaCutInnerTriplet, + const float dcaCutOuterTriplet) const { + // detIndex of the layerStart for the Phase1 Pixel Detector: + // [BPX1, BPX2, BPX3, BPX4, FP1, FP2, FP3, FN1, FN2, FN3, LAST_VALID] + // [ 0, 96, 320, 672, 1184, 1296, 1408, 1520, 1632, 1744, 1856] + auto ri = inner_r(hh); + auto zi = inner_z(hh); + + auto ro = outer_r(hh); + auto zo = outer_z(hh); + + auto r1 = otherCell.inner_r(hh); + auto z1 = otherCell.inner_z(hh); + auto isBarrel = otherCell.outer_detIndex(hh) < TrackerTraits::last_barrel_detIndex; + // TODO tune CA cuts below (theta and dca) + bool aligned = areAlignedRZ(r1, z1, ri, zi, ro, zo, ptmin, isBarrel ? caThetaCutBarrel : caThetaCutForward); + return (aligned && dcaCut(hh, + otherCell, + otherCell.inner_detIndex(hh) < TrackerTraits::last_bpix1_detIndex ? dcaCutInnerTriplet + : dcaCutOuterTriplet, + hardCurvCut)); + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) static bool areAlignedRZ( + float r1, float z1, float ri, float zi, float ro, float zo, const float ptmin, const float thetaCut) { + float radius_diff = std::abs(r1 - ro); + float distance_13_squared = radius_diff * radius_diff + (z1 - zo) * (z1 - zo); + + float pMin = ptmin * std::sqrt(distance_13_squared); // this needs to be divided by + // radius_diff later + + float tan_12_13_half_mul_distance_13_squared = fabs(z1 * (ri - ro) + zi * (ro - r1) + zo * (r1 - ri)); + return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff; + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool dcaCut(const HitsConstView& hh, + CACellT const& otherCell, + const float region_origin_radius_plus_tolerance, + const float maxCurv) const { + auto x1 = otherCell.inner_x(hh); + auto y1 = otherCell.inner_y(hh); + + auto x2 = inner_x(hh); + auto y2 = inner_y(hh); + + auto x3 = outer_x(hh); + auto y3 = outer_y(hh); + + CircleEq eq(x1, y1, x2, y2, x3, y3); + + if (eq.curvature() > maxCurv) + return false; + + return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature()); + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) static bool dcaCutH( + float x1, + float y1, + float x2, + float y2, + float x3, + float y3, + const float region_origin_radius_plus_tolerance, + const float maxCurv) { + CircleEq eq(x1, y1, x2, y2, x3, y3); + + if (eq.curvature() > maxCurv) + return false; + + return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature()); + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool hole0(const HitsConstView& hh, CACellT const& innerCell) const { + using namespace phase1PixelTopology; + + int p = innerCell.inner_iphi(hh); + if (p < 0) + p += std::numeric_limits::max(); + p = (max_ladder_bpx0 * p) / std::numeric_limits::max(); + p %= max_ladder_bpx0; + auto il = first_ladder_bpx0 + p; + auto r0 = hh.averageGeometry().ladderR[il]; + auto ri = innerCell.inner_r(hh); + auto zi = innerCell.inner_z(hh); + auto ro = outer_r(hh); + auto zo = outer_z(hh); + auto z0 = zi + (r0 - ri) * (zo - zi) / (ro - ri); + auto z_in_ladder = std::abs(z0 - hh.averageGeometry().ladderZ[il]); + auto z_in_module = z_in_ladder - module_length_bpx0 * int(z_in_ladder / module_length_bpx0); + auto gap = z_in_module < module_tolerance_bpx0 || z_in_module > (module_length_bpx0 - module_tolerance_bpx0); + return gap; + } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool hole4(const HitsConstView& hh, CACellT const& innerCell) const { + using namespace phase1PixelTopology; + + int p = outer_iphi(hh); + if (p < 0) + p += std::numeric_limits::max(); + p = (max_ladder_bpx4 * p) / std::numeric_limits::max(); + p %= max_ladder_bpx4; + auto il = first_ladder_bpx4 + p; + auto r4 = hh.averageGeometry().ladderR[il]; + auto ri = innerCell.inner_r(hh); + auto zi = innerCell.inner_z(hh); + auto ro = outer_r(hh); + auto zo = outer_z(hh); + auto z4 = zo + (r4 - ro) * (zo - zi) / (ro - ri); + auto z_in_ladder = std::abs(z4 - hh.averageGeometry().ladderZ[il]); + auto z_in_module = z_in_ladder - module_length_bpx4 * int(z_in_ladder / module_length_bpx4); + auto gap = z_in_module < module_tolerance_bpx4 || z_in_module > (module_length_bpx4 - module_tolerance_bpx4); + auto holeP = z4 > hh.averageGeometry().ladderMaxZ[il] && z4 < hh.averageGeometry().endCapZ[0]; + auto holeN = z4 < hh.averageGeometry().ladderMinZ[il] && z4 > hh.averageGeometry().endCapZ[1]; + return gap || holeP || holeN; + } + + // trying to free the track building process from hardcoded layers, leaving + // the visit of the graph based on the neighborhood connections between cells. + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void find_ntuplets(TAcc const& acc, + const HitsConstView& hh, + CACellT* __restrict__ cells, + CellTracksVector& cellTracks, + HitContainer& foundNtuplets, + cms::alpakatools::AtomicPairCounter& apc, + Quality* __restrict__ quality, + TmpTuple& tmpNtuplet, + const unsigned int minHitsPerNtuplet, + bool startAt0) const { + // the building process for a track ends if: + // it has no right neighbor + // it has no compatible neighbor + // the ntuplets is then saved if the number of hits it contains is greater + // than a threshold + + if constexpr (DEPTH <= 0) { + printf("ERROR: CACellT::find_ntuplets reached full depth!\n"); + ALPAKA_ASSERT_OFFLOAD(false); + } else { + auto doubletId = this - cells; + tmpNtuplet.push_back_unsafe(doubletId); + ALPAKA_ASSERT_OFFLOAD(tmpNtuplet.size() <= int(TrackerTraits::maxHitsOnTrack - 3)); + + bool last = true; + for (unsigned int otherCell : outerNeighbors()) { + if (cells[otherCell].isKilled()) + continue; // killed by earlyFishbone + last = false; + cells[otherCell].template find_ntuplets( + acc, hh, cells, cellTracks, foundNtuplets, apc, quality, tmpNtuplet, minHitsPerNtuplet, startAt0); + } + if (last) { // if long enough save... + if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) { +#ifdef ONLY_TRIPLETS_IN_HOLE + // triplets accepted only pointing to the hole + if (tmpNtuplet.size() >= 3 || (startAt0 && hole4(hh, cells[tmpNtuplet[0]])) || + ((!startAt0) && hole0(hh, cells[tmpNtuplet[0]]))) +#endif + { + hindex_type hits[TrackerTraits::maxDepth + 2]; + auto nh = 0U; + constexpr int maxFB = 2; // for the time being let's limit this + int nfb = 0; + for (auto c : tmpNtuplet) { + hits[nh++] = cells[c].theInnerHitId; + if (nfb < maxFB && cells[c].hasFishbone()) { + ++nfb; + hits[nh++] = cells[c].theFishboneId; // Fishbone hit is always outer than inner hit + } + } + assert(nh < TrackerTraits::maxHitsOnTrack); + hits[nh] = theOuterHitId; + auto it = foundNtuplets.bulkFill(acc, apc, hits, nh + 1); + if (it >= 0) { // if negative is overflow.... + for (auto c : tmpNtuplet) + cells[c].addTrack(acc, it, cellTracks); + quality[it] = bad; // initialize to bad + } + } + } + } + tmpNtuplet.pop_back(); + assert(tmpNtuplet.size() < int(TrackerTraits::maxHitsOnTrack - 1)); + } + } + + // Cell status management + ALPAKA_FN_ACC ALPAKA_FN_INLINE void kill() { theStatus_ |= uint16_t(StatusBit::kKilled); } + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool isKilled() const { return theStatus_ & uint16_t(StatusBit::kKilled); } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE int16_t layerPairId() const { return theLayerPairId_; } + + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); } + ALPAKA_FN_ACC ALPAKA_FN_INLINE void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void setFishbone(TAcc const& acc, hindex_type id, float z, const HitsConstView& hh) { + // make it deterministic: use the farther apart (in z) + auto old = theFishboneId; + while (old != + alpaka::atomicCas( + acc, + &theFishboneId, + old, + (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old, + alpaka::hierarchy::Blocks{})) + old = theFishboneId; + } + ALPAKA_FN_ACC ALPAKA_FN_INLINE auto fishboneId() const { return theFishboneId; } + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool hasFishbone() const { return theFishboneId != invalidHitId; } + + private: + CellNeighbors* theOuterNeighbors; + CellTracks* theTracks; + + int16_t theLayerPairId_; + uint16_t theStatus_; // tbd + + float theInnerZ; + float theInnerR; + hindex_type theInnerHitId; + hindex_type theOuterHitId; + hindex_type theFishboneId; + }; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelTriplets_plugins_CACellT_h diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h new file mode 100644 index 0000000000000..343e0cf9ad005 --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAFishbone.h @@ -0,0 +1,148 @@ +#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h +#define RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h + +#include +#include +#include +#include +#include + +#include +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h" +#include "DataFormats/Math/interface/approx_atan2.h" + +#include "CACell.h" +#include "CAStructures.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace caPixelDoublets { + + template + using CellNeighbors = caStructures::CellNeighborsT; + template + using CellTracks = caStructures::CellTracksT; + template + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + template + using CellTracksVector = caStructures::CellTracksVectorT; + template + using OuterHitOfCell = caStructures::OuterHitOfCellT; + template + using HitsConstView = typename CACellT::HitsConstView; + + template + class CAFishbone { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + HitsConstView hh, + CACellT* cells, + uint32_t const* __restrict__ nCells, + OuterHitOfCell const* isOuterHitOfCellWrap, + int32_t nHits, + bool checkTrack) const { + if (nHits <= isOuterHitOfCellWrap->offset) + return; + constexpr auto maxCellsPerHit = CACellT::maxCellsPerHit; + + auto const isOuterHitOfCell = isOuterHitOfCellWrap->container; + + // x runs faster... + + float x[maxCellsPerHit], y[maxCellsPerHit], z[maxCellsPerHit], n[maxCellsPerHit]; + uint16_t d[maxCellsPerHit]; + uint32_t cc[maxCellsPerHit]; + uint8_t l[maxCellsPerHit]; + const uint32_t dimIndexY = 0u; + const uint32_t dimIndexX = 1u; + const uint32_t blockDimensionX(alpaka::getWorkDiv(acc)[dimIndexX]); + const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] = + cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX); + + // Outermost loop on Y + const uint32_t gridDimensionY(alpaka::getWorkDiv(acc)[dimIndexY]); + const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] = + cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY); + uint32_t firstElementIdxY = firstElementIdxNoStrideY; + uint32_t endElementIdxY = endElementIdxNoStrideY; + + for (uint32_t idy = firstElementIdxY, nt = nHits; idy < nt; ++idy) { + if (not cms::alpakatools::next_valid_element_index_strided( + idy, firstElementIdxY, endElementIdxY, gridDimensionY, nt)) + break; + + auto const& vc = isOuterHitOfCell[idy]; + auto s = vc.size(); + if (s < 2) + continue; + + auto const& c0 = cells[vc[0]]; + auto xo = c0.outer_x(hh); + auto yo = c0.outer_y(hh); + auto zo = c0.outer_z(hh); + auto sg = 0; + for (int32_t ic = 0; ic < s; ++ic) { + auto& ci = cells[vc[ic]]; + if (ci.unused()) + continue; // for triplets equivalent to next + if (checkTrack && ci.tracks().empty()) + continue; + cc[sg] = vc[ic]; + d[sg] = ci.inner_detIndex(hh); + l[sg] = ci.layerPairId(); + x[sg] = ci.inner_x(hh) - xo; + y[sg] = ci.inner_y(hh) - yo; + z[sg] = ci.inner_z(hh) - zo; + n[sg] = x[sg] * x[sg] + y[sg] * y[sg] + z[sg] * z[sg]; + ++sg; + } + if (sg < 2) + continue; + // here we parallelize in X + uint32_t firstElementIdxX = firstElementIdxNoStrideX; + uint32_t endElementIdxX = endElementIdxNoStrideX; + for (uint32_t ic = firstElementIdxX; (int)ic < sg - 1; ++ic) { + if (not cms::alpakatools::next_valid_element_index_strided( + ic, firstElementIdxX, endElementIdxX, blockDimensionX, sg - 1)) + break; + + auto& ci = cells[cc[ic]]; + for (auto jc = ic + 1; (int)jc < sg; ++jc) { + auto& cj = cells[cc[jc]]; + // must be different detectors (in the same layer) + // if (d[ic]==d[jc]) continue; + // || l[ic]!=l[jc]) continue; + auto cos12 = x[ic] * x[jc] + y[ic] * y[jc] + z[ic] * z[jc]; + + if (d[ic] != d[jc] && cos12 * cos12 >= 0.99999f * (n[ic] * n[jc])) { + // alligned: kill farthest (prefer consecutive layers) + // if same layer prefer farthest (longer level arm) and make space for intermediate hit + bool sameLayer = l[ic] == l[jc]; + if (n[ic] > n[jc]) { + if (sameLayer) { + cj.kill(); // closest + ci.setFishbone(acc, cj.inner_hit_id(), cj.inner_z(hh), hh); + } else { + ci.kill(); // farthest + // break; // removed to improve reproducibility. keep it for reference and tests + } + } else { + if (!sameLayer) { + cj.kill(); // farthest + } else { + ci.kill(); // closest + cj.setFishbone(acc, ci.inner_hit_id(), ci.inner_z(hh), hh); + // break; // removed to improve reproducibility. keep it for reference and tests + } + } + } + } //cj + } // ci + } // hits + } + }; + } // namespace caPixelDoublets +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelTriplets_alpaka_CAFishbone_h diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtuplet.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtuplet.cc new file mode 100644 index 0000000000000..c16aed2e0b1e8 --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtuplet.cc @@ -0,0 +1,95 @@ +#include + +#include "DataFormats/TrackSoA/interface/TracksHost.h" +#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h" +#include "DataFormats/TrackSoA/interface/TracksDevice.h" +#include "DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/ESGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "FWCore/Utilities/interface/RunningAverage.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDGetToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/Event.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EventSetup.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/stream/EDProducer.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" +#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h" +#include "RecoLocalTracker/Records/interface/PixelCPEFastParamsRecord.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/alpaka/PixelCPEFastParamsCollection.h" + +#include "CAHitNtupletGenerator.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + template + class CAHitNtupletAlpaka : public stream::EDProducer<> { + using HitsConstView = TrackingRecHitSoAConstView; + using HitsOnDevice = TrackingRecHitsSoACollection; + using HitsOnHost = TrackingRecHitHost; + + using TkSoAHost = TracksHost; + using TkSoADevice = TracksSoACollection; + + using Algo = CAHitNtupletGenerator; + + public: + explicit CAHitNtupletAlpaka(const edm::ParameterSet& iConfig); + ~CAHitNtupletAlpaka() override = default; + void produce(device::Event& iEvent, const device::EventSetup& es) override; + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + private: + const edm::ESGetToken tokenField_; + const device::ESGetToken, PixelCPEFastParamsRecord> cpeToken_; + const device::EDGetToken tokenHit_; + const device::EDPutToken tokenTrack_; + + Algo deviceAlgo_; + }; + + template + CAHitNtupletAlpaka::CAHitNtupletAlpaka(const edm::ParameterSet& iConfig) + : tokenField_(esConsumes()), + cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter("CPE")))), + tokenHit_(consumes(iConfig.getParameter("pixelRecHitSrc"))), + tokenTrack_(produces()), + deviceAlgo_(iConfig) {} + + template + void CAHitNtupletAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingAlpaka")); + + std::string cpe = "PixelCPEFastParams"; + cpe += TrackerTraits::nameModifier; + desc.add("CPE", cpe); + + Algo::fillPSetDescription(desc); + descriptions.addWithDefaultLabel(desc); + } + + template + void CAHitNtupletAlpaka::produce(device::Event& iEvent, const device::EventSetup& es) { + auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV(); + + auto& fcpe = es.getData(cpeToken_); + + auto const& hits = iEvent.get(tokenHit_); + + iEvent.emplace(tokenTrack_, deviceAlgo_.makeTuplesAsync(hits, fcpe.const_buffer().data(), bf, iEvent.queue())); + } + + using CAHitNtupletAlpakaPhase1 = CAHitNtupletAlpaka; + using CAHitNtupletAlpakaPhase2 = CAHitNtupletAlpaka; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h" + +DEFINE_FWK_ALPAKA_MODULE(CAHitNtupletAlpakaPhase1); +DEFINE_FWK_ALPAKA_MODULE(CAHitNtupletAlpakaPhase2); diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc new file mode 100644 index 0000000000000..8f898872a66f4 --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.cc @@ -0,0 +1,329 @@ +// +// Original Author: Felice Pantaleo, CERN +// + +//#define GPU_DEBUG +//#define DUMP_GPU_TK_TUPLES + +#include +#include +#include +#include + +#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h" +#include "DataFormats/TrackSoA/interface/TracksDevice.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/Exception.h" + +#include "CAHitNtupletGenerator.h" +#include "CAHitNtupletGeneratorKernels.h" +#include "CAPixelDoublets.h" +#include "CAPixelDoubletsAlgos.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace { + + using namespace caHitNtupletGenerator; + using namespace caPixelDoublets; + using namespace pixelTopology; + using namespace pixelTrack; + + template + T sqr(T x) { + return x * x; + } + + //Common Params + void fillDescriptionsCommon(edm::ParameterSetDescription& desc) { + // 87 cm/GeV = 1/(3.8T * 0.3) + // take less than radius given by the hardPtCut and reject everything below + // auto hardCurvCut = 1.f/(0.35 * 87.f); + desc.add("ptmin", 0.9f)->setComment("Cut on minimum pt"); + desc.add("CAThetaCutBarrel", 0.002f)->setComment("Cut on RZ alignement for Barrel"); + desc.add("CAThetaCutForward", 0.003f)->setComment("Cut on RZ alignment for Forward"); + desc.add("hardCurvCut", 1.f / (0.35 * 87.f)) + ->setComment("Cut on minimum curvature, used in DCA ntuplet selection"); + desc.add("dcaCutInnerTriplet", 0.15f)->setComment("Cut on origin radius when the inner hit is on BPix1"); + desc.add("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1"); + desc.add("earlyFishbone", true); + desc.add("lateFishbone", false); + desc.add("fillStatistics", false); + desc.add("minHitsPerNtuplet", 4); + desc.add("minHitsForSharingCut", 10) + ->setComment("Maximum number of hits in a tuple to clean also if the shared hit is on bpx1"); + + desc.add("fitNas4", false)->setComment("fit only 4 hits out of N"); + desc.add("doClusterCut", true); + desc.add("doZ0Cut", true); + desc.add("doPtCut", true); + desc.add("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine"); + desc.add("doSharedHitCut", true)->setComment("Sharing hit nTuples cleaning"); + desc.add("dupPassThrough", false)->setComment("Do not reject duplicate"); + desc.add("useSimpleTripletCleaner", true)->setComment("use alternate implementation"); + } + + AlgoParams makeCommonParams(edm::ParameterSet const& cfg) { + return AlgoParams({cfg.getParameter("minHitsForSharingCut"), + cfg.getParameter("useRiemannFit"), + cfg.getParameter("fitNas4"), + cfg.getParameter("includeJumpingForwardDoublets"), + cfg.getParameter("earlyFishbone"), + cfg.getParameter("lateFishbone"), + cfg.getParameter("fillStatistics"), + cfg.getParameter("doSharedHitCut"), + cfg.getParameter("dupPassThrough"), + cfg.getParameter("useSimpleTripletCleaner")}); + } + + //This is needed to have the partial specialization for isPhase1Topology/isPhase2Topology + template + struct TopologyCuts {}; + + template + struct TopologyCuts> { + static constexpr CAParamsT makeCACuts(edm::ParameterSet const& cfg) { + return CAParamsT{{cfg.getParameter("maxNumberOfDoublets"), + cfg.getParameter("minHitsPerNtuplet"), + (float)cfg.getParameter("ptmin"), + (float)cfg.getParameter("CAThetaCutBarrel"), + (float)cfg.getParameter("CAThetaCutForward"), + (float)cfg.getParameter("hardCurvCut"), + (float)cfg.getParameter("dcaCutInnerTriplet"), + (float)cfg.getParameter("dcaCutOuterTriplet")}}; + }; + + static constexpr ::pixelTrack::QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { + auto coeff = pset.getParameter>("chi2Coeff"); + auto ptMax = pset.getParameter("chi2MaxPt"); + + coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax); + return ::pixelTrack::QualityCutsT{// polynomial coefficients for the pT-dependent chi2 cut + {(float)coeff[0], (float)coeff[1], 0.f, 0.f}, + // max pT used to determine the chi2 cut + (float)ptMax, + // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit + (float)pset.getParameter("chi2Scale"), + // regional cuts for triplets + {(float)pset.getParameter("tripletMaxTip"), + (float)pset.getParameter("tripletMinPt"), + (float)pset.getParameter("tripletMaxZip")}, + // regional cuts for quadruplets + {(float)pset.getParameter("quadrupletMaxTip"), + (float)pset.getParameter("quadrupletMinPt"), + (float)pset.getParameter("quadrupletMaxZip")}}; + } + }; + + template + struct TopologyCuts> { + static constexpr CAParamsT makeCACuts(edm::ParameterSet const& cfg) { + return CAParamsT{{cfg.getParameter("maxNumberOfDoublets"), + cfg.getParameter("minHitsPerNtuplet"), + (float)cfg.getParameter("ptmin"), + (float)cfg.getParameter("CAThetaCutBarrel"), + (float)cfg.getParameter("CAThetaCutForward"), + (float)cfg.getParameter("hardCurvCut"), + (float)cfg.getParameter("dcaCutInnerTriplet"), + (float)cfg.getParameter("dcaCutOuterTriplet")}, + {(bool)cfg.getParameter("includeFarForwards")}}; + } + + static constexpr ::pixelTrack::QualityCutsT makeQualityCuts(edm::ParameterSet const& pset) { + return ::pixelTrack::QualityCutsT{ + static_cast(pset.getParameter("maxChi2")), + static_cast(pset.getParameter("minPt")), + static_cast(pset.getParameter("maxTip")), + static_cast(pset.getParameter("maxZip")), + }; + } + }; + + //Cell Cuts, as they are the cuts have the same logic for Phase2 and Phase1 + //keeping them separate would allow further differentiation in the future + //moving them to TopologyCuts and using the same syntax + template + CellCutsT makeCellCuts(edm::ParameterSet const& cfg) { + return CellCutsT{cfg.getParameter("doClusterCut"), + cfg.getParameter("doZ0Cut"), + cfg.getParameter("doPtCut"), + cfg.getParameter("idealConditions"), + (float)cfg.getParameter("cellZ0Cut"), + (float)cfg.getParameter("cellPtCut"), + cfg.getParameter>("phiCuts")}; + } + + } // namespace + + using namespace std; + + template + CAHitNtupletGenerator::CAHitNtupletGenerator(const edm::ParameterSet& cfg) + : m_params(makeCommonParams(cfg), + makeCellCuts(cfg), + TopologyCuts::makeQualityCuts(cfg.getParameterSet("trackQualityCuts")), + TopologyCuts::makeCACuts(cfg)) { +#ifdef DUMP_GPU_TK_TUPLES + printf("TK: %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s\n", + "tid", + "qual", + "nh", + "nl", + "charge", + "pt", + "eta", + "phi", + "tip", + "zip", + "chi2", + "h1", + "h2", + "h3", + "h4", + "h5", + "hn"); +#endif + } + + template + void CAHitNtupletGenerator::fillPSetDescription(edm::ParameterSetDescription& desc) { + static_assert(sizeof(TrackerTraits) == 0, + "Note: this fillPSetDescription is a dummy one. Please specialise it for the correct version of " + "CAHitNtupletGenerator."); + } + + template <> + void CAHitNtupletGenerator::fillPSetDescription(edm::ParameterSetDescription& desc) { + fillDescriptionsCommon(desc); + + desc.add("maxNumberOfDoublets", pixelTopology::Phase1::maxNumberOfDoublets); + desc.add("idealConditions", true); + desc.add("includeJumpingForwardDoublets", false); + desc.add("cellZ0Cut", 12.0); + desc.add("cellPtCut", 0.5); + + edm::ParameterSetDescription trackQualityCuts; + trackQualityCuts.add("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut"); + trackQualityCuts.add>("chi2Coeff", {0.9, 1.8})->setComment("chi2 at 1GeV and at ptMax above"); + trackQualityCuts.add("chi2Scale", 8.) + ->setComment( + "Factor to multiply the pT-dependent chi2 cut (currently: 8 for the broken line fit, ?? for the Riemann " + "fit)"); + trackQualityCuts.add("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV"); + trackQualityCuts.add("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm"); + trackQualityCuts.add("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm"); + trackQualityCuts.add("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV"); + trackQualityCuts.add("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm"); + trackQualityCuts.add("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm"); + desc.add("trackQualityCuts", trackQualityCuts) + ->setComment( + "Quality cuts based on the results of the track fit:\n - apply a pT-dependent chi2 cut;\n - apply " + "\"region " + "cuts\" based on the fit results (pT, Tip, Zip)."); + + desc.add>( + "phiCuts", + std::vector(std::begin(phase1PixelTopology::phicuts), std::end(phase1PixelTopology::phicuts))) + ->setComment("Cuts in phi for cells"); + } + + template <> + void CAHitNtupletGenerator::fillPSetDescription(edm::ParameterSetDescription& desc) { + fillDescriptionsCommon(desc); + + desc.add("maxNumberOfDoublets", pixelTopology::HIonPhase1::maxNumberOfDoublets); + desc.add("idealConditions", false); + desc.add("includeJumpingForwardDoublets", false); + desc.add("cellZ0Cut", 10.0); + desc.add("cellPtCut", 0.0); + + edm::ParameterSetDescription trackQualityCuts; + trackQualityCuts.add("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut"); + trackQualityCuts.add>("chi2Coeff", {0.9, 1.8})->setComment("chi2 at 1GeV and at ptMax above"); + trackQualityCuts.add("chi2Scale", 8.) + ->setComment( + "Factor to multiply the pT-dependent chi2 cut (currently: 8 for the broken line fit, ?? for the Riemann " + "fit)"); + trackQualityCuts.add("tripletMinPt", 0.0)->setComment("Min pT for triplets, in GeV"); + trackQualityCuts.add("tripletMaxTip", 0.1)->setComment("Max |Tip| for triplets, in cm"); + trackQualityCuts.add("tripletMaxZip", 6.)->setComment("Max |Zip| for triplets, in cm"); + trackQualityCuts.add("quadrupletMinPt", 0.0)->setComment("Min pT for quadruplets, in GeV"); + trackQualityCuts.add("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm"); + trackQualityCuts.add("quadrupletMaxZip", 6.)->setComment("Max |Zip| for quadruplets, in cm"); + + desc.add("trackQualityCuts", trackQualityCuts) + ->setComment( + "Quality cuts based on the results of the track fit:\n - apply a pT-dependent chi2 cut;\n - apply " + "\"region " + "cuts\" based on the fit results (pT, Tip, Zip)."); + + desc.add>( + "phiCuts", + std::vector(std::begin(phase1PixelTopology::phicuts), std::end(phase1PixelTopology::phicuts))) + ->setComment("Cuts in phi for cells"); + } + + template <> + void CAHitNtupletGenerator::fillPSetDescription(edm::ParameterSetDescription& desc) { + fillDescriptionsCommon(desc); + + desc.add("maxNumberOfDoublets", pixelTopology::Phase2::maxNumberOfDoublets); + desc.add("idealConditions", false); + desc.add("includeFarForwards", true); + desc.add("includeJumpingForwardDoublets", true); + desc.add("cellZ0Cut", 7.5); + desc.add("cellPtCut", 0.85); + + edm::ParameterSetDescription trackQualityCuts; + trackQualityCuts.add("maxChi2", 5.)->setComment("Max normalized chi2"); + trackQualityCuts.add("minPt", 0.5)->setComment("Min pT in GeV"); + trackQualityCuts.add("maxTip", 0.3)->setComment("Max |Tip| in cm"); + trackQualityCuts.add("maxZip", 12.)->setComment("Max |Zip|, in cm"); + desc.add("trackQualityCuts", trackQualityCuts) + ->setComment( + "Quality cuts based on the results of the track fit:\n - apply cuts based on the fit results (pT, Tip, " + "Zip)."); + + desc.add>( + "phiCuts", + std::vector(std::begin(phase2PixelTopology::phicuts), std::end(phase2PixelTopology::phicuts))) + ->setComment("Cuts in phi for cells"); + } + + template + TracksSoACollection CAHitNtupletGenerator::makeTuplesAsync( + HitsOnDevice const& hits_d, ParamsOnDevice const* cpeParams, float bfield, Queue& queue) const { + using HelixFit = HelixFit; + using TrackSoA = TracksSoACollection; + using GPUKernels = CAHitNtupletGeneratorKernels; + + TrackSoA tracks(queue); + + GPUKernels kernels(m_params, hits_d.view().metadata().size(), queue); + + kernels.buildDoublets(hits_d.view(), queue); + kernels.launchKernels(hits_d.view(), tracks.view(), queue); + + HelixFit fitter(bfield, m_params.fitNas4_); + fitter.allocate(kernels.tupleMultiplicity(), tracks.view()); + if (m_params.useRiemannFit_) { + fitter.launchRiemannKernels( + hits_d.view(), cpeParams, hits_d.view().metadata().size(), TrackerTraits::maxNumberOfQuadruplets, queue); + } else { + fitter.launchBrokenLineKernels( + hits_d.view(), cpeParams, hits_d.view().metadata().size(), TrackerTraits::maxNumberOfQuadruplets, queue); + } + kernels.classifyTuples(hits_d.view(), tracks.view(), queue); +#ifdef GPU_DEBUG + alpaka::wait(queue); + std::cout << "finished building pixel tracks on GPU" << std::endl; +#endif + + return tracks; + } + + template class CAHitNtupletGenerator; + template class CAHitNtupletGenerator; + template class CAHitNtupletGenerator; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h new file mode 100644 index 0000000000000..826b92d4a195a --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGenerator.h @@ -0,0 +1,86 @@ +#ifndef RecoPixelVertexing_PixelTriplets_Alpaka_CAHitNtupletGenerator_h +#define RecoPixelVertexing_PixelTriplets_Alpaka_CAHitNtupletGenerator_h + +#include + +#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h" +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" +#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" +#include "DataFormats/TrackingRecHitSoA/interface/alpaka/TrackingRecHitsSoACollection.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h" + +#include "CAHitNtupletGeneratorKernels.h" +#include "CACell.h" +#include "HelixFit.h" + +namespace edm { + class ParameterSetDescription; +} // namespace edm + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + template + class CAHitNtupletGenerator { + public: + using HitsView = TrackingRecHitSoAView; + using HitsConstView = TrackingRecHitSoAConstView; + using HitsOnDevice = TrackingRecHitsSoACollection; + using HitsOnHost = TrackingRecHitHost; + using hindex_type = typename TrackingRecHitSoA::hindex_type; + + using HitToTuple = caStructures::HitToTupleT; + using TupleMultiplicity = caStructures::TupleMultiplicityT; + using OuterHitOfCell = caStructures::OuterHitOfCellT; + + using CACell = CACellT; + using TkSoAHost = TracksHost; + using TkSoADevice = TracksSoACollection; + using HitContainer = typename reco::TrackSoA::HitContainer; + using Tuple = HitContainer; + + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + using CellTracksVector = caStructures::CellTracksVectorT; + + using Quality = ::pixelTrack::Quality; + + using QualityCuts = ::pixelTrack::QualityCutsT; + using Params = caHitNtupletGenerator::ParamsT; + using Counters = caHitNtupletGenerator::Counters; + + using ParamsOnDevice = pixelCPEforDevice::ParamsOnDeviceT; + + public: + CAHitNtupletGenerator(const edm::ParameterSet& cfg); + + static void fillPSetDescription(edm::ParameterSetDescription& desc); + + // NOTE: beginJob and endJob were meant to be used + // to fill the statistics. This is still not implemented in Alpaka + // since we are missing the begin/endJob functionality for the Alpaka + // producers. + // + // void beginJob(); + // void endJob(); + + TkSoADevice makeTuplesAsync(HitsOnDevice const& hits_d, + ParamsOnDevice const* cpeParams, + float bfield, + Queue& queue) const; + + private: + void buildDoublets(const HitsConstView& hh, Queue& queue) const; + + void hitNtuplets(const HitsConstView& hh, const edm::EventSetup& es, bool useRiemannFit, Queue& queue); + + void launchKernels(const HitsConstView& hh, bool useRiemannFit, Queue& queue) const; + + Params m_params; + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGenerator_h diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc new file mode 100644 index 0000000000000..44e3295bdb606 --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.dev.cc @@ -0,0 +1,538 @@ +#include +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" +#include "CAHitNtupletGeneratorKernels.h" +#include "CAHitNtupletGeneratorKernelsImpl.h" +#ifdef DUMP_GPU_TK_TUPLES +#include +#endif + +//#define GPU_DEBUG +//#define NTUPLE_DEBUG + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + template + CAHitNtupletGeneratorKernels::CAHitNtupletGeneratorKernels(Params const ¶ms, + uint32_t nhits, + Queue &queue) + : m_params(params), + ////////////////////////////////////////////////////////// + // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) + ////////////////////////////////////////////////////////// + counters_{cms::alpakatools::make_device_buffer(queue)}, + + // workspace + device_hitToTuple_{cms::alpakatools::make_device_buffer(queue)}, + device_tupleMultiplicity_{cms::alpakatools::make_device_buffer(queue)}, + + // NB: In legacy, device_theCells_ and device_isOuterHitOfCell_ were allocated inside buildDoublets + device_theCells_{ + cms::alpakatools::make_device_buffer(queue, m_params.caParams_.maxNumberOfDoublets_)}, + // in principle we can use "nhits" to heuristically dimension the workspace... + device_isOuterHitOfCell_{ + cms::alpakatools::make_device_buffer(queue, std::max(1u, nhits))}, + isOuterHitOfCell_{cms::alpakatools::make_device_buffer(queue)}, + + device_theCellNeighbors_{cms::alpakatools::make_device_buffer(queue)}, + device_theCellTracks_{cms::alpakatools::make_device_buffer(queue)}, + // NB: In legacy, cellStorage_ was allocated inside buildDoublets + cellStorage_{cms::alpakatools::make_device_buffer( + queue, + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) + + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks))}, + device_cellCuts_{cms::alpakatools::make_device_buffer(queue)}, + device_theCellNeighborsContainer_{reinterpret_cast(cellStorage_.data())}, + device_theCellTracksContainer_{reinterpret_cast( + cellStorage_.data() + TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors))}, + + // NB: In legacy, device_storage_ was allocated inside allocateOnGPU + device_storage_{ + cms::alpakatools::make_device_buffer(queue, 3u)}, + device_hitTuple_apc_{reinterpret_cast(device_storage_.data())}, + device_hitToTuple_apc_{reinterpret_cast(device_storage_.data() + 1)}, + device_nCells_{cms::alpakatools::make_device_view(alpaka::getDev(queue), + *reinterpret_cast(device_storage_.data() + 2))} { + alpaka::memset(queue, counters_, 0); + alpaka::memset(queue, device_nCells_, 0); + alpaka::memset(queue, cellStorage_, 0); + + auto cellCuts_h = cms::alpakatools::make_host_view(m_params.cellCuts_); + alpaka::memcpy(queue, device_cellCuts_, cellCuts_h); + + [[maybe_unused]] TupleMultiplicity *tupleMultiplicityDeviceData = device_tupleMultiplicity_.data(); + [[maybe_unused]] HitToTuple *hitToTupleDeviceData = device_hitToTuple_.data(); + using TM = cms::alpakatools::OneToManyAssocRandomAccess; + TM *tm = device_tupleMultiplicity_.data(); + TM::template launchZero(tm, queue); + TupleMultiplicity::template launchZero(tupleMultiplicityDeviceData, queue); + HitToTuple::template launchZero(hitToTupleDeviceData, queue); + } + + template + void CAHitNtupletGeneratorKernels::launchKernels(const HitsConstView &hh, + TkSoAView &tracks_view, + Queue &queue) { + using namespace caPixelDoublets; + using namespace caHitNtupletGeneratorKernels; + + // zero tuples + HitContainer::template launchZero(&(tracks_view.hitIndices()), queue); + + int32_t nhits = hh.metadata().size(); + +#ifdef NTUPLE_DEBUG + std::cout << "start tuple building. N hits " << nhits << std::endl; + if (nhits < 2) + std::cout << "too few hits " << nhits << std::endl; +#endif + + // + // applying conbinatoric cleaning such as fishbone at this stage is too expensive + // + + const auto nthTot = 64; + const auto stride = 4; + auto blockSize = nthTot / stride; + auto numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize); + const auto rescale = numberOfBlocks / 65536; + blockSize *= (rescale + 1); + numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize); + assert(numberOfBlocks < 65536); + assert(blockSize > 0 && 0 == blockSize % 16); + const Vec2D blks{numberOfBlocks, 1u}; + const Vec2D thrs{blockSize, stride}; + const auto kernelConnectWorkDiv = cms::alpakatools::make_workdiv(blks, thrs); + + alpaka::exec(queue, + kernelConnectWorkDiv, + Kernel_connect{}, + this->device_hitTuple_apc_, + this->device_hitToTuple_apc_, // needed only to be reset, ready for next kernel + hh, + this->device_theCells_.data(), + this->device_nCells_.data(), + this->device_theCellNeighbors_.data(), + this->isOuterHitOfCell_.data(), + this->m_params.caParams_); + + // do not run the fishbone if there are hits only in BPIX1 + if (this->m_params.earlyFishbone_) { + const auto nthTot = 128; + const auto stride = 16; + const auto blockSize = nthTot / stride; + const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits, blockSize); + const Vec2D blks{numberOfBlocks, 1u}; + const Vec2D thrs{blockSize, stride}; + const auto fishboneWorkDiv = cms::alpakatools::make_workdiv(blks, thrs); + alpaka::exec(queue, + fishboneWorkDiv, + CAFishbone{}, + hh, + this->device_theCells_.data(), + this->device_nCells_.data(), + this->isOuterHitOfCell_.data(), + nhits, + false); + } + blockSize = 64; + numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize); + auto workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec(queue, + workDiv1D, + Kernel_find_ntuplets{}, + hh, + tracks_view, + this->device_theCells_.data(), + this->device_nCells_.data(), + this->device_theCellTracks_.data(), + this->device_hitTuple_apc_, + this->m_params.caParams_); +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + + if (this->m_params.doStats_) + alpaka::exec(queue, + workDiv1D, + Kernel_mark_used{}, + this->device_theCells_.data(), + this->device_nCells_.data()); + +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + + blockSize = 128; + numberOfBlocks = cms::alpakatools::divide_up_by(HitContainer{}.totOnes(), blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + + alpaka::exec( + queue, workDiv1D, typename HitContainer::finalizeBulk{}, this->device_hitTuple_apc_, &tracks_view.hitIndices()); + +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + + alpaka::exec(queue, workDiv1D, Kernel_fillHitDetIndices{}, tracks_view, hh); + +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + alpaka::exec(queue, workDiv1D, Kernel_fillNLayers{}, tracks_view, this->device_hitTuple_apc_); + +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + + // remove duplicates (tracks that share a doublet) + numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + + alpaka::exec(queue, + workDiv1D, + Kernel_earlyDuplicateRemover{}, + this->device_theCells_.data(), + this->device_nCells_.data(), + tracks_view, + this->m_params.dupPassThrough_); +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + + blockSize = 128; + numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfTuples / 4, blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + + alpaka::exec(queue, + workDiv1D, + Kernel_countMultiplicity{}, + tracks_view, + this->device_tupleMultiplicity_.data()); + TupleMultiplicity::template launchFinalize(this->device_tupleMultiplicity_.data(), queue); + + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec( + queue, workDiv1D, Kernel_fillMultiplicity{}, tracks_view, this->device_tupleMultiplicity_.data()); +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + // do not run the fishbone if there are hits only in BPIX1 + if (this->m_params.lateFishbone_) { + const auto nthTot = 128; + const auto stride = 16; + const auto blockSize = nthTot / stride; + const auto numberOfBlocks = cms::alpakatools::divide_up_by(nhits, blockSize); + const Vec2D blks{numberOfBlocks, 1u}; + const Vec2D thrs{blockSize, stride}; + const auto workDiv2D = cms::alpakatools::make_workdiv(blks, thrs); + + alpaka::exec(queue, + workDiv2D, + CAFishbone{}, + hh, + this->device_theCells_.data(), + this->device_nCells_.data(), + this->isOuterHitOfCell_.data(), + nhits, + true); + } + +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + } + + template + void CAHitNtupletGeneratorKernels::buildDoublets(const HitsConstView &hh, Queue &queue) { + auto nhits = hh.metadata().size(); + + using namespace caPixelDoublets; + + using CACell = CACellT; + using OuterHitOfCell = typename CACell::OuterHitOfCell; + using CellNeighbors = typename CACell::CellNeighbors; + using CellTracks = typename CACell::CellTracks; + using OuterHitOfCellContainer = typename CACell::OuterHitOfCellContainer; + +#ifdef NTUPLE_DEBUG + std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; +#endif + +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + + // in principle we can use "nhits" to heuristically dimension the workspace... + ALPAKA_ASSERT_OFFLOAD(this->device_isOuterHitOfCell_.data()); + + alpaka::exec( + queue, + cms::alpakatools::make_workdiv(1, 1), + [] ALPAKA_FN_ACC(Acc1D const &acc, + OuterHitOfCell *isOuterHitOfCell, + OuterHitOfCellContainer *container, + int32_t const *offset) { + // this code runs on the device + isOuterHitOfCell->container = container; + isOuterHitOfCell->offset = *offset; + }, + this->isOuterHitOfCell_.data(), + this->device_isOuterHitOfCell_.data(), + &hh.offsetBPIX2()); + + { + int threadsPerBlock = 128; + // at least one block! + int blocks = std::max(1u, cms::alpakatools::divide_up_by(nhits, threadsPerBlock)); + const auto workDiv1D = cms::alpakatools::make_workdiv(blocks, threadsPerBlock); + + alpaka::exec(queue, + workDiv1D, + InitDoublets{}, + this->isOuterHitOfCell_.data(), + nhits, + this->device_theCellNeighbors_.data(), + this->device_theCellNeighborsContainer_, + this->device_theCellTracks_.data(), + this->device_theCellTracksContainer_); + } + +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + + if (0 == nhits) + return; // protect against empty events + + // take all layer pairs into account + auto nActualPairs = this->m_params.nPairs(); + + const int stride = 4; + const int threadsPerBlock = TrackerTraits::getDoubletsFromHistoMaxBlockSize / stride; + int blocks = (4 * nhits + threadsPerBlock - 1) / threadsPerBlock; + const Vec2D blks{blocks, 1u}; + const Vec2D thrs{threadsPerBlock, stride}; + const auto workDiv2D = cms::alpakatools::make_workdiv(blks, thrs); + + alpaka::exec(queue, + workDiv2D, + GetDoubletsFromHisto{}, + this->device_theCells_.data(), + this->device_nCells_.data(), + this->device_theCellNeighbors_.data(), + this->device_theCellTracks_.data(), + hh, + this->isOuterHitOfCell_.data(), + nActualPairs, + this->m_params.caParams_.maxNumberOfDoublets_, + this->m_params.cellCuts_); + +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + } + + template + void CAHitNtupletGeneratorKernels::classifyTuples(const HitsConstView &hh, + TkSoAView &tracks_view, + Queue &queue) { + using namespace caHitNtupletGeneratorKernels; + + uint32_t nhits = hh.metadata().size(); + + auto blockSize = 64; + + // classify tracks based on kinematics + auto numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize); + auto workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec( + queue, workDiv1D, Kernel_classifyTracks{}, tracks_view, this->m_params.qualityCuts_); + + if (this->m_params.lateFishbone_) { + // apply fishbone cleaning to good tracks + numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec(queue, + workDiv1D, + Kernel_fishboneCleaner{}, + this->device_theCells_.data(), + this->device_nCells_.data(), + tracks_view); + } + + // mark duplicates (tracks that share a doublet) + numberOfBlocks = cms::alpakatools::divide_up_by(3 * m_params.caParams_.maxNumberOfDoublets_ / 4, blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec(queue, + workDiv1D, + Kernel_fastDuplicateRemover{}, + this->device_theCells_.data(), + this->device_nCells_.data(), + tracks_view, + this->m_params.dupPassThrough_); +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + + if (this->m_params.doSharedHitCut_ || this->m_params.doStats_) { + // fill hit->track "map" + numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec(queue, + workDiv1D, + Kernel_countHitInTracks{}, + tracks_view, + this->device_hitToTuple_.data()); //CHECK + + HitToTuple::template launchFinalize(this->device_hitToTuple_.data(), queue); + alpaka::exec( + queue, workDiv1D, Kernel_fillHitInTracks{}, tracks_view, this->device_hitToTuple_.data()); +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + } + + if (this->m_params.doSharedHitCut_) { + // mark duplicates (tracks that share at least one hit) + numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, + blockSize); // TODO: Check if correct + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec(queue, + workDiv1D, + Kernel_rejectDuplicate{}, + tracks_view, + this->m_params.minHitsForSharingCut_, + this->m_params.dupPassThrough_, + this->device_hitToTuple_.data()); + + alpaka::exec(queue, + workDiv1D, + Kernel_sharedHitCleaner{}, + hh, + tracks_view, + this->m_params.minHitsForSharingCut_, + this->m_params.dupPassThrough_, + this->device_hitToTuple_.data()); + + if (this->m_params.useSimpleTripletCleaner_) { + // (typename HitToTuple{}::capacity(), + numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec(queue, + workDiv1D, + Kernel_simpleTripletCleaner{}, + tracks_view, + this->m_params.minHitsForSharingCut_, + this->m_params.dupPassThrough_, + this->device_hitToTuple_.data()); + } else { + numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec(queue, + workDiv1D, + Kernel_tripletCleaner{}, + tracks_view, + this->m_params.minHitsForSharingCut_, + this->m_params.dupPassThrough_, + this->device_hitToTuple_.data()); + } +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + } + + if (this->m_params.doStats_) { + numberOfBlocks = + cms::alpakatools::divide_up_by(std::max(nhits, m_params.caParams_.maxNumberOfDoublets_), blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + + alpaka::exec(queue, + workDiv1D, + Kernel_checkOverflows{}, + tracks_view, + this->device_tupleMultiplicity_.data(), + this->device_hitToTuple_.data(), + this->device_hitTuple_apc_, + this->device_theCells_.data(), + this->device_nCells_.data(), + this->device_theCellNeighbors_.data(), + this->device_theCellTracks_.data(), + this->isOuterHitOfCell_.data(), + nhits, + this->m_params.caParams_.maxNumberOfDoublets_, + this->counters_.data()); + } + + if (this->m_params.doStats_) { + // counters (add flag???) + + numberOfBlocks = cms::alpakatools::divide_up_by(HitToTuple{}.capacity(), blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec(queue, + workDiv1D, + Kernel_doStatsForHitInTracks{}, + this->device_hitToTuple_.data(), + this->counters_.data()); + + numberOfBlocks = cms::alpakatools::divide_up_by(3 * TrackerTraits::maxNumberOfQuadruplets / 4, blockSize); + workDiv1D = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec( + queue, workDiv1D, Kernel_doStatsForTracks{}, tracks_view, this->counters_.data()); + } +#ifdef GPU_DEBUG + alpaka::wait(queue); +#endif + +#ifdef DUMP_GPU_TK_TUPLES + static std::atomic iev(0); + static std::mutex lock; + workDiv1D = cms::alpakatools::make_workdiv(1u, 32u); + { + std::lock_guard guard(lock); + ++iev; + for (int k = 0; k < 20000; k += 500) { + alpaka::exec(queue, + workDiv1D, + Kernel_print_found_ntuplets{}, + hh, + tracks_view, + this->device_hitToTuple_.data(), + k, + k + 500, + iev); + alpaka::wait(queue); + } + alpaka::exec(queue, + workDiv1D, + Kernel_print_found_ntuplets{}, + hh, + tracks_view, + this->device_hitToTuple_.data(), + 20000, + 1000000, + iev); + + alpaka::wait(queue); + } +#endif + } + // This will make sense when we will be able to run this once per job in Alpaka + /* +template +void CAHitNtupletGeneratorKernels::printCounters() { + auto workDiv1D = cms::alpakatools::make_workdiv(1,1); + alpaka::exec(queue_,workDiv1D,Kernel_printCounters{},this->counters_.data()); +} +*/ + template class CAHitNtupletGeneratorKernels; + template class CAHitNtupletGeneratorKernels; + template class CAHitNtupletGeneratorKernels; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h new file mode 100644 index 0000000000000..d55be09e6e497 --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernels.h @@ -0,0 +1,273 @@ +#ifndef RecoPixelVertexing_PixelTriplets_CAHitNtupletGeneratorKernels_h +#define RecoPixelVertexing_PixelTriplets_CAHitNtupletGeneratorKernels_h + +//#define GPU_DEBUG +//#define DUMP_GPU_TK_TUPLES + +#include + +#include + +#include "DataFormats/TrackSoA/interface/TrackDefinitions.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" +#include "HeterogeneousCore/AlpakaInterface/interface/AtomicPairCounter.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" + +#include "CACell.h" +#include "CAPixelDoublets.h" +#include "CAStructures.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace caHitNtupletGenerator { + + //Configuration params common to all topologies, for the algorithms + struct AlgoParams { + const uint32_t minHitsForSharingCut_; + const bool useRiemannFit_; + const bool fitNas4_; + const bool includeJumpingForwardDoublets_; + const bool earlyFishbone_; + const bool lateFishbone_; + const bool doStats_; + const bool doSharedHitCut_; + const bool dupPassThrough_; + const bool useSimpleTripletCleaner_; + }; + + //CAParams + struct CACommon { + const uint32_t maxNumberOfDoublets_; + const uint32_t minHitsPerNtuplet_; + const float ptmin_; + const float CAThetaCutBarrel_; + const float CAThetaCutForward_; + const float hardCurvCut_; + const float dcaCutInnerTriplet_; + const float dcaCutOuterTriplet_; + }; + + template + struct CAParamsT : public CACommon { + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startingLayerPair(int16_t pid) const { return false; }; + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startAt0(int16_t pid) const { return false; }; + }; + + template + struct CAParamsT> : public CACommon { + /// Is is a starting layer pair? + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startingLayerPair(int16_t pid) const { + return minHitsPerNtuplet_ > 3 ? pid < 3 : pid < 8 || pid > 12; + } + + /// Is this a pair with inner == 0? + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startAt0(int16_t pid) const { + assert((pixelTopology::Phase1::layerPairs[pid * 2] == 0) == + (pid < 3 || pid == 13 || pid == 15 || pid == 16)); // to be 100% sure it's working, may be removed + return pixelTopology::Phase1::layerPairs[pid * 2] == 0; + } + }; + + template + struct CAParamsT> : public CACommon { + const bool includeFarForwards_; + /// Is is a starting layer pair? + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startingLayerPair(int16_t pid) const { + return pid < 33; // in principle one could remove 5,6,7 23, 28 and 29 + } + + /// Is this a pair with inner == 0 + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool startAt0(int16_t pid) const { + assert((pixelTopology::Phase2::layerPairs[pid * 2] == 0) == ((pid < 3) | (pid >= 23 && pid < 28))); + return pixelTopology::Phase2::layerPairs[pid * 2] == 0; + } + }; + + //Full list of params = algo params + ca params + cell params + quality cuts + //Generic template + template + struct ParamsT : public AlgoParams { + // one should define the params for its own pixelTopology + // not defining anything here + inline uint32_t nPairs() const { return 0; } + }; + + template + struct ParamsT> : public AlgoParams { + using TT = TrackerTraits; + using QualityCuts = ::pixelTrack::QualityCutsT; //track quality cuts + using CellCuts = caPixelDoublets::CellCutsT; //cell building cuts + using CAParams = CAParamsT; //params to be used on device + + ParamsT(AlgoParams const& commonCuts, + CellCuts const& cellCuts, + QualityCuts const& cutsCuts, + CAParams const& caParams) + : AlgoParams(commonCuts), cellCuts_(cellCuts), qualityCuts_(cutsCuts), caParams_(caParams) {} + + const CellCuts cellCuts_; + const QualityCuts qualityCuts_{// polynomial coefficients for the pT-dependent chi2 cut + {0.68177776, 0.74609577, -0.08035491, 0.00315399}, + // max pT used to determine the chi2 cut + 10., + // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit + 30., + // regional cuts for triplets + { + 0.3, // |Tip| < 0.3 cm + 0.5, // pT > 0.5 GeV + 12.0 // |Zip| < 12.0 cm + }, + // regional cuts for quadruplets + { + 0.5, // |Tip| < 0.5 cm + 0.3, // pT > 0.3 GeV + 12.0 // |Zip| < 12.0 cm + }}; + const CAParams caParams_; + /// Compute the number of pairs + inline uint32_t nPairs() const { + // take all layer pairs into account + uint32_t nActualPairs = TT::nPairs; + if (not includeJumpingForwardDoublets_) { + // exclude forward "jumping" layer pairs + nActualPairs = TT::nPairsForTriplets; + } + if (caParams_.minHitsPerNtuplet_ > 3) { + // for quadruplets, exclude all "jumping" layer pairs + nActualPairs = TT::nPairsForQuadruplets; + } + + return nActualPairs; + } + + }; // Params Phase1 + + template + struct ParamsT> : public AlgoParams { + using TT = TrackerTraits; + using QualityCuts = ::pixelTrack::QualityCutsT; + using CellCuts = caPixelDoublets::CellCutsT; + using CAParams = CAParamsT; + + ParamsT(AlgoParams const& commonCuts, + CellCuts const& cellCuts, + QualityCuts const& qualityCuts, + CAParams const& caParams) + : AlgoParams(commonCuts), cellCuts_(cellCuts), qualityCuts_(qualityCuts), caParams_(caParams) {} + + // quality cuts + const CellCuts cellCuts_; + const QualityCuts qualityCuts_{5.0f, /*chi2*/ 0.9f, /* pT in Gev*/ 0.4f, /*zip in cm*/ 12.0f /*tip in cm*/}; + const CAParams caParams_; + + inline uint32_t nPairs() const { + // take all layer pairs into account + uint32_t nActualPairs = TT::nPairsMinimal; + if (caParams_.includeFarForwards_) { + // considera far forwards (> 11 & > 23) + nActualPairs = TT::nPairsFarForwards; + } + if (includeJumpingForwardDoublets_) { + // include jumping forwards + nActualPairs = TT::nPairs; + } + + return nActualPairs; + } + + }; // Params Phase1 + + // counters + struct Counters { + unsigned long long nEvents; + unsigned long long nHits; + unsigned long long nCells; + unsigned long long nTuples; + unsigned long long nFitTracks; + unsigned long long nLooseTracks; + unsigned long long nGoodTracks; + unsigned long long nUsedHits; + unsigned long long nDupHits; + unsigned long long nFishCells; + unsigned long long nKilledCells; + unsigned long long nEmptyCells; + unsigned long long nZeroTrackCells; + }; + + using Quality = ::pixelTrack::Quality; + + } // namespace caHitNtupletGenerator + + template + class CAHitNtupletGeneratorKernels { + public: + using TrackerTraits = TTTraits; + using QualityCuts = ::pixelTrack::QualityCutsT; + using CellCuts = caPixelDoublets::CellCutsT; + using Params = caHitNtupletGenerator::ParamsT; + using CAParams = caHitNtupletGenerator::CAParamsT; + using Counters = caHitNtupletGenerator::Counters; + + using HitsView = TrackingRecHitSoAView; + using HitsConstView = TrackingRecHitSoAConstView; + using TkSoAView = reco::TrackSoAView; + + using HitToTuple = caStructures::template HitToTupleT; + using TupleMultiplicity = caStructures::template TupleMultiplicityT; + struct Testttt { + TupleMultiplicity tm; + }; + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + using CellNeighbors = caStructures::CellNeighborsT; + using CellTracksVector = caStructures::CellTracksVectorT; + using CellTracks = caStructures::CellTracksT; + using OuterHitOfCellContainer = caStructures::OuterHitOfCellContainerT; + using OuterHitOfCell = caStructures::OuterHitOfCellT; + + using CACell = CACellT; + + using Quality = ::pixelTrack::Quality; + using HitContainer = typename reco::TrackSoA::HitContainer; + + CAHitNtupletGeneratorKernels(Params const& params, uint32_t nhits, Queue& queue); + ~CAHitNtupletGeneratorKernels() = default; + + TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.data(); } + + void launchKernels(const HitsConstView& hh, TkSoAView& track_view, Queue& queue); + + void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, Queue& queue); + + void buildDoublets(const HitsConstView& hh, Queue& queue); + + static void printCounters(); + + private: + // params + Params const& m_params; + cms::alpakatools::device_buffer counters_; + + // workspace + cms::alpakatools::device_buffer device_hitToTuple_; + cms::alpakatools::device_buffer device_tupleMultiplicity_; + cms::alpakatools::device_buffer device_theCells_; + cms::alpakatools::device_buffer device_isOuterHitOfCell_; + cms::alpakatools::device_buffer isOuterHitOfCell_; + cms::alpakatools::device_buffer device_theCellNeighbors_; + cms::alpakatools::device_buffer device_theCellTracks_; + cms::alpakatools::device_buffer cellStorage_; + cms::alpakatools::device_buffer device_cellCuts_; + CellNeighbors* device_theCellNeighborsContainer_; + CellTracks* device_theCellTracksContainer_; + cms::alpakatools::device_buffer device_storage_; + cms::alpakatools::AtomicPairCounter* device_hitTuple_apc_; + cms::alpakatools::AtomicPairCounter* device_hitToTuple_apc_; + cms::alpakatools::device_view device_nCells_; + }; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h new file mode 100644 index 0000000000000..dd6ba7c5cf51e --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAHitNtupletGeneratorKernelsImpl.h @@ -0,0 +1,1048 @@ +// +// Original Author: Felice Pantaleo, CERN +// + +//#define GPU_DEBUG +//#define NTUPLE_DEBUG + +#include +#include +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h" +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" + +#include "CAStructures.h" +#include "CAHitNtupletGeneratorKernels.h" +#include "CACell.h" +#include "CAFishbone.h" +#include "CAPixelDoublets.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace caHitNtupletGeneratorKernels { + + constexpr uint32_t tkNotFound = std::numeric_limits::max(); + constexpr float maxScore = std::numeric_limits::max(); + constexpr float nSigma2 = 25.f; + + //all of these below are mostly to avoid brining around the relative namespace + + template + using HitToTuple = caStructures::HitToTupleT; + + template + using TupleMultiplicity = caStructures::TupleMultiplicityT; + + template + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + + template + using CellTracksVector = caStructures::CellTracksVectorT; + + template + using OuterHitOfCell = caStructures::OuterHitOfCellT; + + using Quality = ::pixelTrack::Quality; + + template + using TkSoAView = reco::TrackSoAView; + + template + using HitContainer = typename reco::TrackSoA::HitContainer; + + template + using HitsConstView = typename CACellT::HitsConstView; + + template + using QualityCuts = ::pixelTrack::QualityCutsT; + + template + using CAParams = caHitNtupletGenerator::CAParamsT; + + using Counters = caHitNtupletGenerator::Counters; + + template + class Kernel_checkOverflows { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + TupleMultiplicity const *tupleMultiplicity, + HitToTuple const *hitToTuple, + cms::alpakatools::AtomicPairCounter *apc, + CACellT const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + CellNeighborsVector const *cellNeighbors, + CellTracksVector const *cellTracks, + OuterHitOfCell const *isOuterHitOfCell, + int32_t nHits, + uint32_t maxNumberOfDoublets, + Counters *counters) const { + auto &c = *counters; + // counters once per event + if (cms::alpakatools::once_per_grid(acc)) { + alpaka::atomicAdd(acc, &c.nEvents, 1ull, alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &c.nHits, static_cast(nHits), alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &c.nCells, static_cast(*nCells), alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd( + acc, &c.nTuples, static_cast(apc->get().first), alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, + &c.nFitTracks, + static_cast(tupleMultiplicity->size()), + alpaka::hierarchy::Blocks{}); + } + +#ifdef NTUPLE_DEBUGS + if (cms::alpakatools::once_per_grid(acc)) { + printf("number of found cells %d \n found tuples %d with total hits %d out of %d\n", + *nCells, + apc->get().first, + apc->get().second, + nHits); + if (apc->get().first < TrackerTraits::maxNumberOfQuadruplets) { + ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size(apc->get().first) == 0); + ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().size() == apc->get().second); + } + } + const auto ntNbins = foundNtuplets->nbins(); + + for (auto idx : cms::alpakatools::elements_with_stride(acc, ntBins)) { + if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack) // current real limit + printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx)); + ALPAKA_ASSERT_OFFLOAD(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack); + for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih) + ALPAKA_ASSERT_OFFLOAD(int(*ih) < nHits); + } +#endif + + if (cms::alpakatools::once_per_grid(acc)) { + if (apc->get().first >= TrackerTraits::maxNumberOfQuadruplets) + printf("Tuples overflow\n"); + if (*nCells >= maxNumberOfDoublets) + printf("Cells overflow\n"); + if (cellNeighbors && cellNeighbors->full()) + printf("cellNeighbors overflow %d %d \n", cellNeighbors->capacity(), cellNeighbors->size()); + if (cellTracks && cellTracks->full()) + printf("cellTracks overflow\n"); + if (int(hitToTuple->nOnes()) < nHits) + printf("ERROR hitToTuple overflow %d %d\n", hitToTuple->nOnes(), nHits); +#ifdef GPU_DEBUG + printf("size of cellNeighbors %d \n cellTracks %d \n hitToTuple %d \n", + cellNeighbors->size(), + cellTracks->size(), + hitToTuple->size()); +#endif + } + + const auto ntNCells = (*nCells); + for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) { + auto const &thisCell = cells[idx]; + if (thisCell.hasFishbone() && !thisCell.isKilled()) + alpaka::atomicAdd(acc, &c.nFishCells, 1ull, alpaka::hierarchy::Blocks{}); + if (thisCell.outerNeighbors().full()) //++tooManyNeighbors[thisCell.theLayerPairId]; + printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.layerPairId()); + if (thisCell.tracks().full()) //++tooManyTracks[thisCell.theLayerPairId]; + printf("Tracks overflow %d in %d\n", idx, thisCell.layerPairId()); + if (thisCell.isKilled()) + alpaka::atomicAdd(acc, &c.nKilledCells, 1ull, alpaka::hierarchy::Blocks{}); + if (!thisCell.unused()) + alpaka::atomicAdd(acc, &c.nEmptyCells, 1ull, alpaka::hierarchy::Blocks{}); + if ((0 == hitToTuple->size(thisCell.inner_hit_id())) && (0 == hitToTuple->size(thisCell.outer_hit_id()))) + alpaka::atomicAdd(acc, &c.nZeroTrackCells, 1ull, alpaka::hierarchy::Blocks{}); + } + + for (auto idx : cms::alpakatools::elements_with_stride(acc, nHits)) + if ((*isOuterHitOfCell).container[idx].full()) // ++tooManyOuterHitOfCell; + printf("OuterHitOfCell overflow %d\n", idx); + } + }; + + template + class Kernel_fishboneCleaner { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + CACellT const *cells, + uint32_t const *__restrict__ nCells, + TkSoAView tracks_view) const { + constexpr auto reject = Quality::dup; + const auto ntNCells = (*nCells); + + for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) { + auto const &thisCell = cells[idx]; + if (!thisCell.isKilled()) + continue; + + for (auto it : thisCell.tracks()) + tracks_view[it].quality() = reject; + } + } + }; + // remove shorter tracks if sharing a cell + // It does not seem to affect efficiency in any way! + template + class Kernel_earlyDuplicateRemover { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + CACellT const *cells, + uint32_t const *__restrict__ nCells, + TkSoAView tracks_view, + bool dupPassThrough) const { + // quality to mark rejected + constexpr auto reject = Quality::edup; /// cannot be loose + ALPAKA_ASSERT_OFFLOAD(nCells); + const auto ntNCells = (*nCells); + + for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) { + auto const &thisCell = cells[idx]; + + if (thisCell.tracks().size() < 2) + continue; + + int8_t maxNl = 0; + + // find maxNl + for (auto it : thisCell.tracks()) { + auto nl = tracks_view[it].nLayers(); + maxNl = std::max(nl, maxNl); + } + + // if (maxNl<4) continue; + // quad pass through (leave it her for tests) + // maxNl = std::min(4, maxNl); + + for (auto it : thisCell.tracks()) { + if (tracks_view[it].nLayers() < maxNl) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant + } + } + } + }; + + // assume the above (so, short tracks already removed) + template + class Kernel_fastDuplicateRemover { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + CACellT const *__restrict__ cells, + uint32_t const *__restrict__ nCells, + TkSoAView tracks_view, + bool dupPassThrough) const { + // quality to mark rejected + auto const reject = dupPassThrough ? Quality::loose : Quality::dup; + constexpr auto loose = Quality::loose; + + ALPAKA_ASSERT_OFFLOAD(nCells); + const auto ntNCells = (*nCells); + + for (auto idx : cms::alpakatools::elements_with_stride(acc, ntNCells)) { + auto const &thisCell = cells[idx]; + if (thisCell.tracks().size() < 2) + continue; + + float mc = maxScore; + uint16_t im = tkNotFound; + + auto score = [&](auto it) { return std::abs(TracksUtilities::tip(tracks_view, it)); }; + + // full crazy combinatorics + int ntr = thisCell.tracks().size(); + for (int i = 0; i < ntr - 1; ++i) { + auto it = thisCell.tracks()[i]; + auto qi = tracks_view[it].quality(); + if (qi <= reject) + continue; + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); + for (auto j = i + 1; j < ntr; ++j) { + auto jt = thisCell.tracks()[j]; + auto qj = tracks_view[jt].quality(); + if (qj <= reject) + continue; + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); + if ((cti - ctj) * (cti - ctj) > dct) + continue; + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); + if ((opi - opj) * (opi - opj) > dop) + continue; + if ((qj < qi) || (qj == qi && score(it) < score(jt))) + tracks_view[jt].quality() = reject; + else { + tracks_view[it].quality() = reject; + break; + } + } + } + + // find maxQual + auto maxQual = reject; // no duplicate! + for (auto it : thisCell.tracks()) { + if (tracks_view[it].quality() > maxQual) + maxQual = tracks_view[it].quality(); + } + + if (maxQual <= loose) + continue; + + // find min score + for (auto it : thisCell.tracks()) { + if (tracks_view[it].quality() == maxQual && score(it) < mc) { + mc = score(it); + im = it; + } + } + + if (tkNotFound == im) + continue; + + // mark all other duplicates (not yet, keep it loose) + for (auto it : thisCell.tracks()) { + if (tracks_view[it].quality() > loose && it != im) + tracks_view[it].quality() = loose; //no race: simple assignment of the same constant + } + } + } + }; + + template + class Kernel_connect { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + cms::alpakatools::AtomicPairCounter *apc1, + cms::alpakatools::AtomicPairCounter *apc2, // just to zero them + HitsConstView hh, + CACellT *cells, + uint32_t *nCells, + CellNeighborsVector *cellNeighbors, + OuterHitOfCell const *isOuterHitOfCell, + CAParams params) const { + using Cell = CACellT; + + const uint32_t dimIndexY = 0u; + const uint32_t dimIndexX = 1u; + const uint32_t threadIdxY(alpaka::getIdx(acc)[dimIndexY]); + const uint32_t threadIdxLocalX(alpaka::getIdx(acc)[dimIndexX]); + + if (0 == (threadIdxY + threadIdxLocalX)) { + (*apc1) = 0; + (*apc2) = 0; + } // ready for next kernel + + constexpr uint32_t last_bpix1_detIndex = TrackerTraits::last_bpix1_detIndex; + constexpr uint32_t last_barrel_detIndex = TrackerTraits::last_barrel_detIndex; + + cms::alpakatools::for_each_element_in_grid_strided( + acc, + (*nCells), + 0u, + [&](uint32_t idx) { + auto cellIndex = idx; + auto &thisCell = cells[idx]; + auto innerHitId = thisCell.inner_hit_id(); + if (int(innerHitId) >= isOuterHitOfCell->offset) { + uint32_t numberOfPossibleNeighbors = (*isOuterHitOfCell)[innerHitId].size(); + auto vi = (*isOuterHitOfCell)[innerHitId].data(); + + auto ri = thisCell.inner_r(hh); + auto zi = thisCell.inner_z(hh); + + auto ro = thisCell.outer_r(hh); + auto zo = thisCell.outer_z(hh); + auto isBarrel = thisCell.inner_detIndex(hh) < last_barrel_detIndex; + + cms::alpakatools::for_each_element_in_block_strided( + acc, + numberOfPossibleNeighbors, + 0u, + [&](uint32_t j) { + auto otherCell = (vi[j]); + auto &oc = cells[otherCell]; + auto r1 = oc.inner_r(hh); + auto z1 = oc.inner_z(hh); + bool aligned = Cell::areAlignedRZ( + r1, + z1, + ri, + zi, + ro, + zo, + params.ptmin_, + isBarrel ? params.CAThetaCutBarrel_ + : params.CAThetaCutForward_); // 2.f*thetaCut); // FIXME tune cuts + if (aligned && + thisCell.dcaCut(hh, + oc, + oc.inner_detIndex(hh) < last_bpix1_detIndex ? params.dcaCutInnerTriplet_ + : params.dcaCutOuterTriplet_, + params.hardCurvCut_)) { // FIXME tune cuts + oc.addOuterNeighbor(acc, cellIndex, *cellNeighbors); + thisCell.setStatusBits(Cell::StatusBit::kUsed); + oc.setStatusBits(Cell::StatusBit::kUsed); + } + }, + dimIndexX); // loop on inner cells + } + }, + dimIndexY); // loop on outer cells + } + }; + template + class Kernel_find_ntuplets { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + HitsConstView hh, + TkSoAView tracks_view, + CACellT *__restrict__ cells, + uint32_t const *nCells, + CellTracksVector *cellTracks, + cms::alpakatools::AtomicPairCounter *apc, + CAParams params) const { + // recursive: not obvious to widen + + using Cell = CACellT; + +#ifdef GPU_DEBUG + if (cms::alpakatools::once_per_grid(acc)) + printf("starting producing ntuplets from %d cells \n", *nCells); +#endif + + for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) { + auto const &thisCell = cells[idx]; + + if (thisCell.isKilled()) + continue; // cut by earlyFishbone + + // we require at least three hits... + + if (thisCell.outerNeighbors().empty()) + continue; + + auto pid = thisCell.layerPairId(); + bool doit = params.startingLayerPair(pid); + + constexpr uint32_t maxDepth = TrackerTraits::maxDepth; + + if (doit) { + typename Cell::TmpTuple stack; + stack.reset(); + bool bpix1Start = params.startAt0(pid); + thisCell.template find_ntuplets(acc, + hh, + cells, + *cellTracks, + tracks_view.hitIndices(), + *apc, + tracks_view.quality(), + stack, + params.minHitsPerNtuplet_, + bpix1Start); + ALPAKA_ASSERT_OFFLOAD(stack.empty()); + } + } + } + }; + + template + class Kernel_mark_used { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + CACellT *__restrict__ cells, + uint32_t const *nCells) const { + using Cell = CACellT; + for (auto idx : cms::alpakatools::elements_with_stride(acc, (*nCells))) { + auto &thisCell = cells[idx]; + if (!thisCell.tracks().empty()) + thisCell.setStatusBits(Cell::StatusBit::kInTrack); + } + } + }; + + template + class Kernel_countMultiplicity { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + TupleMultiplicity *tupleMultiplicity) const { + for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) { + auto nhits = tracks_view.hitIndices().size(it); + if (nhits < 3) + continue; + if (tracks_view[it].quality() == Quality::edup) + continue; + ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad); + if (nhits > TrackerTraits::maxHitsOnTrack) // current limit + printf("wrong mult %d %d\n", it, nhits); + ALPAKA_ASSERT_OFFLOAD(nhits <= TrackerTraits::maxHitsOnTrack); + tupleMultiplicity->count(acc, nhits); + } + } + }; + + template + class Kernel_fillMultiplicity { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + TupleMultiplicity *tupleMultiplicity) const { + for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) { + auto nhits = tracks_view.hitIndices().size(it); + if (nhits < 3) + continue; + if (tracks_view[it].quality() == Quality::edup) + continue; + ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad); + if (nhits > TrackerTraits::maxHitsOnTrack) + printf("wrong mult %d %d\n", it, nhits); + ALPAKA_ASSERT_OFFLOAD(nhits <= TrackerTraits::maxHitsOnTrack); + tupleMultiplicity->fill(acc, nhits, it); + } + } + }; + + template + class Kernel_classifyTracks { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + QualityCuts cuts) const { + for (auto it : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) { + auto nhits = tracks_view.hitIndices().size(it); + if (nhits == 0) + break; // guard + + // if duplicate: not even fit + if (tracks_view[it].quality() == Quality::edup) + continue; + + ALPAKA_ASSERT_OFFLOAD(tracks_view[it].quality() == Quality::bad); + + // mark doublets as bad + if (nhits < 3) + continue; + + // if the fit has any invalid parameters, mark it as bad + bool isNaN = false; + for (int i = 0; i < 5; ++i) { + isNaN |= std::isnan(tracks_view[it].state()(i)); + } + if (isNaN) { +#ifdef NTUPLE_DEBUG + printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2()); +#endif + continue; + } + + tracks_view[it].quality() = Quality::strict; + + if (cuts.strictCut(tracks_view, it)) + continue; + + tracks_view[it].quality() = Quality::tight; + + if (cuts.isHP(tracks_view, nhits, it)) + tracks_view[it].quality() = Quality::highPurity; + } + } + }; + + template + class Kernel_doStatsForTracks { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, TkSoAView tracks_view, Counters *counters) const { + for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) { + if (tracks_view.hitIndices().size(idx) == 0) + break; //guard + if (tracks_view[idx].quality() < Quality::loose) + continue; + alpaka::atomicAdd(acc, &(counters->nLooseTracks), 1ull, alpaka::hierarchy::Blocks{}); + if (tracks_view[idx].quality() < Quality::strict) + continue; + alpaka::atomicAdd(acc, &(counters->nGoodTracks), 1ull, alpaka::hierarchy::Blocks{}); + } + } + }; + + template + class Kernel_countHitInTracks { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + HitToTuple *hitToTuple) const { + for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) { + if (tracks_view.hitIndices().size(idx) == 0) + break; // guard + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) + hitToTuple->count(acc, *h); + } + } + }; + + template + class Kernel_fillHitInTracks { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + HitToTuple *hitToTuple) const { + for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) { + if (tracks_view.hitIndices().size(idx) == 0) + break; // guard + for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h) + hitToTuple->fill(acc, *h, idx); + } + } + }; + + template + class Kernel_fillHitDetIndices { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + HitsConstView hh) const { + // copy offsets + for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nOnes())) { + tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx]; + } + // fill hit indices + for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().size())) { + ALPAKA_ASSERT_OFFLOAD(tracks_view.hitIndices().content[idx] < (uint32_t)hh.metadata().size()); + tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex(); + } + } + }; + + template + class Kernel_fillNLayers { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + cms::alpakatools::AtomicPairCounter *apc) const { + // clamp the number of tracks to the capacity of the SoA + auto ntracks = std::min(apc->get().first, tracks_view.metadata().size() - 1); + + if (cms::alpakatools::once_per_grid(acc)) + tracks_view.nTracks() = ntracks; + for (auto idx : cms::alpakatools::elements_with_stride(acc, ntracks)) { + ALPAKA_ASSERT_OFFLOAD(TracksUtilities::nHits(tracks_view, idx) >= 3); + tracks_view[idx].nLayers() = TracksUtilities::computeNumberOfLayers(tracks_view, idx); + } + } + }; + + template + class Kernel_doStatsForHitInTracks { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + HitToTuple const *__restrict__ hitToTuple, + Counters *counters) const { + auto &c = *counters; + for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nOnes())) { + if (hitToTuple->size(idx) == 0) + continue; // SHALL NOT BE break + alpaka::atomicAdd(acc, &c.nUsedHits, 1ull, alpaka::hierarchy::Blocks{}); + if (hitToTuple->size(idx) > 1) + alpaka::atomicAdd(acc, &c.nDupHits, 1ull, alpaka::hierarchy::Blocks{}); + } + } + }; + + template + class Kernel_countSharedHit { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + int *__restrict__ nshared, + HitContainer const *__restrict__ ptuples, + Quality const *__restrict__ quality, + HitToTuple const *__restrict__ phitToTuple) const { + constexpr auto loose = Quality::loose; + + auto &hitToTuple = *phitToTuple; + auto const &foundNtuplets = *ptuples; + for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple->nbins())) { + if (hitToTuple.size(idx) < 2) + continue; + + int nt = 0; + + // count "good" tracks + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + if (quality[*it] < loose) + continue; + ++nt; + } + + if (nt < 2) + continue; + + // now mark each track triplet as sharing a hit + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + if (foundNtuplets.size(*it) > 3) + continue; + alpaka::atomicAdd(acc, &nshared[*it], 1ull, alpaka::hierarchy::Blocks{}); + } + + } // hit loop + } + }; + + template + class Kernel_markSharedHit { + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + int const *__restrict__ nshared, + HitContainer const *__restrict__ tuples, + Quality *__restrict__ quality, + bool dupPassThrough) const { + // constexpr auto bad = Quality::bad; + constexpr auto dup = Quality::dup; + constexpr auto loose = Quality::loose; + // constexpr auto strict = Quality::strict; + + // quality to mark rejected + auto const reject = dupPassThrough ? loose : dup; + for (auto idx : cms::alpakatools::elements_with_stride(acc, tuples->nbins())) { + if (tuples->size(idx) == 0) + break; //guard + if (quality[idx] <= reject) + continue; + if (nshared[idx] > 2) + quality[idx] = reject; + } + } + }; + + // mostly for very forward triplets..... + template + class Kernel_rejectDuplicate { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + uint16_t nmin, + bool dupPassThrough, + HitToTuple const *__restrict__ phitToTuple) const { + // quality to mark rejected + auto const reject = dupPassThrough ? Quality::loose : Quality::dup; + + auto &hitToTuple = *phitToTuple; + + for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) { + if (hitToTuple.size(idx) < 2) + continue; + + auto score = [&](auto it, auto nl) { return std::abs(TracksUtilities::tip(tracks_view, it)); }; + + // full combinatorics + for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) { + auto const it = *ip; + auto qi = tracks_view[it].quality(); + if (qi <= reject) + continue; + auto opi = tracks_view[it].state()(2); + auto e2opi = tracks_view[it].covariance()(9); + auto cti = tracks_view[it].state()(3); + auto e2cti = tracks_view[it].covariance()(12); + auto nli = tracks_view[it].nLayers(); + for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) { + auto const jt = *jp; + auto qj = tracks_view[jt].quality(); + if (qj <= reject) + continue; + auto opj = tracks_view[jt].state()(2); + auto ctj = tracks_view[jt].state()(3); + auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti); + if ((cti - ctj) * (cti - ctj) > dct) + continue; + auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi); + if ((opi - opj) * (opi - opj) > dop) + continue; + auto nlj = tracks_view[jt].nLayers(); + if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj))))) + tracks_view[jt].quality() = reject; + else { + tracks_view[it].quality() = reject; + break; + } + } + } + } + } + }; + + template + class Kernel_sharedHitCleaner { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + HitsConstView hh, + TkSoAView tracks_view, + int nmin, + bool dupPassThrough, + HitToTuple const *__restrict__ phitToTuple) const { + // quality to mark rejected + auto const reject = dupPassThrough ? Quality::loose : Quality::dup; + // quality of longest track + auto const longTqual = Quality::highPurity; + + auto &hitToTuple = *phitToTuple; + + uint32_t l1end = hh.hitsLayerStart()[1]; + + for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) { + if (hitToTuple.size(idx) < 2) + continue; + + int8_t maxNl = 0; + + // find maxNl + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + if (tracks_view[*it].quality() < longTqual) + continue; + // if (tracks_view[*it].nHits()==3) continue; + auto nl = tracks_view[*it].nLayers(); + maxNl = std::max(nl, maxNl); + } + + if (maxNl < 4) + continue; + + // quad pass through (leave for tests) + // maxNl = std::min(4, maxNl); + + // kill all tracks shorter than maxHl (only triplets??? + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + auto nl = tracks_view[*it].nLayers(); + + //checking if shared hit is on bpix1 and if the tuple is short enough + if (idx < l1end and nl > nmin) + continue; + + if (nl < maxNl && tracks_view[*it].quality() > reject) + tracks_view[*it].quality() = reject; + } + } + } + }; + template + class Kernel_tripletCleaner { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + uint16_t nmin, + bool dupPassThrough, + HitToTuple const *__restrict__ phitToTuple) const { + // quality to mark rejected + auto const reject = Quality::loose; + /// min quality of good + auto const good = Quality::strict; + + auto &hitToTuple = *phitToTuple; + + for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) { + if (hitToTuple.size(idx) < 2) + continue; + + float mc = maxScore; + uint16_t im = tkNotFound; + bool onlyTriplets = true; + + // check if only triplets + for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) { + if (tracks_view[*it].quality() <= good) + continue; + onlyTriplets &= TracksUtilities::isTriplet(tracks_view, *it); + if (!onlyTriplets) + break; + } + + // only triplets + if (!onlyTriplets) + continue; + + // for triplets choose best tip! (should we first find best quality???) + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (tracks_view[it].quality() >= good && + std::abs(TracksUtilities::tip(tracks_view, it)) < mc) { + mc = std::abs(TracksUtilities::tip(tracks_view, it)); + im = it; + } + } + + if (tkNotFound == im) + continue; + + // mark worse ambiguities + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (tracks_view[it].quality() > reject && it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant + } + + } // loop over hits + } + }; + + template + class Kernel_simpleTripletCleaner { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TkSoAView tracks_view, + uint16_t nmin, + bool dupPassThrough, + HitToTuple const *__restrict__ phitToTuple) const { + // quality to mark rejected + auto const reject = Quality::loose; + /// min quality of good + auto const good = Quality::loose; + + auto &hitToTuple = *phitToTuple; + + for (auto idx : cms::alpakatools::elements_with_stride(acc, hitToTuple.nOnes())) { + if (hitToTuple.size(idx) < 2) + continue; + + float mc = maxScore; + uint16_t im = tkNotFound; + + // choose best tip! (should we first find best quality???) + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (tracks_view[it].quality() >= good && + std::abs(TracksUtilities::tip(tracks_view, it)) < mc) { + mc = std::abs(TracksUtilities::tip(tracks_view, it)); + im = it; + } + } + + if (tkNotFound == im) + continue; + + // mark worse ambiguities + for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) { + auto const it = *ip; + if (tracks_view[it].quality() > reject && TracksUtilities::isTriplet(tracks_view, it) && + it != im) + tracks_view[it].quality() = reject; //no race: simple assignment of the same constant + } + + } // loop over hits + } + }; + + template + class Kernel_print_found_ntuplets { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + HitsConstView hh, + TkSoAView tracks_view, + HitToTuple const *__restrict__ phitToTuple, + int32_t firstPrint, + int32_t lastPrint, + int iev) const { + constexpr auto loose = Quality::loose; + + for (auto i : cms::alpakatools::elements_with_stride(acc, tracks_view.hitIndices().nbins())) { + auto nh = tracks_view.hitIndices().size(i); + if (nh < 3) + continue; + if (tracks_view[i].quality() < loose) + continue; + printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n", + 10000 * iev + i, + int(tracks_view[i].quality()), + nh, + tracks_view[i].nLayers(), + reco::charge(tracks_view, i), + //TracksUtilities::charge(tracks_view, i), + tracks_view[i].pt(), + tracks_view[i].eta(), + TracksUtilities::phi(tracks_view, i), + TracksUtilities::tip(tracks_view, i), + TracksUtilities::zip(tracks_view, i), + tracks_view[i].chi2(), + hh[*tracks_view.hitIndices().begin(i)].zGlobal(), + hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(), + hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(), + nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0, + nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0, + nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0, + nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0); + } + } + }; + + class Kernel_printCounters { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, Counters const *counters) const { + auto const &c = *counters; + printf( + "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks | nLooseTracks | nGoodTracks | " + "nUsedHits " + "| " + "nDupHits | " + "nFishCells | " + "nKilledCells | " + "nUsedCells | nZeroTrackCells ||\n"); + printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n", + c.nEvents, + c.nHits, + c.nCells, + c.nTuples, + c.nFitTracks, + c.nLooseTracks, + c.nGoodTracks, + c.nUsedHits, + c.nDupHits, + c.nFishCells, + c.nKilledCells, + c.nEmptyCells, + c.nZeroTrackCells); + printf( + "Counters Norm %lld || %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.1f| %.3f| %.3f| " + "%.3f| " + "%.3f||\n", + c.nEvents, + c.nHits / double(c.nEvents), + c.nCells / double(c.nEvents), + c.nTuples / double(c.nEvents), + c.nFitTracks / double(c.nEvents), + c.nLooseTracks / double(c.nEvents), + c.nGoodTracks / double(c.nEvents), + c.nUsedHits / double(c.nEvents), + c.nDupHits / double(c.nEvents), + c.nFishCells / double(c.nCells), + c.nKilledCells / double(c.nCells), + c.nEmptyCells / double(c.nCells), + c.nZeroTrackCells / double(c.nCells)); + } + }; + } // namespace caHitNtupletGeneratorKernels +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h new file mode 100644 index 0000000000000..0b5ab0a985163 --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoublets.h @@ -0,0 +1,71 @@ +#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoublets_h +#define RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoublets_h + +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "CAPixelDoubletsAlgos.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + using namespace alpaka; + using namespace cms::alpakatools; + namespace caPixelDoublets { + + template + class InitDoublets { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + OuterHitOfCell* isOuterHitOfCell, + int nHits, + CellNeighborsVector* cellNeighbors, + CellNeighbors* cellNeighborsContainer, + CellTracksVector* cellTracks, + CellTracks* cellTracksContainer) const { + ALPAKA_ASSERT_OFFLOAD((*isOuterHitOfCell).container); + + for (auto i : cms::alpakatools::elements_with_stride(acc, nHits)) + (*isOuterHitOfCell).container[i].reset(); + + if (cms::alpakatools::once_per_grid(acc)) { + cellNeighbors->construct(TrackerTraits::maxNumOfActiveDoublets, cellNeighborsContainer); + cellTracks->construct(TrackerTraits::maxNumOfActiveDoublets, cellTracksContainer); + [[maybe_unused]] auto i = cellNeighbors->extend(acc); + ALPAKA_ASSERT_OFFLOAD(0 == i); + (*cellNeighbors)[0].reset(); + i = cellTracks->extend(acc); + ALPAKA_ASSERT_OFFLOAD(0 == i); + (*cellTracks)[0].reset(); + } + } + }; + + // Not used for the moment, see below. + //constexpr auto getDoubletsFromHistoMaxBlockSize = 64; // for both x and y + //constexpr auto getDoubletsFromHistoMinBlocksPerMP = 16; + + template + class GetDoubletsFromHisto { + public: + template >> + // #ifdef __CUDACC__ + // __launch_bounds__(getDoubletsFromHistoMaxBlockSize, getDoubletsFromHistoMinBlocksPerMP) // TODO: Alapakify + // #endif + ALPAKA_FN_ACC void operator()(TAcc const& acc, + CACellT* cells, + uint32_t* nCells, + CellNeighborsVector* cellNeighbors, + CellTracksVector* cellTracks, + HitsConstView hh, + OuterHitOfCell* isOuterHitOfCell, + uint32_t nActualPairs, + const uint32_t maxNumOfDoublets, + CellCutsT cuts) const { + doubletsFromHisto( + acc, nActualPairs, maxNumOfDoublets, cells, nCells, cellNeighbors, cellTracks, hh, *isOuterHitOfCell, cuts); + } + }; + } // namespace caPixelDoublets +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAPixelDoublets_h diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h new file mode 100644 index 0000000000000..29ce8d7d76e3c --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAPixelDoubletsAlgos.h @@ -0,0 +1,333 @@ +#ifndef RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoubletsAlgos_h +#define RecoPixelVertexing_PixelTriplets_alpaka_CAPixelDoubletsAlgos_h + +#include +#include +#include +#include +#include + +#include + +#include "DataFormats/Math/interface/approx_atan2.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +#include "CACell.h" +#include "CAStructures.h" + +//#define GPU_DEBUG +//#define NTUPLE_DEBUG + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace caPixelDoublets { + using namespace cms::alpakatools; + + template + using CellNeighbors = caStructures::CellNeighborsT; + template + using CellTracks = caStructures::CellTracksT; + template + using CellNeighborsVector = caStructures::CellNeighborsVectorT; + template + using CellTracksVector = caStructures::CellTracksVectorT; + template + using OuterHitOfCell = caStructures::OuterHitOfCellT; + template + using HitsConstView = typename CACellT::HitsConstView; + + template + struct CellCutsT { + using H = HitsConstView; + using T = TrackerTraits; + + CellCutsT() = default; + + CellCutsT(const bool doClusterCut, + const bool doZ0Cut, + const bool doPtCut, + const bool idealConditions, + const float z0Cut, + const float ptCut, + const std::vector& phiCutsV) + : doClusterCut_(doClusterCut), + doZ0Cut_(doZ0Cut), + doPtCut_(doPtCut), + idealConditions_(idealConditions), + z0Cut_(z0Cut), + ptCut_(ptCut) { + assert(phiCutsV.size() == TrackerTraits::nPairs); + std::copy(phiCutsV.begin(), phiCutsV.end(), &phiCuts[0]); + } + + bool doClusterCut_; + bool doZ0Cut_; + bool doPtCut_; + bool idealConditions_; //this is actually not used by phase2 + + float z0Cut_; //FIXME: check if could be const now + float ptCut_; + + int phiCuts[T::nPairs]; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool __attribute__((always_inline)) + zSizeCut(const TAcc& acc, H hh, int i, int o) const { + const uint32_t mi = hh[i].detectorIndex(); + + bool innerB1 = mi < T::last_bpix1_detIndex; + bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2; + auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1; + + if (mes < 0) + return false; + + const uint32_t mo = hh[o].detectorIndex(); + auto so = hh[o].clusterSizeY(); + + auto dz = hh[i].zGlobal() - hh[o].zGlobal(); + auto dr = hh[i].rGlobal() - hh[o].rGlobal(); + + auto innerBarrel = mi < T::last_barrel_detIndex; + auto onlyBarrel = mo < T::last_barrel_detIndex; + + if (not innerBarrel and not onlyBarrel) + return false; + auto dy = innerB1 ? T::maxDYsize12 : T::maxDYsize; + + return onlyBarrel ? so > 0 && std::abs(so - mes) > dy + : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool __attribute__((always_inline)) + clusterCut(const TAcc& acc, H hh, uint32_t i) const { + const uint32_t mi = hh[i].detectorIndex(); + bool innerB1orB2 = mi < T::last_bpix2_detIndex; + + if (!innerB1orB2) + return false; + + bool innerB1 = mi < T::last_bpix1_detIndex; + bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2; + auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1; + + if (innerB1) // B1 + if (mes > 0 && mes < T::minYsizeB1) + return true; // only long cluster (5*8) + bool innerB2 = (mi >= T::last_bpix1_detIndex) && (mi < T::last_bpix2_detIndex); //FIXME number + if (innerB2) // B2 and F1 + if (mes > 0 && mes < T::minYsizeB2) + return true; + + return false; + } + }; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline)) + doubletsFromHisto(const TAcc& acc, + uint32_t nPairs, + const uint32_t maxNumOfDoublets, + CACellT* cells, + uint32_t* nCells, + CellNeighborsVector* cellNeighbors, + CellTracksVector* cellTracks, + HitsConstView hh, + OuterHitOfCell isOuterHitOfCell, + CellCutsT const& cuts) { // ysize cuts (z in the barrel) times 8 + // these are used if doClusterCut is true + + const bool doClusterCut = cuts.doClusterCut_; + const bool doZ0Cut = cuts.doZ0Cut_; + const bool doPtCut = cuts.doPtCut_; + + const float z0cut = cuts.z0Cut_; // cm + const float hardPtCut = cuts.ptCut_; // GeV + // cm (1 GeV track has 1 GeV/c / (e * 3.8T) ~ 87 cm radius in a 3.8T field) + const float minRadius = hardPtCut * 87.78f; + const float minRadius2T4 = 4.f * minRadius * minRadius; + + using PhiBinner = typename TrackingRecHitSoA::PhiBinner; + + auto const& __restrict__ phiBinner = hh.phiBinner(); + uint32_t const* __restrict__ offsets = hh.hitsLayerStart().data(); + ALPAKA_ASSERT_OFFLOAD(offsets); + + auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; }; + + // nPairsMax to be optimized later (originally was 64). + // If it should much be bigger, consider using a block-wide parallel prefix scan, + // e.g. see https://nvlabs.github.io/cub/classcub_1_1_warp_scan.html + auto& innerLayerCumulativeSize = alpaka::declareSharedVar(acc); + auto& ntot = alpaka::declareSharedVar(acc); + + constexpr uint32_t dimIndexY = 0u; + constexpr uint32_t dimIndexX = 1u; + const uint32_t threadIdxLocalY(alpaka::getIdx(acc)[dimIndexY]); + const uint32_t threadIdxLocalX(alpaka::getIdx(acc)[dimIndexX]); + + if (threadIdxLocalY == 0 && threadIdxLocalX == 0) { + innerLayerCumulativeSize[0] = layerSize(TrackerTraits::layerPairs[0]); + for (uint32_t i = 1; i < nPairs; ++i) { + innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i - 1] + layerSize(TrackerTraits::layerPairs[2 * i]); + } + ntot = innerLayerCumulativeSize[nPairs - 1]; + } + alpaka::syncBlockThreads(acc); + + // x runs faster + const uint32_t blockDimensionX(alpaka::getWorkDiv(acc)[dimIndexX]); + const auto& [firstElementIdxNoStrideX, endElementIdxNoStrideX] = + cms::alpakatools::element_index_range_in_block(acc, 0u, dimIndexX); + + uint32_t pairLayerId = 0; // cannot go backward + + // Outermost loop on Y + const uint32_t gridDimensionY(alpaka::getWorkDiv(acc)[dimIndexY]); + const auto& [firstElementIdxNoStrideY, endElementIdxNoStrideY] = + cms::alpakatools::element_index_range_in_grid(acc, 0u, dimIndexY); + uint32_t firstElementIdxY = firstElementIdxNoStrideY; + + for (uint32_t j = firstElementIdxY; j < ntot; j += gridDimensionY) { + while (j >= innerLayerCumulativeSize[pairLayerId++]) + ; + --pairLayerId; // move to lower_bound ?? + + ALPAKA_ASSERT_OFFLOAD(pairLayerId < nPairs); + ALPAKA_ASSERT_OFFLOAD(j < innerLayerCumulativeSize[pairLayerId]); + ALPAKA_ASSERT_OFFLOAD(0 == pairLayerId || j >= innerLayerCumulativeSize[pairLayerId - 1]); + + uint8_t inner = TrackerTraits::layerPairs[2 * pairLayerId]; + uint8_t outer = TrackerTraits::layerPairs[2 * pairLayerId + 1]; + ALPAKA_ASSERT_OFFLOAD(outer > inner); + + auto hoff = PhiBinner::histOff(outer); + auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1]; + i += offsets[inner]; + + ALPAKA_ASSERT_OFFLOAD(i >= offsets[inner]); + ALPAKA_ASSERT_OFFLOAD(i < offsets[inner + 1]); + + // found hit corresponding to our cuda thread, now do the job + if (hh[i].detectorIndex() > pixelClustering::maxNumModules) + continue; // invalid + + /* maybe clever, not effective when zoCut is on + auto bpos = (mi%8)/4; // if barrel is 1 for z>0 + auto fpos = (outer>3) & (outer<7); + if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue; + */ + + auto mez = hh[i].zGlobal(); + + if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId]) + continue; + + if (doClusterCut && outer > pixelTopology::last_barrel_layer && cuts.clusterCut(acc, hh, i)) + continue; + + auto mep = hh[i].iphi(); + auto mer = hh[i].rGlobal(); + + // all cuts: true if fails + auto ptcut = [&](int j, int16_t idphi) { + auto r2t4 = minRadius2T4; + auto ri = mer; + auto ro = hh[j].rGlobal(); + auto dphi = short2phi(idphi); + return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri); + }; + auto z0cutoff = [&](int j) { + auto zo = hh[j].zGlobal(); + auto ro = hh[j].rGlobal(); + auto dr = ro - mer; + return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr; + }; + + auto iphicut = cuts.phiCuts[pairLayerId]; + + auto kl = PhiBinner::bin(int16_t(mep - iphicut)); + auto kh = PhiBinner::bin(int16_t(mep + iphicut)); + auto incr = [](auto& k) { return k = (k + 1) % PhiBinner::nbins(); }; + +#ifdef GPU_DEBUG + int tot = 0; + int nmin = 0; + int tooMany = 0; +#endif + + auto khh = kh; + incr(khh); + for (auto kk = kl; kk != khh; incr(kk)) { +#ifdef GPU_DEBUG + if (kk != kl && kk != kh) + nmin += phiBinner.size(kk + hoff); +#endif + auto const* __restrict__ p = phiBinner.begin(kk + hoff); + auto const* __restrict__ e = phiBinner.end(kk + hoff); + auto const maxpIndex = e - p; + + // Here we parallelize in X + uint32_t firstElementIdxX = firstElementIdxNoStrideX; + for (uint32_t pIndex = firstElementIdxX; pIndex < maxpIndex; pIndex += blockDimensionX) { + auto oi = p[pIndex]; // auto oi = __ldg(p); is not allowed since __ldg is device-only + ALPAKA_ASSERT_OFFLOAD(oi >= offsets[outer]); + ALPAKA_ASSERT_OFFLOAD(oi < offsets[outer + 1]); + auto mo = hh[oi].detectorIndex(); + + if (mo > pixelClustering::maxNumModules) + continue; // invalid + + if (doZ0Cut && z0cutoff(oi)) + continue; + + auto mop = hh[oi].iphi(); + uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))); + + if (idphi > iphicut) + continue; + + if (doClusterCut && cuts.zSizeCut(acc, hh, i, oi)) + continue; + + if (doPtCut && ptcut(oi, idphi)) + continue; + + auto ind = alpaka::atomicAdd(acc, nCells, (uint32_t)1, alpaka::hierarchy::Blocks{}); + if (ind >= maxNumOfDoublets) { + alpaka::atomicSub(acc, nCells, (uint32_t)1, alpaka::hierarchy::Blocks{}); + break; + } // move to SimpleVector?? + cells[ind].init(*cellNeighbors, *cellTracks, hh, pairLayerId, i, oi); + isOuterHitOfCell[oi].push_back(acc, ind); +#ifdef GPU_DEBUG + if (isOuterHitOfCell[oi].full()) + ++tooMany; + ++tot; +#endif + } + } +// #endif +#ifdef GPU_DEBUG + if (tooMany > 0 or tot > 0) + printf("OuterHitOfCell for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f %s\n", + i, + inner, + outer, + nmin, + tot, + tooMany, + iphicut, + TrackerTraits::minz[pairLayerId], + TrackerTraits::maxz[pairLayerId], + tooMany > 0 ? "FULL!!" : "not full."); +#endif + } // loop in block... + } // namespace caPixelDoublets + } // namespace caPixelDoublets +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelTriplets_CAPixelDoubletsAlgos_h diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h b/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h new file mode 100644 index 0000000000000..6ac7a90c724fc --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/CAStructures.h @@ -0,0 +1,52 @@ +#ifndef RecoPixelVertexing_PixelTriplets_CAStructures_h +#define RecoPixelVertexing_PixelTriplets_CAStructures_h + +#include "HeterogeneousCore/AlpakaInterface/interface/SimpleVector.h" +#include "HeterogeneousCore/AlpakaInterface/interface/VecArray.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" + +namespace caStructures { + + template + using CellNeighborsT = + cms::alpakatools::VecArray; + + template + using CellTracksT = cms::alpakatools::VecArray; + + template + using CellNeighborsVectorT = cms::alpakatools::SimpleVector>; + + template + using CellTracksVectorT = cms::alpakatools::SimpleVector>; + + template + using OuterHitOfCellContainerT = cms::alpakatools::VecArray; + + template + using TupleMultiplicityT = cms::alpakatools::OneToManyAssocRandomAccess; + + template + using HitToTupleT = + cms::alpakatools::OneToManyAssocRandomAccess; // 3.5 should be enough + + template + using TuplesContainerT = cms::alpakatools::OneToManyAssocRandomAccess; + + template + struct OuterHitOfCellT { + OuterHitOfCellContainerT* container; + int32_t offset; + constexpr auto& operator[](int i) { return container[i - offset]; } + constexpr auto const& operator[](int i) const { return container[i - offset]; } + }; + +} // namespace caStructures + +#endif diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.cc b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.cc new file mode 100644 index 0000000000000..078cbe8de45a4 --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.cc @@ -0,0 +1,21 @@ +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HelixFit.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + template + void HelixFit::allocate(TupleMultiplicity const *tupleMultiplicity, OutputSoAView &helix_fit_results) { + tuples_ = &helix_fit_results.hitIndices(); + tupleMultiplicity_ = tupleMultiplicity; + outputSoa_ = helix_fit_results; + + ALPAKA_ASSERT_OFFLOAD(tuples_); + ALPAKA_ASSERT_OFFLOAD(tupleMultiplicity_); + } + + template + void HelixFit::deallocate() {} + + template class HelixFit; + template class HelixFit; + template class HelixFit; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h new file mode 100644 index 0000000000000..908124bb83081 --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/HelixFit.h @@ -0,0 +1,93 @@ +#ifndef RecoPixelVertexing_PixelTriplets_HelixFit_h +#define RecoPixelVertexing_PixelTriplets_HelixFit_h + +#include +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" +#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h" + +#include "CAStructures.h" +namespace riemannFit { + // TODO: Can this be taken from TrackerTraits or somewhere else? + // in case of memory issue can be made smaller + constexpr uint32_t maxNumberOfConcurrentFits = 32 * 1024; + constexpr uint32_t stride = maxNumberOfConcurrentFits; + using Matrix3x4d = Eigen::Matrix; + using Map3x4d = Eigen::Map >; + using Matrix6x4f = Eigen::Matrix; + using Map6x4f = Eigen::Map >; + + // hits + template + using Matrix3xNd = Eigen::Matrix; + template + using Map3xNd = Eigen::Map, 0, Eigen::Stride<3 * stride, stride> >; + // errors + template + using Matrix6xNf = Eigen::Matrix; + template + using Map6xNf = Eigen::Map, 0, Eigen::Stride<6 * stride, stride> >; + // fast fit + using Map4d = Eigen::Map >; + + template //a compile-time bounded for loop + constexpr void rolling_fits(F &&f) { + if constexpr (Start < End) { + f(std::integral_constant()); + rolling_fits(f); + } + } + +} // namespace riemannFit + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + template + class HelixFit { + public: + using TrackingRecHitSoAs = TrackingRecHitSoA; + + using HitView = TrackingRecHitSoAView; + using HitConstView = TrackingRecHitSoAConstView; + + using Tuples = typename reco::TrackSoA::HitContainer; + using OutputSoAView = reco::TrackSoAView; + + using TupleMultiplicity = caStructures::TupleMultiplicityT; + + using ParamsOnDevice = pixelCPEforDevice::ParamsOnDeviceT; + + explicit HelixFit(float bf, bool fitNas4) : bField_(bf), fitNas4_(fitNas4) {} + ~HelixFit() { deallocate(); } + + void setBField(double bField) { bField_ = bField; } + void launchRiemannKernels(const HitConstView &hv, + ParamsOnDevice const *cpeParams, + uint32_t nhits, + uint32_t maxNumberOfTuples, + Queue &queue); + void launchBrokenLineKernels(const HitConstView &hv, + ParamsOnDevice const *cpeParams, + uint32_t nhits, + uint32_t maxNumberOfTuples, + Queue &queue); + + void allocate(TupleMultiplicity const *tupleMultiplicity, OutputSoAView &helix_fit_results); + void deallocate(); + + private: + static constexpr uint32_t maxNumberOfConcurrentFits_ = riemannFit::maxNumberOfConcurrentFits; + + // fowarded + Tuples const *tuples_ = nullptr; + TupleMultiplicity const *tupleMultiplicity_ = nullptr; + OutputSoAView outputSoa_; + float bField_; + + const bool fitNas4_; + }; +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelTriplets_plugins_HelixFit_h diff --git a/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc b/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc new file mode 100644 index 0000000000000..5aa202700580c --- /dev/null +++ b/RecoTracker/PixelSeeding/plugins/alpaka/RiemannFit.dev.cc @@ -0,0 +1,401 @@ +// +// Author: Felice Pantaleo, CERN +// + +#include +#include + +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "DataFormats/TrackingRecHitSoA/interface/TrackingRecHitsSoA.h" +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforDevice.h" +#include "RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h" +#include "HelixFit.h" +#include "CAStructures.h" + +template +using Tuples = typename reco::TrackSoA::HitContainer; +template +using OutputSoAView = reco::TrackSoAView; +template +using TupleMultiplicity = caStructures::TupleMultiplicityT; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + using namespace alpaka; + using namespace cms::alpakatools; + + template + class Kernel_FastFit { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + Tuples const *__restrict__ foundNtuplets, + TupleMultiplicity const *__restrict__ tupleMultiplicity, + uint32_t nHits, + TrackingRecHitSoAConstView hh, + pixelCPEforDevice::ParamsOnDeviceT const *__restrict__ cpeParams, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit, + uint32_t offset) const { + constexpr uint32_t hitsInFit = N; + + ALPAKA_ASSERT_OFFLOAD(hitsInFit <= nHits); + + ALPAKA_ASSERT_OFFLOAD(pfast_fit); + ALPAKA_ASSERT_OFFLOAD(foundNtuplets); + ALPAKA_ASSERT_OFFLOAD(tupleMultiplicity); + + // look in bin for this hit multiplicity + +#ifdef RIEMANN_DEBUG + const uint32_t threadIdx(alpaka::getIdx(acc)[0u]); + if (cms::alpakatools::once_per_grid(acc)) + printf("%d Ntuple of size %d for %d hits to fit\n", tupleMultiplicity->size(nHits), nHits, hitsInFit); +#endif + + const auto nt = riemannFit::maxNumberOfConcurrentFits; + for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + break; + + // get it from the ntuple container (one to one to helix) + auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + ALPAKA_ASSERT_OFFLOAD(static_cast(tkid) < foundNtuplets->nOnes()); + + ALPAKA_ASSERT_OFFLOAD(foundNtuplets->size(tkid) == nHits); + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + + // Prepare data structure + auto const *hitId = foundNtuplets->begin(tkid); + for (unsigned int i = 0; i < hitsInFit; ++i) { + auto hit = hitId[i]; + float ge[6]; + cpeParams->detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge); + + hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal(); + hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; + } + riemannFit::fastFit(acc, hits, fast_fit); + + // no NaN here.... + ALPAKA_ASSERT_OFFLOAD(fast_fit(0) == fast_fit(0)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(1) == fast_fit(1)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(2) == fast_fit(2)); + ALPAKA_ASSERT_OFFLOAD(fast_fit(3) == fast_fit(3)); + } + } + }; + + template + class Kernel_CircleFit { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TupleMultiplicity const *__restrict__ tupleMultiplicity, + uint32_t nHits, + double bField, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit_input, + riemannFit::CircleFit *circle_fit, + uint32_t offset) const { + ALPAKA_ASSERT_OFFLOAD(circle_fit); + ALPAKA_ASSERT_OFFLOAD(N <= nHits); + + // same as above... + + // look in bin for this hit multiplicity + const auto nt = riemannFit::maxNumberOfConcurrentFits; + for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + break; + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit_input + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + + riemannFit::VectorNd rad = (hits.block(0, 0, 2, N).colwise().norm()); + + riemannFit::Matrix2Nd hits_cov = riemannFit::Matrix2Nd::Zero(); + riemannFit::loadCovariance2D(acc, hits_ge, hits_cov); + + circle_fit[local_idx] = + riemannFit::circleFit(acc, hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, bField, true); + +#ifdef RIEMANN_DEBUG +// auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); +// printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid, +// circle_fit[local_idx].par(0), circle_fit[local_idx].par(1), circle_fit[local_idx].par(2)); +#endif + } + } + }; + + template + class Kernel_LineFit { + public: + template >> + ALPAKA_FN_ACC void operator()(TAcc const &acc, + TupleMultiplicity const *__restrict__ tupleMultiplicity, + uint32_t nHits, + double bField, + OutputSoAView results_view, + double *__restrict__ phits, + float *__restrict__ phits_ge, + double *__restrict__ pfast_fit_input, + riemannFit::CircleFit *__restrict__ circle_fit, + uint32_t offset) const { + ALPAKA_ASSERT_OFFLOAD(circle_fit); + ALPAKA_ASSERT_OFFLOAD(N <= nHits); + + // same as above... + + // look in bin for this hit multiplicity + const auto nt = riemannFit::maxNumberOfConcurrentFits; + for (auto local_idx : cms::alpakatools::elements_with_stride(acc, nt)) { + auto tuple_idx = local_idx + offset; + if (tuple_idx >= tupleMultiplicity->size(nHits)) + break; + + // get it for the ntuple container (one to one to helix) + int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx); + + riemannFit::Map3xNd hits(phits + local_idx); + riemannFit::Map4d fast_fit(pfast_fit_input + local_idx); + riemannFit::Map6xNf hits_ge(phits_ge + local_idx); + + auto const &line_fit = riemannFit::lineFit(acc, hits, hits_ge, circle_fit[local_idx], fast_fit, bField, true); + + riemannFit::fromCircleToPerigee(acc, circle_fit[local_idx]); + + TracksUtilities::copyFromCircle(results_view, + circle_fit[local_idx].par, + circle_fit[local_idx].cov, + line_fit.par, + line_fit.cov, + 1.f / float(bField), + tkid); + results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2)); + results_view[tkid].eta() = asinhf(line_fit.par(0)); + results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5); + +#ifdef RIEMANN_DEBUG + printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n", + N, + nHits, + tkid, + circle_fit[local_idx].par(0), + circle_fit[local_idx].par(1), + circle_fit[local_idx].par(2)); + printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1)); + printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n", + circle_fit[local_idx].chi2, + line_fit.chi2, + circle_fit[local_idx].cov(0, 0), + circle_fit[local_idx].cov(1, 1), + circle_fit[local_idx].cov(2, 2), + line_fit.cov(0, 0), + line_fit.cov(1, 1)); +#endif + } + } + }; + + template + void HelixFit::launchRiemannKernels(const TrackingRecHitSoAConstView &hv, + pixelCPEforDevice::ParamsOnDeviceT const *cpeParams, + uint32_t nhits, + uint32_t maxNumberOfTuples, + Queue &queue) { + assert(tuples_); + + auto blockSize = 64; + auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; + const auto workDivTriplets = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + const auto workDivQuadsPenta = cms::alpakatools::make_workdiv(numberOfBlocks / 4, blockSize); + + // Fit internals + auto hitsDevice = cms::alpakatools::make_device_buffer( + queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<4>) / sizeof(double)); + auto hits_geDevice = cms::alpakatools::make_device_buffer( + queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6x4f) / sizeof(float)); + auto fast_fit_resultsDevice = cms::alpakatools::make_device_buffer( + queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double)); + auto circle_fit_resultsDevice_holder = + cms::alpakatools::make_device_buffer(queue, maxNumberOfConcurrentFits_ * sizeof(riemannFit::CircleFit)); + riemannFit::CircleFit *circle_fit_resultsDevice_ = + (riemannFit::CircleFit *)(circle_fit_resultsDevice_holder.data()); + + for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { + // triplets + alpaka::exec(queue, + workDivTriplets, + Kernel_FastFit<3, TrackerTraits>{}, + tuples_, + tupleMultiplicity_, + 3, + hv, + cpeParams, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + offset); + + alpaka::exec(queue, + workDivTriplets, + Kernel_CircleFit<3, TrackerTraits>{}, + tupleMultiplicity_, + 3, + bField_, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + circle_fit_resultsDevice_, + offset); + + alpaka::exec(queue, + workDivTriplets, + Kernel_LineFit<3, TrackerTraits>{}, + tupleMultiplicity_, + 3, + bField_, + outputSoa_, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + circle_fit_resultsDevice_, + offset); + + // quads + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_FastFit<4, TrackerTraits>{}, + tuples_, + tupleMultiplicity_, + 4, + hv, + cpeParams, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_CircleFit<4, TrackerTraits>{}, + tupleMultiplicity_, + 4, + bField_, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + circle_fit_resultsDevice_, + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_LineFit<4, TrackerTraits>{}, + tupleMultiplicity_, + 4, + bField_, + outputSoa_, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + circle_fit_resultsDevice_, + offset); + + if (fitNas4_) { + // penta + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_FastFit<4, TrackerTraits>{}, + tuples_, + tupleMultiplicity_, + 5, + hv, + cpeParams, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_CircleFit<4, TrackerTraits>{}, + tupleMultiplicity_, + 5, + bField_, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + circle_fit_resultsDevice_, + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_LineFit<4, TrackerTraits>{}, + tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + circle_fit_resultsDevice_, + offset); + } else { + // penta all 5 + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_FastFit<5, TrackerTraits>{}, + tuples_, + tupleMultiplicity_, + 5, + hv, + cpeParams, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_CircleFit<5, TrackerTraits>{}, + tupleMultiplicity_, + 5, + bField_, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + circle_fit_resultsDevice_, + offset); + + alpaka::exec(queue, + workDivQuadsPenta, + Kernel_LineFit<5, TrackerTraits>{}, + tupleMultiplicity_, + 5, + bField_, + outputSoa_, + hitsDevice.data(), + hits_geDevice.data(), + fast_fit_resultsDevice.data(), + circle_fit_resultsDevice_, + offset); + } + } + } + + template class HelixFit; + template class HelixFit; + template class HelixFit; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoTracker/PixelSeeding/plugins/gpuPixelDoubletsAlgos.h b/RecoTracker/PixelSeeding/plugins/gpuPixelDoubletsAlgos.h index b86ba09949416..583021081d534 100644 --- a/RecoTracker/PixelSeeding/plugins/gpuPixelDoubletsAlgos.h +++ b/RecoTracker/PixelSeeding/plugins/gpuPixelDoubletsAlgos.h @@ -9,15 +9,15 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h" #include "DataFormats/Math/interface/approx_atan2.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h" #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" -#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "CAStructures.h" #include "GPUCACell.h" -// #define GPU_DEBUG -// #define NTUPLE_DEBUG +//#define GPU_DEBUG +//#define NTUPLE_DEBUG namespace gpuPixelDoublets { @@ -287,8 +287,8 @@ namespace gpuPixelDoublets { } // #endif #ifdef GPU_DEBUG - if (tooMany > 0) - printf("OuterHitOfCell full for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f\n", + if (tooMany > 0 || tot > 0) + printf("OuterHitOfCell for %d in layer %d/%d, %d,%d %d, %d %.3f %.3f %s\n", i, inner, outer, @@ -297,7 +297,8 @@ namespace gpuPixelDoublets { tooMany, iphicut, TrackerTraits::minz[pairLayerId], - TrackerTraits::maxz[pairLayerId]); + TrackerTraits::maxz[pairLayerId], + tooMany > 0 ? "FULL!!" : "not full."); #endif } // loop in block... } diff --git a/RecoTracker/PixelSeeding/test/BuildFile.xml b/RecoTracker/PixelSeeding/test/BuildFile.xml index 37e12c0ec6aed..74e7849e410e4 100644 --- a/RecoTracker/PixelSeeding/test/BuildFile.xml +++ b/RecoTracker/PixelSeeding/test/BuildFile.xml @@ -28,3 +28,10 @@ + + + + + + + diff --git a/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp b/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp new file mode 100644 index 0000000000000..770957d9a79c0 --- /dev/null +++ b/RecoTracker/PixelSeeding/test/alpaka/CAsizes_t.cpp @@ -0,0 +1,40 @@ +#include "RecoTracker/PixelSeeding/plugins/alpaka/CACell.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include +#include + +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +template +void print() { + std::cout << "size of " << typeid(T).name() << ' ' << sizeof(T) << std::endl; +} + +int main() { + using namespace pixelTopology; + using namespace caStructures; + //for Phase-I + print>(); + print>(); + print>(); + print>(); + print>(); + print>(); + print>(); + + print>(); + + //for Phase-II + + print>(); + print>(); + print>(); + print>(); + print>(); + print>(); + print>(); + + print>(); + + return 0; +} diff --git a/RecoTracker/PixelTrackFitting/BuildFile.xml b/RecoTracker/PixelTrackFitting/BuildFile.xml index b57493ad60503..c21f4634d0308 100644 --- a/RecoTracker/PixelTrackFitting/BuildFile.xml +++ b/RecoTracker/PixelTrackFitting/BuildFile.xml @@ -1,3 +1,4 @@ + @@ -13,6 +14,7 @@ + diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h b/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h new file mode 100644 index 0000000000000..9e656e2de18dc --- /dev/null +++ b/RecoTracker/PixelTrackFitting/interface/alpaka/BrokenLine.h @@ -0,0 +1,634 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h +#define RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h +#include +#include +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace brokenline { + using namespace cms::alpakatools; + using namespace ::riemannFit; + + //!< Karimäki's parameters: (phi, d, k=1/R) + /*!< covariance matrix: \n + |cov(phi,phi)|cov( d ,phi)|cov( k ,phi)| \n + |cov(phi, d )|cov( d , d )|cov( k , d )| \n + |cov(phi, k )|cov( d , k )|cov( k , k )| \n + as defined in Karimäki V., 1990, Effective circle fitting for particle trajectories, + Nucl. Instr. and Meth. A305 (1991) 187. + */ + using karimaki_circle_fit = riemannFit::CircleFit; + + /*! + \brief data needed for the Broken Line fit procedure. + */ + template + struct PreparedBrokenLineData { + int qCharge; //!< particle charge + riemannFit::Matrix2xNd radii; //!< xy data in the system in which the pre-fitted center is the origin + riemannFit::VectorNd sTransverse; //!< total distance traveled in the transverse plane + // starting from the pre-fitted closest approach + riemannFit::VectorNd sTotal; //!< total distance traveled (three-dimensional) + riemannFit::VectorNd zInSZplane; //!< orthogonal coordinate to the pre-fitted line in the sz plane + riemannFit::VectorNd varBeta; //!< kink angles in the SZ plane + }; + + /*! + \brief Computes the Coulomb multiple scattering variance of the planar angle. + + \param length length of the track in the material. + \param bField magnetic field in Gev/cm/c. + \param radius radius of curvature (needed to evaluate p). + \param layer denotes which of the four layers of the detector is the endpoint of the + * multiple scattered track. For example, if Layer=3, then the particle has + * just gone through the material between the second and the third layer. + + \todo add another Layer variable to identify also the start point of the track, + * so if there are missing hits or multiple hits, the part of the detector that + * the particle has traversed can be exactly identified. + + \warning the formula used here assumes beta=1, and so neglects the dependence + * of theta_0 on the mass of the particle at fixed momentum. + + \return the variance of the planar angle ((theta_0)^2 /3). + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE double multScatt( + const TAcc& acc, const double& length, const double bField, const double radius, int layer, double slope) { + // limit R to 20GeV... + auto pt2 = alpaka::math::min(acc, 20., bField * radius); + pt2 *= pt2; + constexpr double inv_X0 = 0.06 / 16.; //!< inverse of radiation length of the material in cm + //if(Layer==1) XXI_0=0.06/16.; + // else XXI_0=0.06/16.; + //XX_0*=1; + + //! number between 1/3 (uniform material) and 1 (thin scatterer) to be manually tuned + constexpr double geometry_factor = 0.7; + constexpr double fact = geometry_factor * riemannFit::sqr(13.6 / 1000.); + return fact / (pt2 * (1. + riemannFit::sqr(slope))) * (alpaka::math::abs(acc, length) * inv_X0) * + riemannFit::sqr(1. + 0.038 * log(alpaka::math::abs(acc, length) * inv_X0)); + } + + /*! + \brief Computes the 2D rotation matrix that transforms the line y=slope*x into the line y=0. + + \param slope tangent of the angle of rotation. + + \return 2D rotation matrix. + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE riemannFit::Matrix2d rotationMatrix(const TAcc& acc, double slope) { + riemannFit::Matrix2d rot; + rot(0, 0) = 1. / alpaka::math::sqrt(acc, 1. + riemannFit::sqr(slope)); + rot(0, 1) = slope * rot(0, 0); + rot(1, 0) = -rot(0, 1); + rot(1, 1) = rot(0, 0); + return rot; + } + + /*! + \brief Changes the Karimäki parameters (and consequently their covariance matrix) under a + * translation of the coordinate system, such that the old origin has coordinates (x0,y0) + * in the new coordinate system. The formulas are taken from Karimäki V., 1990, Effective + * circle fitting for particle trajectories, Nucl. Instr. and Meth. A305 (1991) 187. + + \param circle circle fit in the old coordinate system. circle.par(0) is phi, circle.par(1) is d and circle.par(2) is rho. + \param x0 x coordinate of the translation vector. + \param y0 y coordinate of the translation vector. + \param jacobian passed by reference in order to save stack. + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void translateKarimaki( + const TAcc& acc, karimaki_circle_fit& circle, double x0, double y0, riemannFit::Matrix3d& jacobian) { + // Avoid multiple access to the circle.par vector. + using scalar = typename std::remove_reference::type; + scalar phi = circle.par(0); + scalar dee = circle.par(1); + scalar rho = circle.par(2); + + // Avoid repeated trig. computations + scalar sinPhi = alpaka::math::sin(acc, phi); + scalar cosPhi = alpaka::math::cos(acc, phi); + + // Intermediate computations for the circle parameters + scalar deltaPara = x0 * cosPhi + y0 * sinPhi; + scalar deltaOrth = x0 * sinPhi - y0 * cosPhi + dee; + scalar tempSmallU = 1 + rho * dee; + scalar tempC = -rho * y0 + tempSmallU * cosPhi; + scalar tempB = rho * x0 + tempSmallU * sinPhi; + scalar tempA = 2. * deltaOrth + rho * (riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara)); + scalar tempU = alpaka::math::sqrt(acc, 1. + rho * tempA); + + // Intermediate computations for the error matrix transform + scalar xi = 1. / (riemannFit::sqr(tempB) + riemannFit::sqr(tempC)); + scalar tempV = 1. + rho * deltaOrth; + scalar lambda = (0.5 * tempA) / (riemannFit::sqr(1. + tempU) * tempU); + scalar mu = 1. / (tempU * (1. + tempU)) + rho * lambda; + scalar zeta = riemannFit::sqr(deltaOrth) + riemannFit::sqr(deltaPara); + jacobian << xi * tempSmallU * tempV, -xi * riemannFit::sqr(rho) * deltaOrth, xi * deltaPara, + 2. * mu * tempSmallU * deltaPara, 2. * mu * tempV, mu * zeta - lambda * tempA, 0, 0, 1.; + + // translated circle parameters + // phi + circle.par(0) = alpaka::math::atan2(acc, tempB, tempC); + // d + circle.par(1) = tempA / (1 + tempU); + // rho after translation. It is invariant, so noop + // circle.par(2)= rho; + + // translated error matrix + circle.cov = jacobian * circle.cov * jacobian.transpose(); + } + + /*! + \brief Computes the data needed for the Broken Line fit procedure that are mainly common for the circle and the line fit. + + \param hits hits coordinates. + \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)). + \param bField magnetic field in Gev/cm/c. + \param results PreparedBrokenLineData to be filled (see description of PreparedBrokenLineData). + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline)) + prepareBrokenLineData(const TAcc& acc, + const M3xN& hits, + const V4& fast_fit, + const double bField, + PreparedBrokenLineData& results) { + riemannFit::Vector2d dVec; + riemannFit::Vector2d eVec; + + int mId = 1; + + if constexpr (n > 3) { + riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1)); + auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm(); + auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm(); + mId = d1 < d2 ? n / 2 : n / 2 - 1; + } + + dVec = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1); + eVec = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1); + results.qCharge = riemannFit::cross2D(acc, dVec, eVec) > 0 ? -1 : 1; + + const double slope = -results.qCharge / fast_fit(3); + + riemannFit::Matrix2d rotMat = rotationMatrix(acc, slope); + + // calculate radii and s + results.radii = hits.block(0, 0, 2, n) - fast_fit.head(2) * riemannFit::MatrixXd::Constant(1, n, 1); + eVec = -fast_fit(2) * fast_fit.head(2) / fast_fit.head(2).norm(); + for (u_int i = 0; i < n; i++) { + dVec = results.radii.block(0, i, 2, 1); + results.sTransverse(i) = + results.qCharge * fast_fit(2) * + alpaka::math::atan2( + acc, riemannFit::cross2D(acc, dVec, eVec), dVec.dot(eVec)); // calculates the arc length + } + riemannFit::VectorNd zVec = hits.block(2, 0, 1, n).transpose(); + + //calculate sTotal and zVec + riemannFit::Matrix2xNd pointsSZ = riemannFit::Matrix2xNd::Zero(); + for (u_int i = 0; i < n; i++) { + pointsSZ(0, i) = results.sTransverse(i); + pointsSZ(1, i) = zVec(i); + pointsSZ.block(0, i, 2, 1) = rotMat * pointsSZ.block(0, i, 2, 1); + } + results.sTotal = pointsSZ.block(0, 0, 1, n).transpose(); + results.zInSZplane = pointsSZ.block(1, 0, 1, n).transpose(); + + //calculate varBeta + results.varBeta(0) = results.varBeta(n - 1) = 0; + for (u_int i = 1; i < n - 1; i++) { + results.varBeta(i) = + multScatt(acc, results.sTotal(i + 1) - results.sTotal(i), bField, fast_fit(2), i + 2, slope) + + multScatt(acc, results.sTotal(i) - results.sTotal(i - 1), bField, fast_fit(2), i + 1, slope); + } + } + + /*! + \brief Computes the n-by-n band matrix obtained minimizing the Broken Line's cost function w.r.t u. + * This is the whole matrix in the case of the line fit and the main n-by-n block in the case + * of the circle fit. + + \param weights weights of the first part of the cost function, the one with the measurements + * and not the angles (\sum_{i=1}^n w*(y_i-u_i)^2). + \param sTotal total distance traveled by the particle from the pre-fitted closest approach. + \param varBeta kink angles' variance. + + \return the n-by-n matrix of the linear system + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE riemannFit::MatrixNd matrixC_u(const TAcc& acc, + const riemannFit::VectorNd& weights, + const riemannFit::VectorNd& sTotal, + const riemannFit::VectorNd& varBeta) { + riemannFit::MatrixNd c_uMat = riemannFit::MatrixNd::Zero(); + for (u_int i = 0; i < n; i++) { + c_uMat(i, i) = weights(i); + if (i > 1) + c_uMat(i, i) += 1. / (varBeta(i - 1) * riemannFit::sqr(sTotal(i) - sTotal(i - 1))); + if (i > 0 && i < n - 1) + c_uMat(i, i) += + (1. / varBeta(i)) * riemannFit::sqr((sTotal(i + 1) - sTotal(i - 1)) / + ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1)))); + if (i < n - 2) + c_uMat(i, i) += 1. / (varBeta(i + 1) * riemannFit::sqr(sTotal(i + 1) - sTotal(i))); + + if (i > 0 && i < n - 1) + c_uMat(i, i + 1) = + 1. / (varBeta(i) * (sTotal(i + 1) - sTotal(i))) * + (-(sTotal(i + 1) - sTotal(i - 1)) / ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1)))); + if (i < n - 2) + c_uMat(i, i + 1) += + 1. / (varBeta(i + 1) * (sTotal(i + 1) - sTotal(i))) * + (-(sTotal(i + 2) - sTotal(i)) / ((sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i)))); + + if (i < n - 2) + c_uMat(i, i + 2) = 1. / (varBeta(i + 1) * (sTotal(i + 2) - sTotal(i + 1)) * (sTotal(i + 1) - sTotal(i))); + + c_uMat(i, i) *= 0.5; + } + return c_uMat + c_uMat.transpose(); + } + + /*! + \brief A very fast helix fit. + + \param hits the measured hits. + + \return (X0,Y0,R,tan(theta)). + + \warning sign of theta is (intentionally, for now) mistaken for negative charges. + */ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fastFit(const TAcc& acc, const M3xN& hits, V4& result) { + constexpr uint32_t n = M3xN::ColsAtCompileTime; + + int mId = 1; + + if constexpr (n > 3) { + riemannFit::Vector2d middle = 0.5 * (hits.block(0, n - 1, 2, 1) + hits.block(0, 0, 2, 1)); + auto d1 = (hits.block(0, n / 2, 2, 1) - middle).squaredNorm(); + auto d2 = (hits.block(0, n / 2 - 1, 2, 1) - middle).squaredNorm(); + mId = d1 < d2 ? n / 2 : n / 2 - 1; + } + + const riemannFit::Vector2d a = hits.block(0, mId, 2, 1) - hits.block(0, 0, 2, 1); + const riemannFit::Vector2d b = hits.block(0, n - 1, 2, 1) - hits.block(0, mId, 2, 1); + const riemannFit::Vector2d c = hits.block(0, 0, 2, 1) - hits.block(0, n - 1, 2, 1); + + auto tmp = 0.5 / riemannFit::cross2D(acc, c, a); + result(0) = hits(0, 0) - (a(1) * c.squaredNorm() + c(1) * a.squaredNorm()) * tmp; + result(1) = hits(1, 0) + (a(0) * c.squaredNorm() + c(0) * a.squaredNorm()) * tmp; + // check Wikipedia for these formulas + + result(2) = alpaka::math::sqrt(acc, a.squaredNorm() * b.squaredNorm() * c.squaredNorm()) / + (2. * alpaka::math::abs(acc, riemannFit::cross2D(acc, b, a))); + // Using Math Olympiad's formula R=abc/(4A) + + const riemannFit::Vector2d d = hits.block(0, 0, 2, 1) - result.head(2); + const riemannFit::Vector2d e = hits.block(0, n - 1, 2, 1) - result.head(2); + + result(3) = result(2) * atan2(riemannFit::cross2D(acc, d, e), d.dot(e)) / (hits(2, n - 1) - hits(2, 0)); + // ds/dz slope between last and first point + } + + /*! + \brief Performs the Broken Line fit in the curved track case (that is, the fit + * parameters are the interceptions u and the curvature correction \Delta\kappa). + + \param hits hits coordinates. + \param hits_cov hits covariance matrix. + \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)). + \param bField magnetic field in Gev/cm/c. + \param data PreparedBrokenLineData. + \param circle_results struct to be filled with the results in this form: + -par parameter of the line in this form: (phi, d, k); \n + -cov covariance matrix of the fitted parameter; \n + -chi2 value of the cost function in the minimum. + + \details The function implements the steps 2 and 3 of the Broken Line fit + * with the curvature correction.\n + * The step 2 is the least square fit, done by imposing the minimum constraint on + * the cost function and solving the consequent linear system. It determines the + * fitted parameters u and \Delta\kappa and their covariance matrix. + * The step 3 is the correction of the fast pre-fitted parameters for the innermost + * part of the track. It is first done in a comfortable coordinate system (the one + * in which the first hit is the origin) and then the parameters and their + * covariance matrix are transformed to the original coordinate system. + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void circleFit(const TAcc& acc, + const M3xN& hits, + const M6xN& hits_ge, + const V4& fast_fit, + const double bField, + PreparedBrokenLineData& data, + karimaki_circle_fit& circle_results) { + circle_results.qCharge = data.qCharge; + auto& radii = data.radii; + const auto& sTransverse = data.sTransverse; + const auto& sTotal = data.sTotal; + auto& zInSZplane = data.zInSZplane; + auto& varBeta = data.varBeta; + const double slope = -circle_results.qCharge / fast_fit(3); + varBeta *= 1. + riemannFit::sqr(slope); // the kink angles are projected! + + for (u_int i = 0; i < n; i++) { + zInSZplane(i) = radii.block(0, i, 2, 1).norm() - fast_fit(2); + } + + riemannFit::Matrix2d vMat; // covariance matrix + riemannFit::VectorNd weightsVec; // weights + riemannFit::Matrix2d rotMat; // rotation matrix point by point + for (u_int i = 0; i < n; i++) { + vMat(0, 0) = hits_ge.col(i)[0]; // x errors + vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1]; // cov_xy + vMat(1, 1) = hits_ge.col(i)[2]; // y errors + rotMat = rotationMatrix(acc, -radii(0, i) / radii(1, i)); + weightsVec(i) = + 1. / ((rotMat * vMat * rotMat.transpose())(1, 1)); // compute the orthogonal weight point by point + } + + riemannFit::VectorNplusONEd r_uVec; + r_uVec(n) = 0; + for (u_int i = 0; i < n; i++) { + r_uVec(i) = weightsVec(i) * zInSZplane(i); + } + + riemannFit::MatrixNplusONEd c_uMat; + c_uMat.block(0, 0, n, n) = matrixC_u(acc, weightsVec, sTransverse, varBeta); + c_uMat(n, n) = 0; + //add the border to the c_uMat matrix + for (u_int i = 0; i < n; i++) { + c_uMat(i, n) = 0; + if (i > 0 && i < n - 1) { + c_uMat(i, n) += + -(sTransverse(i + 1) - sTransverse(i - 1)) * (sTransverse(i + 1) - sTransverse(i - 1)) / + (2. * varBeta(i) * (sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))); + } + if (i > 1) { + c_uMat(i, n) += + (sTransverse(i) - sTransverse(i - 2)) / (2. * varBeta(i - 1) * (sTransverse(i) - sTransverse(i - 1))); + } + if (i < n - 2) { + c_uMat(i, n) += + (sTransverse(i + 2) - sTransverse(i)) / (2. * varBeta(i + 1) * (sTransverse(i + 1) - sTransverse(i))); + } + c_uMat(n, i) = c_uMat(i, n); + if (i > 0 && i < n - 1) + c_uMat(n, n) += riemannFit::sqr(sTransverse(i + 1) - sTransverse(i - 1)) / (4. * varBeta(i)); + } + +#ifdef CPP_DUMP + std::cout << "CU5\n" << c_uMat << std::endl; +#endif + riemannFit::MatrixNplusONEd iMat; + math::cholesky::invert(c_uMat, iMat); +#ifdef CPP_DUMP + std::cout << "I5\n" << iMat << std::endl; +#endif + riemannFit::VectorNplusONEd uVec = iMat * r_uVec; // obtain the fitted parameters by solving the linear system + + // compute (phi, d_ca, k) in the system in which the midpoint of the first two corrected hits is the origin... + + radii.block(0, 0, 2, 1) /= radii.block(0, 0, 2, 1).norm(); + radii.block(0, 1, 2, 1) /= radii.block(0, 1, 2, 1).norm(); + + riemannFit::Vector2d dVec = hits.block(0, 0, 2, 1) + (-zInSZplane(0) + uVec(0)) * radii.block(0, 0, 2, 1); + riemannFit::Vector2d eVec = hits.block(0, 1, 2, 1) + (-zInSZplane(1) + uVec(1)) * radii.block(0, 1, 2, 1); + auto eMinusd = eVec - dVec; + auto eMinusd2 = eMinusd.squaredNorm(); + auto tmp1 = 1. / eMinusd2; + auto tmp2 = alpaka::math::sqrt(acc, riemannFit::sqr(fast_fit(2)) - 0.25 * eMinusd2); + + circle_results.par << atan2(eMinusd(1), eMinusd(0)), circle_results.qCharge * (tmp2 - fast_fit(2)), + circle_results.qCharge * (1. / fast_fit(2) + uVec(n)); + + tmp2 = 1. / tmp2; + + riemannFit::Matrix3d jacobian; + jacobian << (radii(1, 0) * eMinusd(0) - eMinusd(1) * radii(0, 0)) * tmp1, + (radii(1, 1) * eMinusd(0) - eMinusd(1) * radii(0, 1)) * tmp1, 0, + circle_results.qCharge * (eMinusd(0) * radii(0, 0) + eMinusd(1) * radii(1, 0)) * tmp2, + circle_results.qCharge * (eMinusd(0) * radii(0, 1) + eMinusd(1) * radii(1, 1)) * tmp2, 0, 0, 0, + circle_results.qCharge; + + circle_results.cov << iMat(0, 0), iMat(0, 1), iMat(0, n), iMat(1, 0), iMat(1, 1), iMat(1, n), iMat(n, 0), + iMat(n, 1), iMat(n, n); + + circle_results.cov = jacobian * circle_results.cov * jacobian.transpose(); + + //...Translate in the system in which the first corrected hit is the origin, adding the m.s. correction... + + translateKarimaki(acc, circle_results, 0.5 * eMinusd(0), 0.5 * eMinusd(1), jacobian); + circle_results.cov(0, 0) += + (1 + riemannFit::sqr(slope)) * multScatt(acc, sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope); + + //...And translate back to the original system + + translateKarimaki(acc, circle_results, dVec(0), dVec(1), jacobian); + + // compute chi2 + circle_results.chi2 = 0; + for (u_int i = 0; i < n; i++) { + circle_results.chi2 += weightsVec(i) * riemannFit::sqr(zInSZplane(i) - uVec(i)); + if (i > 0 && i < n - 1) + circle_results.chi2 += + riemannFit::sqr(uVec(i - 1) / (sTransverse(i) - sTransverse(i - 1)) - + uVec(i) * (sTransverse(i + 1) - sTransverse(i - 1)) / + ((sTransverse(i + 1) - sTransverse(i)) * (sTransverse(i) - sTransverse(i - 1))) + + uVec(i + 1) / (sTransverse(i + 1) - sTransverse(i)) + + (sTransverse(i + 1) - sTransverse(i - 1)) * uVec(n) / 2) / + varBeta(i); + } + } + + /*! + \brief Performs the Broken Line fit in the straight track case (that is, the fit parameters are only the interceptions u). + + \param hits hits coordinates. + \param fast_fit pre-fit result in the form (X0,Y0,R,tan(theta)). + \param bField magnetic field in Gev/cm/c. + \param data PreparedBrokenLineData. + \param line_results struct to be filled with the results in this form: + -par parameter of the line in this form: (cot(theta), Zip); \n + -cov covariance matrix of the fitted parameter; \n + -chi2 value of the cost function in the minimum. + + \details The function implements the steps 2 and 3 of the Broken Line fit without + * the curvature correction.\n + * The step 2 is the least square fit, done by imposing the minimum constraint + * on the cost function and solving the consequent linear system. It determines + * the fitted parameters u and their covariance matrix. + * The step 3 is the correction of the fast pre-fitted parameters for the innermost + * part of the track. It is first done in a comfortable coordinate system (the one + * in which the first hit is the origin) and then the parameters and their covariance + * matrix are transformed to the original coordinate system. + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void lineFit(const TAcc& acc, + const M6xN& hits_ge, + const V4& fast_fit, + const double bField, + const PreparedBrokenLineData& data, + riemannFit::LineFit& line_results) { + const auto& radii = data.radii; + const auto& sTotal = data.sTotal; + const auto& zInSZplane = data.zInSZplane; + const auto& varBeta = data.varBeta; + + const double slope = -data.qCharge / fast_fit(3); + riemannFit::Matrix2d rotMat = rotationMatrix(acc, slope); + + riemannFit::Matrix3d vMat = riemannFit::Matrix3d::Zero(); // covariance matrix XYZ + riemannFit::Matrix2x3d jacobXYZtosZ = + riemannFit::Matrix2x3d::Zero(); // jacobian for computation of the error on s (xyz -> sz) + riemannFit::VectorNd weights = riemannFit::VectorNd::Zero(); + for (u_int i = 0; i < n; i++) { + vMat(0, 0) = hits_ge.col(i)[0]; // x errors + vMat(0, 1) = vMat(1, 0) = hits_ge.col(i)[1]; // cov_xy + vMat(0, 2) = vMat(2, 0) = hits_ge.col(i)[3]; // cov_xz + vMat(1, 1) = hits_ge.col(i)[2]; // y errors + vMat(2, 1) = vMat(1, 2) = hits_ge.col(i)[4]; // cov_yz + vMat(2, 2) = hits_ge.col(i)[5]; // z errors + auto tmp = 1. / radii.block(0, i, 2, 1).norm(); + jacobXYZtosZ(0, 0) = radii(1, i) * tmp; + jacobXYZtosZ(0, 1) = -radii(0, i) * tmp; + jacobXYZtosZ(1, 2) = 1.; + weights(i) = 1. / ((rotMat * jacobXYZtosZ * vMat * jacobXYZtosZ.transpose() * rotMat.transpose())( + 1, 1)); // compute the orthogonal weight point by point + } + + riemannFit::VectorNd r_u; + for (u_int i = 0; i < n; i++) { + r_u(i) = weights(i) * zInSZplane(i); + } +#ifdef CPP_DUMP + std::cout << "CU4\n" << matrixC_u(w, sTotal, varBeta) << std::endl; +#endif + riemannFit::MatrixNd iMat; + math::cholesky::invert(matrixC_u(acc, weights, sTotal, varBeta), iMat); +#ifdef CPP_DUMP + std::cout << "I4\n" << iMat << std::endl; +#endif + + riemannFit::VectorNd uVec = iMat * r_u; // obtain the fitted parameters by solving the linear system + + // line parameters in the system in which the first hit is the origin and with axis along SZ + line_results.par << (uVec(1) - uVec(0)) / (sTotal(1) - sTotal(0)), uVec(0); + auto idiff = 1. / (sTotal(1) - sTotal(0)); + line_results.cov << (iMat(0, 0) - 2 * iMat(0, 1) + iMat(1, 1)) * riemannFit::sqr(idiff) + + multScatt(acc, sTotal(1) - sTotal(0), bField, fast_fit(2), 2, slope), + (iMat(0, 1) - iMat(0, 0)) * idiff, (iMat(0, 1) - iMat(0, 0)) * idiff, iMat(0, 0); + + // translate to the original SZ system + riemannFit::Matrix2d jacobian; + jacobian(0, 0) = 1.; + jacobian(0, 1) = 0; + jacobian(1, 0) = -sTotal(0); + jacobian(1, 1) = 1.; + line_results.par(1) += -line_results.par(0) * sTotal(0); + line_results.cov = jacobian * line_results.cov * jacobian.transpose(); + + // rotate to the original sz system + auto tmp = rotMat(0, 0) - line_results.par(0) * rotMat(0, 1); + jacobian(1, 1) = 1. / tmp; + jacobian(0, 0) = jacobian(1, 1) * jacobian(1, 1); + jacobian(0, 1) = 0; + jacobian(1, 0) = line_results.par(1) * rotMat(0, 1) * jacobian(0, 0); + line_results.par(1) = line_results.par(1) * jacobian(1, 1); + line_results.par(0) = (rotMat(0, 1) + line_results.par(0) * rotMat(0, 0)) * jacobian(1, 1); + line_results.cov = jacobian * line_results.cov * jacobian.transpose(); + + // compute chi2 + line_results.chi2 = 0; + for (u_int i = 0; i < n; i++) { + line_results.chi2 += weights(i) * riemannFit::sqr(zInSZplane(i) - uVec(i)); + if (i > 0 && i < n - 1) + line_results.chi2 += riemannFit::sqr(uVec(i - 1) / (sTotal(i) - sTotal(i - 1)) - + uVec(i) * (sTotal(i + 1) - sTotal(i - 1)) / + ((sTotal(i + 1) - sTotal(i)) * (sTotal(i) - sTotal(i - 1))) + + uVec(i + 1) / (sTotal(i + 1) - sTotal(i))) / + varBeta(i); + } + } + + /*! + \brief Helix fit by three step: + -fast pre-fit (see Fast_fit() for further info); \n + -circle fit of the hits projected in the transverse plane by Broken Line algorithm (see BL_Circle_fit() for further info); \n + -line fit of the hits projected on the (pre-fitted) cilinder surface by Broken Line algorithm (see BL_Line_fit() for further info); \n + Points must be passed ordered (from inner to outer layer). + + \param hits Matrix3xNd hits coordinates in this form: \n + |x1|x2|x3|...|xn| \n + |y1|y2|y3|...|yn| \n + |z1|z2|z3|...|zn| + \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n + |(x1,x1)|(x2,x1)|(x3,x1)|(x4,x1)|.|(y1,x1)|(y2,x1)|(y3,x1)|(y4,x1)|.|(z1,x1)|(z2,x1)|(z3,x1)|(z4,x1)| \n + |(x1,x2)|(x2,x2)|(x3,x2)|(x4,x2)|.|(y1,x2)|(y2,x2)|(y3,x2)|(y4,x2)|.|(z1,x2)|(z2,x2)|(z3,x2)|(z4,x2)| \n + |(x1,x3)|(x2,x3)|(x3,x3)|(x4,x3)|.|(y1,x3)|(y2,x3)|(y3,x3)|(y4,x3)|.|(z1,x3)|(z2,x3)|(z3,x3)|(z4,x3)| \n + |(x1,x4)|(x2,x4)|(x3,x4)|(x4,x4)|.|(y1,x4)|(y2,x4)|(y3,x4)|(y4,x4)|.|(z1,x4)|(z2,x4)|(z3,x4)|(z4,x4)| \n + . . . . . . . . . . . . . . . \n + |(x1,y1)|(x2,y1)|(x3,y1)|(x4,y1)|.|(y1,y1)|(y2,y1)|(y3,x1)|(y4,y1)|.|(z1,y1)|(z2,y1)|(z3,y1)|(z4,y1)| \n + |(x1,y2)|(x2,y2)|(x3,y2)|(x4,y2)|.|(y1,y2)|(y2,y2)|(y3,x2)|(y4,y2)|.|(z1,y2)|(z2,y2)|(z3,y2)|(z4,y2)| \n + |(x1,y3)|(x2,y3)|(x3,y3)|(x4,y3)|.|(y1,y3)|(y2,y3)|(y3,x3)|(y4,y3)|.|(z1,y3)|(z2,y3)|(z3,y3)|(z4,y3)| \n + |(x1,y4)|(x2,y4)|(x3,y4)|(x4,y4)|.|(y1,y4)|(y2,y4)|(y3,x4)|(y4,y4)|.|(z1,y4)|(z2,y4)|(z3,y4)|(z4,y4)| \n + . . . . . . . . . . . . . . . \n + |(x1,z1)|(x2,z1)|(x3,z1)|(x4,z1)|.|(y1,z1)|(y2,z1)|(y3,z1)|(y4,z1)|.|(z1,z1)|(z2,z1)|(z3,z1)|(z4,z1)| \n + |(x1,z2)|(x2,z2)|(x3,z2)|(x4,z2)|.|(y1,z2)|(y2,z2)|(y3,z2)|(y4,z2)|.|(z1,z2)|(z2,z2)|(z3,z2)|(z4,z2)| \n + |(x1,z3)|(x2,z3)|(x3,z3)|(x4,z3)|.|(y1,z3)|(y2,z3)|(y3,z3)|(y4,z3)|.|(z1,z3)|(z2,z3)|(z3,z3)|(z4,z3)| \n + |(x1,z4)|(x2,z4)|(x3,z4)|(x4,z4)|.|(y1,z4)|(y2,z4)|(y3,z4)|(y4,z4)|.|(z1,z4)|(z2,z4)|(z3,z4)|(z4,z4)| + \param bField magnetic field in the center of the detector in Gev/cm/c, in order to perform the p_t calculation. + + \warning see BL_Circle_fit(), BL_Line_fit() and Fast_fit() warnings. + + \bug see BL_Circle_fit(), BL_Line_fit() and Fast_fit() bugs. + + \return (phi,Tip,p_t,cot(theta)),Zip), their covariance matrix and the chi2's of the circle and line fits. + */ + + template + class helixFit { + public: + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const TAcc& acc, + const riemannFit::Matrix3xNd* hits, + const Eigen::Matrix* hits_ge, + const double bField, + riemannFit::HelixFit* helix) const { + riemannFit::Vector4d fast_fit; + fastFit(acc, *hits, fast_fit); + + PreparedBrokenLineData data; + karimaki_circle_fit circle; + riemannFit::LineFit line; + riemannFit::Matrix3d jacobian; + + prepareBrokenLineData(acc, *hits, fast_fit, bField, data); + lineFit(acc, *hits_ge, fast_fit, bField, data, line); + circleFit(acc, *hits, *hits_ge, fast_fit, bField, data, circle); + + // the circle fit gives k, but here we want p_t, so let's change the parameter and the covariance matrix + jacobian << 1., 0, 0, 0, 1., 0, 0, 0, + -alpaka::math::abs(acc, circle.par(2)) * bField / (riemannFit::sqr(circle.par(2)) * circle.par(2)); + circle.par(2) = bField / alpaka::math::abs(acc, circle.par(2)); + circle.cov = jacobian * circle.cov * jacobian.transpose(); + + helix->par << circle.par, line.par; + helix->cov = riemannFit::MatrixXd::Zero(5, 5); + helix->cov.block(0, 0, 3, 3) = circle.cov; + helix->cov.block(3, 3, 2, 2) = line.cov; + helix->qCharge = circle.qCharge; + helix->chi2_circle = circle.chi2; + helix->chi2_line = line.chi2; + } + }; + } // namespace brokenline +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelTrackFitting_interface_BrokenLine_h diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h b/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h new file mode 100644 index 0000000000000..3daf271a5ca13 --- /dev/null +++ b/RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h @@ -0,0 +1,64 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h +#define RecoPixelVertexing_PixelTrackFitting_interface_FitResult_h + +#include +#include + +#include +#include + +namespace riemannFit { + + using Vector2d = Eigen::Vector2d; + using Vector3d = Eigen::Vector3d; + using Vector4d = Eigen::Vector4d; + using Vector5d = Eigen::Matrix; + using Matrix2d = Eigen::Matrix2d; + using Matrix3d = Eigen::Matrix3d; + using Matrix4d = Eigen::Matrix4d; + using Matrix5d = Eigen::Matrix; + using Matrix6d = Eigen::Matrix; + + template + using Matrix3xNd = Eigen::Matrix; // used for inputs hits + + struct CircleFit { + Vector3d par; //!< parameter: (X0,Y0,R) + Matrix3d cov; + /*!< covariance matrix: \n + |cov(X0,X0)|cov(Y0,X0)|cov( R,X0)| \n + |cov(X0,Y0)|cov(Y0,Y0)|cov( R,Y0)| \n + |cov(X0, R)|cov(Y0, R)|cov( R, R)| + */ + int32_t qCharge; //!< particle charge + float chi2; + }; + + struct LineFit { + Vector2d par; //!<(cotan(theta),Zip) + Matrix2d cov; + /*!< + |cov(c_t,c_t)|cov(Zip,c_t)| \n + |cov(c_t,Zip)|cov(Zip,Zip)| + */ + double chi2; + }; + + struct HelixFit { + Vector5d par; //!<(phi,Tip,pt,cotan(theta)),Zip) + Matrix5d cov; + /*!< ()->cov() \n + |(phi,phi)|(Tip,phi)|(p_t,phi)|(c_t,phi)|(Zip,phi)| \n + |(phi,Tip)|(Tip,Tip)|(p_t,Tip)|(c_t,Tip)|(Zip,Tip)| \n + |(phi,p_t)|(Tip,p_t)|(p_t,p_t)|(c_t,p_t)|(Zip,p_t)| \n + |(phi,c_t)|(Tip,c_t)|(p_t,c_t)|(c_t,c_t)|(Zip,c_t)| \n + |(phi,Zip)|(Tip,Zip)|(p_t,Zip)|(c_t,Zip)|(Zip,Zip)| + */ + float chi2_circle; + float chi2_line; + // Vector4d fast_fit; + int32_t qCharge; //!< particle charge + }; // __attribute__((aligned(16))); + +} // namespace riemannFit +#endif diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h b/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h new file mode 100644 index 0000000000000..5dfa609ad3905 --- /dev/null +++ b/RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h @@ -0,0 +1,253 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_alpaka_FitUtils_h +#define RecoPixelVertexing_PixelTrackFitting_alpaka_FitUtils_h +#include +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitResult.h" +namespace riemannFit { + + constexpr double epsilon = 1.e-4; //!< used in numerical derivative (J2 in Circle_fit()) + + using VectorXd = Eigen::VectorXd; + using MatrixXd = Eigen::MatrixXd; + template + using MatrixNd = Eigen::Matrix; + template + using MatrixNplusONEd = Eigen::Matrix; + template + using ArrayNd = Eigen::Array; + template + using Matrix2Nd = Eigen::Matrix; + template + using Matrix3Nd = Eigen::Matrix; + template + using Matrix2xNd = Eigen::Matrix; + template + using Array2xNd = Eigen::Array; + template + using MatrixNx3d = Eigen::Matrix; + template + using MatrixNx5d = Eigen::Matrix; + template + using VectorNd = Eigen::Matrix; + template + using VectorNplusONEd = Eigen::Matrix; + template + using Vector2Nd = Eigen::Matrix; + template + using Vector3Nd = Eigen::Matrix; + template + using RowVectorNd = Eigen::Matrix; + template + using RowVector2Nd = Eigen::Matrix; + + using Matrix2x3d = Eigen::Matrix; + + using Matrix3f = Eigen::Matrix3f; + using Vector3f = Eigen::Vector3f; + using Vector4f = Eigen::Vector4f; + using Vector6f = Eigen::Matrix; + // transformation between the "perigee" to cmssw localcoord frame + // the plane of the latter is the perigee plane... + // from //!<(phi,Tip,q/pt,cotan(theta)),Zip) + // to q/p,dx/dz,dy/dz,x,z + template + inline void transformToPerigeePlane(VI5 const& ip, MI5 const& icov, VO5& op, MO5& ocov) { + auto sinTheta2 = 1. / (1. + ip(3) * ip(3)); + auto sinTheta = std::sqrt(sinTheta2); + auto cosTheta = ip(3) * sinTheta; + + op(0) = sinTheta * ip(2); + op(1) = 0.; + op(2) = -ip(3); + op(3) = ip(1); + op(4) = -ip(4); + + Matrix5d jMat = Matrix5d::Zero(); + + jMat(0, 2) = sinTheta; + jMat(0, 3) = -sinTheta2 * cosTheta * ip(2); + jMat(1, 0) = 1.; + jMat(2, 3) = -1.; + jMat(3, 1) = 1.; + jMat(4, 4) = -1; + + ocov = jMat * icov * jMat.transpose(); + } + +} // namespace riemannFit + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace riemannFit { + using namespace ::riemannFit; + + template + ALPAKA_FN_ACC void printIt(const TAcc& acc, C* m, const char* prefix = "") { +#ifdef RFIT_DEBUG + for (uint r = 0; r < m->rows(); ++r) { + for (uint c = 0; c < m->cols(); ++c) { + printf("%s Matrix(%d,%d) = %g\n", prefix, r, c, (*m)(r, c)); + } + } +#endif + } + + /*! + \brief raise to square. + */ + template + constexpr T sqr(const T a) { + return a * a; + } + + /*! + \brief Compute cross product of two 2D vector (assuming z component 0), + returning z component of the result. + \param a first 2D vector in the product. + \param b second 2D vector in the product. + \return z component of the cross product. + */ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE double cross2D(const TAcc& acc, const Vector2d& a, const Vector2d& b) { + return a.x() * b.y() - a.y() * b.x(); + } + + /*! + * load error in CMSSW format to our formalism + * + */ + template + ALPAKA_FN_ACC void loadCovariance2D(const TAcc& acc, M6xNf const& ge, M2Nd& hits_cov) { + // Index numerology: + // i: index of the hits/point (0,..,3) + // j: index of space component (x,y,z) + // l: index of space components (x,y,z) + // ge is always in sync with the index i and is formatted as: + // ge[] ==> [xx, xy, yy, xz, yz, zz] + // in (j,l) notation, we have: + // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)] + // so the index ge_idx corresponds to the matrix elements: + // | 0 1 3 | + // | 1 2 4 | + // | 3 4 5 | + constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime; + for (uint32_t i = 0; i < hits_in_fit; ++i) { + { + constexpr uint32_t ge_idx = 0, j = 0, l = 0; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 2, j = 1, l = 1; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 1, j = 1, l = 0; + hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = + ge.col(i)[ge_idx]; + } + } + } + + template + ALPAKA_FN_ACC void loadCovariance(const TAcc& acc, M6xNf const& ge, M3xNd& hits_cov) { + // Index numerology: + // i: index of the hits/point (0,..,3) + // j: index of space component (x,y,z) + // l: index of space components (x,y,z) + // ge is always in sync with the index i and is formatted as: + // ge[] ==> [xx, xy, yy, xz, yz, zz] + // in (j,l) notation, we have: + // ge[] ==> [(0,0), (0,1), (1,1), (0,2), (1,2), (2,2)] + // so the index ge_idx corresponds to the matrix elements: + // | 0 1 3 | + // | 1 2 4 | + // | 3 4 5 | + constexpr uint32_t hits_in_fit = M6xNf::ColsAtCompileTime; + for (uint32_t i = 0; i < hits_in_fit; ++i) { + { + constexpr uint32_t ge_idx = 0, j = 0, l = 0; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 2, j = 1, l = 1; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 5, j = 2, l = 2; + hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 1, j = 1, l = 0; + hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = + ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 3, j = 2, l = 0; + hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = + ge.col(i)[ge_idx]; + } + { + constexpr uint32_t ge_idx = 4, j = 2, l = 1; + hits_cov(i + l * hits_in_fit, i + j * hits_in_fit) = hits_cov(i + j * hits_in_fit, i + l * hits_in_fit) = + ge.col(i)[ge_idx]; + } + } + } + + /*! + \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,p_t) and + consequently covariance matrix. + \param circle_uvr parameter (X0,Y0,R), covariance matrix to + be transformed and particle charge. + \param B magnetic field in Gev/cm/c unit. + \param error flag for errors computation. + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void par_uvrtopak(const TAcc& acc, + CircleFit& circle, + const double B, + const bool error) { + Vector3d par_pak; + const double temp0 = circle.par.head(2).squaredNorm(); + const double temp1 = alpaka::math::sqrt(acc, temp0); + par_pak << alpaka::math::atan2(acc, circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)), + circle.qCharge * (temp1 - circle.par(2)), circle.par(2) * B; + if (error) { + const double temp2 = sqr(circle.par(0)) * 1. / temp0; + const double temp3 = 1. / temp1 * circle.qCharge; + Matrix3d j4Mat; + j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., + circle.par(0) * temp3, circle.par(1) * temp3, -circle.qCharge, 0., 0., B; + circle.cov = j4Mat * circle.cov * j4Mat.transpose(); + } + circle.par = par_pak; + } + + /*! + \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and + consequently covariance matrix. + \param circle_uvr parameter (X0,Y0,R), covariance matrix to + be transformed and particle charge. + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fromCircleToPerigee(const TAcc& acc, CircleFit& circle) { + Vector3d par_pak; + const double temp0 = circle.par.head(2).squaredNorm(); + const double temp1 = alpaka::math::sqrt(acc, temp0); + par_pak << alpaka::math::atan2(acc, circle.qCharge * circle.par(0), -circle.qCharge * circle.par(1)), + circle.qCharge * (temp1 - circle.par(2)), circle.qCharge / circle.par(2); + + const double temp2 = sqr(circle.par(0)) * 1. / temp0; + const double temp3 = 1. / temp1 * circle.qCharge; + Matrix3d j4Mat; + j4Mat << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3, + circle.par(1) * temp3, -circle.qCharge, 0., 0., -circle.qCharge / (circle.par(2) * circle.par(2)); + circle.cov = j4Mat * circle.cov * j4Mat.transpose(); + + circle.par = par_pak; + } + + } // namespace riemannFit + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelTrackFitting_interface_FitUtils_h diff --git a/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h b/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h new file mode 100644 index 0000000000000..8455a03e9f58f --- /dev/null +++ b/RecoTracker/PixelTrackFitting/interface/alpaka/RiemannFit.h @@ -0,0 +1,1023 @@ +#ifndef RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h +#define RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h +#include +#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + namespace riemannFit { + using namespace ::riemannFit; + /*! Compute the Radiation length in the uniform hypothesis + * + * The Pixel detector, barrel and forward, is considered as an homogeneous + * cylinder of material, whose radiation lengths has been derived from the TDR + * plot that shows that 16cm correspond to 0.06 radiation lengths. Therefore + * one radiation length corresponds to 16cm/0.06 =~ 267 cm. All radiation + * lengths are computed using this unique number, in both regions, barrel and + * endcap. + * + * NB: no angle corrections nor projections are computed inside this routine. + * It is therefore the responsibility of the caller to supply the proper + * lengths in input. These lengths are the path traveled by the particle along + * its trajectory, namely the so called S of the helix in 3D space. + * + * \param length_values vector of incremental distances that will be translated + * into radiation length equivalent. Each radiation length i is computed + * incrementally with respect to the previous length i-1. The first length has + * no reference point (i.e. it has the dca). + * + * \return incremental radiation lengths that correspond to each segment. + */ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeRadLenUniformMaterial(const TAcc& acc, + const VNd1& length_values, + VNd2& rad_lengths) { + // Radiation length of the pixel detector in the uniform assumption, with + // 0.06 rad_len at 16 cm + constexpr double xx_0_inv = 0.06 / 16.; + uint n = length_values.rows(); + rad_lengths(0) = length_values(0) * xx_0_inv; + for (uint j = 1; j < n; ++j) { + rad_lengths(j) = alpaka::math::abs(acc, length_values(j) - length_values(j - 1)) * xx_0_inv; + } + } + + /*! + \brief Compute the covariance matrix along cartesian S-Z of points due to + multiple Coulomb scattering to be used in the line_fit, for the barrel + and forward cases. + The input covariance matrix is in the variables s-z, original and + unrotated. + The multiple scattering component is computed in the usual linear + approximation, using the 3D path which is computed as the squared root of + the squared sum of the s and z components passed in. + Internally a rotation by theta is performed and the covariance matrix + returned is the one in the direction orthogonal to the rotated S3D axis, + i.e. along the rotated Z axis. + The choice of the rotation is not arbitrary, but derived from the fact that + putting the horizontal axis along the S3D direction allows the usage of the + ordinary least squared fitting techiques with the trivial parametrization y + = mx + q, avoiding the patological case with m = +/- inf, that would + correspond to the case at eta = 0. + */ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE auto scatterCovLine(const TAcc& acc, + Matrix2d const* cov_sz, + const V4& fast_fit, + VNd1 const& s_arcs, + VNd2 const& z_values, + const double theta, + const double bField, + MatrixNd& ret) { +#ifdef RFIT_DEBUG + riemannFit::printIt(&s_arcs, "Scatter_cov_line - s_arcs: "); +#endif + constexpr uint n = N; + double p_t = alpaka::math::min(acc, 20., fast_fit(2) * bField); // limit pt to avoid too small error!!! + double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3))); + VectorNd rad_lengths_S; + // See documentation at http://eigen.tuxfamily.org/dox/group__TutorialArrayClass.html + // Basically, to perform cwise operations on Matrices and Vectors, you need + // to transform them into Array-like objects. + VectorNd s_values = s_arcs.array() * s_arcs.array() + z_values.array() * z_values.array(); + s_values = s_values.array().sqrt(); + computeRadLenUniformMaterial(acc, s_values, rad_lengths_S); + VectorNd sig2_S; + sig2_S = .000225 / p_2 * (1. + 0.038 * rad_lengths_S.array().log()).abs2() * rad_lengths_S.array(); +#ifdef RFIT_DEBUG + riemannFit::printIt(cov_sz, "Scatter_cov_line - cov_sz: "); +#endif + Matrix2Nd tmp = Matrix2Nd::Zero(); + for (uint k = 0; k < n; ++k) { + tmp(k, k) = cov_sz[k](0, 0); + tmp(k + n, k + n) = cov_sz[k](1, 1); + tmp(k, k + n) = tmp(k + n, k) = cov_sz[k](0, 1); + } + for (uint k = 0; k < n; ++k) { + for (uint l = k; l < n; ++l) { + for (uint i = 0; i < uint(alpaka::math::min(acc, k, l)); ++i) { + tmp(k + n, l + n) += alpaka::math::abs(acc, s_values(k) - s_values(i)) * + alpaka::math::abs(acc, s_values(l) - s_values(i)) * sig2_S(i); + } + tmp(l + n, k + n) = tmp(k + n, l + n); + } + } + // We are interested only in the errors orthogonal to the rotated s-axis + // which, in our formalism, are in the lower square matrix. +#ifdef RFIT_DEBUG + riemannFit::printIt(&tmp, "Scatter_cov_line - tmp: "); +#endif + ret = tmp.block(n, n, n, n); + } + + /*! + \brief Compute the covariance matrix (in radial coordinates) of points in + the transverse plane due to multiple Coulomb scattering. + \param p2D 2D points in the transverse plane. + \param fast_fit fast_fit Vector4d result of the previous pre-fit + structured in this form:(X0, Y0, R, Tan(Theta))). + \param B magnetic field use to compute p + \return scatter_cov_rad errors due to multiple scattering. + \warning input points must be ordered radially from the detector center + (from inner layer to outer ones; points on the same layer must ordered too). + \details Only the tangential component is computed (the radial one is + negligible). + */ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE MatrixNd scatter_cov_rad( + const TAcc& acc, const M2xN& p2D, const V4& fast_fit, VectorNd const& rad, double B) { + constexpr uint n = N; + double p_t = alpaka::math::min(acc, 20., fast_fit(2) * B); // limit pt to avoid too small error!!! + double p_2 = p_t * p_t * (1. + 1. / sqr(fast_fit(3))); + double theta = atan(fast_fit(3)); + theta = theta < 0. ? theta + M_PI : theta; + VectorNd s_values; + VectorNd rad_lengths; + const Vector2d oVec(fast_fit(0), fast_fit(1)); + + // associated Jacobian, used in weights and errors computation + for (uint i = 0; i < n; ++i) { // x + Vector2d pVec = p2D.block(0, i, 2, 1) - oVec; + const double cross = cross2D(acc, -oVec, pVec); + const double dot = (-oVec).dot(pVec); + const double tempAtan2 = atan2(cross, dot); + s_values(i) = alpaka::math::abs(acc, tempAtan2 * fast_fit(2)); + } + computeRadLenUniformMaterial(acc, s_values * sqrt(1. + 1. / sqr(fast_fit(3))), rad_lengths); + MatrixNd scatter_cov_rad = MatrixNd::Zero(); + VectorNd sig2 = (1. + 0.038 * rad_lengths.array().log()).abs2() * rad_lengths.array(); + sig2 *= 0.000225 / (p_2 * sqr(sin(theta))); + for (uint k = 0; k < n; ++k) { + for (uint l = k; l < n; ++l) { + for (uint i = 0; i < uint(alpaka::math::min(acc, k, l)); ++i) { + scatter_cov_rad(k, l) += (rad(k) - rad(i)) * (rad(l) - rad(i)) * sig2(i); + } + scatter_cov_rad(l, k) = scatter_cov_rad(k, l); + } + } +#ifdef RFIT_DEBUG + riemannFit::printIt(&scatter_cov_rad, "Scatter_cov_rad - scatter_cov_rad: "); +#endif + return scatter_cov_rad; + } + + /*! + \brief Transform covariance matrix from radial (only tangential component) + to Cartesian coordinates (only transverse plane component). + \param p2D 2D points in the transverse plane. + \param cov_rad covariance matrix in radial coordinate. + \return cov_cart covariance matrix in Cartesian coordinates. +*/ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE Matrix2Nd cov_radtocart(const TAcc& acc, + const M2xN& p2D, + const MatrixNd& cov_rad, + const VectorNd& rad) { +#ifdef RFIT_DEBUG + printf("Address of p2D: %p\n", &p2D); +#endif + printIt(&p2D, "cov_radtocart - p2D:"); + constexpr uint n = N; + Matrix2Nd cov_cart = Matrix2Nd::Zero(); + VectorNd rad_inv = rad.cwiseInverse(); + printIt(&rad_inv, "cov_radtocart - rad_inv:"); + for (uint i = 0; i < n; ++i) { + for (uint j = i; j < n; ++j) { + cov_cart(i, j) = cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(1, j) * rad_inv(j); + cov_cart(i + n, j + n) = cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(0, j) * rad_inv(j); + cov_cart(i, j + n) = -cov_rad(i, j) * p2D(1, i) * rad_inv(i) * p2D(0, j) * rad_inv(j); + cov_cart(i + n, j) = -cov_rad(i, j) * p2D(0, i) * rad_inv(i) * p2D(1, j) * rad_inv(j); + cov_cart(j, i) = cov_cart(i, j); + cov_cart(j + n, i + n) = cov_cart(i + n, j + n); + cov_cart(j + n, i) = cov_cart(i, j + n); + cov_cart(j, i + n) = cov_cart(i + n, j); + } + } + return cov_cart; + } + + /*! + \brief Transform covariance matrix from Cartesian coordinates (only + transverse plane component) to radial coordinates (both radial and + tangential component but only diagonal terms, correlation between different + point are not managed). + \param p2D 2D points in transverse plane. + \param cov_cart covariance matrix in Cartesian coordinates. + \return cov_rad covariance matrix in raidal coordinate. + \warning correlation between different point are not computed. +*/ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd cov_carttorad(const TAcc& acc, + const M2xN& p2D, + const Matrix2Nd& cov_cart, + const VectorNd& rad) { + constexpr uint n = N; + VectorNd cov_rad; + const VectorNd rad_inv2 = rad.cwiseInverse().array().square(); + for (uint i = 0; i < n; ++i) { + //!< in case you have (0,0) to avoid dividing by 0 radius + if (rad(i) < 1.e-4) + cov_rad(i) = cov_cart(i, i); + else { + cov_rad(i) = rad_inv2(i) * (cov_cart(i, i) * sqr(p2D(1, i)) + cov_cart(i + n, i + n) * sqr(p2D(0, i)) - + 2. * cov_cart(i, i + n) * p2D(0, i) * p2D(1, i)); + } + } + return cov_rad; + } + + /*! + \brief Transform covariance matrix from Cartesian coordinates (only + transverse plane component) to coordinates system orthogonal to the + pre-fitted circle in each point. + Further information in attached documentation. + \param p2D 2D points in transverse plane. + \param cov_cart covariance matrix in Cartesian coordinates. + \param fast_fit fast_fit Vector4d result of the previous pre-fit + structured in this form:(X0, Y0, R, tan(theta))). + \return cov_rad covariance matrix in the pre-fitted circle's + orthogonal system. +*/ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd cov_carttorad_prefit( + const TAcc& acc, const M2xN& p2D, const Matrix2Nd& cov_cart, V4& fast_fit, const VectorNd& rad) { + constexpr uint n = N; + VectorNd cov_rad; + for (uint i = 0; i < n; ++i) { + //!< in case you have (0,0) to avoid dividing by 0 radius + if (rad(i) < 1.e-4) + cov_rad(i) = cov_cart(i, i); // TO FIX + else { + Vector2d a = p2D.col(i); + Vector2d b = p2D.col(i) - fast_fit.head(2); + const double x2 = a.dot(b); + const double y2 = cross2D(acc, a, b); + const double tan_c = -y2 / x2; + const double tan_c2 = sqr(tan_c); + cov_rad(i) = + 1. / (1. + tan_c2) * (cov_cart(i, i) + cov_cart(i + n, i + n) * tan_c2 + 2 * cov_cart(i, i + n) * tan_c); + } + } + return cov_rad; + } + + /*! + \brief Compute the points' weights' vector for the circle fit when multiple + scattering is managed. + Further information in attached documentation. + \param cov_rad_inv covariance matrix inverse in radial coordinated + (or, beter, pre-fitted circle's orthogonal system). + \return weight VectorNd points' weights' vector. + \bug I'm not sure this is the right way to compute the weights for non + diagonal cov matrix. Further investigation needed. +*/ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE VectorNd weightCircle(const TAcc& acc, const MatrixNd& cov_rad_inv) { + return cov_rad_inv.colwise().sum().transpose(); + } + + /*! + \brief Find particle q considering the sign of cross product between + particles velocity (estimated by the first 2 hits) and the vector radius + between the first hit and the center of the fitted circle. + \param p2D 2D points in transverse plane. + \param par_uvr result of the circle fit in this form: (X0,Y0,R). + \return q int 1 or -1. +*/ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE int32_t charge(const TAcc& acc, const M2xN& p2D, const Vector3d& par_uvr) { + return ((p2D(0, 1) - p2D(0, 0)) * (par_uvr.y() - p2D(1, 0)) - + (p2D(1, 1) - p2D(1, 0)) * (par_uvr.x() - p2D(0, 0)) > + 0) + ? -1 + : 1; + } + + /*! + \brief Compute the eigenvector associated to the minimum eigenvalue. + \param A the Matrix you want to know eigenvector and eigenvalue. + \param chi2 the double were the chi2-related quantity will be stored. + \return the eigenvector associated to the minimum eigenvalue. + \warning double precision is needed for a correct assessment of chi2. + \details The minimus eigenvalue is related to chi2. + We exploit the fact that the matrix is symmetrical and small (2x2 for line + fit and 3x3 for circle fit), so the SelfAdjointEigenSolver from Eigen + library is used, with the computedDirect method (available only for 2x2 + and 3x3 Matrix) wich computes eigendecomposition of given matrix using a + fast closed-form algorithm. + For this optimization the matrix type must be known at compiling time. +*/ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector3d min_eigen3D(const TAcc& acc, const Matrix3d& A, double& chi2) { +#ifdef RFIT_DEBUG + printf("min_eigen3D - enter\n"); +#endif + Eigen::SelfAdjointEigenSolver solver(3); + solver.computeDirect(A); + int min_index; + chi2 = solver.eigenvalues().minCoeff(&min_index); +#ifdef RFIT_DEBUG + printf("min_eigen3D - exit\n"); +#endif + return solver.eigenvectors().col(min_index); + } + + /*! + \brief A faster version of min_eigen3D() where double precision is not + needed. + \param A the Matrix you want to know eigenvector and eigenvalue. + \param chi2 the double were the chi2-related quantity will be stored + \return the eigenvector associated to the minimum eigenvalue. + \detail The computedDirect() method of SelfAdjointEigenSolver for 3x3 Matrix + indeed, use trigonometry function (it solves a third degree equation) which + speed up in single precision. +*/ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector3d min_eigen3D_fast(const TAcc& acc, const Matrix3d& A) { + Eigen::SelfAdjointEigenSolver solver(3); + solver.computeDirect(A.cast()); + int min_index; + solver.eigenvalues().minCoeff(&min_index); + return solver.eigenvectors().col(min_index).cast(); + } + + /*! + \brief 2D version of min_eigen3D(). + \param aMat the Matrix you want to know eigenvector and eigenvalue. + \param chi2 the double were the chi2-related quantity will be stored + \return the eigenvector associated to the minimum eigenvalue. + \detail The computedDirect() method of SelfAdjointEigenSolver for 2x2 Matrix + do not use special math function (just sqrt) therefore it doesn't speed up + significantly in single precision. +*/ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE Vector2d min_eigen2D(const TAcc& acc, const Matrix2d& aMat, double& chi2) { + Eigen::SelfAdjointEigenSolver solver(2); + solver.computeDirect(aMat); + int min_index; + chi2 = solver.eigenvalues().minCoeff(&min_index); + return solver.eigenvectors().col(min_index); + } + + /*! + \brief A very fast helix fit: it fits a circle by three points (first, middle + and last point) and a line by two points (first and last). + \param hits points to be fitted + \return result in this form: (X0,Y0,R,tan(theta)). + \warning points must be passed ordered (from internal layer to external) in + order to maximize accuracy and do not mistake tan(theta) sign. + \details This fast fit is used as pre-fit which is needed for: + - weights estimation and chi2 computation in line fit (fundamental); + - weights estimation and chi2 computation in circle fit (useful); + - computation of error due to multiple scattering. +*/ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void fastFit(const TAcc& acc, const M3xN& hits, V4& result) { + constexpr uint32_t N = M3xN::ColsAtCompileTime; + constexpr auto n = N; // get the number of hits + printIt(&hits, "Fast_fit - hits: "); + + // CIRCLE FIT + // Make segments between middle-to-first(b) and last-to-first(c) hits + const Vector2d bVec = hits.block(0, n / 2, 2, 1) - hits.block(0, 0, 2, 1); + const Vector2d cVec = hits.block(0, n - 1, 2, 1) - hits.block(0, 0, 2, 1); + printIt(&bVec, "Fast_fit - b: "); + printIt(&cVec, "Fast_fit - c: "); + // Compute their lengths + auto b2 = bVec.squaredNorm(); + auto c2 = cVec.squaredNorm(); + // The algebra has been verified (MR). The usual approach has been followed: + // * use an orthogonal reference frame passing from the first point. + // * build the segments (chords) + // * build orthogonal lines through mid points + // * make a system and solve for X0 and Y0. + // * add the initial point + bool flip = abs(bVec.x()) < abs(bVec.y()); + auto bx = flip ? bVec.y() : bVec.x(); + auto by = flip ? bVec.x() : bVec.y(); + auto cx = flip ? cVec.y() : cVec.x(); + auto cy = flip ? cVec.x() : cVec.y(); + //!< in case b.x is 0 (2 hits with same x) + auto div = 2. * (cx * by - bx * cy); + // if aligned TO FIX + auto y0 = (cx * b2 - bx * c2) / div; + auto x0 = (0.5 * b2 - y0 * by) / bx; + result(0) = hits(0, 0) + (flip ? y0 : x0); + result(1) = hits(1, 0) + (flip ? x0 : y0); + result(2) = sqrt(sqr(x0) + sqr(y0)); + printIt(&result, "Fast_fit - result: "); + + // LINE FIT + const Vector2d dVec = hits.block(0, 0, 2, 1) - result.head(2); + const Vector2d eVec = hits.block(0, n - 1, 2, 1) - result.head(2); + printIt(&eVec, "Fast_fit - e: "); + printIt(&dVec, "Fast_fit - d: "); + // Compute the arc-length between first and last point: L = R * theta = R * atan (tan (Theta) ) + auto dr = result(2) * atan2(cross2D(acc, dVec, eVec), dVec.dot(eVec)); + // Simple difference in Z between last and first hit + auto dz = hits(2, n - 1) - hits(2, 0); + + result(3) = (dr / dz); + +#ifdef RFIT_DEBUG + printf("Fast_fit: [%f, %f, %f, %f]\n", result(0), result(1), result(2), result(3)); +#endif + } + + /*! + \brief Fit a generic number of 2D points with a circle using Riemann-Chernov + algorithm. Covariance matrix of fitted parameter is optionally computed. + Multiple scattering (currently only in barrel layer) is optionally handled. + \param hits2D 2D points to be fitted. + \param hits_cov2D covariance matrix of 2D points. + \param fast_fit pre-fit result in this form: (X0,Y0,R,tan(theta)). + (tan(theta) is not used). + \param bField magnetic field + \param error flag for error computation. + \param scattering flag for multiple scattering + \return circle circle_fit: + -par parameter of the fitted circle in this form (X0,Y0,R); \n + -cov covariance matrix of the fitted parameter (not initialized if + error = false); \n + -q charge of the particle; \n + -chi2. + \warning hits must be passed ordered from inner to outer layer (double hits + on the same layer must be ordered too) so that multiple scattering is + treated properly. + \warning Multiple scattering for barrel is still not tested. + \warning Multiple scattering for endcap hits is not handled (yet). Do not + fit endcap hits with scattering = true ! + \bug for small pt (<0.3 Gev/c) chi2 could be slightly underestimated. + \bug further investigation needed for error propagation with multiple + scattering. +*/ + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE CircleFit circleFit(const TAcc& acc, + const M2xN& hits2D, + const Matrix2Nd& hits_cov2D, + const V4& fast_fit, + const VectorNd& rad, + const double bField, + const bool error) { +#ifdef RFIT_DEBUG + printf("circle_fit - enter\n"); +#endif + // INITIALIZATION + Matrix2Nd vMat = hits_cov2D; + constexpr uint n = N; + printIt(&hits2D, "circle_fit - hits2D:"); + printIt(&hits_cov2D, "circle_fit - hits_cov2D:"); + +#ifdef RFIT_DEBUG + printf("circle_fit - WEIGHT COMPUTATION\n"); +#endif + // WEIGHT COMPUTATION + VectorNd weight; + MatrixNd gMat; + double renorm; + { + MatrixNd cov_rad = cov_carttorad_prefit(acc, hits2D, vMat, fast_fit, rad).asDiagonal(); + MatrixNd scatterCovRadMat = scatter_cov_rad(acc, hits2D, fast_fit, rad, bField); + printIt(&scatterCovRadMat, "circle_fit - scatter_cov_rad:"); + printIt(&hits2D, "circle_fit - hits2D bis:"); +#ifdef RFIT_DEBUG + printf("Address of hits2D: a) %p\n", &hits2D); +#endif + vMat += cov_radtocart(acc, hits2D, scatterCovRadMat, rad); + printIt(&vMat, "circle_fit - V:"); + cov_rad += scatterCovRadMat; + printIt(&cov_rad, "circle_fit - cov_rad:"); + math::cholesky::invert(cov_rad, gMat); + // gMat = cov_rad.inverse(); + renorm = gMat.sum(); + gMat *= 1. / renorm; + weight = weightCircle(acc, gMat); + } + printIt(&weight, "circle_fit - weight:"); + + // SPACE TRANSFORMATION +#ifdef RFIT_DEBUG + printf("circle_fit - SPACE TRANSFORMATION\n"); +#endif + + // center +#ifdef RFIT_DEBUG + printf("Address of hits2D: b) %p\n", &hits2D); +#endif + const Vector2d hCentroid = hits2D.rowwise().mean(); // centroid + printIt(&hCentroid, "circle_fit - h_:"); + Matrix3xNd p3D; + p3D.block(0, 0, 2, n) = hits2D.colwise() - hCentroid; + printIt(&p3D, "circle_fit - p3D: a)"); + Vector2Nd mc; // centered hits, used in error computation + mc << p3D.row(0).transpose(), p3D.row(1).transpose(); + printIt(&mc, "circle_fit - mc(centered hits):"); + + // scale + const double tempQ = mc.squaredNorm(); + const double tempS = sqrt(n * 1. / tempQ); // scaling factor + p3D.block(0, 0, 2, n) *= tempS; + + // project on paraboloid + p3D.row(2) = p3D.block(0, 0, 2, n).colwise().squaredNorm(); + printIt(&p3D, "circle_fit - p3D: b)"); + +#ifdef RFIT_DEBUG + printf("circle_fit - COST FUNCTION\n"); +#endif + // COST FUNCTION + + // compute + Vector3d r0; + r0.noalias() = p3D * weight; // center of gravity + const Matrix3xNd xMat = p3D.colwise() - r0; + Matrix3d aMat = xMat * gMat * xMat.transpose(); + printIt(&aMat, "circle_fit - A:"); + +#ifdef RFIT_DEBUG + printf("circle_fit - MINIMIZE\n"); +#endif + // minimize + double chi2; + Vector3d vVec = min_eigen3D(acc, aMat, chi2); +#ifdef RFIT_DEBUG + printf("circle_fit - AFTER MIN_EIGEN\n"); +#endif + printIt(&vVec, "v BEFORE INVERSION"); + vVec *= (vVec(2) > 0) ? 1 : -1; // TO FIX dovrebbe essere N(3)>0 + printIt(&vVec, "v AFTER INVERSION"); + // This hack to be able to run on GPU where the automatic assignment to a + // double from the vector multiplication is not working. +#ifdef RFIT_DEBUG + printf("circle_fit - AFTER MIN_EIGEN 1\n"); +#endif + Eigen::Matrix cm; +#ifdef RFIT_DEBUG + printf("circle_fit - AFTER MIN_EIGEN 2\n"); +#endif + cm = -vVec.transpose() * r0; +#ifdef RFIT_DEBUG + printf("circle_fit - AFTER MIN_EIGEN 3\n"); +#endif + const double tempC = cm(0, 0); + +#ifdef RFIT_DEBUG + printf("circle_fit - COMPUTE CIRCLE PARAMETER\n"); +#endif + // COMPUTE CIRCLE PARAMETER + + // auxiliary quantities + const double tempH = sqrt(1. - sqr(vVec(2)) - 4. * tempC * vVec(2)); + const double v2x2_inv = 1. / (2. * vVec(2)); + const double s_inv = 1. / tempS; + Vector3d par_uvr; // used in error propagation + par_uvr << -vVec(0) * v2x2_inv, -vVec(1) * v2x2_inv, tempH * v2x2_inv; + + CircleFit circle; + circle.par << par_uvr(0) * s_inv + hCentroid(0), par_uvr(1) * s_inv + hCentroid(1), par_uvr(2) * s_inv; + circle.qCharge = charge(acc, hits2D, circle.par); + circle.chi2 = abs(chi2) * renorm / sqr(2 * vVec(2) * par_uvr(2) * tempS); + printIt(&circle.par, "circle_fit - CIRCLE PARAMETERS:"); + printIt(&circle.cov, "circle_fit - CIRCLE COVARIANCE:"); +#ifdef RFIT_DEBUG + printf("circle_fit - CIRCLE CHARGE: %d\n", circle.qCharge); +#endif + +#ifdef RFIT_DEBUG + printf("circle_fit - ERROR PROPAGATION\n"); +#endif + // ERROR PROPAGATION + if (error) { +#ifdef RFIT_DEBUG + printf("circle_fit - ERROR PRPAGATION ACTIVATED\n"); +#endif + ArrayNd vcsMat[2][2]; // cov matrix of center & scaled points + MatrixNd cMat[3][3]; // cov matrix of 3D transformed points +#ifdef RFIT_DEBUG + printf("circle_fit - ERROR PRPAGATION ACTIVATED 2\n"); +#endif + { + Eigen::Matrix cm; + Eigen::Matrix cm2; + cm = mc.transpose() * vMat * mc; + const double tempC2 = cm(0, 0); + Matrix2Nd tempVcsMat; + tempVcsMat.template triangularView() = + (sqr(tempS) * vMat + sqr(sqr(tempS)) * 1. / (4. * tempQ * n) * + (2. * vMat.squaredNorm() + 4. * tempC2) * // mc.transpose() * V * mc) * + (mc * mc.transpose())); + + printIt(&tempVcsMat, "circle_fit - Vcs:"); + cMat[0][0] = tempVcsMat.block(0, 0, n, n).template selfadjointView(); + vcsMat[0][1] = tempVcsMat.block(0, n, n, n); + cMat[1][1] = tempVcsMat.block(n, n, n, n).template selfadjointView(); + vcsMat[1][0] = vcsMat[0][1].transpose(); + printIt(&tempVcsMat, "circle_fit - Vcs:"); + } + + { + const ArrayNd t0 = (VectorXd::Constant(n, 1.) * p3D.row(0)); + const ArrayNd t1 = (VectorXd::Constant(n, 1.) * p3D.row(1)); + const ArrayNd t00 = p3D.row(0).transpose() * p3D.row(0); + const ArrayNd t01 = p3D.row(0).transpose() * p3D.row(1); + const ArrayNd t11 = p3D.row(1).transpose() * p3D.row(1); + const ArrayNd t10 = t01.transpose(); + vcsMat[0][0] = cMat[0][0]; + cMat[0][1] = vcsMat[0][1]; + cMat[0][2] = 2. * (vcsMat[0][0] * t0 + vcsMat[0][1] * t1); + vcsMat[1][1] = cMat[1][1]; + cMat[1][2] = 2. * (vcsMat[1][0] * t0 + vcsMat[1][1] * t1); + MatrixNd tmp; + tmp.template triangularView() = + (2. * (vcsMat[0][0] * vcsMat[0][0] + vcsMat[0][0] * vcsMat[0][1] + vcsMat[1][1] * vcsMat[1][0] + + vcsMat[1][1] * vcsMat[1][1]) + + 4. * (vcsMat[0][0] * t00 + vcsMat[0][1] * t01 + vcsMat[1][0] * t10 + vcsMat[1][1] * t11)) + .matrix(); + cMat[2][2] = tmp.template selfadjointView(); + } + printIt(&cMat[0][0], "circle_fit - C[0][0]:"); + + Matrix3d c0Mat; // cov matrix of center of gravity (r0.x,r0.y,r0.z) + for (uint i = 0; i < 3; ++i) { + for (uint j = i; j < 3; ++j) { + Eigen::Matrix tmp; + tmp = weight.transpose() * cMat[i][j] * weight; + // Workaround to get things working in GPU + const double tempC = tmp(0, 0); + c0Mat(i, j) = tempC; //weight.transpose() * C[i][j] * weight; + c0Mat(j, i) = c0Mat(i, j); + } + } + printIt(&c0Mat, "circle_fit - C0:"); + + const MatrixNd wMat = weight * weight.transpose(); + const MatrixNd hMat = MatrixNd::Identity().rowwise() - weight.transpose(); + const MatrixNx3d s_v = hMat * p3D.transpose(); + printIt(&wMat, "circle_fit - W:"); + printIt(&hMat, "circle_fit - H:"); + printIt(&s_v, "circle_fit - s_v:"); + + MatrixNd dMat[3][3]; // cov(s_v) + dMat[0][0] = (hMat * cMat[0][0] * hMat.transpose()).cwiseProduct(wMat); + dMat[0][1] = (hMat * cMat[0][1] * hMat.transpose()).cwiseProduct(wMat); + dMat[0][2] = (hMat * cMat[0][2] * hMat.transpose()).cwiseProduct(wMat); + dMat[1][1] = (hMat * cMat[1][1] * hMat.transpose()).cwiseProduct(wMat); + dMat[1][2] = (hMat * cMat[1][2] * hMat.transpose()).cwiseProduct(wMat); + dMat[2][2] = (hMat * cMat[2][2] * hMat.transpose()).cwiseProduct(wMat); + dMat[1][0] = dMat[0][1].transpose(); + dMat[2][0] = dMat[0][2].transpose(); + dMat[2][1] = dMat[1][2].transpose(); + printIt(&dMat[0][0], "circle_fit - D_[0][0]:"); + + constexpr uint nu[6][2] = {{0, 0}, {0, 1}, {0, 2}, {1, 1}, {1, 2}, {2, 2}}; + + Matrix6d eMat; // cov matrix of the 6 independent elements of A + for (uint a = 0; a < 6; ++a) { + const uint i = nu[a][0], j = nu[a][1]; + for (uint b = a; b < 6; ++b) { + const uint k = nu[b][0], l = nu[b][1]; + VectorNd t0(n); + VectorNd t1(n); + if (l == k) { + t0 = 2. * dMat[j][l] * s_v.col(l); + if (i == j) + t1 = t0; + else + t1 = 2. * dMat[i][l] * s_v.col(l); + } else { + t0 = dMat[j][l] * s_v.col(k) + dMat[j][k] * s_v.col(l); + if (i == j) + t1 = t0; + else + t1 = dMat[i][l] * s_v.col(k) + dMat[i][k] * s_v.col(l); + } + + if (i == j) { + Eigen::Matrix cm; + cm = s_v.col(i).transpose() * (t0 + t1); + // Workaround to get things working in GPU + const double tempC = cm(0, 0); + eMat(a, b) = 0. + tempC; + } else { + Eigen::Matrix cm; + cm = (s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1); + // Workaround to get things working in GPU + const double tempC = cm(0, 0); + eMat(a, b) = 0. + tempC; //(s_v.col(i).transpose() * t0) + (s_v.col(j).transpose() * t1); + } + if (b != a) + eMat(b, a) = eMat(a, b); + } + } + printIt(&eMat, "circle_fit - E:"); + + Eigen::Matrix j2Mat; // Jacobian of min_eigen() (numerically computed) + for (uint a = 0; a < 6; ++a) { + const uint i = nu[a][0], j = nu[a][1]; + Matrix3d delta = Matrix3d::Zero(); + delta(i, j) = delta(j, i) = abs(aMat(i, j) * epsilon); + j2Mat.col(a) = min_eigen3D_fast(acc, aMat + delta); + const int sign = (j2Mat.col(a)(2) > 0) ? 1 : -1; + j2Mat.col(a) = (j2Mat.col(a) * sign - vVec) / delta(i, j); + } + printIt(&j2Mat, "circle_fit - J2:"); + + Matrix4d cvcMat; // joint cov matrix of (v0,v1,v2,c) + { + Matrix3d t0 = j2Mat * eMat * j2Mat.transpose(); + Vector3d t1 = -t0 * r0; + cvcMat.block(0, 0, 3, 3) = t0; + cvcMat.block(0, 3, 3, 1) = t1; + cvcMat.block(3, 0, 1, 3) = t1.transpose(); + Eigen::Matrix cm1; + Eigen::Matrix cm3; + cm1 = (vVec.transpose() * c0Mat * vVec); + // cm2 = (c0Mat.cwiseProduct(t0)).sum(); + cm3 = (r0.transpose() * t0 * r0); + // Workaround to get things working in GPU + const double tempC = cm1(0, 0) + (c0Mat.cwiseProduct(t0)).sum() + cm3(0, 0); + cvcMat(3, 3) = tempC; + // (v.transpose() * c0Mat * v) + (c0Mat.cwiseProduct(t0)).sum() + (r0.transpose() * t0 * r0); + } + printIt(&cvcMat, "circle_fit - Cvc:"); + + Eigen::Matrix j3Mat; // Jacobian (v0,v1,v2,c)->(X0,Y0,R) + { + const double t = 1. / tempH; + j3Mat << -v2x2_inv, 0, vVec(0) * sqr(v2x2_inv) * 2., 0, 0, -v2x2_inv, vVec(1) * sqr(v2x2_inv) * 2., 0, + vVec(0) * v2x2_inv * t, vVec(1) * v2x2_inv * t, + -tempH * sqr(v2x2_inv) * 2. - (2. * tempC + vVec(2)) * v2x2_inv * t, -t; + } + printIt(&j3Mat, "circle_fit - J3:"); + + const RowVector2Nd Jq = mc.transpose() * tempS * 1. / n; // var(q) + printIt(&Jq, "circle_fit - Jq:"); + + Matrix3d cov_uvr = j3Mat * cvcMat * j3Mat.transpose() * sqr(s_inv) // cov(X0,Y0,R) + + (par_uvr * par_uvr.transpose()) * (Jq * vMat * Jq.transpose()); + + circle.cov = cov_uvr; + } + + printIt(&circle.cov, "Circle cov:"); +#ifdef RFIT_DEBUG + printf("circle_fit - exit\n"); +#endif + return circle; + } + + /*! \brief Perform an ordinary least square fit in the s-z plane to compute + * the parameters cotTheta and Zip. + * + * The fit is performed in the rotated S3D-Z' plane, following the formalism of + * Frodesen, Chapter 10, p. 259. + * + * The system has been rotated to both try to use the combined errors in s-z + * along Z', as errors in the Y direction and to avoid the patological case of + * degenerate lines with angular coefficient m = +/- inf. + * + * The rotation is using the information on the theta angle computed in the + * fast fit. The rotation is such that the S3D axis will be the X-direction, + * while the rotated Z-axis will be the Y-direction. This pretty much follows + * what is done in the same fit in the Broken Line approach. + */ + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE LineFit lineFit(const TAcc& acc, + const M3xN& hits, + const M6xN& hits_ge, + const CircleFit& circle, + const V4& fast_fit, + const double bField, + const bool error) { + constexpr uint32_t N = M3xN::ColsAtCompileTime; + constexpr auto n = N; + double theta = -circle.qCharge * atan(fast_fit(3)); + theta = theta < 0. ? theta + M_PI : theta; + + // Prepare the Rotation Matrix to rotate the points + Eigen::Matrix rot; + rot << sin(theta), cos(theta), -cos(theta), sin(theta); + + // PROJECTION ON THE CILINDER + // + // p2D will be: + // [s1, s2, s3, ..., sn] + // [z1, z2, z3, ..., zn] + // s values will be ordinary x-values + // z values will be ordinary y-values + + Matrix2xNd p2D = Matrix2xNd::Zero(); + Eigen::Matrix jxMat; + +#ifdef RFIT_DEBUG + printf("Line_fit - B: %g\n", bField); + printIt(&hits, "Line_fit points: "); + printIt(&hits_ge, "Line_fit covs: "); + printIt(&rot, "Line_fit rot: "); +#endif + // x & associated Jacobian + // cfr https://indico.cern.ch/event/663159/contributions/2707659/attachments/1517175/2368189/Riemann_fit.pdf + // Slide 11 + // a ==> -o i.e. the origin of the circle in XY plane, negative + // b ==> p i.e. distances of the points wrt the origin of the circle. + const Vector2d oVec(circle.par(0), circle.par(1)); + + // associated Jacobian, used in weights and errors computation + Matrix6d covMat = Matrix6d::Zero(); + Matrix2d cov_sz[N]; + for (uint i = 0; i < n; ++i) { + Vector2d pVec = hits.block(0, i, 2, 1) - oVec; + const double cross = cross2D(acc, -oVec, pVec); + const double dot = (-oVec).dot(pVec); + // atan2(cross, dot) give back the angle in the transverse plane so tha the + // final equation reads: x_i = -q*R*theta (theta = angle returned by atan2) + const double tempQAtan2 = -circle.qCharge * atan2(cross, dot); + // p2D.coeffRef(1, i) = atan2_ * circle.par(2); + p2D(0, i) = tempQAtan2 * circle.par(2); + + // associated Jacobian, used in weights and errors- computation + const double temp0 = -circle.qCharge * circle.par(2) * 1. / (sqr(dot) + sqr(cross)); + double d_X0 = 0., d_Y0 = 0., d_R = 0.; // good approximation for big pt and eta + if (error) { + d_X0 = -temp0 * ((pVec(1) + oVec(1)) * dot - (pVec(0) - oVec(0)) * cross); + d_Y0 = temp0 * ((pVec(0) + oVec(0)) * dot - (oVec(1) - pVec(1)) * cross); + d_R = tempQAtan2; + } + const double d_x = temp0 * (oVec(1) * dot + oVec(0) * cross); + const double d_y = temp0 * (-oVec(0) * dot + oVec(1) * cross); + jxMat << d_X0, d_Y0, d_R, d_x, d_y, 0., 0., 0., 0., 0., 0., 1.; + + covMat.block(0, 0, 3, 3) = circle.cov; + covMat(3, 3) = hits_ge.col(i)[0]; // x errors + covMat(4, 4) = hits_ge.col(i)[2]; // y errors + covMat(5, 5) = hits_ge.col(i)[5]; // z errors + covMat(3, 4) = covMat(4, 3) = hits_ge.col(i)[1]; // cov_xy + covMat(3, 5) = covMat(5, 3) = hits_ge.col(i)[3]; // cov_xz + covMat(4, 5) = covMat(5, 4) = hits_ge.col(i)[4]; // cov_yz + Matrix2d tmp = jxMat * covMat * jxMat.transpose(); + cov_sz[i].noalias() = rot * tmp * rot.transpose(); + } + // Math of d_{X0,Y0,R,x,y} all verified by hand + p2D.row(1) = hits.row(2); + + // The following matrix will contain errors orthogonal to the rotated S + // component only, with the Multiple Scattering properly treated!! + MatrixNd cov_with_ms; + scatterCovLine(acc, cov_sz, fast_fit, p2D.row(0), p2D.row(1), theta, bField, cov_with_ms); +#ifdef RFIT_DEBUG + printIt(cov_sz, "line_fit - cov_sz:"); + printIt(&cov_with_ms, "line_fit - cov_with_ms: "); +#endif + + // Rotate Points with the shape [2, n] + Matrix2xNd p2D_rot = rot * p2D; + +#ifdef RFIT_DEBUG + printf("Fast fit Tan(theta): %g\n", fast_fit(3)); + printf("Rotation angle: %g\n", theta); + printIt(&rot, "Rotation Matrix:"); + printIt(&p2D, "Original Hits(s,z):"); + printIt(&p2D_rot, "Rotated hits(S3D, Z'):"); + printIt(&rot, "Rotation Matrix:"); +#endif + + // Build the A Matrix + Matrix2xNd aMat; + aMat << MatrixXd::Ones(1, n), p2D_rot.row(0); // rotated s values + +#ifdef RFIT_DEBUG + printIt(&aMat, "A Matrix:"); +#endif + + // Build A^T V-1 A, where V-1 is the covariance of only the Y components. + MatrixNd vyInvMat; + math::cholesky::invert(cov_with_ms, vyInvMat); + // MatrixNd vyInvMat = cov_with_ms.inverse(); + Eigen::Matrix covParamsMat = aMat * vyInvMat * aMat.transpose(); + // Compute the Covariance Matrix of the fit parameters + math::cholesky::invert(covParamsMat, covParamsMat); + + // Now Compute the Parameters in the form [2,1] + // The first component is q. + // The second component is m. + Eigen::Matrix sol = covParamsMat * aMat * vyInvMat * p2D_rot.row(1).transpose(); + +#ifdef RFIT_DEBUG + printIt(&sol, "Rotated solutions:"); +#endif + + // We need now to transfer back the results in the original s-z plane + const auto sinTheta = sin(theta); + const auto cosTheta = cos(theta); + auto common_factor = 1. / (sinTheta - sol(1, 0) * cosTheta); + Eigen::Matrix jMat; + jMat << 0., common_factor * common_factor, common_factor, sol(0, 0) * cosTheta * common_factor * common_factor; + + double tempM = common_factor * (sol(1, 0) * sinTheta + cosTheta); + double tempQ = common_factor * sol(0, 0); + auto cov_mq = jMat * covParamsMat * jMat.transpose(); + + VectorNd res = p2D_rot.row(1).transpose() - aMat.transpose() * sol; + double chi2 = res.transpose() * vyInvMat * res; + + LineFit line; + line.par << tempM, tempQ; + line.cov << cov_mq; + line.chi2 = chi2; + +#ifdef RFIT_DEBUG + printf("Common_factor: %g\n", common_factor); + printIt(&jMat, "Jacobian:"); + printIt(&sol, "Rotated solutions:"); + printIt(&covParamsMat, "Cov_params:"); + printIt(&cov_mq, "Rotated Covariance Matrix:"); + printIt(&(line.par), "Real Parameters:"); + printIt(&(line.cov), "Real Covariance Matrix:"); + printf("Chi2: %g\n", chi2); +#endif + + return line; + } + + } // namespace riemannFit +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +namespace riemannFit { + /*! + \brief Helix fit by three step: + -fast pre-fit (see Fast_fit() for further info); \n + -circle fit of hits projected in the transverse plane by Riemann-Chernov + algorithm (see Circle_fit() for further info); \n + -line fit of hits projected on cylinder surface by orthogonal distance + regression (see Line_fit for further info). \n + Points must be passed ordered (from inner to outer layer). + \param hits Matrix3xNd hits coordinates in this form: \n + |x0|x1|x2|...|xn| \n + |y0|y1|y2|...|yn| \n + |z0|z1|z2|...|zn| + \param hits_cov Matrix3Nd covariance matrix in this form (()->cov()): \n + |(x0,x0)|(x1,x0)|(x2,x0)|.|(y0,x0)|(y1,x0)|(y2,x0)|.|(z0,x0)|(z1,x0)|(z2,x0)| \n + |(x0,x1)|(x1,x1)|(x2,x1)|.|(y0,x1)|(y1,x1)|(y2,x1)|.|(z0,x1)|(z1,x1)|(z2,x1)| \n + |(x0,x2)|(x1,x2)|(x2,x2)|.|(y0,x2)|(y1,x2)|(y2,x2)|.|(z0,x2)|(z1,x2)|(z2,x2)| \n + . . . . . . . . . . . \n + |(x0,y0)|(x1,y0)|(x2,y0)|.|(y0,y0)|(y1,y0)|(y2,x0)|.|(z0,y0)|(z1,y0)|(z2,y0)| \n + |(x0,y1)|(x1,y1)|(x2,y1)|.|(y0,y1)|(y1,y1)|(y2,x1)|.|(z0,y1)|(z1,y1)|(z2,y1)| \n + |(x0,y2)|(x1,y2)|(x2,y2)|.|(y0,y2)|(y1,y2)|(y2,x2)|.|(z0,y2)|(z1,y2)|(z2,y2)| \n + . . . . . . . . . . . \n + |(x0,z0)|(x1,z0)|(x2,z0)|.|(y0,z0)|(y1,z0)|(y2,z0)|.|(z0,z0)|(z1,z0)|(z2,z0)| \n + |(x0,z1)|(x1,z1)|(x2,z1)|.|(y0,z1)|(y1,z1)|(y2,z1)|.|(z0,z1)|(z1,z1)|(z2,z1)| \n + |(x0,z2)|(x1,z2)|(x2,z2)|.|(y0,z2)|(y1,z2)|(y2,z2)|.|(z0,z2)|(z1,z2)|(z2,z2)| + \param bField magnetic field in the center of the detector in Gev/cm/c + unit, in order to perform pt calculation. + \param error flag for error computation. + \param scattering flag for multiple scattering treatment. + (see Circle_fit() documentation for further info). + \warning see Circle_fit(), Line_fit() and Fast_fit() warnings. + \bug see Circle_fit(), Line_fit() and Fast_fit() bugs. +*/ + + template + class helixFit { + public: + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void operator()(const TAcc& acc, + const Matrix3xNd* hits, + const Eigen::Matrix* hits_ge, + const double bField, + const bool error, + HelixFit* helix) const { + constexpr uint n = N; + VectorNd<4> rad = (hits->block(0, 0, 2, n).colwise().norm()); + + // Fast_fit gives back (X0, Y0, R, theta) w/o errors, using only 3 points. + Vector4d fast_fit; + ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::fastFit(acc, *hits, fast_fit); + riemannFit::Matrix2Nd hits_cov = MatrixXd::Zero(2 * n, 2 * n); + ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::loadCovariance2D(acc, *hits_ge, hits_cov); + CircleFit circle = ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::circleFit( + acc, hits->block(0, 0, 2, n), hits_cov, fast_fit, rad, bField, error); + LineFit line = + ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::lineFit(acc, *hits, *hits_ge, circle, fast_fit, bField, error); + + ALPAKA_ACCELERATOR_NAMESPACE::riemannFit::par_uvrtopak(acc, circle, bField, error); + + helix->par << circle.par, line.par; + if (error) { + helix->cov = MatrixXd::Zero(5, 5); + helix->cov.block(0, 0, 3, 3) = circle.cov; + helix->cov.block(3, 3, 2, 2) = line.cov; + } + helix->qCharge = circle.qCharge; + helix->chi2_circle = circle.chi2; + helix->chi2_line = line.chi2; + } + }; +} // namespace riemannFit +#endif // RecoPixelVertexing_PixelTrackFitting_interface_RiemannFit_h diff --git a/RecoTracker/PixelTrackFitting/plugins/BuildFile.xml b/RecoTracker/PixelTrackFitting/plugins/BuildFile.xml index d28dad5793a66..6c8c102293651 100644 --- a/RecoTracker/PixelTrackFitting/plugins/BuildFile.xml +++ b/RecoTracker/PixelTrackFitting/plugins/BuildFile.xml @@ -1,8 +1,10 @@ - - - - - + + + + + + + diff --git a/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc b/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc new file mode 100644 index 0000000000000..c4f0b97dba8a9 --- /dev/null +++ b/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc @@ -0,0 +1,79 @@ +#include // needed here by soa layout + +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/global/EDAnalyzer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" + +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" + +template +class PixelTrackDumpAlpakaT : public edm::global::EDAnalyzer<> { +public: + using TkSoAHost = TracksHost; + using VertexSoAHost = ZVertexHost; + + explicit PixelTrackDumpAlpakaT(const edm::ParameterSet& iConfig); + ~PixelTrackDumpAlpakaT() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override; + edm::EDGetTokenT tokenSoATrack_; + edm::EDGetTokenT tokenSoAVertex_; +}; + +template +PixelTrackDumpAlpakaT::PixelTrackDumpAlpakaT(const edm::ParameterSet& iConfig) { + tokenSoATrack_ = consumes(iConfig.getParameter("pixelTrackSrc")); + tokenSoAVertex_ = consumes(iConfig.getParameter("pixelVertexSrc")); +} + +template +void PixelTrackDumpAlpakaT::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("pixelTrackSrc", edm::InputTag("pixelTracksAlpaka")); + desc.add("pixelVertexSrc", edm::InputTag("pixelVerticesAlpaka")); + descriptions.addWithDefaultLabel(desc); +} + +template +void PixelTrackDumpAlpakaT::analyze(edm::StreamID streamID, + edm::Event const& iEvent, + const edm::EventSetup& iSetup) const { + auto const& tracks = iEvent.get(tokenSoATrack_); + assert(tracks.view().quality()); + assert(tracks.view().chi2()); + assert(tracks.view().nLayers()); + assert(tracks.view().eta()); + assert(tracks.view().pt()); + assert(tracks.view().state()); + assert(tracks.view().covariance()); + assert(tracks.view().nTracks()); + + auto const& vertices = iEvent.get(tokenSoAVertex_); + assert(vertices.view().idv()); + assert(vertices.view().zv()); + assert(vertices.view().wv()); + assert(vertices.view().chi2()); + assert(vertices.view().ptv2()); + assert(vertices.view().ndof()); + assert(vertices.view().sortInd()); + assert(vertices.view().nvFinal()); +} + +using PixelTrackDumpAlpakaPhase1 = PixelTrackDumpAlpakaT; +using PixelTrackDumpAlpakaPhase2 = PixelTrackDumpAlpakaT; +using PixelTrackDumpAlpakaHIonPhase1 = PixelTrackDumpAlpakaT; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(PixelTrackDumpAlpakaPhase1); +DEFINE_FWK_MODULE(PixelTrackDumpAlpakaPhase2); +DEFINE_FWK_MODULE(PixelTrackDumpAlpakaHIonPhase1); diff --git a/RecoTracker/PixelTrackFitting/plugins/PixelTrackProducerFromSoAAlpaka.cc b/RecoTracker/PixelTrackFitting/plugins/PixelTrackProducerFromSoAAlpaka.cc new file mode 100644 index 0000000000000..48d9072dc2d71 --- /dev/null +++ b/RecoTracker/PixelTrackFitting/plugins/PixelTrackProducerFromSoAAlpaka.cc @@ -0,0 +1,264 @@ +#include + +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/GeometrySurface/interface/Plane.h" +#include "DataFormats/SiPixelClusterSoA/interface/ClusteringConstants.h" +#include "DataFormats/TrackSoA/interface/TracksHost.h" +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackReco/interface/TrackExtra.h" +#include "DataFormats/TrackReco/interface/TrackFwd.h" +#include "DataFormats/TrackerCommon/interface/TrackerTopology.h" +#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h" +#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h" +#include "FWCore/Framework/interface/ConsumesCollector.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" +#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h" +#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h" +#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h" + +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" +#include "RecoTracker/PixelTrackFitting/interface/alpaka/FitUtils.h" + +#include "storeTracks.h" + +/** + * This class creates "legacy" reco::Track + * objects from the output of SoA CA. + */ + +//#define GPU_DEBUG + +template +class PixelTrackProducerFromSoAAlpaka : public edm::global::EDProducer<> { + using TkSoAHost = TracksHost; + using tracksHelpers = TracksUtilities; + using HMSstorage = std::vector; + +public: + using IndToEdm = std::vector; + + explicit PixelTrackProducerFromSoAAlpaka(const edm::ParameterSet &iConfig); + ~PixelTrackProducerFromSoAAlpaka() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; + + // Event Data tokens + const edm::EDGetTokenT tBeamSpot_; + const edm::EDGetTokenT tokenTrack_; + const edm::EDGetTokenT cpuHits_; + const edm::EDGetTokenT hmsToken_; + // Event Setup tokens + const edm::ESGetToken idealMagneticFieldToken_; + const edm::ESGetToken ttTopoToken_; + + int32_t const minNumberOfHits_; + pixelTrack::Quality const minQuality_; +}; + +template +PixelTrackProducerFromSoAAlpaka::PixelTrackProducerFromSoAAlpaka(const edm::ParameterSet &iConfig) + : tBeamSpot_(consumes(iConfig.getParameter("beamSpot"))), + tokenTrack_(consumes(iConfig.getParameter("trackSrc"))), + cpuHits_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), + hmsToken_(consumes(iConfig.getParameter("pixelRecHitLegacySrc"))), + idealMagneticFieldToken_(esConsumes()), + ttTopoToken_(esConsumes()), + minNumberOfHits_(iConfig.getParameter("minNumberOfHits")), + minQuality_(pixelTrack::qualityByName(iConfig.getParameter("minQuality"))) { + if (minQuality_ == pixelTrack::Quality::notQuality) { + throw cms::Exception("PixelTrackConfiguration") + << iConfig.getParameter("minQuality") + " is not a pixelTrack::Quality"; + } + if (minQuality_ < pixelTrack::Quality::dup) { + throw cms::Exception("PixelTrackConfiguration") + << iConfig.getParameter("minQuality") + " not supported"; + } + produces(); + produces(); + // TrackCollection refers to TrackingRechit and TrackExtra + // collections, need to declare its production after them to work + // around a rare race condition in framework scheduling + produces(); + produces(); +} + +template +void PixelTrackProducerFromSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("trackSrc", edm::InputTag("pixelTracksAlpaka")); + desc.add("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsPreSplittingLegacy")); + desc.add("minNumberOfHits", 0); + desc.add("minQuality", "loose"); + descriptions.addWithDefaultLabel(desc); +} + +template +void PixelTrackProducerFromSoAAlpaka::produce(edm::StreamID streamID, + edm::Event &iEvent, + const edm::EventSetup &iSetup) const { + // enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity }; + reco::TrackBase::TrackQuality recoQuality[] = {reco::TrackBase::undefQuality, + reco::TrackBase::undefQuality, + reco::TrackBase::discarded, + reco::TrackBase::loose, + reco::TrackBase::tight, + reco::TrackBase::tight, + reco::TrackBase::highPurity}; + assert(reco::TrackBase::highPurity == recoQuality[int(pixelTrack::Quality::highPurity)]); + +#ifdef GPU_DEBUG + std::cout << "Converting soa helix in reco tracks" << std::endl; +#endif + + auto indToEdmP = std::make_unique(); + auto &indToEdm = *indToEdmP; + + auto const &idealField = iSetup.getData(idealMagneticFieldToken_); + + pixeltrackfitting::TracksWithRecHits tracks; + + auto const &httopo = iSetup.getData(ttTopoToken_); + + const auto &bsh = iEvent.get(tBeamSpot_); + GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0()); + + auto const &rechits = iEvent.get(cpuHits_); + std::vector hitmap; + auto const &rcs = rechits.data(); + auto const nhits = rcs.size(); + + hitmap.resize(nhits, nullptr); + + auto const &hitsModuleStart = iEvent.get(hmsToken_); + + for (auto const &hit : rcs) { + auto const &thit = static_cast(hit); + auto const detI = thit.det()->index(); + auto const &clus = thit.firstClusterRef(); + assert(clus.isPixel()); + auto const idx = hitsModuleStart[detI] + clus.pixelCluster().originalId(); + if (idx >= hitmap.size()) + hitmap.resize(idx + 256, nullptr); // only in case of hit overflow in one module + + assert(nullptr == hitmap[idx]); + hitmap[idx] = &hit; + } + + std::vector hits; + hits.reserve(5); + + auto const &tsoa = iEvent.get(tokenTrack_); + auto const *quality = tsoa.view().quality(); + auto const &hitIndices = tsoa.view().hitIndices(); + auto nTracks = tsoa.view().nTracks(); + + tracks.reserve(nTracks); + + int32_t nt = 0; + + //sort index by pt + std::vector sortIdxs(nTracks); + std::iota(sortIdxs.begin(), sortIdxs.end(), 0); + std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { + return tsoa.view()[i1].pt() > tsoa.view()[i2].pt(); + }); + + //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists) + indToEdm.resize(sortIdxs.size(), -1); + for (const auto &it : sortIdxs) { + auto nHits = tracksHelpers::nHits(tsoa.view(), it); + assert(nHits >= 3); + auto q = quality[it]; + + if (q < minQuality_) + continue; + if (nHits < minNumberOfHits_) //move to nLayers? + continue; + indToEdm[it] = nt; + ++nt; + + hits.resize(nHits); + auto b = hitIndices.begin(it); + for (int iHit = 0; iHit < nHits; ++iHit) + hits[iHit] = hitmap[*(b + iHit)]; + + // mind: this values are respect the beamspot! + + float chi2 = tsoa.view()[it].chi2(); + float phi = tracksHelpers::phi(tsoa.view(), it); + + riemannFit::Vector5d ipar, opar; + riemannFit::Matrix5d icov, ocov; + tracksHelpers::template copyToDense(tsoa.view(), ipar, icov, it); + riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov); + + LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.); + AlgebraicSymMatrix55 m; + for (int i = 0; i < 5; ++i) + for (int j = i; j < 5; ++j) + m(i, j) = ocov(i, j); + + float sp = std::sin(phi); + float cp = std::cos(phi); + Surface::RotationType rot(sp, -cp, 0, 0, 0, -1.f, cp, sp, 0); + + Plane impPointPlane(bs, rot); + GlobalTrajectoryParameters gp( + impPointPlane.toGlobal(lpar.position()), impPointPlane.toGlobal(lpar.momentum()), lpar.charge(), &idealField); + JacobianLocalToCurvilinear jl2c(impPointPlane, lpar, idealField); + + AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(), m); + + int ndof = 2 * hits.size() - 5; + chi2 = chi2 * ndof; + GlobalPoint vv = gp.position(); + math::XYZPoint pos(vv.x(), vv.y(), vv.z()); + GlobalVector pp = gp.momentum(); + math::XYZVector mom(pp.x(), pp.y(), pp.z()); + + auto track = std::make_unique(chi2, ndof, pos, mom, gp.charge(), CurvilinearTrajectoryError(mo)); + + // bad and edup not supported as fit not present or not reliable + auto tkq = recoQuality[int(q)]; + track->setQuality(tkq); + // loose,tight and HP are inclusive + if (reco::TrackBase::highPurity == tkq) { + track->setQuality(reco::TrackBase::tight); + track->setQuality(reco::TrackBase::loose); + } else if (reco::TrackBase::tight == tkq) { + track->setQuality(reco::TrackBase::loose); + } + track->setQuality(tkq); + // filter??? + tracks.emplace_back(track.release(), hits); + } +#ifdef GPU_DEBUG + std::cout << "processed " << nt << " good tuples " << tracks.size() << " out of " << indToEdm.size() << std::endl; +#endif + // store tracks + storeTracks(iEvent, tracks, httopo); + iEvent.put(std::move(indToEdmP)); +} + +using PixelTrackProducerFromSoAAlpakaPhase1 = PixelTrackProducerFromSoAAlpaka; +using PixelTrackProducerFromSoAAlpakaPhase2 = PixelTrackProducerFromSoAAlpaka; +using PixelTrackProducerFromSoAAlpakaHIonPhase1 = PixelTrackProducerFromSoAAlpaka; + +#include "FWCore/Framework/interface/MakerMacros.h" +DEFINE_FWK_MODULE(PixelTrackProducerFromSoAAlpakaPhase1); +DEFINE_FWK_MODULE(PixelTrackProducerFromSoAAlpakaPhase2); +DEFINE_FWK_MODULE(PixelTrackProducerFromSoAAlpakaHIonPhase1); diff --git a/RecoTracker/PixelTrackFitting/python/PixelTracks_cff.py b/RecoTracker/PixelTrackFitting/python/PixelTracks_cff.py index 91eb380a33da9..046caa0b033f3 100644 --- a/RecoTracker/PixelTrackFitting/python/PixelTracks_cff.py +++ b/RecoTracker/PixelTrackFitting/python/PixelTracks_cff.py @@ -1,4 +1,5 @@ import FWCore.ParameterSet.Config as cms +from HeterogeneousCore.AlpakaCore.functions import * from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA from RecoLocalTracker.SiStripRecHitConverter.StripCPEfromTrackAngle_cfi import * @@ -203,3 +204,42 @@ (pixelNtupletFit & gpu & gpuValidationPixel).toModify(pixelTracksSoA.cpu, pixelRecHitSrc = "siPixelRecHitsPreSplittingSoA@cpu" ) + +###################################################################### + +### Alpaka Pixel Track Reco + +from Configuration.ProcessModifiers.alpaka_cff import alpaka + +# pixel tracks SoA producer on the device +from RecoTracker.PixelSeeding.caHitNtupletAlpakaPhase1_cfi import caHitNtupletAlpakaPhase1 as _pixelTracksAlpakaPhase1 +from RecoTracker.PixelSeeding.caHitNtupletAlpakaPhase2_cfi import caHitNtupletAlpakaPhase2 as _pixelTracksAlpakaPhase2 + +pixelTracksAlpaka = _pixelTracksAlpakaPhase1.clone() +phase2_tracker.toReplaceWith(pixelTracksAlpaka,_pixelTracksAlpakaPhase2.clone()) + +# pixel tracks SoA producer on the cpu, for validation +pixelTracksAlpakaSerial = makeSerialClone(pixelTracksAlpaka, + pixelRecHitSrc = 'siPixelRecHitsPreSplittingAlpakaSerial' +) + +# legacy pixel tracks from SoA +from RecoTracker.PixelTrackFitting.pixelTrackProducerFromSoAAlpakaPhase1_cfi import pixelTrackProducerFromSoAAlpakaPhase1 as _pixelTrackProducerFromSoAAlpakaPhase1 +from RecoTracker.PixelTrackFitting.pixelTrackProducerFromSoAAlpakaPhase2_cfi import pixelTrackProducerFromSoAAlpakaPhase2 as _pixelTrackProducerFromSoAAlpakaPhase2 + +(alpaka & ~phase2_tracker).toReplaceWith(pixelTracks, _pixelTrackProducerFromSoAAlpakaPhase1.clone( + pixelRecHitLegacySrc = "siPixelRecHitsPreSplitting", +)) + +(alpaka & phase2_tracker).toReplaceWith(pixelTracks, _pixelTrackProducerFromSoAAlpakaPhase2.clone( + pixelRecHitLegacySrc = "siPixelRecHitsPreSplitting", +)) + +alpaka.toReplaceWith(pixelTracksTask, cms.Task( + # Build the pixel ntuplets and the pixel tracks in SoA format with alpaka on the device + pixelTracksAlpaka, + # Build the pixel ntuplets and the pixel tracks in SoA format with alpaka on the cpu (if requested by the validation) + pixelTracksAlpakaSerial, + # Convert the pixel tracks from SoA to legacy format + pixelTracks) +) diff --git a/RecoTracker/PixelVertexFinding/BuildFile.xml b/RecoTracker/PixelVertexFinding/BuildFile.xml index 6171a7a94824a..aebe052016d0d 100644 --- a/RecoTracker/PixelVertexFinding/BuildFile.xml +++ b/RecoTracker/PixelVertexFinding/BuildFile.xml @@ -1,3 +1,5 @@ + + @@ -5,7 +7,6 @@ - diff --git a/RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h b/RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h new file mode 100644 index 0000000000000..0948d88ef3acf --- /dev/null +++ b/RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h @@ -0,0 +1,33 @@ +#ifndef RecoTracker_PixelVertexFinding_interface_PixelVertexWorkSpaceLayout_h +#define RecoTracker_PixelVertexFinding_interface_PixelVertexWorkSpaceLayout_h + +#include + +#include "DataFormats/SoATemplate/interface/SoALayout.h" + +// Intermediate data used in the vertex reco algos +// For internal use only +namespace vertexFinder { + + GENERATE_SOA_LAYOUT(PixelVertexWSSoALayout, + SOA_COLUMN(uint16_t, itrk), // index of original track + SOA_COLUMN(float, zt), // input track z at bs + SOA_COLUMN(float, ezt2), // input error^2 on the above + SOA_COLUMN(float, ptt2), // input pt^2 on the above + SOA_COLUMN(uint8_t, izt), // interized z-position of input tracks + SOA_COLUMN(int32_t, iv), // vertex index for each associated track + SOA_SCALAR(uint32_t, ntrks), // number of "selected tracks" + SOA_SCALAR(uint32_t, nvIntermediate)) // the number of vertices after splitting pruning etc. + + using PixelVertexWorkSpaceSoALayout = PixelVertexWSSoALayout<>; + using PixelVertexWorkSpaceSoAView = PixelVertexWSSoALayout<>::View; + using PixelVertexWorkSpaceSoAConstView = PixelVertexWSSoALayout<>::ConstView; + + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void init(PixelVertexWorkSpaceSoAView& workspace_view) { + workspace_view.ntrks() = 0; + workspace_view.nvIntermediate() = 0; + } + +} // namespace vertexFinder + +#endif // RecoTracker_PixelVertexFinding_interface_PixelVertexWorkSpaceLayout_h diff --git a/RecoTracker/PixelVertexFinding/plugins/BuildFile.xml b/RecoTracker/PixelVertexFinding/plugins/BuildFile.xml index d330676889f26..2df520dffcf5b 100644 --- a/RecoTracker/PixelVertexFinding/plugins/BuildFile.xml +++ b/RecoTracker/PixelVertexFinding/plugins/BuildFile.xml @@ -1,5 +1,3 @@ - - @@ -17,18 +15,33 @@ - - + + + + + + + + + + + + + + + + + diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc b/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc new file mode 100644 index 0000000000000..6e542f7870c2e --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc @@ -0,0 +1,175 @@ +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/Common/interface/OrphanHandle.h" +#include "DataFormats/TrackReco/interface/Track.h" +#include "DataFormats/TrackReco/interface/TrackExtra.h" +#include "DataFormats/TrackReco/interface/TrackFwd.h" +#include "DataFormats/VertexReco/interface/Vertex.h" +#include "DataFormats/VertexReco/interface/VertexFwd.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/global/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/PluginManager/interface/ModuleDef.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "Geometry/Records/interface/TrackerTopologyRcd.h" +#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h" + +class PixelVertexProducerFromSoAAlpaka : public edm::global::EDProducer<> { +public: + using IndToEdm = std::vector; + + explicit PixelVertexProducerFromSoAAlpaka(const edm::ParameterSet &iConfig); + ~PixelVertexProducerFromSoAAlpaka() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions &descriptions); + +private: + void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override; + + edm::EDGetTokenT tokenVertex_; + edm::EDGetTokenT tokenBeamSpot_; + edm::EDGetTokenT tokenTracks_; + edm::EDGetTokenT tokenIndToEdm_; +}; + +PixelVertexProducerFromSoAAlpaka::PixelVertexProducerFromSoAAlpaka(const edm::ParameterSet &conf) + : tokenVertex_(consumes(conf.getParameter("src"))), + tokenBeamSpot_(consumes(conf.getParameter("beamSpot"))), + tokenTracks_(consumes(conf.getParameter("TrackCollection"))), + tokenIndToEdm_(consumes(conf.getParameter("TrackCollection"))) { + produces(); +} + +void PixelVertexProducerFromSoAAlpaka::fillDescriptions(edm::ConfigurationDescriptions &descriptions) { + edm::ParameterSetDescription desc; + + desc.add("TrackCollection", edm::InputTag("pixelTracks")); + desc.add("beamSpot", edm::InputTag("offlineBeamSpot")); + desc.add("src", edm::InputTag("pixelVerticesAlpaka")); + + descriptions.add("pixelVertexFromSoAAlpaka", desc); +} + +void PixelVertexProducerFromSoAAlpaka::produce(edm::StreamID streamID, + edm::Event &iEvent, + const edm::EventSetup &) const { + auto vertexes = std::make_unique(); + + auto tracksHandle = iEvent.getHandle(tokenTracks_); + auto tracksSize = tracksHandle->size(); + auto const &indToEdm = iEvent.get(tokenIndToEdm_); + auto bsHandle = iEvent.getHandle(tokenBeamSpot_); + + float x0 = 0, y0 = 0, z0 = 0, dxdz = 0, dydz = 0; + std::vector itrk; + itrk.reserve(64); // avoid first relocations + if (!bsHandle.isValid()) { + edm::LogWarning("PixelVertexProducer") << "No beamspot found. returning vertexes with (0,0,Z) "; + } else { + const reco::BeamSpot &bs = *bsHandle; + x0 = bs.x0(); + y0 = bs.y0(); + z0 = bs.z0(); + dxdz = bs.dxdz(); + dydz = bs.dydz(); + } + + auto const &soa = iEvent.get(tokenVertex_); + + int nv = soa.view().nvFinal(); + +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "converting " << nv << " vertices " + << " from " << indToEdm.size() << " tracks" << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + + std::set uind; // for verifing index consistency + for (int j = nv - 1; j >= 0; --j) { + auto i = soa.view()[j].sortInd(); // on gpu sorted in ascending order.... + assert(i < nv); + uind.insert(i); + assert(itrk.empty()); + auto z = soa.view()[i].zv(); + auto x = x0 + dxdz * z; + auto y = y0 + dydz * z; + z += z0; + reco::Vertex::Error err; + err(2, 2) = 1.f / soa.view()[i].wv(); + err(2, 2) *= 2.; // artifically inflate error + //Copy also the tracks (no intention to be efficient....) + for (auto k = 0U; k < indToEdm.size(); ++k) { + if (soa.view()[k].idv() == int16_t(i)) + itrk.push_back(k); + } + auto nt = itrk.size(); + if (nt == 0) { +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "vertex " << i << " with no tracks..." << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + continue; + } + if (nt < 2) { + itrk.clear(); + continue; + } // remove outliers + (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); + auto &v = (*vertexes).back(); + v.reserve(itrk.size()); + for (auto it : itrk) { + assert(it < int(indToEdm.size())); + auto k = indToEdm[it]; + if (k > tracksSize) { + edm::LogWarning("PixelVertexProducer") << "oops track " << it << " does not exists on CPU " << k; + continue; + } + auto tk = reco::TrackRef(tracksHandle, k); + v.add(tk); + } + itrk.clear(); + } + + LogDebug("PixelVertexProducer") << ": Found " << vertexes->size() << " vertexes\n"; + for (unsigned int i = 0; i < vertexes->size(); ++i) { + LogDebug("PixelVertexProducer") << "Vertex number " << i << " has " << (*vertexes)[i].tracksSize() + << " tracks with a position of " << (*vertexes)[i].z() << " +- " + << std::sqrt((*vertexes)[i].covariance(2, 2)); + } + + // legacy logic.... + if (vertexes->empty() && bsHandle.isValid()) { + const reco::BeamSpot &bs = *bsHandle; + + GlobalError bse(bs.rotatedCovariance3D()); + if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) { + AlgebraicSymMatrix33 we; + we(0, 0) = 10000; + we(1, 1) = 10000; + we(2, 2) = 10000; + vertexes->push_back(reco::Vertex(bs.position(), we, 0., 0., 0)); + + edm::LogInfo("PixelVertexProducer") << "No vertices found. Beamspot with invalid errors " << bse.matrix() + << "\nWill put Vertex derived from dummy-fake BeamSpot into Event.\n" + << (*vertexes)[0].x() << "\n" + << (*vertexes)[0].y() << "\n" + << (*vertexes)[0].z() << "\n"; + } else { + vertexes->push_back(reco::Vertex(bs.position(), bs.rotatedCovariance3D(), 0., 0., 0)); + + edm::LogInfo("PixelVertexProducer") << "No vertices found. Will put Vertex derived from BeamSpot into Event:\n" + << (*vertexes)[0].x() << "\n" + << (*vertexes)[0].y() << "\n" + << (*vertexes)[0].z() << "\n"; + } + } else if (vertexes->empty() && !bsHandle.isValid()) { + edm::LogWarning("PixelVertexProducer") << "No beamspot and no vertex found. No vertex returned."; + } + + iEvent.put(std::move(vertexes)); +} + +DEFINE_FWK_MODULE(PixelVertexProducerFromSoAAlpaka); diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h index 0c55cd97b070a..48848ff959554 100644 --- a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h +++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h @@ -8,7 +8,7 @@ template class PixelVertexWorkSpaceSoADevice : public cms::cuda::PortableDeviceCollection> { public: - PixelVertexWorkSpaceSoADevice() = default; + explicit PixelVertexWorkSpaceSoADevice() = default; // Constructor which specifies the SoA size and CUDA stream explicit PixelVertexWorkSpaceSoADevice(cudaStream_t stream) diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h index 0e698933b0731..9df8cc4580a1f 100644 --- a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h +++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h @@ -9,6 +9,7 @@ template class PixelVertexWorkSpaceSoAHost : public cms::cuda::PortableHostCollection> { public: explicit PixelVertexWorkSpaceSoAHost() : PortableHostCollection>(S) {} + // Constructor which specifies the SoA size and CUDA stream explicit PixelVertexWorkSpaceSoAHost(cudaStream_t stream) : PortableHostCollection>(S, stream) {} diff --git a/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h new file mode 100644 index 0000000000000..33e163dbab784 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h @@ -0,0 +1,15 @@ +#ifndef RecoTracker_PixelVertexFinding_plugins_PixelVertexWorkSpaceSoAHostAlpaka_h +#define RecoTracker_PixelVertexFinding_plugins_PixelVertexWorkSpaceSoAHostAlpaka_h + +#include + +#include "DataFormats/Portable/interface/PortableHostCollection.h" +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" + +namespace vertexFinder { + + using PixelVertexWorkSpaceSoAHost = PortableHostCollection>; + +} // namespace vertexFinder + +#endif // RecoTracker_PixelVertexFinding_plugins_PixelVertexWorkSpaceSoAHostAlpaka_h diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc b/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc new file mode 100644 index 0000000000000..d572a181ccf85 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc @@ -0,0 +1,110 @@ +#include + +#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/Utilities/interface/StreamID.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/ESGetToken.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/Event.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EventSetup.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/global/EDProducer.h" + +#include "DataFormats/TrackSoA/interface/alpaka/TracksSoACollection.h" +#include "DataFormats/TrackSoA/interface/TracksDevice.h" +#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h" +#include "DataFormats/VertexSoA/interface/ZVertexDevice.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h" + +#include "vertexFinder.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + using namespace cms::alpakatools; + + template + class PixelVertexProducerAlpaka : public global::EDProducer<> { + using TkSoADevice = TracksSoACollection; + using Algo = vertexFinder::Producer; + + public: + explicit PixelVertexProducerAlpaka(const edm::ParameterSet& iConfig); + ~PixelVertexProducerAlpaka() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + private: + void produce(edm::StreamID streamID, device::Event& iEvent, const device::EventSetup& iSetup) const override; + + const Algo algo_; + + // Tracking cuts before sending tracks to vertex algo + const float ptMin_; + const float ptMax_; + + device::EDGetToken tokenDeviceTrack_; + device::EDPutToken tokenDeviceVertex_; + }; + + template + PixelVertexProducerAlpaka::PixelVertexProducerAlpaka(const edm::ParameterSet& conf) + : algo_(conf.getParameter("oneKernel"), + conf.getParameter("useDensity"), + conf.getParameter("useDBSCAN"), + conf.getParameter("useIterative"), + conf.getParameter("doSplitting"), + conf.getParameter("minT"), + conf.getParameter("eps"), + conf.getParameter("errmax"), + conf.getParameter("chi2max")), + ptMin_(conf.getParameter("PtMin")), // 0.5 GeV + ptMax_(conf.getParameter("PtMax")), // 75. Onsumes + tokenDeviceTrack_(consumes(conf.getParameter("pixelTrackSrc"))), + tokenDeviceVertex_(produces()) {} + + template + void PixelVertexProducerAlpaka::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + // Only one of these three algos can be used at once. + // Maybe this should become a Plugin Factory + desc.add("oneKernel", true); + desc.add("useDensity", true); + desc.add("useDBSCAN", false); + desc.add("useIterative", false); + desc.add("doSplitting", true); + + desc.add("minT", 2); // min number of neighbours to be "core" + desc.add("eps", 0.07); // max absolute distance to cluster + desc.add("errmax", 0.01); // max error to be "seed" + desc.add("chi2max", 9.); // max normalized distance to cluster + + desc.add("PtMin", 0.5); + desc.add("PtMax", 75.); + desc.add("pixelTrackSrc", edm::InputTag("pixelTracksAlpaka")); + + descriptions.addWithDefaultLabel(desc); + } + + template + void PixelVertexProducerAlpaka::produce(edm::StreamID streamID, + device::Event& iEvent, + const device::EventSetup& iSetup) const { + auto const& hTracks = iEvent.get(tokenDeviceTrack_); + + iEvent.emplace(tokenDeviceVertex_, algo_.makeAsync(iEvent.queue(), hTracks.view(), ptMin_, ptMax_)); + } + + using PixelVertexProducerAlpakaPhase1 = PixelVertexProducerAlpaka; + using PixelVertexProducerAlpakaPhase2 = PixelVertexProducerAlpaka; + using PixelVertexProducerAlpakaHIonPhase1 = PixelVertexProducerAlpaka; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +DEFINE_FWK_ALPAKA_MODULE(PixelVertexProducerAlpakaPhase1); +DEFINE_FWK_ALPAKA_MODULE(PixelVertexProducerAlpakaPhase2); +DEFINE_FWK_ALPAKA_MODULE(PixelVertexProducerAlpakaHIonPhase1); diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h new file mode 100644 index 0000000000000..d0ec816b32aee --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h @@ -0,0 +1,23 @@ +#ifndef RecoTracker_PixelVertexFinding_plugins_alpaka_PixelVertexWorkSpaceSoADeviceAlpaka_h +#define RecoTracker_PixelVertexFinding_plugins_alpaka_PixelVertexWorkSpaceSoADeviceAlpaka_h + +#include + +#include "DataFormats/Portable/interface/alpaka/PortableCollection.h" +#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" +#include "RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + namespace vertexFinder { + + using PixelVertexWorkSpaceSoADevice = PortableCollection<::vertexFinder::PixelVertexWSSoALayout<>>; + using PixelVertexWorkSpaceSoAHost = ::vertexFinder::PixelVertexWorkSpaceSoAHost; + + } // namespace vertexFinder + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#endif // RecoTracker_PixelVertexFinding_plugins_alpaka_PixelVertexWorkSpaceSoADeviceAlpaka_h diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h new file mode 100644 index 0000000000000..29cd537ac4aa7 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h @@ -0,0 +1,248 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h +#define RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h + +#include +#include +#include +#include +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" +#include "vertexFinder.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace vertexFinder { + using VtxSoAView = ::reco::ZVertexSoAView; + using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + // + // based on Rodrighez&Laio algo + // + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline)) + clusterTracksByDensity(const TAcc& acc, + VtxSoAView& pdata, + WsSoAView& pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) { + using namespace vertexFinder; + constexpr bool verbose = false; // in principle the compiler should optmize out if false + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + } + auto er2mx = errmax * errmax; + + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); + + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); + + ALPAKA_ASSERT_OFFLOAD(zt); + ALPAKA_ASSERT_OFFLOAD(ezt2); + ALPAKA_ASSERT_OFFLOAD(izt); + ALPAKA_ASSERT_OFFLOAD(nn); + ALPAKA_ASSERT_OFFLOAD(iv); + + using Hist = cms::alpakatools::HistoContainer; + auto& hist = alpaka::declareSharedVar(acc); + auto& hws = alpaka::declareSharedVar(acc); + + for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) { + hist.off[j] = 0; + } + alpaka::syncBlockThreads(acc); + + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.totbins(), hist.capacity(), nt); + } + ALPAKA_ASSERT_OFFLOAD(static_cast(nt) <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only + iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); + izt[i] = iz - INT8_MIN; + ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0); + ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256); + hist.count(acc, izt[i]); + iv[i] = i; + nn[i] = 0; + } + alpaka::syncBlockThreads(acc); + if (threadIdxLocal < 32) + hws[threadIdxLocal] = 0; // used by prefix scan... + alpaka::syncBlockThreads(acc); + hist.finalize(acc, hws); + alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_OFFLOAD(hist.size() == nt); + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + hist.fill(acc, izt[i], uint16_t(i)); + } + alpaka::syncBlockThreads(acc); + // count neighbours + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + nn[i]++; + }; + + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } + alpaka::syncBlockThreads(acc); + + // find closest above me .... (we ignore the possibility of two j at same distance from i) + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + float mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < nn[i]) + return; + if (nn[j] == nn[i] && zt[j] >= zt[i]) + return; // if equal use natural order... + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // (break natural order???) + mdist = dist; + iv[i] = j; // assign to cluster (better be unique??) + }; + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } + alpaka::syncBlockThreads(acc); + +#ifdef GPU_DEBUG + // mini verification + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] != int(i)) + ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i)); + } + alpaka::syncBlockThreads(acc); +#endif + + // consolidate graph (percolate index of seed) + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + +#ifdef GPU_DEBUG + alpaka::syncBlockThreads(acc); + // mini verification + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] != int(i)) + ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i)); + } +#endif + +#ifdef GPU_DEBUG + // and verify that we did not spit any cluster... + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + auto minJ = i; + auto mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < nn[i]) + return; + if (nn[j] == nn[i] && zt[j] >= zt[i]) + return; // if equal use natural order... + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + mdist = dist; + minJ = j; + }; + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + // should belong to the same cluster... + ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[minJ]); + ALPAKA_ASSERT_OFFLOAD(nn[i] <= nn[iv[i]]); + } + alpaka::syncBlockThreads(acc); +#endif + + auto& foundClusters = alpaka::declareSharedVar(acc); + foundClusters = 0; + alpaka::syncBlockThreads(acc); + + // find the number of different clusters, identified by a tracks with clus[i] == i and density larger than threshold; + // mark these tracks with a negative id. + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{}); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + alpaka::syncBlockThreads(acc); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("found %d proto vertices\n", foundClusters); + } + } + class ClusterTracksByDensityKernel { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VtxSoAView pdata, + WsSoAView pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) const { + clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + } + }; + } // namespace vertexFinder +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelVertexFinding_alpaka_clusterTracksByDensity_h diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h new file mode 100644 index 0000000000000..46ae2ad80ecc9 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h @@ -0,0 +1,255 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_gpuClusterTracksDBSCAN_h +#define RecoPixelVertexing_PixelVertexFinding_gpuClusterTracksDBSCAN_h + +#include +#include +#include +#include +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" +#include "vertexFinder.h" +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace vertexFinder { + using VtxSoAView = ::reco::ZVertexSoAView; + using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + class ClusterTracksDBSCAN { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VtxSoAView pdata, + WsSoAView pws, + int minT, // min number of neighbours to be "core" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) const { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + } + auto er2mx = errmax * errmax; + + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); + + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); + + ALPAKA_ASSERT_OFFLOAD(zt); + ALPAKA_ASSERT_OFFLOAD(iv); + ALPAKA_ASSERT_OFFLOAD(nn); + ALPAKA_ASSERT_OFFLOAD(ezt2); + + using Hist = cms::alpakatools::HistoContainer; + auto& hist = alpaka::declareSharedVar(acc); + auto& hws = alpaka::declareSharedVar(acc); + + for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) { + hist.off[j] = 0; + } + alpaka::syncBlockThreads(acc); + + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + } + + ALPAKA_ASSERT_OFFLOAD(static_cast(nt) <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + iz = std::clamp(iz, INT8_MIN, INT8_MAX); + izt[i] = iz - INT8_MIN; + ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0); + ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256); + hist.count(acc, izt[i]); + iv[i] = i; + nn[i] = 0; + } + alpaka::syncBlockThreads(acc); + if (threadIdxLocal < 32) + hws[threadIdxLocal] = 0; // used by prefix scan... + alpaka::syncBlockThreads(acc); + hist.finalize(acc, hws); + alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_OFFLOAD(hist.size() == nt); + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + hist.fill(acc, izt[i], uint32_t(i)); + } + alpaka::syncBlockThreads(acc); + + // count neighbours + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + nn[i]++; + }; + + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } + + alpaka::syncBlockThreads(acc); + + // find NN with smaller z... + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (nn[i] < minT) + continue; // DBSCAN core rule + float mz = zt[i]; + auto loop = [&](uint32_t j) { + if (zt[j] >= mz) + return; + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + mz = zt[j]; + iv[i] = j; // assign to cluster (better be unique??) + }; + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } + + alpaka::syncBlockThreads(acc); + +#ifdef GPU_DEBUG + // mini verification + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] != int(i)) + ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i)); + } + alpaka::syncBlockThreads(acc); +#endif + + // consolidate graph (percolate index of seed) + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + + alpaka::syncBlockThreads(acc); + +#ifdef GPU_DEBUG + // mini verification + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] != int(i)) + ALPAKA_ASSERT_OFFLOAD(iv[iv[i]] != int(i)); + } + alpaka::syncBlockThreads(acc); +#endif + +#ifdef GPU_DEBUG + // and verify that we did not spit any cluster... + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (nn[i] < minT) + continue; // DBSCAN core rule + ALPAKA_ASSERT_OFFLOAD(zt[iv[i]] <= zt[i]); + auto loop = [&](uint32_t j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; + // they should belong to the same cluster, isn't it? + if (iv[i] != iv[j]) { + printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]); + printf(" %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]); + ; + } + ALPAKA_ASSERT_OFFLOAD(iv[i] == iv[j]); + }; + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } + alpaka::syncBlockThreads(acc); +#endif + + // collect edges (assign to closest cluster of closest point??? here to closest point) + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule + if (nn[i] >= minT) + continue; // DBSCAN edge rule + float mdist = eps; + auto loop = [&](uint32_t j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // needed? + mdist = dist; + iv[i] = iv[j]; // assign to cluster (better be unique??) + }; + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } + + auto& foundClusters = alpaka::declareSharedVar(acc); + foundClusters = 0; + alpaka::syncBlockThreads(acc); + + // find the number of different clusters, identified by a tracks with clus[i] == i; + // mark these tracks with a negative id. + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{}); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + alpaka::syncBlockThreads(acc); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("found %d proto vertices\n", foundClusters); + } + } + }; + } // namespace vertexFinder +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuClusterTracksDBSCAN_h diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h new file mode 100644 index 0000000000000..3fe0202121f80 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h @@ -0,0 +1,230 @@ +#ifndef RecoTracker_PixelVertexFinding_clusterTracksIterativeAlpaka_h +#define RecoTracker_PixelVertexFinding_clusterTracksIterativeAlpaka_h + +#include +#include +#include +#include + +#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" +#include "vertexFinder.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace vertexFinder { + + // this algo does not really scale as it works in a single block... + // enough for <10K tracks we have + class ClusterTracksIterative { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VtxSoAView pdata, + WsSoAView pws, + int minT, // min number of neighbours to be "core" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) const { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + } + auto er2mx = errmax * errmax; + + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); + + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); + + ALPAKA_ASSERT_OFFLOAD(zt); + ALPAKA_ASSERT_OFFLOAD(nn); + ALPAKA_ASSERT_OFFLOAD(iv); + ALPAKA_ASSERT_OFFLOAD(ezt2); + + using Hist = cms::alpakatools::HistoContainer; + auto& hist = alpaka::declareSharedVar(acc); + auto& hws = alpaka::declareSharedVar(acc); + + for (auto j : cms::alpakatools::elements_with_stride(acc, Hist::totbins())) { + hist.off[j] = 0; + } + alpaka::syncBlockThreads(acc); + + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + } + + ALPAKA_ASSERT_OFFLOAD(static_cast(nt) <= hist.capacity()); + + // fill hist (bin shall be wider than "eps") + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + ALPAKA_ASSERT_OFFLOAD(i < ::zVertex::MAXTRACKS); + int iz = int(zt[i] * 10.); // valid if eps<=0.1 + iz = std::clamp(iz, INT8_MIN, INT8_MAX); + izt[i] = iz - INT8_MIN; + ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN >= 0); + ALPAKA_ASSERT_OFFLOAD(iz - INT8_MIN < 256); + hist.count(acc, izt[i]); + iv[i] = i; + nn[i] = 0; + } + alpaka::syncBlockThreads(acc); + + if (threadIdxLocal < 32) + hws[threadIdxLocal] = 0; // used by prefix scan... + alpaka::syncBlockThreads(acc); + + hist.finalize(acc, hws); + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(hist.size() == nt); + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + hist.fill(acc, izt[i], uint16_t(i)); + } + alpaka::syncBlockThreads(acc); + + // count neighbours + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (ezt2[i] > er2mx) + continue; + auto loop = [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + nn[i]++; + }; + + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } + + auto& nloops = alpaka::declareSharedVar(acc); + nloops = 0; + + alpaka::syncBlockThreads(acc); + + // cluster seeds only + bool more = true; + while (alpaka::syncBlockThreadsPredicate(acc, more)) { + if (1 == nloops % 2) { + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; + } + } else { + more = false; + for (auto k : cms::alpakatools::elements_with_stride(acc, hist.size())) { + auto p = hist.begin() + k; + auto i = (*p); + auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1)); + if (nn[i] < minT) + continue; // DBSCAN core rule + auto loop = [&](uint32_t j) { + ALPAKA_ASSERT_OFFLOAD(i != j); + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + auto old = alpaka::atomicMin(acc, &iv[j], iv[i], alpaka::hierarchy::Blocks{}); + if (old != iv[i]) { + // end the loop only if no changes were applied + more = true; + } + alpaka::atomicMin(acc, &iv[i], old, alpaka::hierarchy::Blocks{}); + }; + ++p; + for (; p < hist.end(be); ++p) + loop(*p); + } // for i + } + if (threadIdxLocal == 0) + ++nloops; + } // while + + // collect edges (assign to closest cluster of closest point??? here to closest point) + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule + if (nn[i] >= minT) + continue; // DBSCAN edge rule + float mdist = eps; + auto loop = [&](int j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // needed? + mdist = dist; + iv[i] = iv[j]; // assign to cluster (better be unique??) + }; + cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } + + auto& foundClusters = alpaka::declareSharedVar(acc); + foundClusters = 0; + alpaka::syncBlockThreads(acc); + + // find the number of different clusters, identified by a tracks with clus[i] == i; + // mark these tracks with a negative id. + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{}); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; + } + } + } + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(foundClusters < ::zVertex::MAXVTX); + + // propagate the negative id to all the tracks in the cluster. + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; + } + } + alpaka::syncBlockThreads(acc); + + // adjust the cluster id to be a positive value starting from 0 + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + iv[i] = -iv[i] - 1; + } + + nvIntermediate = nvFinal = foundClusters; + + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("found %d proto vertices\n", foundClusters); + } + } + }; + } // namespace vertexFinder +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoTracker_PixelVertexFinding_plugins_clusterTracksIterativeAlpaka_h diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h new file mode 100644 index 0000000000000..9ff4656b9718e --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h @@ -0,0 +1,123 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_gpuFitVertices_h +#define RecoPixelVertexing_PixelVertexFinding_gpuFitVertices_h + +#include +#include +#include +#include +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" + +#include "vertexFinder.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace vertexFinder { + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fitVertices( + const TAcc& acc, + VtxSoAView& pdata, + WsSoAView& pws, + float chi2Max // for outlier rejection + ) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); + uint32_t& nvIntermediate = ws.nvIntermediate(); + + int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); + + ALPAKA_ASSERT_OFFLOAD(nvFinal <= nvIntermediate); + nvFinal = nvIntermediate; + auto foundClusters = nvFinal; + + // zero + for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) { + zv[i] = 0; + wv[i] = 0; + chi2[i] = 0; + } + + // only for test + auto& noise = alpaka::declareSharedVar(acc); + + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + noise = 0; + } + alpaka::syncBlockThreads(acc); + + // compute cluster location + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] > 9990) { + if constexpr (verbose) + alpaka::atomicAdd(acc, &noise, 1, alpaka::hierarchy::Threads{}); + continue; + } + ALPAKA_ASSERT_OFFLOAD(iv[i] >= 0); + ALPAKA_ASSERT_OFFLOAD(iv[i] < int(foundClusters)); + auto w = 1.f / ezt2[i]; + alpaka::atomicAdd(acc, &zv[iv[i]], zt[i] * w, alpaka::hierarchy::Threads{}); + alpaka::atomicAdd(acc, &wv[iv[i]], w, alpaka::hierarchy::Threads{}); + } + + alpaka::syncBlockThreads(acc); + // reuse nn + for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) { + ALPAKA_ASSERT_OFFLOAD(wv[i] > 0.f); + zv[i] /= wv[i]; + nn[i] = -1; // ndof + } + alpaka::syncBlockThreads(acc); + + // compute chi2 + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] > 9990) + continue; + + auto c2 = zv[iv[i]] - zt[i]; + c2 *= c2 / ezt2[i]; + if (c2 > chi2Max) { + iv[i] = 9999; + continue; + } + alpaka::atomicAdd(acc, &chi2[iv[i]], c2, alpaka::hierarchy::Blocks{}); + alpaka::atomicAdd(acc, &nn[iv[i]], 1, alpaka::hierarchy::Blocks{}); + } + alpaka::syncBlockThreads(acc); + + for (auto i : cms::alpakatools::elements_with_stride(acc, foundClusters)) { + if (nn[i] > 0) { + wv[i] *= float(nn[i]) / chi2[i]; + } + } + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) { + printf("found %d proto clusters ", foundClusters); + printf("and %d noise\n", noise); + } + } + } + + class FitVerticesKernel { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VtxSoAView pdata, + WsSoAView pws, + float chi2Max // for outlier rejection + ) const { + fitVertices(acc, pdata, pws, chi2Max); + } + }; + } // namespace vertexFinder +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_gpuFitVertices_h diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h new file mode 100644 index 0000000000000..2c6f0cb0597e4 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h @@ -0,0 +1,80 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_sortByPt2_h +#define RecoPixelVertexing_PixelVertexFinding_sortByPt2_h + +#include +#include +#include +#include +#include +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" +#include "HeterogeneousCore/AlpakaInterface/interface/radixSort.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" + +#include "vertexFinder.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace vertexFinder { + using VtxSoAView = ::reco::ZVertexSoAView; + using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE void sortByPt2(const TAcc& acc, VtxSoAView& data, WsSoAView& ws) { + auto nt = ws.ntrks(); + float const* __restrict__ ptt2 = ws.ptt2(); + uint32_t const& nvFinal = data.nvFinal(); + + int32_t const* __restrict__ iv = ws.iv(); + float* __restrict__ ptv2 = data.ptv2(); + uint16_t* __restrict__ sortInd = data.sortInd(); + + if (nvFinal < 1) + return; + + // fill indexing + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + data.idv()[ws.itrk()[i]] = iv[i]; + }; + + // can be done asynchronously at the end of previous event + for (auto i : cms::alpakatools::elements_with_stride(acc, nvFinal)) { + ptv2[i] = 0; + }; + alpaka::syncBlockThreads(acc); + + for (auto i : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[i] <= 9990) { + alpaka::atomicAdd(acc, &ptv2[iv[i]], ptt2[i], alpaka::hierarchy::Blocks{}); + } + }; + alpaka::syncBlockThreads(acc); + + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + if (1 == nvFinal) { + if (threadIdxLocal == 0) + sortInd[0] = 0; + return; + } + + if constexpr (not cms::alpakatools::requires_single_thread_per_block_v) { + auto& sws = alpaka::declareSharedVar(acc); + // sort using only 16 bits + cms::alpakatools::radixSort(acc, ptv2, sortInd, sws, nvFinal); + } else { + for (uint16_t i = 0; i < nvFinal; ++i) + sortInd[i] = i; + std::sort(sortInd, sortInd + nvFinal, [&](auto i, auto j) { return ptv2[i] < ptv2[j]; }); + } + } + + class SortByPt2Kernel { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const { + sortByPt2(acc, pdata, pws); + } + }; + } // namespace vertexFinder +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelVertexFinding_sortByPt2_h diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h new file mode 100644 index 0000000000000..7d31a48a0f6f3 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h @@ -0,0 +1,162 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_splitVertices_h +#define RecoPixelVertexing_PixelVertexFinding_splitVertices_h + +#include +#include +#include +#include +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" + +#include "vertexFinder.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace vertexFinder { + using VtxSoAView = ::reco::ZVertexSoAView; + using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void splitVertices(const TAcc& acc, + VtxSoAView& pdata, + WsSoAView& pws, + float maxChi2) { + constexpr bool verbose = false; // in principle the compiler should optmize out if false + const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + + auto& __restrict__ data = pdata; + auto& __restrict__ ws = pws; + auto nt = ws.ntrks(); + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + float* __restrict__ zv = data.zv(); + float* __restrict__ wv = data.wv(); + float const* __restrict__ chi2 = data.chi2(); + uint32_t& nvFinal = data.nvFinal(); + + int32_t const* __restrict__ nn = data.ndof(); + int32_t* __restrict__ iv = ws.iv(); + + ALPAKA_ASSERT_OFFLOAD(zt); + ALPAKA_ASSERT_OFFLOAD(wv); + ALPAKA_ASSERT_OFFLOAD(chi2); + ALPAKA_ASSERT_OFFLOAD(nn); + + constexpr uint32_t MAXTK = 512; + + auto& it = alpaka::declareSharedVar(acc); // track index + auto& zz = alpaka::declareSharedVar(acc); // z pos + auto& newV = alpaka::declareSharedVar(acc); // 0 or 1 + auto& ww = alpaka::declareSharedVar(acc); // z weight + auto& nq = alpaka::declareSharedVar(acc); // number of track for this vertex + + const uint32_t blockIdx(alpaka::getIdx(acc)[0u]); + const uint32_t gridDimension(alpaka::getWorkDiv(acc)[0u]); + + // one vertex per block + for (auto kv = blockIdx; kv < nvFinal; kv += gridDimension) { + if (nn[kv] < 4) + continue; + if (chi2[kv] < maxChi2 * float(nn[kv])) + continue; + + ALPAKA_ASSERT_OFFLOAD(nn[kv] < int32_t(MAXTK)); + + if ((uint32_t)nn[kv] >= MAXTK) + continue; // too bad FIXME + + nq = 0u; + alpaka::syncBlockThreads(acc); + + // copy to local + for (auto k : cms::alpakatools::elements_with_stride(acc, nt)) { + if (iv[k] == int(kv)) { + auto old = alpaka::atomicInc(acc, &nq, MAXTK, alpaka::hierarchy::Threads{}); + zz[old] = zt[k] - zv[kv]; + newV[old] = zz[old] < 0 ? 0 : 1; + ww[old] = 1.f / ezt2[k]; + it[old] = k; + } + } + + // the new vertices + auto& znew = alpaka::declareSharedVar(acc); + auto& wnew = alpaka::declareSharedVar(acc); + alpaka::syncBlockThreads(acc); + + ALPAKA_ASSERT_OFFLOAD(int(nq) == nn[kv] + 1); + + int maxiter = 20; + // kt-min.... + bool more = true; + while (alpaka::syncBlockThreadsPredicate(acc, more)) { + more = false; + if (0 == threadIdxLocal) { + znew[0] = 0; + znew[1] = 0; + wnew[0] = 0; + wnew[1] = 0; + } + alpaka::syncBlockThreads(acc); + + for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) { + auto i = newV[k]; + alpaka::atomicAdd(acc, &znew[i], zz[k] * ww[k], alpaka::hierarchy::Threads{}); + alpaka::atomicAdd(acc, &wnew[i], ww[k], alpaka::hierarchy::Threads{}); + } + alpaka::syncBlockThreads(acc); + + if (0 == threadIdxLocal) { + znew[0] /= wnew[0]; + znew[1] /= wnew[1]; + } + alpaka::syncBlockThreads(acc); + + for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) { + auto d0 = fabs(zz[k] - znew[0]); + auto d1 = fabs(zz[k] - znew[1]); + auto newer = d0 < d1 ? 0 : 1; + more |= newer != newV[k]; + newV[k] = newer; + } + --maxiter; + if (maxiter <= 0) + more = false; + } + + // avoid empty vertices + if (0 == wnew[0] || 0 == wnew[1]) + continue; + + // quality cut + auto dist2 = (znew[0] - znew[1]) * (znew[0] - znew[1]); + + auto chi2Dist = dist2 / (1.f / wnew[0] + 1.f / wnew[1]); + + if (verbose && 0 == threadIdxLocal) + printf("inter %d %f %f\n", 20 - maxiter, chi2Dist, dist2 * wv[kv]); + + if (chi2Dist < 4) + continue; + + // get a new global vertex + auto& igv = alpaka::declareSharedVar(acc); + if (0 == threadIdxLocal) + igv = alpaka::atomicAdd(acc, &ws.nvIntermediate(), 1u, alpaka::hierarchy::Blocks{}); + alpaka::syncBlockThreads(acc); + for (auto k : cms::alpakatools::elements_with_stride(acc, nq)) { + if (1 == newV[k]) + iv[it[k]] = igv; + } + + } // loop on vertices + } + + class SplitVerticesKernel { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws, float maxChi2) const { + splitVertices(acc, pdata, pws, maxChi2); + } + }; + } // namespace vertexFinder +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif // RecoPixelVertexing_PixelVertexFinding_plugins_splitVertices.h diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc new file mode 100644 index 0000000000000..83bc8f0d84ec2 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc @@ -0,0 +1,208 @@ +#include +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" + +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/traits.h" +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h" + +#include "vertexFinder.h" +#include "vertexFinder.h" +#include "clusterTracksDBSCAN.h" +#include "clusterTracksIterative.h" +#include "clusterTracksByDensity.h" +#include "fitVertices.h" +#include "sortByPt2.h" +#include "splitVertices.h" + +#undef PIXVERTEX_DEBUG_PRODUCE +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace vertexFinder { + using namespace cms::alpakatools; + // reject outlier tracks that contribute more than this to the chi2 of the vertex fit + constexpr float maxChi2ForFirstFit = 50.f; + constexpr float maxChi2ForFinalFit = 5000.f; + + // split vertices with a chi2/NDoF greater than this + constexpr float maxChi2ForSplit = 9.f; + + template + class LoadTracks { + public: + template >> + ALPAKA_FN_ACC void operator()(const TAcc& acc, + reco::TrackSoAConstView tracks_view, + VtxSoAView soa, + WsSoAView pws, + float ptMin, + float ptMax) const { + auto const* quality = tracks_view.quality(); + using helper = TracksUtilities; + + for (auto idx : cms::alpakatools::elements_with_stride(acc, tracks_view.nTracks())) { + [[maybe_unused]] auto nHits = helper::nHits(tracks_view, idx); + ALPAKA_ASSERT_OFFLOAD(nHits >= 3); + + // initialize soa... + soa[idx].idv() = -1; + + if (helper::isTriplet(tracks_view, idx)) + continue; // no triplets + if (quality[idx] < ::pixelTrack::Quality::highPurity) + continue; + + auto pt = tracks_view[idx].pt(); + + if (pt < ptMin) + continue; + + // clamp pt + pt = std::min(pt, ptMax); + + auto& data = pws; + auto it = alpaka::atomicAdd(acc, &data.ntrks(), 1u, alpaka::hierarchy::Blocks{}); + data[it].itrk() = idx; + data[it].zt() = helper::zip(tracks_view, idx); + data[it].ezt2() = tracks_view[idx].covariance()(14); + data[it].ptt2() = pt * pt; + } + } + }; +// #define THREE_KERNELS +#ifndef THREE_KERNELS + class VertexFinderOneKernel { + public: + template >> + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VtxSoAView pdata, + WsSoAView pws, + bool doSplit, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) const { + clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + alpaka::syncBlockThreads(acc); + fitVertices(acc, pdata, pws, maxChi2ForFirstFit); + alpaka::syncBlockThreads(acc); + if (doSplit) { + splitVertices(acc, pdata, pws, maxChi2ForSplit); + alpaka::syncBlockThreads(acc); + fitVertices(acc, pdata, pws, maxChi2ForFinalFit); + alpaka::syncBlockThreads(acc); + } + sortByPt2(acc, pdata, pws); + } + }; +#else + class VertexFinderKernel1 { + public: + template >> + ALPAKA_FN_ACC void operator()(const TAcc& acc, + VtxSoAView pdata, + WsSoAView pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) const { + clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + alpaka::syncBlockThreads(acc); + fitVertices(pdata, pws, maxChi2ForFirstFit); + } + }; + class VertexFinderKernel2 { + public: + template >> + ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const { + fitVertices(pdata, pws, maxChi2ForFinalFit); + alpaka::syncBlockThreads(acc); + sortByPt2(pdata, pws); + } + }; +#endif + + template + ZVertexSoACollection Producer::makeAsync(Queue& queue, + const reco::TrackSoAConstView& tracks_view, + float ptMin, + float ptMax) const { +#ifdef PIXVERTEX_DEBUG_PRODUCE + std::cout << "producing Vertices on GPU" << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + ZVertexSoACollection vertices(queue); + + auto soa = vertices.view(); + + auto ws_d = PixelVertexWorkSpaceSoADevice(::zVertex::MAXTRACKS, queue); + + // Initialize + const auto initWorkDiv = cms::alpakatools::make_workdiv(1, 1); + alpaka::exec(queue, initWorkDiv, Init{}, soa, ws_d.view()); + + // Load Tracks + const uint32_t blockSize = 128; + const uint32_t numberOfBlocks = + cms::alpakatools::divide_up_by(tracks_view.metadata().size() + blockSize - 1, blockSize); + const auto loadTracksWorkDiv = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); + alpaka::exec( + queue, loadTracksWorkDiv, LoadTracks{}, tracks_view, soa, ws_d.view(), ptMin, ptMax); + + // Running too many thread lead to problems when printf is enabled. + const auto finderSorterWorkDiv = cms::alpakatools::make_workdiv(1, 1024 - 128); + const auto splitterFitterWorkDiv = cms::alpakatools::make_workdiv(1024, 128); + + if (oneKernel_) { + // implemented only for density clustesrs +#ifndef THREE_KERNELS + alpaka::exec(queue, + finderSorterWorkDiv, + VertexFinderOneKernel{}, + soa, + ws_d.view(), + doSplitting_, + minT, + eps, + errmax, + chi2max); +#else + alpaka::exec( + queue, finderSorterWorkDiv, VertexFinderOneKernel{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + + // one block per vertex... + if (doSplitting_) + alpaka::exec(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, ws_d.view(), maxChi2ForSplit); + alpaka::exec(queue, finderSorterWorkDiv{}, soa, ws_d.view()); +#endif + } else { // five kernels + if (useDensity_) { + alpaka::exec( + queue, finderSorterWorkDiv, ClusterTracksByDensityKernel{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + + } else if (useDBSCAN_) { + alpaka::exec( + queue, finderSorterWorkDiv, ClusterTracksDBSCAN{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + } else if (useIterative_) { + alpaka::exec( + queue, finderSorterWorkDiv, ClusterTracksIterative{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + } + alpaka::exec(queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, ws_d.view(), maxChi2ForFirstFit); + + // one block per vertex... + if (doSplitting_) { + alpaka::exec(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, ws_d.view(), maxChi2ForSplit); + + alpaka::exec(queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, ws_d.view(), maxChi2ForFinalFit); + } + alpaka::exec(queue, finderSorterWorkDiv, SortByPt2Kernel{}, soa, ws_d.view()); + } + + return vertices; + } + + template class Producer; + template class Producer; + template class Producer; + } // namespace vertexFinder +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h new file mode 100644 index 0000000000000..23e5db1e706c4 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h @@ -0,0 +1,76 @@ +#ifndef RecoPixelVertexing_PixelVertexFinding_vertexFinder_h +#define RecoPixelVertexing_PixelVertexFinding_vertexFinder_h + +#include +#include +#include +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h" +#include "DataFormats/VertexSoA/interface/ZVertexDevice.h" + +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace vertexFinder { + using namespace cms::alpakatools; + using VtxSoAView = ::reco::ZVertexSoAView; + using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; + + class Init { + public: + template >> + ALPAKA_FN_ACC void operator()(const TAcc &acc, VtxSoAView pdata, WsSoAView pws) const { + pdata.nvFinal() = 0; // initialization + ::vertexFinder::init(pws); + } + }; + + template + class Producer { + using TkSoAConstView = reco::TrackSoAConstView; + + public: + Producer(bool oneKernel, + bool useDensity, + bool useDBSCAN, + bool useIterative, + bool doSplitting, + int iminT, // min number of neighbours to be "core" + float ieps, // max absolute distance to cluster + float ierrmax, // max error to be "seed" + float ichi2max // max normalized distance to cluster + ) + : oneKernel_(oneKernel && !(useDBSCAN || useIterative)), + useDensity_(useDensity), + useDBSCAN_(useDBSCAN), + useIterative_(useIterative), + doSplitting_(doSplitting), + minT(iminT), + eps(ieps), + errmax(ierrmax), + chi2max(ichi2max) {} + + ~Producer() = default; + + ZVertexSoACollection makeAsync(Queue &queue, const TkSoAConstView &tracks_view, float ptMin, float ptMax) const; + + private: + const bool oneKernel_; // run everything (cluster,fit,split,sort) in one kernel. Uses only density clusterizer + const bool useDensity_; // use density clusterizer + const bool useDBSCAN_; // use DBScan clusterizer + const bool useIterative_; // use iterative clusterizer + const bool doSplitting_; //run vertex splitting + + int minT; // min number of neighbours to be "core" + float eps; // max absolute distance to cluster + float errmax; // max error to be "seed" + float chi2max; // max normalized distance to cluster + }; + + } // namespace vertexFinder +} // namespace ALPAKA_ACCELERATOR_NAMESPACE +#endif diff --git a/RecoTracker/PixelVertexFinding/test/BuildFile.xml b/RecoTracker/PixelVertexFinding/test/BuildFile.xml index 9343f00f9a027..d5d0142eca659 100644 --- a/RecoTracker/PixelVertexFinding/test/BuildFile.xml +++ b/RecoTracker/PixelVertexFinding/test/BuildFile.xml @@ -10,29 +10,31 @@ - - - - - - - - - - + + + + + - - - - - + + + + + + + + + + + + + + + + + - - - - - @@ -42,3 +44,31 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.cc b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.cc new file mode 100644 index 0000000000000..c3a74676956f8 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.cc @@ -0,0 +1,33 @@ +#include +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" +#include "HeterogeneousCore/AlpakaInterface/interface/host.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h" +#include "DataFormats/VertexSoA/interface/ZVertexDevice.h" + +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" +#include "RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h" +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h" + +using namespace std; +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + namespace vertexfinder_t { + void runKernels(Queue& queue); + } + +}; // namespace ALPAKA_ACCELERATOR_NAMESPACE + +int main() { + const auto host = cms::alpakatools::host(); + const auto device = cms::alpakatools::devices()[0]; + Queue queue(device); + + vertexfinder_t::runKernels(queue); + return 0; +} diff --git a/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc new file mode 100644 index 0000000000000..e92d586dc1833 --- /dev/null +++ b/RecoTracker/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc @@ -0,0 +1,282 @@ +#include +#include +#include +#include +#include +#include +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +// TrackUtilities only included in order to compile SoALayout with Eigen columns +#include "DataFormats/TrackSoA/interface/alpaka/TrackUtilities.h" +#ifdef USE_DBSCAN +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h" +#define CLUSTERIZE ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ClusterTracksDBSCAN +#elif USE_ITERATIVE +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h" +#define CLUSTERIZE ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ClusterTracksIterative +#else +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h" +#define CLUSTERIZE ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder::ClusterTracksByDensityKernel +#endif + +#include "RecoTracker/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" +#include "RecoTracker/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h" +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h" + +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/fitVertices.h" +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/sortByPt2.h" +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/splitVertices.h" +#include "RecoTracker/PixelVertexFinding/plugins/alpaka/vertexFinder.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + using namespace cms::alpakatools; + + struct ClusterGenerator { + explicit ClusterGenerator(float nvert, float ntrack) + : rgen(-13., 13), errgen(0.005, 0.025), clusGen(nvert), trackGen(ntrack), gauss(0., 1.), ptGen(1.) {} + + void operator()(vertexFinder::PixelVertexWorkSpaceSoAHost& pwsh, ZVertexHost& vtxh) { + int nclus = clusGen(reng); + for (int zint = 0; zint < vtxh.view().metadata().size(); ++zint) { + vtxh.view().zv()[zint] = 3.5f * gauss(reng); + } + + int aux = 0; + for (int iv = 0; iv < nclus; ++iv) { + auto nt = trackGen(reng); + pwsh.view().itrk()[iv] = nt; + for (int it = 0; it < nt; ++it) { + auto err = errgen(reng); // reality is not flat.... + pwsh.view().zt()[aux] = vtxh.view().zv()[iv] + err * gauss(reng); + pwsh.view().ezt2()[aux] = err * err; + pwsh.view().iv()[aux] = iv; + pwsh.view().ptt2()[aux] = (iv == 5 ? 1.f : 0.5f) + ptGen(reng); + pwsh.view().ptt2()[aux] *= pwsh.view().ptt2()[aux]; + ++aux; + } + } + pwsh.view().ntrks() = aux; + // add noise + auto nt = 2 * trackGen(reng); + for (int it = 0; it < nt; ++it) { + auto err = 0.03f; + pwsh.view().zt()[it] = rgen(reng); + pwsh.view().ezt2()[it] = err * err; + pwsh.view().iv()[it] = 9999; + pwsh.view().ptt2()[it] = 0.5f + ptGen(reng); + pwsh.view().ptt2()[it] *= pwsh.view().ptt2()[it]; + } + } + + std::mt19937 reng; + std::uniform_real_distribution rgen; + std::uniform_real_distribution errgen; + std::poisson_distribution clusGen; + std::poisson_distribution trackGen; + std::normal_distribution gauss; + std::exponential_distribution ptGen; + }; + + namespace vertexfinder_t { +#ifdef ONE_KERNEL + class VertexFinderOneKernel { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + vertexFinder::VtxSoAView pdata, + vertexFinder::WsSoAView pws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster, + ) const { + vertexFinder::clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + alpaka::syncBlockThreads(acc); + vertexFinder::fitVertices(acc, pdata, pws, 50.); + alpaka::syncBlockThreads(acc); + vertexFinder::splitVertices(acc, pdata, pws, 9.f); + alpaka::syncBlockThreads(acc); + vertexFinder::fitVertices(acc, pdata, pws, 5000.); + alpaka::syncBlockThreads(acc); + vertexFinder::sortByPt2(acc, pdata, pws); + alpaka::syncBlockThreads(acc); + } + }; +#endif + + class Kernel_print { + public: + template + ALPAKA_FN_ACC void operator()(const TAcc& acc, + vertexFinder::VtxSoAView pdata, + vertexFinder::WsSoAView pws) const { + printf("nt,nv %d %d,%d\n", pws.ntrks(), pdata.nvFinal(), pws.nvIntermediate()); + } + }; + + void runKernels(Queue& queue) { + vertexFinder::PixelVertexWorkSpaceSoADevice ws_d(zVertex::MAXTRACKS, queue); + vertexFinder::PixelVertexWorkSpaceSoAHost ws_h(zVertex::MAXTRACKS, queue); + ZVertexHost vertices_h(queue); + ZVertexSoACollection vertices_d(queue); + + float eps = 0.1f; + std::array par{{eps, 0.01f, 9.0f}}; + for (int nav = 30; nav < 80; nav += 20) { + ClusterGenerator gen(nav, 10); + + for (int i = 8; i < 20; ++i) { + auto kk = i / 4; // M param + + gen(ws_h, vertices_h); + auto workDiv1D = make_workdiv(1, 1); + alpaka::exec(queue, workDiv1D, vertexFinder::Init{}, vertices_d.view(), ws_d.view()); + // std::cout << "v,t size " << ws_h.view().zt()[0] << ' ' << vertices_h.view().zv()[0] << std::endl; + alpaka::memcpy(queue, ws_d.buffer(), ws_h.buffer()); + alpaka::wait(queue); + + std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; + + if ((i % 4) == 0) + par = {{eps, 0.02f, 12.0f}}; + if ((i % 4) == 1) + par = {{eps, 0.02f, 9.0f}}; + if ((i % 4) == 2) + par = {{eps, 0.01f, 9.0f}}; + if ((i % 4) == 3) + par = {{0.7f * eps, 0.01f, 9.0f}}; + + alpaka::exec(queue, workDiv1D, Kernel_print{}, vertices_d.view(), ws_d.view()); + + auto workDivClusterizer = make_workdiv(1, 512 + 256); +#ifdef ONE_KERNEL + alpaka::exec(queue, + workDivClusterizer, + VertexFinderOneKernel{}, + vertices_d.view(), + ws_d.view(), + kk, + par[0], + par[1], + par[2]); +#else + alpaka::exec( + queue, workDivClusterizer, CLUSTERIZE{}, vertices_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); +#endif + alpaka::wait(queue); + alpaka::exec(queue, workDiv1D, Kernel_print{}, vertices_d.view(), ws_d.view()); + alpaka::wait(queue); + + auto workDivFitter = make_workdiv(1, 1024 - 256); + + alpaka::exec( + queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 50.f); + + alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); + alpaka::wait(queue); + + if (vertices_h.view().nvFinal() == 0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } + + for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + { + auto mx = + std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal()); + std::cout << "after fit nv, min max chi2 " << vertices_h.view().nvFinal() << " " << *mx.first << ' ' + << *mx.second << std::endl; + } + + alpaka::exec( + queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 50.f); + alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); + alpaka::wait(queue); + + for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + { + auto mx = + std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal()); + std::cout << "before splitting nv, min max chi2 " << vertices_h.view().nvFinal() << " " << *mx.first << ' ' + << *mx.second << std::endl; + } + + auto workDivSplitter = make_workdiv(1024, 64); + + // one vertex per block!!! + alpaka::exec( + queue, workDivSplitter, vertexFinder::SplitVerticesKernel{}, vertices_d.view(), ws_d.view(), 9.f); + alpaka::memcpy(queue, ws_h.buffer(), ws_d.buffer()); + alpaka::wait(queue); + std::cout << "after split " << ws_h.view().nvIntermediate() << std::endl; + + alpaka::exec( + queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 5000.f); + + auto workDivSorter = make_workdiv(1, 256); + alpaka::exec(queue, workDivSorter, vertexFinder::SortByPt2Kernel{}, vertices_d.view(), ws_d.view()); + alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); + alpaka::wait(queue); + + if (vertices_h.view().nvFinal() == 0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } + + for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + { + auto mx = + std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal()); + std::cout << "nv, min max chi2 " << vertices_h.view().nvFinal() << " " << *mx.first << ' ' << *mx.second + << std::endl; + } + + { + auto mx = std::minmax_element(vertices_h.view().wv(), vertices_h.view().wv() + vertices_h.view().nvFinal()); + std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) + << std::endl; + } + + { + auto mx = + std::minmax_element(vertices_h.view().ptv2(), vertices_h.view().ptv2() + vertices_h.view().nvFinal()); + std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl; + std::cout << "min max ptv2 " << vertices_h.view().ptv2()[vertices_h.view().sortInd()[0]] << ' ' + << vertices_h.view().ptv2()[vertices_h.view().sortInd()[vertices_h.view().nvFinal() - 1]] + << " at " << vertices_h.view().sortInd()[0] << ' ' + << vertices_h.view().sortInd()[vertices_h.view().nvFinal() - 1] << std::endl; + } + + float dd[vertices_h.view().nvFinal()]; + for (auto kv = 0U; kv < vertices_h.view().nvFinal(); ++kv) { + auto zr = vertices_h.view().zv()[kv]; + auto md = 500.0f; + for (int zint = 0; zint < ws_h.view().metadata().size(); ++zint) { + auto d = std::abs(zr - ws_h.view().zt()[zint]); + md = std::min(d, md); + } + dd[kv] = md; + } + if (i == 6) { + for (auto d : dd) + std::cout << d << ' '; + std::cout << std::endl; + } + auto mx = std::minmax_element(dd, dd + vertices_h.view().nvFinal()); + float rms = 0; + for (auto d : dd) + rms += d * d; + rms = std::sqrt(rms) / (vertices_h.view().nvFinal() - 1); + std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl; + + } // loop on events + } // lopp on ave vert + } + } // namespace vertexfinder_t +} // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml b/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml index dec839e2af6cc..318ef5848183d 100644 --- a/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml +++ b/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml @@ -12,33 +12,42 @@ + + + + + + + + + @@ -48,3 +57,11 @@ + + + + + + + + diff --git a/RecoVertex/BeamSpotProducer/plugins/alpaka/BeamSpotDeviceProducer.cc b/RecoVertex/BeamSpotProducer/plugins/alpaka/BeamSpotDeviceProducer.cc new file mode 100644 index 0000000000000..bd597164827fa --- /dev/null +++ b/RecoVertex/BeamSpotProducer/plugins/alpaka/BeamSpotDeviceProducer.cc @@ -0,0 +1,59 @@ +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/BeamSpot/interface/BeamSpotHost.h" +#include "DataFormats/BeamSpot/interface/BeamSpotPOD.h" +#include "DataFormats/BeamSpot/interface/alpaka/BeamSpotDevice.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/global/EDProducer.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + + class BeamSpotDeviceProducer : public global::EDProducer<> { + public: + BeamSpotDeviceProducer(edm::ParameterSet const& config) + : legacyToken_{consumes(config.getParameter("src"))}, deviceToken_{produces()} {} + + void produce(edm::StreamID, device::Event& event, device::EventSetup const& setup) const override { + reco::BeamSpot const& beamspot = event.get(legacyToken_); + + BeamSpotHost hostProduct{event.queue()}; + hostProduct->x = beamspot.x0(); + hostProduct->y = beamspot.y0(); + hostProduct->z = beamspot.z0(); + hostProduct->sigmaZ = beamspot.sigmaZ(); + hostProduct->beamWidthX = beamspot.BeamWidthX(); + hostProduct->beamWidthY = beamspot.BeamWidthY(); + hostProduct->dxdz = beamspot.dxdz(); + hostProduct->dydz = beamspot.dydz(); + hostProduct->emittanceX = beamspot.emittanceX(); + hostProduct->emittanceY = beamspot.emittanceY(); + hostProduct->betaStar = beamspot.betaStar(); + + if constexpr (std::is_same_v) { + event.emplace(deviceToken_, std::move(hostProduct)); + } else { + BeamSpotDevice deviceProduct{event.queue()}; + alpaka::memcpy(event.queue(), deviceProduct.buffer(), hostProduct.const_buffer()); + event.emplace(deviceToken_, std::move(deviceProduct)); + } + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag{}); + descriptions.addWithDefaultLabel(desc); + } + + private: + const edm::EDGetTokenT legacyToken_; + const device::EDPutToken deviceToken_; + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h" +DEFINE_FWK_ALPAKA_MODULE(BeamSpotDeviceProducer); diff --git a/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py b/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py index 7cc651af22106..5c17275c17274 100644 --- a/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py +++ b/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py @@ -2,6 +2,7 @@ from RecoVertex.BeamSpotProducer.BeamSpot_cfi import * from RecoVertex.BeamSpotProducer.offlineBeamSpotToCUDA_cfi import offlineBeamSpotToCUDA +from RecoVertex.BeamSpotProducer.beamSpotDeviceProducer_cfi import beamSpotDeviceProducer as _beamSpotDeviceProducer offlineBeamSpotTask = cms.Task(offlineBeamSpot) @@ -9,3 +10,9 @@ _offlineBeamSpotTask_gpu = offlineBeamSpotTask.copy() _offlineBeamSpotTask_gpu.add(offlineBeamSpotToCUDA) gpu.toReplaceWith(offlineBeamSpotTask, _offlineBeamSpotTask_gpu) + +from Configuration.ProcessModifiers.alpaka_cff import alpaka +_offlineBeamSpotTask_alpaka = offlineBeamSpotTask.copy() +offlineBeamSpotDevice = _beamSpotDeviceProducer.clone(src = cms.InputTag('offlineBeamSpot')) +_offlineBeamSpotTask_alpaka.add(offlineBeamSpotDevice) +alpaka.toReplaceWith(offlineBeamSpotTask, _offlineBeamSpotTask_alpaka)