diff --git a/RecoTracker/LST/interface/LSTOutput.h b/RecoTracker/LST/interface/LSTOutput.h index a337f107e35ba..5be4f645d3416 100644 --- a/RecoTracker/LST/interface/LSTOutput.h +++ b/RecoTracker/LST/interface/LSTOutput.h @@ -7,17 +7,14 @@ class LSTOutput { public: LSTOutput() = default; - LSTOutput(std::vector> const& hitIdx, - std::vector const& len, - std::vector const& seedIdx, - std::vector const& trackCandidateType) { - hitIdx_ = std::move(hitIdx); - len_ = std::move(len); - seedIdx_ = std::move(seedIdx); - trackCandidateType_ = std::move(trackCandidateType); - } - - ~LSTOutput() = default; + LSTOutput(std::vector> const hitIdx, + std::vector const len, + std::vector const seedIdx, + std::vector const trackCandidateType) + : hitIdx_(std::move(hitIdx)), + len_(std::move(len)), + seedIdx_(std::move(seedIdx)), + trackCandidateType_(std::move(trackCandidateType)) {} enum LSTTCType { T5 = 4, pT3 = 5, pT5 = 7, pLS = 8 }; diff --git a/RecoTracker/LST/interface/LSTPhase2OTHitsInput.h b/RecoTracker/LST/interface/LSTPhase2OTHitsInput.h index 40b265db3edb7..00fd77846c4c3 100644 --- a/RecoTracker/LST/interface/LSTPhase2OTHitsInput.h +++ b/RecoTracker/LST/interface/LSTPhase2OTHitsInput.h @@ -9,19 +9,12 @@ class LSTPhase2OTHitsInput { public: LSTPhase2OTHitsInput() = default; - LSTPhase2OTHitsInput(std::vector const& detId, - std::vector const& x, - std::vector const& y, - std::vector const& z, - std::vector const& hits) { - detId_ = std::move(detId); - x_ = std::move(x); - y_ = std::move(y); - z_ = std::move(z); - hits_ = std::move(hits); - } - - ~LSTPhase2OTHitsInput() = default; + LSTPhase2OTHitsInput(std::vector const detId, + std::vector const x, + std::vector const y, + std::vector const z, + std::vector const hits) + : detId_(std::move(detId)), x_(std::move(x)), y_(std::move(y)), z_(std::move(z)), hits_(std::move(hits)) {} std::vector const& detId() const { return detId_; } std::vector const& x() const { return x_; } diff --git a/RecoTracker/LST/interface/LSTPixelSeedInput.h b/RecoTracker/LST/interface/LSTPixelSeedInput.h index 2fb6a244a5648..18d3768b2e0fc 100644 --- a/RecoTracker/LST/interface/LSTPixelSeedInput.h +++ b/RecoTracker/LST/interface/LSTPixelSeedInput.h @@ -7,39 +7,36 @@ class LSTPixelSeedInput { public: LSTPixelSeedInput() = default; - LSTPixelSeedInput(std::vector const& px, - std::vector const& py, - std::vector const& pz, - std::vector const& dxy, - std::vector const& dz, - std::vector const& ptErr, - std::vector const& etaErr, - std::vector const& stateTrajGlbX, - std::vector const& stateTrajGlbY, - std::vector const& stateTrajGlbZ, - std::vector const& stateTrajGlbPx, - std::vector const& stateTrajGlbPy, - std::vector const& stateTrajGlbPz, - std::vector const& q, - std::vector> const& hitIdx) { - px_ = std::move(px); - py_ = std::move(py); - pz_ = std::move(pz); - dxy_ = std::move(dxy); - dz_ = std::move(dz); - ptErr_ = std::move(ptErr); - etaErr_ = std::move(etaErr); - stateTrajGlbX_ = std::move(stateTrajGlbX); - stateTrajGlbY_ = std::move(stateTrajGlbY); - stateTrajGlbZ_ = std::move(stateTrajGlbZ); - stateTrajGlbPx_ = std::move(stateTrajGlbPx); - stateTrajGlbPy_ = std::move(stateTrajGlbPy); - stateTrajGlbPz_ = std::move(stateTrajGlbPz); - q_ = std::move(q); - hitIdx_ = std::move(hitIdx); - } - - ~LSTPixelSeedInput() = default; + LSTPixelSeedInput(std::vector const px, + std::vector const py, + std::vector const pz, + std::vector const dxy, + std::vector const dz, + std::vector const ptErr, + std::vector const etaErr, + std::vector const stateTrajGlbX, + std::vector const stateTrajGlbY, + std::vector const stateTrajGlbZ, + std::vector const stateTrajGlbPx, + std::vector const stateTrajGlbPy, + std::vector const stateTrajGlbPz, + std::vector const q, + std::vector> const hitIdx) + : px_(std::move(px)), + py_(std::move(py)), + pz_(std::move(pz)), + dxy_(std::move(dxy)), + dz_(std::move(dz)), + ptErr_(std::move(ptErr)), + etaErr_(std::move(etaErr)), + stateTrajGlbX_(std::move(stateTrajGlbX)), + stateTrajGlbY_(std::move(stateTrajGlbY)), + stateTrajGlbZ_(std::move(stateTrajGlbZ)), + stateTrajGlbPx_(std::move(stateTrajGlbPx)), + stateTrajGlbPy_(std::move(stateTrajGlbPy)), + stateTrajGlbPz_(std::move(stateTrajGlbPz)), + q_(std::move(q)), + hitIdx_(std::move(hitIdx)) {} std::vector const& px() const { return px_; } std::vector const& py() const { return py_; } diff --git a/RecoTracker/LST/plugins/alpaka/LSTModulesDevESProducer.cc b/RecoTracker/LST/plugins/alpaka/LSTModulesDevESProducer.cc index 908eb796e581b..46c99993c5ed9 100644 --- a/RecoTracker/LST/plugins/alpaka/LSTModulesDevESProducer.cc +++ b/RecoTracker/LST/plugins/alpaka/LSTModulesDevESProducer.cc @@ -9,7 +9,7 @@ // LST includes #include "RecoTracker/LSTCore/interface/Module.h" -#include "RecoTracker/LSTCore/interface/LST.h" +#include "RecoTracker/LSTCore/interface/alpaka/LST.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -22,8 +22,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { descriptions.addWithDefaultLabel(desc); } - std::unique_ptr> produce(TrackerRecoGeometryRecord const& iRecord) { - return lst::loadAndFillESHost(); + std::unique_ptr<::lst::LSTESData> produce(TrackerRecoGeometryRecord const& iRecord) { + return ::lst::loadAndFillESHost(); } }; diff --git a/RecoTracker/LST/plugins/alpaka/LSTProducer.cc b/RecoTracker/LST/plugins/alpaka/LSTProducer.cc index 18bd7c25a9aec..e92ff549dffd1 100644 --- a/RecoTracker/LST/plugins/alpaka/LSTProducer.cc +++ b/RecoTracker/LST/plugins/alpaka/LSTProducer.cc @@ -19,7 +19,7 @@ #include "RecoTracker/Record/interface/TrackerRecoGeometryRecord.h" -#include "RecoTracker/LSTCore/interface/LST.h" +#include "RecoTracker/LSTCore/interface/alpaka/LST.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -87,11 +87,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { private: edm::EDGetTokenT lstPixelSeedInputToken_; edm::EDGetTokenT lstPhase2OTHitsInputToken_; - device::ESGetToken, TrackerRecoGeometryRecord> lstESToken_; + device::ESGetToken<::lst::LSTESData, TrackerRecoGeometryRecord> lstESToken_; const bool verbose_, nopLSDupClean_, tcpLSTriplets_; edm::EDPutTokenT lstOutputToken_; - lst::LST lst_; + lst::LST lst_; }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/RecoTracker/LSTCore/interface/Constants.h b/RecoTracker/LSTCore/interface/Constants.h index 725cf5f46b224..c0c342b6ad8a0 100644 --- a/RecoTracker/LSTCore/interface/Constants.h +++ b/RecoTracker/LSTCore/interface/Constants.h @@ -14,23 +14,21 @@ namespace lst { using Buf = alpaka::Buf; // Allocation wrapper function to make integration of the caching allocator easier and reduce code boilerplate. - template - ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf, T> allocBufWrapper(TAcc const& devAccIn, - TSize nElements, - TQueue queue) { + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf allocBufWrapper(TDev const& dev, TSize nElements, TQueue queue) { #ifdef CACHE_ALLOC return cms::alpakatools::allocCachedBuf( - devAccIn, queue, alpaka_common::Vec1D(static_cast(nElements))); + dev, queue, alpaka_common::Vec1D(static_cast(nElements))); #else - return alpaka::allocBuf(devAccIn, + return alpaka::allocBuf(dev, alpaka_common::Vec1D(static_cast(nElements))); #endif } // Second allocation wrapper function when queue is not given. Reduces code boilerplate. - template - ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf, T> allocBufWrapper(TAcc const& devAccIn, TSize nElements) { - return alpaka::allocBuf(devAccIn, + template + ALPAKA_FN_HOST ALPAKA_FN_INLINE Buf allocBufWrapper(TDev const& dev, TSize nElements) { + return alpaka::allocBuf(dev, alpaka_common::Vec1D(static_cast(nElements))); } diff --git a/RecoTracker/LSTCore/interface/EndcapGeometry.h b/RecoTracker/LSTCore/interface/EndcapGeometry.h index 555955d83941c..1a84d89abf90d 100644 --- a/RecoTracker/LSTCore/interface/EndcapGeometry.h +++ b/RecoTracker/LSTCore/interface/EndcapGeometry.h @@ -23,7 +23,6 @@ namespace lst { EndcapGeometry() = default; EndcapGeometry(std::string const& filename); - ~EndcapGeometry() = default; void load(std::string const&); void fillGeoMapArraysExplicit(); diff --git a/RecoTracker/LSTCore/interface/EndcapGeometryBuffer.h b/RecoTracker/LSTCore/interface/EndcapGeometryBuffer.h index 6a787a5ed95eb..2c6df9ab2773c 100644 --- a/RecoTracker/LSTCore/interface/EndcapGeometryBuffer.h +++ b/RecoTracker/LSTCore/interface/EndcapGeometryBuffer.h @@ -19,8 +19,8 @@ namespace lst { template void setData(TBuff const& buf) { - geoMapDetId = alpaka::getPtrNative(buf.geoMapDetId_buf); - geoMapPhi = alpaka::getPtrNative(buf.geoMapPhi_buf); + geoMapDetId = buf.geoMapDetId_buf.data(); + geoMapPhi = buf.geoMapPhi_buf.data(); } }; diff --git a/RecoTracker/LSTCore/interface/LST.h b/RecoTracker/LSTCore/interface/LST.h deleted file mode 100644 index ac23bd09a7ecf..0000000000000 --- a/RecoTracker/LSTCore/interface/LST.h +++ /dev/null @@ -1,112 +0,0 @@ -#ifndef RecoTracker_LSTCore_interface_LST_h -#define RecoTracker_LSTCore_interface_LST_h - -#include "RecoTracker/LSTCore/interface/Constants.h" -#include "RecoTracker/LSTCore/interface/LSTESData.h" - -#include -#include -#include - -namespace lst { - template - class Event; - - template - class LST; - - template - class LST { - public: - LST() = default; - - template - void run(TQueue& queue, - bool verbose, - LSTESData> const* deviceESData, - std::vector const& see_px, - std::vector const& see_py, - std::vector const& see_pz, - std::vector const& see_dxy, - std::vector const& see_dz, - std::vector const& see_ptErr, - std::vector const& see_etaErr, - std::vector const& see_stateTrajGlbX, - std::vector const& see_stateTrajGlbY, - std::vector const& see_stateTrajGlbZ, - std::vector const& see_stateTrajGlbPx, - std::vector const& see_stateTrajGlbPy, - std::vector const& see_stateTrajGlbPz, - std::vector const& see_q, - std::vector> const& see_hitIdx, - std::vector const& ph2_detId, - std::vector const& ph2_x, - std::vector const& ph2_y, - std::vector const& ph2_z, - bool no_pls_dupclean, - bool tc_pls_triplets); - std::vector> const& hits() const { return out_tc_hitIdxs_; } - std::vector const& len() const { return out_tc_len_; } - std::vector const& seedIdx() const { return out_tc_seedIdx_; } - std::vector const& trackCandidateType() const { return out_tc_trackCandidateType_; } - - private: - void prepareInput(std::vector const& see_px, - std::vector const& see_py, - std::vector const& see_pz, - std::vector const& see_dxy, - std::vector const& see_dz, - std::vector const& see_ptErr, - std::vector const& see_etaErr, - std::vector const& see_stateTrajGlbX, - std::vector const& see_stateTrajGlbY, - std::vector const& see_stateTrajGlbZ, - std::vector const& see_stateTrajGlbPx, - std::vector const& see_stateTrajGlbPy, - std::vector const& see_stateTrajGlbPz, - std::vector const& see_q, - std::vector> const& see_hitIdx, - std::vector const& ph2_detId, - std::vector const& ph2_x, - std::vector const& ph2_y, - std::vector const& ph2_z); - - void getOutput(lst::Event& event); - std::vector getHitIdxs(short trackCandidateType, - unsigned int TCIdx, - unsigned int const* TCHitIndices, - unsigned int const* hitIndices); - - // Input and output vectors - std::vector in_trkX_; - std::vector in_trkY_; - std::vector in_trkZ_; - std::vector in_hitId_; - std::vector in_hitIdxs_; - std::vector in_hitIndices_vec0_; - std::vector in_hitIndices_vec1_; - std::vector in_hitIndices_vec2_; - std::vector in_hitIndices_vec3_; - std::vector in_deltaPhi_vec_; - std::vector in_ptIn_vec_; - std::vector in_ptErr_vec_; - std::vector in_px_vec_; - std::vector in_py_vec_; - std::vector in_pz_vec_; - std::vector in_eta_vec_; - std::vector in_etaErr_vec_; - std::vector in_phi_vec_; - std::vector in_charge_vec_; - std::vector in_seedIdx_vec_; - std::vector in_superbin_vec_; - std::vector in_pixelType_vec_; - std::vector in_isQuad_vec_; - std::vector> out_tc_hitIdxs_; - std::vector out_tc_len_; - std::vector out_tc_seedIdx_; - std::vector out_tc_trackCandidateType_; - }; - -} // namespace lst - -#endif diff --git a/RecoTracker/LSTCore/interface/LSTESData.h b/RecoTracker/LSTCore/interface/LSTESData.h index 833770e631d2d..9f51be48f28b6 100644 --- a/RecoTracker/LSTCore/interface/LSTESData.h +++ b/RecoTracker/LSTCore/interface/LSTESData.h @@ -19,16 +19,16 @@ namespace lst { uint16_t nLowerModules; unsigned int nPixels; unsigned int nEndCapMap; - std::shared_ptr> modulesBuffers; - std::shared_ptr> endcapGeometryBuffers; + ModulesBuffer modulesBuffers; + EndcapGeometryBuffer endcapGeometryBuffers; std::shared_ptr pixelMapping; LSTESData(uint16_t const& nModulesIn, uint16_t const& nLowerModulesIn, unsigned int const& nPixelsIn, unsigned int const& nEndCapMapIn, - std::shared_ptr> const& modulesBuffersIn, - std::shared_ptr> const& endcapGeometryBuffersIn, + ModulesBuffer const& modulesBuffersIn, + EndcapGeometryBuffer const& endcapGeometryBuffersIn, std::shared_ptr const& pixelMappingIn) : nModules(nModulesIn), nLowerModules(nLowerModulesIn), @@ -49,19 +49,19 @@ namespace cms::alpakatools { template static lst::LSTESData> copyAsync(TQueue& queue, lst::LSTESData const& srcData) { - auto deviceModulesBuffers = std::make_shared>>( - alpaka::getDev(queue), srcData.nModules, srcData.nPixels); - deviceModulesBuffers->copyFromSrc(queue, *srcData.modulesBuffers); + auto deviceModulesBuffers = + lst::ModulesBuffer>(alpaka::getDev(queue), srcData.nModules, srcData.nPixels); + deviceModulesBuffers.copyFromSrc(queue, srcData.modulesBuffers); auto deviceEndcapGeometryBuffers = - std::make_shared>>(alpaka::getDev(queue), srcData.nEndCapMap); - deviceEndcapGeometryBuffers->copyFromSrc(queue, *srcData.endcapGeometryBuffers); + lst::EndcapGeometryBuffer>(alpaka::getDev(queue), srcData.nEndCapMap); + deviceEndcapGeometryBuffers.copyFromSrc(queue, srcData.endcapGeometryBuffers); return lst::LSTESData>(srcData.nModules, srcData.nLowerModules, srcData.nPixels, srcData.nEndCapMap, - deviceModulesBuffers, - deviceEndcapGeometryBuffers, + std::move(deviceModulesBuffers), + std::move(deviceEndcapGeometryBuffers), srcData.pixelMapping); } }; diff --git a/RecoTracker/LSTCore/interface/Module.h b/RecoTracker/LSTCore/interface/Module.h index d45415f800a4f..7266ebd7bc49b 100644 --- a/RecoTracker/LSTCore/interface/Module.h +++ b/RecoTracker/LSTCore/interface/Module.h @@ -72,44 +72,44 @@ namespace lst { } else { return false; } - }; + } static bool parseIsLower(bool isInvertedx, unsigned int detId) { return (isInvertedx) ? !(detId & 1) : (detId & 1); - }; + } static unsigned int parsePartnerModuleId(unsigned int detId, bool isLowerx, bool isInvertedx) { return isLowerx ? (isInvertedx ? detId - 1 : detId + 1) : (isInvertedx ? detId + 1 : detId - 1); - }; + } template void setData(TBuff const& buf) { - detIds = alpaka::getPtrNative(buf.detIds_buf); - moduleMap = alpaka::getPtrNative(buf.moduleMap_buf); - mapdetId = alpaka::getPtrNative(buf.mapdetId_buf); - mapIdx = alpaka::getPtrNative(buf.mapIdx_buf); - nConnectedModules = alpaka::getPtrNative(buf.nConnectedModules_buf); - drdzs = alpaka::getPtrNative(buf.drdzs_buf); - dxdys = alpaka::getPtrNative(buf.dxdys_buf); - nModules = alpaka::getPtrNative(buf.nModules_buf); - nLowerModules = alpaka::getPtrNative(buf.nLowerModules_buf); - partnerModuleIndices = alpaka::getPtrNative(buf.partnerModuleIndices_buf); - - layers = alpaka::getPtrNative(buf.layers_buf); - rings = alpaka::getPtrNative(buf.rings_buf); - modules = alpaka::getPtrNative(buf.modules_buf); - rods = alpaka::getPtrNative(buf.rods_buf); - subdets = alpaka::getPtrNative(buf.subdets_buf); - sides = alpaka::getPtrNative(buf.sides_buf); - eta = alpaka::getPtrNative(buf.eta_buf); - r = alpaka::getPtrNative(buf.r_buf); - isInverted = alpaka::getPtrNative(buf.isInverted_buf); - isLower = alpaka::getPtrNative(buf.isLower_buf); - isAnchor = alpaka::getPtrNative(buf.isAnchor_buf); - moduleType = alpaka::getPtrNative(buf.moduleType_buf); - moduleLayerType = alpaka::getPtrNative(buf.moduleLayerType_buf); - lstLayers = alpaka::getPtrNative(buf.lstLayers_buf); - connectedPixels = alpaka::getPtrNative(buf.connectedPixels_buf); + detIds = buf.detIds_buf.data(); + moduleMap = buf.moduleMap_buf.data(); + mapdetId = buf.mapdetId_buf.data(); + mapIdx = buf.mapIdx_buf.data(); + nConnectedModules = buf.nConnectedModules_buf.data(); + drdzs = buf.drdzs_buf.data(); + dxdys = buf.dxdys_buf.data(); + nModules = buf.nModules_buf.data(); + nLowerModules = buf.nLowerModules_buf.data(); + partnerModuleIndices = buf.partnerModuleIndices_buf.data(); + + layers = buf.layers_buf.data(); + rings = buf.rings_buf.data(); + modules = buf.modules_buf.data(); + rods = buf.rods_buf.data(); + subdets = buf.subdets_buf.data(); + sides = buf.sides_buf.data(); + eta = buf.eta_buf.data(); + r = buf.r_buf.data(); + isInverted = buf.isInverted_buf.data(); + isLower = buf.isLower_buf.data(); + isAnchor = buf.isAnchor_buf.data(); + moduleType = buf.moduleType_buf.data(); + moduleLayerType = buf.moduleLayerType_buf.data(); + lstLayers = buf.lstLayers_buf.data(); + connectedPixels = buf.connectedPixels_buf.data(); } }; @@ -212,7 +212,6 @@ namespace lst { alpaka::memcpy(queue, lstLayers_buf, src.lstLayers_buf); alpaka::memcpy(queue, connectedPixels_buf, src.connectedPixels_buf); } - alpaka::wait(queue); } template diff --git a/RecoTracker/LSTCore/interface/ModuleConnectionMap.h b/RecoTracker/LSTCore/interface/ModuleConnectionMap.h index b3a931345b3a5..1d4445d3b423e 100644 --- a/RecoTracker/LSTCore/interface/ModuleConnectionMap.h +++ b/RecoTracker/LSTCore/interface/ModuleConnectionMap.h @@ -16,7 +16,6 @@ namespace lst { public: ModuleConnectionMap(); ModuleConnectionMap(std::string const& filename); - ~ModuleConnectionMap(); void load(std::string const&); void add(std::string const&); diff --git a/RecoTracker/LSTCore/interface/TiltedGeometry.h b/RecoTracker/LSTCore/interface/TiltedGeometry.h index b70a1d95a357b..420000dd38aa0 100644 --- a/RecoTracker/LSTCore/interface/TiltedGeometry.h +++ b/RecoTracker/LSTCore/interface/TiltedGeometry.h @@ -18,7 +18,6 @@ namespace lst { public: TiltedGeometry() = default; TiltedGeometry(std::string const& filename); - ~TiltedGeometry() = default; void load(std::string const&); diff --git a/RecoTracker/LSTCore/interface/alpaka/Constants.h b/RecoTracker/LSTCore/interface/alpaka/Constants.h index e2ebd979a59a3..9fed7760c721a 100644 --- a/RecoTracker/LSTCore/interface/alpaka/Constants.h +++ b/RecoTracker/LSTCore/interface/alpaka/Constants.h @@ -3,104 +3,124 @@ #include "RecoTracker/LSTCore/interface/Constants.h" -#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED +#if defined ALPAKA_ACC_GPU_CUDA_ENABLED #include +#elif defined ALPAKA_ACC_GPU_HIP_ENABLED +#include #endif -namespace lst { - - using namespace ALPAKA_ACCELERATOR_NAMESPACE; +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace lst { + + // Re-export some useful things from the main namespace + using ::lst::allocBufWrapper; + using ::lst::Buf; + using ::lst::max_blocks; + using ::lst::max_connected_modules; + using ::lst::n_max_nonpixel_track_candidates; + using ::lst::n_max_pixel_md_per_modules; + using ::lst::n_max_pixel_quintuplets; + using ::lst::n_max_pixel_segments_per_module; + using ::lst::n_max_pixel_track_candidates; + using ::lst::n_max_pixel_triplets; + using ::lst::Params_LS; + using ::lst::Params_pLS; + using ::lst::Params_pT3; + using ::lst::Params_pT5; + using ::lst::Params_T3; + using ::lst::Params_T5; + using ::lst::size_superbins; // Half precision wrapper functions. #if defined(FP16_Base) #define __F2H __float2half #define __H2F __half2float - typedef __half float FPX; + typedef __half float FPX; #else #define __F2H #define __H2F - typedef float FPX; + typedef float FPX; #endif - Vec3D constexpr elementsPerThread(Vec3D::all(static_cast(1))); + Vec3D constexpr elementsPerThread(Vec3D::all(static_cast(1))); // Needed for files that are compiled by g++ to not throw an error. // uint4 is defined only for CUDA, so we will have to revisit this soon when running on other backends. #if !defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !defined(ALPAKA_ACC_GPU_HIP_ENABLED) - struct uint4 { - unsigned int x; - unsigned int y; - unsigned int z; - unsigned int w; - }; -#endif - - // Adjust grid and block sizes based on backend configuration - template - ALPAKA_FN_HOST ALPAKA_FN_INLINE WorkDiv3D createWorkDiv(const Vec& blocksPerGrid, - const Vec& threadsPerBlock, - const Vec& elementsPerThreadArg) { - Vec adjustedBlocks = blocksPerGrid; - Vec adjustedThreads = threadsPerBlock; - - // Serial execution, so all launch parameters set to 1. -#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) - adjustedBlocks = Vec::all(static_cast(1)); - adjustedThreads = Vec::all(static_cast(1)); -#endif - - // Threads enabled, set number of blocks to 1. -#if defined(ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED) - adjustedBlocks = Vec::all(static_cast(1)); + struct uint4 { + unsigned int x; + unsigned int y; + unsigned int z; + unsigned int w; + }; #endif - return WorkDiv3D(adjustedBlocks, adjustedThreads, elementsPerThreadArg); - } - - // The constants below are usually used in functions like alpaka::math::min(), - // expecting a reference (T const&) in the arguments. Hence, - // ALPAKA_STATIC_ACC_MEM_GLOBAL needs to be used in addition to constexpr. - - // 15 MeV constant from the approximate Bethe-Bloch formula - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMulsInGeV = 0.015; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMiniMulsPtScaleBarrel[6] = { - 0.0052, 0.0038, 0.0034, 0.0034, 0.0032, 0.0034}; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMiniMulsPtScaleEndcap[5] = {0.006, 0.006, 0.006, 0.006, 0.006}; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMiniRminMeanBarrel[6] = { - 25.007152356, 37.2186993757, 52.3104270826, 68.6658656666, 85.9770373007, 108.301772384}; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMiniRminMeanEndcap[5] = { - 130.992832231, 154.813883559, 185.352604327, 221.635123002, 265.022076742}; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float k2Rinv1GeVf = (2.99792458e-3 * 3.8) / 2; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kR1GeVf = 1. / (2.99792458e-3 * 3.8); - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kSinAlphaMax = 0.95; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float ptCut = PT_CUT; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kDeltaZLum = 15.0; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kPixelPSZpitch = 0.15; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kStripPSZpitch = 2.4; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kStrip2SZpitch = 5.0; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWidth2S = 0.009; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWidthPS = 0.01; - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kPt_betaMax = 7.0; - // Since C++ can't represent infinity, lst_INF = 123456789 was used to represent infinity in the data table - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float lst_INF = 123456789.0; - - namespace t5dnn { - - // Working points matching LST fake rate (43.9%) or signal acceptance (82.0%) - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kLSTWp1 = 0.3418833f; // 94.0% TPR, 43.9% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kLSTWp2 = 0.6177366f; // 82.0% TPR, 20.0% FPR - // Other working points - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp70 = 0.7776195f; // 70.0% TPR, 10.0% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp75 = 0.7181118f; // 75.0% TPR, 13.5% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp80 = 0.6492643f; // 80.0% TPR, 17.9% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp85 = 0.5655319f; // 85.0% TPR, 23.8% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp90 = 0.4592205f; // 90.0% TPR, 32.6% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp95 = 0.3073708f; // 95.0% TPR, 47.7% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp97p5 = 0.2001348f; // 97.5% TPR, 61.2% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp99 = 0.1120605f; // 99.0% TPR, 75.9% FPR - ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp99p9 = 0.0218196f; // 99.9% TPR, 95.4% FPR - - } // namespace t5dnn - -} //namespace lst + // Adjust grid and block sizes based on backend configuration + template > + ALPAKA_FN_HOST ALPAKA_FN_INLINE WorkDiv createWorkDiv(const Vec& blocksPerGrid, + const Vec& threadsPerBlock, + const Vec& elementsPerThreadArg) { + Vec adjustedBlocks = blocksPerGrid; + Vec adjustedThreads = threadsPerBlock; + + // special overrides for CPU/host cases + if constexpr (std::is_same_v) { + adjustedBlocks = Vec::all(static_cast(1)); + + if constexpr (alpaka::accMatchesTags) { + // Serial execution, set threads to 1 as well + adjustedThreads = Vec::all(static_cast(1)); // probably redundant + } + } + + return WorkDiv(adjustedBlocks, adjustedThreads, elementsPerThreadArg); + } + + // The constants below are usually used in functions like alpaka::math::min(), + // expecting a reference (T const&) in the arguments. Hence, + // ALPAKA_STATIC_ACC_MEM_GLOBAL needs to be used in addition to constexpr. + + // 15 MeV constant from the approximate Bethe-Bloch formula + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMulsInGeV = 0.015; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMiniMulsPtScaleBarrel[6] = { + 0.0052, 0.0038, 0.0034, 0.0034, 0.0032, 0.0034}; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMiniMulsPtScaleEndcap[5] = {0.006, 0.006, 0.006, 0.006, 0.006}; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMiniRminMeanBarrel[6] = { + 25.007152356, 37.2186993757, 52.3104270826, 68.6658656666, 85.9770373007, 108.301772384}; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kMiniRminMeanEndcap[5] = { + 130.992832231, 154.813883559, 185.352604327, 221.635123002, 265.022076742}; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float k2Rinv1GeVf = (2.99792458e-3 * 3.8) / 2; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kR1GeVf = 1. / (2.99792458e-3 * 3.8); + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kSinAlphaMax = 0.95; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float ptCut = PT_CUT; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kDeltaZLum = 15.0; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kPixelPSZpitch = 0.15; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kStripPSZpitch = 2.4; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kStrip2SZpitch = 5.0; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWidth2S = 0.009; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWidthPS = 0.01; + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kPt_betaMax = 7.0; + // Since C++ can't represent infinity, lst_INF = 123456789 was used to represent infinity in the data table + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float lst_INF = 123456789.0; + + namespace t5dnn { + + // Working points matching LST fake rate (43.9%) or signal acceptance (82.0%) + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kLSTWp1 = 0.3418833f; // 94.0% TPR, 43.9% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kLSTWp2 = 0.6177366f; // 82.0% TPR, 20.0% FPR + // Other working points + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp70 = 0.7776195f; // 70.0% TPR, 10.0% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp75 = 0.7181118f; // 75.0% TPR, 13.5% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp80 = 0.6492643f; // 80.0% TPR, 17.9% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp85 = 0.5655319f; // 85.0% TPR, 23.8% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp90 = 0.4592205f; // 90.0% TPR, 32.6% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp95 = 0.3073708f; // 95.0% TPR, 47.7% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp97p5 = 0.2001348f; // 97.5% TPR, 61.2% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp99 = 0.1120605f; // 99.0% TPR, 75.9% FPR + ALPAKA_STATIC_ACC_MEM_GLOBAL constexpr float kWp99p9 = 0.0218196f; // 99.9% TPR, 95.4% FPR + + } // namespace t5dnn + + } //namespace lst +} //namespace ALPAKA_ACCELERATOR_NAMESPACE #endif diff --git a/RecoTracker/LSTCore/interface/alpaka/LST.h b/RecoTracker/LSTCore/interface/alpaka/LST.h new file mode 100644 index 0000000000000..0e4c64d2535df --- /dev/null +++ b/RecoTracker/LSTCore/interface/alpaka/LST.h @@ -0,0 +1,110 @@ +#ifndef RecoTracker_LSTCore_interface_alpaka_LST_h +#define RecoTracker_LSTCore_interface_alpaka_LST_h + +#include "RecoTracker/LSTCore/interface/Constants.h" +#include "RecoTracker/LSTCore/interface/LSTESData.h" + +#include +#include +#include + +using ::lst::LSTESData; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace lst { + class Event; + + class LST { + public: + LST() = default; + + void run(Queue& queue, + bool verbose, + LSTESData const* deviceESData, + std::vector const& see_px, + std::vector const& see_py, + std::vector const& see_pz, + std::vector const& see_dxy, + std::vector const& see_dz, + std::vector const& see_ptErr, + std::vector const& see_etaErr, + std::vector const& see_stateTrajGlbX, + std::vector const& see_stateTrajGlbY, + std::vector const& see_stateTrajGlbZ, + std::vector const& see_stateTrajGlbPx, + std::vector const& see_stateTrajGlbPy, + std::vector const& see_stateTrajGlbPz, + std::vector const& see_q, + std::vector> const& see_hitIdx, + std::vector const& ph2_detId, + std::vector const& ph2_x, + std::vector const& ph2_y, + std::vector const& ph2_z, + bool no_pls_dupclean, + bool tc_pls_triplets); + std::vector> const& hits() const { return out_tc_hitIdxs_; } + std::vector const& len() const { return out_tc_len_; } + std::vector const& seedIdx() const { return out_tc_seedIdx_; } + std::vector const& trackCandidateType() const { return out_tc_trackCandidateType_; } + + private: + void prepareInput(std::vector const& see_px, + std::vector const& see_py, + std::vector const& see_pz, + std::vector const& see_dxy, + std::vector const& see_dz, + std::vector const& see_ptErr, + std::vector const& see_etaErr, + std::vector const& see_stateTrajGlbX, + std::vector const& see_stateTrajGlbY, + std::vector const& see_stateTrajGlbZ, + std::vector const& see_stateTrajGlbPx, + std::vector const& see_stateTrajGlbPy, + std::vector const& see_stateTrajGlbPz, + std::vector const& see_q, + std::vector> const& see_hitIdx, + std::vector const& ph2_detId, + std::vector const& ph2_x, + std::vector const& ph2_y, + std::vector const& ph2_z); + + void getOutput(Event& event); + std::vector getHitIdxs(short trackCandidateType, + unsigned int TCIdx, + unsigned int const* TCHitIndices, + unsigned int const* hitIndices); + + // Input and output vectors + std::vector in_trkX_; + std::vector in_trkY_; + std::vector in_trkZ_; + std::vector in_hitId_; + std::vector in_hitIdxs_; + std::vector in_hitIndices_vec0_; + std::vector in_hitIndices_vec1_; + std::vector in_hitIndices_vec2_; + std::vector in_hitIndices_vec3_; + std::vector in_deltaPhi_vec_; + std::vector in_ptIn_vec_; + std::vector in_ptErr_vec_; + std::vector in_px_vec_; + std::vector in_py_vec_; + std::vector in_pz_vec_; + std::vector in_eta_vec_; + std::vector in_etaErr_vec_; + std::vector in_phi_vec_; + std::vector in_charge_vec_; + std::vector in_seedIdx_vec_; + std::vector in_superbin_vec_; + std::vector in_pixelType_vec_; + std::vector in_isQuad_vec_; + std::vector> out_tc_hitIdxs_; + std::vector out_tc_len_; + std::vector out_tc_seedIdx_; + std::vector out_tc_trackCandidateType_; + }; + + } // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#endif diff --git a/RecoTracker/LSTCore/src/LSTESData.cc b/RecoTracker/LSTCore/src/LSTESData.cc index 482d97d34249c..1acf085a0f491 100644 --- a/RecoTracker/LSTCore/src/LSTESData.cc +++ b/RecoTracker/LSTCore/src/LSTESData.cc @@ -39,9 +39,9 @@ namespace { } void loadMapsHost(lst::MapPLStoLayer& pLStoLayer, - std::shared_ptr endcapGeometry, - std::shared_ptr tiltedGeometry, - std::shared_ptr moduleConnectionMap) { + lst::EndcapGeometry& endcapGeometry, + lst::TiltedGeometry& tiltedGeometry, + lst::ModuleConnectionMap& moduleConnectionMap) { // Module orientation information (DrDz or phi angles) auto endcap_geom = get_absolute_path_after_check_file_exists(trackLooperDir() + "/data/OT800_IT615_pt0.8/endcap_orientation.bin"); @@ -51,9 +51,9 @@ namespace { auto mappath = get_absolute_path_after_check_file_exists( trackLooperDir() + "/data/OT800_IT615_pt0.8/module_connection_tracing_merged.bin"); - endcapGeometry->load(endcap_geom); - tiltedGeometry->load(tilted_geom); - moduleConnectionMap->load(mappath); + endcapGeometry.load(endcap_geom); + tiltedGeometry.load(tilted_geom); + moduleConnectionMap.load(mappath); auto pLSMapDir = trackLooperDir() + "/data/OT800_IT615_pt0.8/pixelmap/pLS_map"; const std::array connects{ @@ -80,34 +80,39 @@ std::unique_ptr> lst::loadAndFillESHost() uint16_t nModules; uint16_t nLowerModules; unsigned int nPixels; - std::shared_ptr> modulesBuffers = nullptr; - auto pLStoLayer = std::make_shared(); - auto endcapGeometry = std::make_shared(); - auto tiltedGeometry = std::make_shared(); - auto pixelMapping = std::make_shared(); - auto moduleConnectionMap = std::make_shared(); - ::loadMapsHost(*pLStoLayer, endcapGeometry, tiltedGeometry, moduleConnectionMap); + MapPLStoLayer pLStoLayer; + EndcapGeometry endcapGeometry; + TiltedGeometry tiltedGeometry; + PixelMap pixelMapping; + ModuleConnectionMap moduleConnectionMap; + ::loadMapsHost(pLStoLayer, endcapGeometry, tiltedGeometry, moduleConnectionMap); - auto endcapGeometryBuffers = std::make_shared>( - cms::alpakatools::host(), endcapGeometry->nEndCapMap); - alpaka::QueueCpuBlocking queue(cms::alpakatools::host()); - alpaka::memcpy( - queue, endcapGeometryBuffers->geoMapDetId_buf, endcapGeometry->geoMapDetId_buf, endcapGeometry->nEndCapMap); - alpaka::memcpy( - queue, endcapGeometryBuffers->geoMapPhi_buf, endcapGeometry->geoMapPhi_buf, endcapGeometry->nEndCapMap); + auto endcapGeometryBuffers = + EndcapGeometryBuffer(cms::alpakatools::host(), endcapGeometry.nEndCapMap); + std::memcpy(endcapGeometryBuffers.geoMapDetId_buf.data(), + endcapGeometry.geoMapDetId_buf.data(), + endcapGeometry.nEndCapMap * sizeof(unsigned int)); + std::memcpy(endcapGeometryBuffers.geoMapPhi_buf.data(), + endcapGeometry.geoMapPhi_buf.data(), + endcapGeometry.nEndCapMap * sizeof(float)); auto path = get_absolute_path_after_check_file_exists(trackLooperDir() + "/data/OT800_IT615_pt0.8/sensor_centroids.bin"); - lst::loadModulesFromFile(pLStoLayer.get(), - path.c_str(), - nModules, - nLowerModules, - nPixels, - modulesBuffers, - pixelMapping.get(), - endcapGeometry.get(), - tiltedGeometry.get(), - moduleConnectionMap.get()); - return std::make_unique>( - nModules, nLowerModules, nPixels, endcapGeometry->nEndCapMap, modulesBuffers, endcapGeometryBuffers, pixelMapping); + auto modulesBuffers = lst::loadModulesFromFile(pLStoLayer, + path.c_str(), + nModules, + nLowerModules, + nPixels, + pixelMapping, + endcapGeometry, + tiltedGeometry, + moduleConnectionMap); + auto pixelMappingPtr = std::make_shared(std::move(pixelMapping)); + return std::make_unique>(nModules, + nLowerModules, + nPixels, + endcapGeometry.nEndCapMap, + std::move(modulesBuffers), + std::move(endcapGeometryBuffers), + pixelMappingPtr); } diff --git a/RecoTracker/LSTCore/src/ModuleConnectionMap.cc b/RecoTracker/LSTCore/src/ModuleConnectionMap.cc index 732b8e155fb4e..d1b68b7f485bb 100644 --- a/RecoTracker/LSTCore/src/ModuleConnectionMap.cc +++ b/RecoTracker/LSTCore/src/ModuleConnectionMap.cc @@ -4,8 +4,6 @@ lst::ModuleConnectionMap::ModuleConnectionMap() {} lst::ModuleConnectionMap::ModuleConnectionMap(std::string const& filename) { load(filename); } -lst::ModuleConnectionMap::~ModuleConnectionMap() {} - void lst::ModuleConnectionMap::load(std::string const& filename) { moduleConnections_.clear(); @@ -69,15 +67,17 @@ void lst::ModuleConnectionMap::add(std::string const& filename) { connected_detids.push_back(connected_detid); } + auto& thisModuleConnections = moduleConnections_.at(detid); + // Concatenate - moduleConnections_[detid].insert(moduleConnections_[detid].end(), connected_detids.begin(), connected_detids.end()); + thisModuleConnections.insert(thisModuleConnections.end(), connected_detids.begin(), connected_detids.end()); // Sort - std::sort(moduleConnections_[detid].begin(), moduleConnections_[detid].end()); + std::sort(thisModuleConnections.begin(), thisModuleConnections.end()); // Unique - moduleConnections_[detid].erase(std::unique(moduleConnections_[detid].begin(), moduleConnections_[detid].end()), - moduleConnections_[detid].end()); + thisModuleConnections.erase(std::unique(thisModuleConnections.begin(), thisModuleConnections.end()), + thisModuleConnections.end()); } } diff --git a/RecoTracker/LSTCore/src/ModuleMethods.h b/RecoTracker/LSTCore/src/ModuleMethods.h index 9693a464fcf1a..bf51e262f69e5 100644 --- a/RecoTracker/LSTCore/src/ModuleMethods.h +++ b/RecoTracker/LSTCore/src/ModuleMethods.h @@ -12,6 +12,7 @@ #include "RecoTracker/LSTCore/interface/PixelMap.h" #include "HeterogeneousCore/AlpakaInterface/interface/host.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" namespace lst { struct ModuleMetaData { @@ -23,15 +24,13 @@ namespace lst { // https://github.com/cms-sw/cmssw/blob/5e809e8e0a625578aa265dc4b128a93830cb5429/Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h#L29 }; - template - inline void fillPixelMap(std::shared_ptr>& modulesBuf, + inline void fillPixelMap(ModulesBuffer& modulesBuf, uint16_t nModules, unsigned int& nPixels, PixelMap& pixelMapping, - TQueue queue, - const MapPLStoLayer& pLStoLayer, - ModuleMetaData& mmd) { - pixelMapping.pixelModuleIndex = mmd.detIdToIndex[1]; + MapPLStoLayer const& pLStoLayer, + ModuleMetaData const& mmd) { + pixelMapping.pixelModuleIndex = mmd.detIdToIndex.at(1); std::vector connectedModuleDetIds; std::vector connectedModuleDetIds_pos; @@ -81,68 +80,43 @@ namespace lst { unsigned int connectedPix_size = totalSizes + totalSizes_pos + totalSizes_neg; nPixels = connectedPix_size; - // Now we can initialize modulesBuf - alpaka_common::DevHost const& devHost = cms::alpakatools::host(); - if (modulesBuf == nullptr) { - modulesBuf = std::make_shared>(devHost, nModules, nPixels); - } + // Now we re-initialize connectedPixels_buf since nPixels is now known + modulesBuf.connectedPixels_buf = cms::alpakatools::make_host_buffer(nPixels); + modulesBuf.data_.setData(modulesBuf); - auto connectedPixels_buf = allocBufWrapper(devHost, connectedPix_size); - unsigned int* connectedPixels = alpaka::getPtrNative(connectedPixels_buf); + unsigned int* connectedPixels = modulesBuf.connectedPixels_buf.data(); for (unsigned int icondet = 0; icondet < totalSizes; icondet++) { - connectedPixels[icondet] = mmd.detIdToIndex[connectedModuleDetIds[icondet]]; + connectedPixels[icondet] = mmd.detIdToIndex.at(connectedModuleDetIds[icondet]); } for (unsigned int icondet = 0; icondet < totalSizes_pos; icondet++) { - connectedPixels[icondet + totalSizes] = mmd.detIdToIndex[connectedModuleDetIds_pos[icondet]]; + connectedPixels[icondet + totalSizes] = mmd.detIdToIndex.at(connectedModuleDetIds_pos[icondet]); } for (unsigned int icondet = 0; icondet < totalSizes_neg; icondet++) { - connectedPixels[icondet + totalSizes + totalSizes_pos] = mmd.detIdToIndex[connectedModuleDetIds_neg[icondet]]; + connectedPixels[icondet + totalSizes + totalSizes_pos] = mmd.detIdToIndex.at(connectedModuleDetIds_neg[icondet]); } + } - alpaka::memcpy(queue, modulesBuf->connectedPixels_buf, connectedPixels_buf); - alpaka::wait(queue); - }; - - template - inline void fillConnectedModuleArrayExplicit(ModulesBuffer* modulesBuf, - unsigned int nMod, - TQueue queue, - ModuleMetaData& mmd, - const ModuleConnectionMap* moduleConnectionMap) { - alpaka_common::DevHost const& devHost = cms::alpakatools::host(); - auto moduleMap_buf = allocBufWrapper(devHost, nMod * max_connected_modules); - uint16_t* moduleMap = alpaka::getPtrNative(moduleMap_buf); - - auto nConnectedModules_buf = allocBufWrapper(devHost, nMod); - uint16_t* nConnectedModules = alpaka::getPtrNative(nConnectedModules_buf); + inline void fillConnectedModuleArrayExplicit(ModulesBuffer& modulesBuf, + ModuleMetaData const& mmd, + ModuleConnectionMap const& moduleConnectionMap) { + uint16_t* moduleMap = modulesBuf.moduleMap_buf.data(); + uint16_t* nConnectedModules = modulesBuf.nConnectedModules_buf.data(); for (auto it = mmd.detIdToIndex.begin(); it != mmd.detIdToIndex.end(); ++it) { unsigned int detId = it->first; uint16_t index = it->second; - auto& connectedModules = moduleConnectionMap->getConnectedModuleDetIds(detId); + auto& connectedModules = moduleConnectionMap.getConnectedModuleDetIds(detId); nConnectedModules[index] = connectedModules.size(); for (uint16_t i = 0; i < nConnectedModules[index]; i++) { - moduleMap[index * max_connected_modules + i] = mmd.detIdToIndex[connectedModules[i]]; + moduleMap[index * max_connected_modules + i] = mmd.detIdToIndex.at(connectedModules[i]); } } + } - alpaka::memcpy(queue, modulesBuf->moduleMap_buf, moduleMap_buf); - alpaka::memcpy(queue, modulesBuf->nConnectedModules_buf, nConnectedModules_buf); - alpaka::wait(queue); - }; - - template - inline void fillMapArraysExplicit(ModulesBuffer* modulesBuf, - unsigned int nMod, - TQueue queue, - ModuleMetaData& mmd) { - alpaka_common::DevHost const& devHost = cms::alpakatools::host(); - auto mapIdx_buf = allocBufWrapper(devHost, nMod); - uint16_t* mapIdx = alpaka::getPtrNative(mapIdx_buf); - - auto mapdetId_buf = allocBufWrapper(devHost, nMod); - unsigned int* mapdetId = alpaka::getPtrNative(mapdetId_buf); + inline void fillMapArraysExplicit(ModulesBuffer& modulesBuf, ModuleMetaData const& mmd) { + uint16_t* mapIdx = modulesBuf.mapIdx_buf.data(); + unsigned int* mapdetId = modulesBuf.mapdetId_buf.data(); unsigned int counter = 0; for (auto it = mmd.detIdToIndex.begin(); it != mmd.detIdToIndex.end(); ++it) { @@ -152,11 +126,7 @@ namespace lst { mapdetId[counter] = detId; counter++; } - - alpaka::memcpy(queue, modulesBuf->mapIdx_buf, mapIdx_buf); - alpaka::memcpy(queue, modulesBuf->mapdetId_buf, mapdetId_buf); - alpaka::wait(queue); - }; + } inline void setDerivedQuantities(unsigned int detId, unsigned short& layer, @@ -179,7 +149,7 @@ namespace lst { r = std::sqrt(m_x * m_x + m_y * m_y + m_z * m_z); eta = ((m_z > 0) - (m_z < 0)) * std::acosh(r / std::sqrt(m_x * m_x + m_y * m_y)); - }; + } inline void loadCentroidsFromFile(const char* filePath, ModuleMetaData& mmd, uint16_t& nModules) { std::ifstream ifile(filePath, std::ios::binary); @@ -216,61 +186,46 @@ namespace lst { mmd.detIdToIndex[1] = counter; //pixel module is the last module in the module list counter++; nModules = counter; - }; - - inline void loadModulesFromFile(const MapPLStoLayer* pLStoLayer, - const char* moduleMetaDataFilePath, - uint16_t& nModules, - uint16_t& nLowerModules, - unsigned int& nPixels, - std::shared_ptr>& modulesBuf, - PixelMap* pixelMapping, - const EndcapGeometry* endcapGeometry, - const TiltedGeometry* tiltedGeometry, - const ModuleConnectionMap* moduleConnectionMap) { + } + + inline ModulesBuffer loadModulesFromFile(MapPLStoLayer const& pLStoLayer, + const char* moduleMetaDataFilePath, + uint16_t& nModules, + uint16_t& nLowerModules, + unsigned int& nPixels, + PixelMap& pixelMapping, + const EndcapGeometry& endcapGeometry, + const TiltedGeometry& tiltedGeometry, + const ModuleConnectionMap& moduleConnectionMap) { ModuleMetaData mmd; loadCentroidsFromFile(moduleMetaDataFilePath, mmd, nModules); - alpaka_common::DevHost const& devHost = cms::alpakatools::host(); - auto detIds_buf = allocBufWrapper(devHost, nModules); - auto layers_buf = allocBufWrapper(devHost, nModules); - auto rings_buf = allocBufWrapper(devHost, nModules); - auto rods_buf = allocBufWrapper(devHost, nModules); - auto modules_buf = allocBufWrapper(devHost, nModules); - auto subdets_buf = allocBufWrapper(devHost, nModules); - auto sides_buf = allocBufWrapper(devHost, nModules); - auto eta_buf = allocBufWrapper(devHost, nModules); - auto r_buf = allocBufWrapper(devHost, nModules); - auto isInverted_buf = allocBufWrapper(devHost, nModules); - auto isLower_buf = allocBufWrapper(devHost, nModules); - auto isAnchor_buf = allocBufWrapper(devHost, nModules); - auto moduleType_buf = allocBufWrapper(devHost, nModules); - auto moduleLayerType_buf = allocBufWrapper(devHost, nModules); - auto dxdys_buf = allocBufWrapper(devHost, nModules); - auto drdzs_buf = allocBufWrapper(devHost, nModules); - auto partnerModuleIndices_buf = allocBufWrapper(devHost, nModules); - auto lstLayers_buf = allocBufWrapper(devHost, nModules); + // Initialize modulesBuf, but with nPixels = 0 + // The fields that require nPixels are re-initialized in fillPixelMap + ModulesBuffer modulesBuf(cms::alpakatools::host(), nModules, 0); // Getting the underlying data pointers - unsigned int* host_detIds = alpaka::getPtrNative(detIds_buf); - short* host_layers = alpaka::getPtrNative(layers_buf); - short* host_rings = alpaka::getPtrNative(rings_buf); - short* host_rods = alpaka::getPtrNative(rods_buf); - short* host_modules = alpaka::getPtrNative(modules_buf); - short* host_subdets = alpaka::getPtrNative(subdets_buf); - short* host_sides = alpaka::getPtrNative(sides_buf); - float* host_eta = alpaka::getPtrNative(eta_buf); - float* host_r = alpaka::getPtrNative(r_buf); - bool* host_isInverted = alpaka::getPtrNative(isInverted_buf); - bool* host_isLower = alpaka::getPtrNative(isLower_buf); - bool* host_isAnchor = alpaka::getPtrNative(isAnchor_buf); - ModuleType* host_moduleType = alpaka::getPtrNative(moduleType_buf); - ModuleLayerType* host_moduleLayerType = alpaka::getPtrNative(moduleLayerType_buf); - float* host_dxdys = alpaka::getPtrNative(dxdys_buf); - float* host_drdzs = alpaka::getPtrNative(drdzs_buf); - uint16_t* host_partnerModuleIndices = alpaka::getPtrNative(partnerModuleIndices_buf); - int* host_lstLayers = alpaka::getPtrNative(lstLayers_buf); + unsigned int* host_detIds = modulesBuf.detIds_buf.data(); + short* host_layers = modulesBuf.layers_buf.data(); + short* host_rings = modulesBuf.rings_buf.data(); + short* host_rods = modulesBuf.rods_buf.data(); + short* host_modules = modulesBuf.modules_buf.data(); + short* host_subdets = modulesBuf.subdets_buf.data(); + short* host_sides = modulesBuf.sides_buf.data(); + float* host_eta = modulesBuf.eta_buf.data(); + float* host_r = modulesBuf.r_buf.data(); + bool* host_isInverted = modulesBuf.isInverted_buf.data(); + bool* host_isLower = modulesBuf.isLower_buf.data(); + bool* host_isAnchor = modulesBuf.isAnchor_buf.data(); + ModuleType* host_moduleType = modulesBuf.moduleType_buf.data(); + ModuleLayerType* host_moduleLayerType = modulesBuf.moduleLayerType_buf.data(); + float* host_dxdys = modulesBuf.dxdys_buf.data(); + float* host_drdzs = modulesBuf.drdzs_buf.data(); + uint16_t* host_nModules = modulesBuf.nModules_buf.data(); + uint16_t* host_nLowerModules = modulesBuf.nLowerModules_buf.data(); + uint16_t* host_partnerModuleIndices = modulesBuf.partnerModuleIndices_buf.data(); + int* host_lstLayers = modulesBuf.lstLayers_buf.data(); //reassign detIdToIndex indices here nLowerModules = (nModules - 1) / 2; @@ -347,8 +302,8 @@ namespace lst { host_isAnchor[index] = false; } - host_dxdys[index] = (subdet == Endcap) ? endcapGeometry->getdxdy_slope(detId) : tiltedGeometry->getDxDy(detId); - host_drdzs[index] = (subdet == Barrel) ? tiltedGeometry->getDrDz(detId) : 0; + host_dxdys[index] = (subdet == Endcap) ? endcapGeometry.getdxdy_slope(detId) : tiltedGeometry.getDxDy(detId); + host_drdzs[index] = (subdet == Barrel) ? tiltedGeometry.getDrDz(detId) : 0; } host_lstLayers[index] = @@ -372,41 +327,15 @@ namespace lst { } } - // TODO: We don't need a queue, but this code needs to be refactored - alpaka::QueueCpuBlocking queue(cms::alpakatools::host()); - - // modulesBuf is initialized in fillPixelMap since both nModules and nPix will be known - fillPixelMap(modulesBuf, nModules, nPixels, *pixelMapping, queue, *pLStoLayer, mmd); - - auto src_view_nModules = alpaka::createView(devHost, &nModules, (alpaka_common::Idx)1u); - alpaka::memcpy(queue, modulesBuf->nModules_buf, src_view_nModules); - - auto src_view_nLowerModules = alpaka::createView(devHost, &nLowerModules, (alpaka_common::Idx)1u); - alpaka::memcpy(queue, modulesBuf->nLowerModules_buf, src_view_nLowerModules); - - alpaka::memcpy(queue, modulesBuf->moduleType_buf, moduleType_buf); - alpaka::memcpy(queue, modulesBuf->moduleLayerType_buf, moduleLayerType_buf); - - alpaka::memcpy(queue, modulesBuf->detIds_buf, detIds_buf); - alpaka::memcpy(queue, modulesBuf->layers_buf, layers_buf); - alpaka::memcpy(queue, modulesBuf->rings_buf, rings_buf); - alpaka::memcpy(queue, modulesBuf->rods_buf, rods_buf); - alpaka::memcpy(queue, modulesBuf->modules_buf, modules_buf); - alpaka::memcpy(queue, modulesBuf->subdets_buf, subdets_buf); - alpaka::memcpy(queue, modulesBuf->sides_buf, sides_buf); - alpaka::memcpy(queue, modulesBuf->eta_buf, eta_buf); - alpaka::memcpy(queue, modulesBuf->r_buf, r_buf); - alpaka::memcpy(queue, modulesBuf->isInverted_buf, isInverted_buf); - alpaka::memcpy(queue, modulesBuf->isLower_buf, isLower_buf); - alpaka::memcpy(queue, modulesBuf->isAnchor_buf, isAnchor_buf); - alpaka::memcpy(queue, modulesBuf->dxdys_buf, dxdys_buf); - alpaka::memcpy(queue, modulesBuf->drdzs_buf, drdzs_buf); - alpaka::memcpy(queue, modulesBuf->partnerModuleIndices_buf, partnerModuleIndices_buf); - alpaka::memcpy(queue, modulesBuf->lstLayers_buf, lstLayers_buf); - alpaka::wait(queue); - - fillConnectedModuleArrayExplicit(modulesBuf.get(), nModules, queue, mmd, moduleConnectionMap); - fillMapArraysExplicit(modulesBuf.get(), nModules, queue, mmd); - }; + fillPixelMap(modulesBuf, nModules, nPixels, pixelMapping, pLStoLayer, mmd); + + *host_nModules = nModules; + *host_nLowerModules = nLowerModules; + + fillConnectedModuleArrayExplicit(modulesBuf, mmd, moduleConnectionMap); + fillMapArraysExplicit(modulesBuf, mmd); + + return modulesBuf; + } } // namespace lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/Event.dev.cc b/RecoTracker/LSTCore/src/alpaka/Event.dev.cc index 43f5bb7d9c3fe..62629bb08fc52 100644 --- a/RecoTracker/LSTCore/src/alpaka/Event.dev.cc +++ b/RecoTracker/LSTCore/src/alpaka/Event.dev.cc @@ -1,8 +1,16 @@ +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" + #include "Event.h" -using namespace ALPAKA_ACCELERATOR_NAMESPACE; +using Device = ALPAKA_ACCELERATOR_NAMESPACE::Device; +using Queue = ALPAKA_ACCELERATOR_NAMESPACE::Queue; +using Acc1D = ALPAKA_ACCELERATOR_NAMESPACE::Acc1D; +using Acc3D = ALPAKA_ACCELERATOR_NAMESPACE::Acc3D; + +using namespace ALPAKA_ACCELERATOR_NAMESPACE::lst; -void lst::Event::init(bool verbose) { +void Event::initSync(bool verbose) { + alpaka::wait(queue); // other calls can be asynchronous addObjects = verbose; hitsInGPU = nullptr; mdsInGPU = nullptr; @@ -44,7 +52,8 @@ void lst::Event::init(bool verbose) { } } -void lst::Event::resetEvent() { +void Event::resetEventSync() { + alpaka::wait(queue); // synchronize to reset consistently //reset the arrays for (int i = 0; i < 6; i++) { n_hits_by_layer_barrel_[i] = 0; @@ -150,24 +159,24 @@ void lst::Event::resetEvent() { } } -void lst::Event::addHitToEvent(std::vector const& x, - std::vector const& y, - std::vector const& z, - std::vector const& detId, - std::vector const& idxInNtuple) { +void Event::addHitToEvent(std::vector const& x, + std::vector const& y, + std::vector const& z, + std::vector const& detId, + std::vector const& idxInNtuple) { // Use the actual number of hits instead of a max. unsigned int nHits = x.size(); // Initialize space on device/host for next event. if (hitsInGPU == nullptr) { - hitsInGPU = new lst::Hits(); - hitsBuffers = new lst::HitsBuffer(nModules_, nHits, devAcc, queue); + hitsInGPU = new Hits(); + hitsBuffers = new HitsBuffer(nModules_, nHits, devAcc, queue); hitsInGPU->setData(*hitsBuffers); } if (rangesInGPU == nullptr) { - rangesInGPU = new lst::ObjectRanges(); - rangesBuffers = new lst::ObjectRangesBuffer(nModules_, nLowerModules_, devAcc, queue); + rangesInGPU = new ObjectRanges(); + rangesBuffers = new ObjectRangesBuffer(nModules_, nLowerModules_, devAcc, queue); rangesInGPU->setData(*rangesBuffers); } @@ -181,60 +190,51 @@ void lst::Event::addHitToEvent(std::vector const& x, alpaka::memcpy(queue, hitsBuffers->detid_buf, detId, nHits); alpaka::memcpy(queue, hitsBuffers->idxs_buf, idxInNtuple, nHits); alpaka::memcpy(queue, hitsBuffers->nHits_buf, nHits_view); - alpaka::wait(queue); + alpaka::wait(queue); // FIXME: remove synch after inputs refactored to be in pinned memory Vec3D const threadsPerBlock1{1, 1, 256}; Vec3D const blocksPerGrid1{1, 1, max_blocks}; WorkDiv3D const hit_loop_workdiv = createWorkDiv(blocksPerGrid1, threadsPerBlock1, elementsPerThread); - hitLoopKernel hit_loop_kernel; - auto const hit_loop_task( - alpaka::createTaskKernel(hit_loop_workdiv, - hit_loop_kernel, - Endcap, - TwoS, - nModules_, - nEndCapMap_, - alpaka::getPtrNative(endcapGeometryBuffers_->geoMapDetId_buf), - alpaka::getPtrNative(endcapGeometryBuffers_->geoMapPhi_buf), - *modulesBuffers_->data(), - *hitsInGPU, - nHits)); - - alpaka::enqueue(queue, hit_loop_task); + alpaka::exec(queue, + hit_loop_workdiv, + HitLoopKernel{}, + ::lst::Endcap, + ::lst::TwoS, + nModules_, + nEndCapMap_, + endcapGeometryBuffers_.geoMapDetId_buf.data(), + endcapGeometryBuffers_.geoMapPhi_buf.data(), + *modulesBuffers_.data(), + *hitsInGPU, + nHits); Vec3D const threadsPerBlock2{1, 1, 256}; Vec3D const blocksPerGrid2{1, 1, max_blocks}; WorkDiv3D const module_ranges_workdiv = createWorkDiv(blocksPerGrid2, threadsPerBlock2, elementsPerThread); - moduleRangesKernel module_ranges_kernel; - auto const module_ranges_task(alpaka::createTaskKernel( - module_ranges_workdiv, module_ranges_kernel, *modulesBuffers_->data(), *hitsInGPU, nLowerModules_)); - - // Waiting isn't needed after second kernel call. Saves ~100 us. - // This is because addPixelSegmentToEvent (which is run next) doesn't rely on hitsBuffers->hitrange variables. - // Also, modulesInGPU->partnerModuleIndices is not alterned in addPixelSegmentToEvent. - alpaka::enqueue(queue, module_ranges_task); + alpaka::exec( + queue, module_ranges_workdiv, ModuleRangesKernel{}, *modulesBuffers_.data(), *hitsInGPU, nLowerModules_); } -void lst::Event::addPixelSegmentToEvent(std::vector const& hitIndices0, - std::vector const& hitIndices1, - std::vector const& hitIndices2, - std::vector const& hitIndices3, - std::vector const& dPhiChange, - std::vector const& ptIn, - std::vector const& ptErr, - std::vector const& px, - std::vector const& py, - std::vector const& pz, - std::vector const& eta, - std::vector const& etaErr, - std::vector const& phi, - std::vector const& charge, - std::vector const& seedIdx, - std::vector const& superbin, - std::vector const& pixelType, - std::vector const& isQuad) { +void Event::addPixelSegmentToEvent(std::vector const& hitIndices0, + std::vector const& hitIndices1, + std::vector const& hitIndices2, + std::vector const& hitIndices3, + std::vector const& dPhiChange, + std::vector const& ptIn, + std::vector const& ptErr, + std::vector const& px, + std::vector const& py, + std::vector const& pz, + std::vector const& eta, + std::vector const& etaErr, + std::vector const& phi, + std::vector const& charge, + std::vector const& seedIdx, + std::vector const& superbin, + std::vector const& pixelType, + std::vector const& isQuad) { unsigned int size = ptIn.size(); if (size > n_max_pixel_segments_per_module) { @@ -247,80 +247,63 @@ void lst::Event::addPixelSegmentToEvent(std::vector const& } unsigned int mdSize = 2 * size; - uint16_t pixelModuleIndex = pixelMapping_->pixelModuleIndex; + uint16_t pixelModuleIndex = pixelMapping_.pixelModuleIndex; if (mdsInGPU == nullptr) { // Create a view for the element nLowerModules_ inside rangesBuffers->miniDoubletModuleOccupancy auto dst_view_miniDoubletModuleOccupancy = alpaka::createSubView(rangesBuffers->miniDoubletModuleOccupancy_buf, (Idx)1u, (Idx)nLowerModules_); - // Create a source view for the value to be set - int value = n_max_pixel_md_per_modules; - auto src_view_value = alpaka::createView(devHost, &value, (Idx)1u); - - alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, src_view_value); - alpaka::wait(queue); + // Create a host buffer for a value to be passed to the device + auto pixelMaxMDs_buf_h = cms::alpakatools::make_host_buffer(queue, (Idx)1u); + *pixelMaxMDs_buf_h.data() = n_max_pixel_md_per_modules; - Vec3D const threadsPerBlockCreateMD{1, 1, 1024}; - Vec3D const blocksPerGridCreateMD{1, 1, 1}; - WorkDiv3D const createMDArrayRangesGPU_workDiv = - createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread); + alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); - lst::createMDArrayRangesGPU createMDArrayRangesGPU_kernel; - auto const createMDArrayRangesGPUTask(alpaka::createTaskKernel( - createMDArrayRangesGPU_workDiv, createMDArrayRangesGPU_kernel, *modulesBuffers_->data(), *rangesInGPU)); + WorkDiv1D const createMDArrayRangesGPU_workDiv = createWorkDiv({1}, {1024}, {1}); - alpaka::enqueue(queue, createMDArrayRangesGPUTask); - alpaka::wait(queue); + alpaka::exec( + queue, createMDArrayRangesGPU_workDiv, CreateMDArrayRangesGPU{}, *modulesBuffers_.data(), *rangesInGPU); - unsigned int nTotalMDs; - auto nTotalMDs_view = alpaka::createView(devHost, &nTotalMDs, (Idx)1u); + auto nTotalMDs_buf_h = cms::alpakatools::make_host_buffer(queue, (Idx)1u); + alpaka::memcpy(queue, nTotalMDs_buf_h, rangesBuffers->device_nTotalMDs_buf); + alpaka::wait(queue); // wait to get the data before manipulation - alpaka::memcpy(queue, nTotalMDs_view, rangesBuffers->device_nTotalMDs_buf); - alpaka::wait(queue); + *nTotalMDs_buf_h.data() += n_max_pixel_md_per_modules; + unsigned int nTotalMDs = *nTotalMDs_buf_h.data(); - nTotalMDs += n_max_pixel_md_per_modules; - - mdsInGPU = new lst::MiniDoublets(); - miniDoubletsBuffers = new lst::MiniDoubletsBuffer(nTotalMDs, nLowerModules_, devAcc, queue); + mdsInGPU = new MiniDoublets(); + miniDoubletsBuffers = new MiniDoubletsBuffer(nTotalMDs, nLowerModules_, devAcc, queue); mdsInGPU->setData(*miniDoubletsBuffers); - alpaka::memcpy(queue, miniDoubletsBuffers->nMemoryLocations_buf, nTotalMDs_view); - alpaka::wait(queue); + alpaka::memcpy(queue, miniDoubletsBuffers->nMemoryLocations_buf, nTotalMDs_buf_h); } if (segmentsInGPU == nullptr) { // can be optimized here: because we didn't distinguish pixel segments and outer-tracker segments and call them both "segments", so they use the index continuously. // If we want to further study the memory footprint in detail, we can separate the two and allocate different memories to them - Vec3D const threadsPerBlockCreateSeg{1, 1, 1024}; - Vec3D const blocksPerGridCreateSeg{1, 1, 1}; - WorkDiv3D const createSegmentArrayRanges_workDiv = - createWorkDiv(blocksPerGridCreateSeg, threadsPerBlockCreateSeg, elementsPerThread); + WorkDiv1D const createSegmentArrayRanges_workDiv = createWorkDiv({1}, {1024}, {1}); - lst::createSegmentArrayRanges createSegmentArrayRanges_kernel; - auto const createSegmentArrayRangesTask(alpaka::createTaskKernel(createSegmentArrayRanges_workDiv, - createSegmentArrayRanges_kernel, - *modulesBuffers_->data(), - *rangesInGPU, - *mdsInGPU)); + alpaka::exec(queue, + createSegmentArrayRanges_workDiv, + CreateSegmentArrayRanges{}, + *modulesBuffers_.data(), + *rangesInGPU, + *mdsInGPU); - alpaka::enqueue(queue, createSegmentArrayRangesTask); - alpaka::wait(queue); - - auto nTotalSegments_view = alpaka::createView(devHost, &nTotalSegments, (Idx)1u); + auto nTotalSegments_view = alpaka::createView(devHost, &nTotalSegments_, (Idx)1u); alpaka::memcpy(queue, nTotalSegments_view, rangesBuffers->device_nTotalSegs_buf); - alpaka::wait(queue); + alpaka::wait(queue); // wait to get the value before manipulation - nTotalSegments += n_max_pixel_segments_per_module; + nTotalSegments_ += n_max_pixel_segments_per_module; - segmentsInGPU = new lst::Segments(); + segmentsInGPU = new Segments(); segmentsBuffers = - new lst::SegmentsBuffer(nTotalSegments, nLowerModules_, n_max_pixel_segments_per_module, devAcc, queue); + new SegmentsBuffer(nTotalSegments_, nLowerModules_, n_max_pixel_segments_per_module, devAcc, queue); segmentsInGPU->setData(*segmentsBuffers); alpaka::memcpy(queue, segmentsBuffers->nMemoryLocations_buf, nTotalSegments_view); - alpaka::wait(queue); } auto hitIndices0_dev = allocBufWrapper(devAcc, size, queue); @@ -367,68 +350,55 @@ void lst::Event::addPixelSegmentToEvent(std::vector const& alpaka::createSubView(miniDoubletsBuffers->totOccupancyMDs_buf, (Idx)1u, (Idx)pixelModuleIndex); alpaka::memcpy(queue, dst_view_totOccupancyMDs, src_view_mdSize); - alpaka::wait(queue); + alpaka::wait(queue); // FIXME: remove synch after inputs refactored to be in pinned memory Vec3D const threadsPerBlock{1, 1, 256}; Vec3D const blocksPerGrid{1, 1, max_blocks}; WorkDiv3D const addPixelSegmentToEvent_workdiv = createWorkDiv(blocksPerGrid, threadsPerBlock, elementsPerThread); - addPixelSegmentToEventKernel addPixelSegmentToEvent_kernel; - auto const addPixelSegmentToEvent_task(alpaka::createTaskKernel(addPixelSegmentToEvent_workdiv, - addPixelSegmentToEvent_kernel, - *modulesBuffers_->data(), - *rangesInGPU, - *hitsInGPU, - *mdsInGPU, - *segmentsInGPU, - alpaka::getPtrNative(hitIndices0_dev), - alpaka::getPtrNative(hitIndices1_dev), - alpaka::getPtrNative(hitIndices2_dev), - alpaka::getPtrNative(hitIndices3_dev), - alpaka::getPtrNative(dPhiChange_dev), - pixelModuleIndex, - size)); - - alpaka::enqueue(queue, addPixelSegmentToEvent_task); - alpaka::wait(queue); + alpaka::exec(queue, + addPixelSegmentToEvent_workdiv, + AddPixelSegmentToEventKernel{}, + *modulesBuffers_.data(), + *rangesInGPU, + *hitsInGPU, + *mdsInGPU, + *segmentsInGPU, + hitIndices0_dev.data(), + hitIndices1_dev.data(), + hitIndices2_dev.data(), + hitIndices3_dev.data(), + dPhiChange_dev.data(), + pixelModuleIndex, + size); } -void lst::Event::createMiniDoublets() { +void Event::createMiniDoublets() { // Create a view for the element nLowerModules_ inside rangesBuffers->miniDoubletModuleOccupancy auto dst_view_miniDoubletModuleOccupancy = alpaka::createSubView(rangesBuffers->miniDoubletModuleOccupancy_buf, (Idx)1u, (Idx)nLowerModules_); - // Create a source view for the value to be set - int value = n_max_pixel_md_per_modules; - auto src_view_value = alpaka::createView(devHost, &value, (Idx)1u); + // Create a host buffer for a value to be passed to the device + auto pixelMaxMDs_buf_h = cms::alpakatools::make_host_buffer(queue, (Idx)1u); + *pixelMaxMDs_buf_h.data() = n_max_pixel_md_per_modules; - alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, src_view_value); - alpaka::wait(queue); + alpaka::memcpy(queue, dst_view_miniDoubletModuleOccupancy, pixelMaxMDs_buf_h); - Vec3D const threadsPerBlockCreateMD{1, 1, 1024}; - Vec3D const blocksPerGridCreateMD{1, 1, 1}; - WorkDiv3D const createMDArrayRangesGPU_workDiv = - createWorkDiv(blocksPerGridCreateMD, threadsPerBlockCreateMD, elementsPerThread); + WorkDiv1D const createMDArrayRangesGPU_workDiv = createWorkDiv({1}, {1024}, {1}); - lst::createMDArrayRangesGPU createMDArrayRangesGPU_kernel; - auto const createMDArrayRangesGPUTask(alpaka::createTaskKernel( - createMDArrayRangesGPU_workDiv, createMDArrayRangesGPU_kernel, *modulesBuffers_->data(), *rangesInGPU)); + alpaka::exec( + queue, createMDArrayRangesGPU_workDiv, CreateMDArrayRangesGPU{}, *modulesBuffers_.data(), *rangesInGPU); - alpaka::enqueue(queue, createMDArrayRangesGPUTask); - alpaka::wait(queue); + auto nTotalMDs_buf_h = cms::alpakatools::make_host_buffer(queue, (Idx)1u); + alpaka::memcpy(queue, nTotalMDs_buf_h, rangesBuffers->device_nTotalMDs_buf); + alpaka::wait(queue); // wait to get the data before manipulation - auto nTotalMDs_buf = allocBufWrapper(devHost, 1, queue); - - alpaka::memcpy(queue, nTotalMDs_buf, rangesBuffers->device_nTotalMDs_buf); - alpaka::wait(queue); - - unsigned int nTotalMDs = *alpaka::getPtrNative(nTotalMDs_buf); - - nTotalMDs += n_max_pixel_md_per_modules; + *nTotalMDs_buf_h.data() += n_max_pixel_md_per_modules; + unsigned int nTotalMDs = *nTotalMDs_buf_h.data(); if (mdsInGPU == nullptr) { - mdsInGPU = new lst::MiniDoublets(); - miniDoubletsBuffers = new lst::MiniDoubletsBuffer(nTotalMDs, nLowerModules_, devAcc, queue); + mdsInGPU = new MiniDoublets(); + miniDoubletsBuffers = new MiniDoubletsBuffer(nTotalMDs, nLowerModules_, devAcc, queue); mdsInGPU->setData(*miniDoubletsBuffers); } @@ -437,43 +407,34 @@ void lst::Event::createMiniDoublets() { WorkDiv3D const createMiniDoubletsInGPUv2_workDiv = createWorkDiv(blocksPerGridCreateMDInGPU, threadsPerBlockCreateMDInGPU, elementsPerThread); - lst::createMiniDoubletsInGPUv2 createMiniDoubletsInGPUv2_kernel; - auto const createMiniDoubletsInGPUv2Task(alpaka::createTaskKernel(createMiniDoubletsInGPUv2_workDiv, - createMiniDoubletsInGPUv2_kernel, - *modulesBuffers_->data(), - *hitsInGPU, - *mdsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, createMiniDoubletsInGPUv2Task); - - Vec3D const threadsPerBlockAddMD{1, 1, 1024}; - Vec3D const blocksPerGridAddMD{1, 1, 1}; - WorkDiv3D const addMiniDoubletRangesToEventExplicit_workDiv = - createWorkDiv(blocksPerGridAddMD, threadsPerBlockAddMD, elementsPerThread); - - lst::addMiniDoubletRangesToEventExplicit addMiniDoubletRangesToEventExplicit_kernel; - auto const addMiniDoubletRangesToEventExplicitTask( - alpaka::createTaskKernel(addMiniDoubletRangesToEventExplicit_workDiv, - addMiniDoubletRangesToEventExplicit_kernel, - *modulesBuffers_->data(), - *mdsInGPU, - *rangesInGPU, - *hitsInGPU)); - - alpaka::enqueue(queue, addMiniDoubletRangesToEventExplicitTask); - alpaka::wait(queue); + alpaka::exec(queue, + createMiniDoubletsInGPUv2_workDiv, + CreateMiniDoubletsInGPUv2{}, + *modulesBuffers_.data(), + *hitsInGPU, + *mdsInGPU, + *rangesInGPU); + + WorkDiv1D const addMiniDoubletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); + + alpaka::exec(queue, + addMiniDoubletRangesToEventExplicit_workDiv, + AddMiniDoubletRangesToEventExplicit{}, + *modulesBuffers_.data(), + *mdsInGPU, + *rangesInGPU, + *hitsInGPU); if (addObjects) { addMiniDoubletsToEventExplicit(); } } -void lst::Event::createSegmentsWithModuleMap() { +void Event::createSegmentsWithModuleMap() { if (segmentsInGPU == nullptr) { - segmentsInGPU = new lst::Segments(); + segmentsInGPU = new Segments(); segmentsBuffers = - new lst::SegmentsBuffer(nTotalSegments, nLowerModules_, n_max_pixel_segments_per_module, devAcc, queue); + new SegmentsBuffer(nTotalSegments_, nLowerModules_, n_max_pixel_segments_per_module, devAcc, queue); segmentsInGPU->setData(*segmentsBuffers); } @@ -482,92 +443,72 @@ void lst::Event::createSegmentsWithModuleMap() { WorkDiv3D const createSegmentsInGPUv2_workDiv = createWorkDiv(blocksPerGridCreateSeg, threadsPerBlockCreateSeg, elementsPerThread); - lst::createSegmentsInGPUv2 createSegmentsInGPUv2_kernel; - auto const createSegmentsInGPUv2Task(alpaka::createTaskKernel(createSegmentsInGPUv2_workDiv, - createSegmentsInGPUv2_kernel, - *modulesBuffers_->data(), - *mdsInGPU, - *segmentsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, createSegmentsInGPUv2Task); - - Vec3D const threadsPerBlockAddSeg{1, 1, 1024}; - Vec3D const blocksPerGridAddSeg{1, 1, 1}; - WorkDiv3D const addSegmentRangesToEventExplicit_workDiv = - createWorkDiv(blocksPerGridAddSeg, threadsPerBlockAddSeg, elementsPerThread); - - lst::addSegmentRangesToEventExplicit addSegmentRangesToEventExplicit_kernel; - auto const addSegmentRangesToEventExplicitTask( - alpaka::createTaskKernel(addSegmentRangesToEventExplicit_workDiv, - addSegmentRangesToEventExplicit_kernel, - *modulesBuffers_->data(), - *segmentsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, addSegmentRangesToEventExplicitTask); - alpaka::wait(queue); + alpaka::exec(queue, + createSegmentsInGPUv2_workDiv, + CreateSegmentsInGPUv2{}, + *modulesBuffers_.data(), + *mdsInGPU, + *segmentsInGPU, + *rangesInGPU); + + WorkDiv1D const addSegmentRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); + + alpaka::exec(queue, + addSegmentRangesToEventExplicit_workDiv, + AddSegmentRangesToEventExplicit{}, + *modulesBuffers_.data(), + *segmentsInGPU, + *rangesInGPU); if (addObjects) { addSegmentsToEventExplicit(); } } -void lst::Event::createTriplets() { +void Event::createTriplets() { if (tripletsInGPU == nullptr) { - Vec3D const threadsPerBlockCreateTrip{1, 1, 1024}; - Vec3D const blocksPerGridCreateTrip{1, 1, 1}; - WorkDiv3D const createTripletArrayRanges_workDiv = - createWorkDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread); - - lst::createTripletArrayRanges createTripletArrayRanges_kernel; - auto const createTripletArrayRangesTask(alpaka::createTaskKernel(createTripletArrayRanges_workDiv, - createTripletArrayRanges_kernel, - *modulesBuffers_->data(), - *rangesInGPU, - *segmentsInGPU)); - - alpaka::enqueue(queue, createTripletArrayRangesTask); - alpaka::wait(queue); + WorkDiv1D const createTripletArrayRanges_workDiv = createWorkDiv({1}, {1024}, {1}); + + alpaka::exec(queue, + createTripletArrayRanges_workDiv, + CreateTripletArrayRanges{}, + *modulesBuffers_.data(), + *rangesInGPU, + *segmentsInGPU); // TODO: Why are we pulling this back down only to put it back on the device in a new struct? - auto maxTriplets_buf = allocBufWrapper(devHost, 1, queue); + auto maxTriplets_buf_h = cms::alpakatools::make_host_buffer(queue, (Idx)1u); - alpaka::memcpy(queue, maxTriplets_buf, rangesBuffers->device_nTotalTrips_buf); - alpaka::wait(queue); + alpaka::memcpy(queue, maxTriplets_buf_h, rangesBuffers->device_nTotalTrips_buf); + alpaka::wait(queue); // wait to get the value before using it - tripletsInGPU = new lst::Triplets(); - tripletsBuffers = - new lst::TripletsBuffer(*alpaka::getPtrNative(maxTriplets_buf), nLowerModules_, devAcc, queue); + tripletsInGPU = new Triplets(); + tripletsBuffers = new TripletsBuffer(*maxTriplets_buf_h.data(), nLowerModules_, devAcc, queue); tripletsInGPU->setData(*tripletsBuffers); - alpaka::memcpy(queue, tripletsBuffers->nMemoryLocations_buf, maxTriplets_buf); - alpaka::wait(queue); + alpaka::memcpy(queue, tripletsBuffers->nMemoryLocations_buf, maxTriplets_buf_h); } uint16_t nonZeroModules = 0; unsigned int max_InnerSeg = 0; - // Allocate host index - auto index_buf = allocBufWrapper(devHost, nLowerModules_, queue); - uint16_t* index = alpaka::getPtrNative(index_buf); + // Allocate and copy nSegments from device to host (only nLowerModules in OT, not the +1 with pLSs) + auto nSegments_buf_h = cms::alpakatools::make_host_buffer(queue, nLowerModules_); + alpaka::memcpy(queue, nSegments_buf_h, segmentsBuffers->nSegments_buf, nLowerModules_); - // Allocate device index - auto index_gpu_buf = allocBufWrapper(devAcc, nLowerModules_, queue); + // ... same for module_nConnectedModules + // FIXME: replace by ES host data + auto module_nConnectedModules_buf_h = cms::alpakatools::make_host_buffer(queue, nLowerModules_); + alpaka::memcpy(queue, module_nConnectedModules_buf_h, modulesBuffers_.nConnectedModules_buf, nLowerModules_); - // Allocate and copy nSegments from device to host - auto nSegments_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, nSegments_buf, segmentsBuffers->nSegments_buf, nLowerModules_); - alpaka::wait(queue); + alpaka::wait(queue); // wait for nSegments and module_nConnectedModules before using - unsigned int* nSegments = alpaka::getPtrNative(nSegments_buf); + auto const* nSegments = nSegments_buf_h.data(); + auto const* module_nConnectedModules = module_nConnectedModules_buf_h.data(); - // Allocate and copy module_nConnectedModules from device to host - auto module_nConnectedModules_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, module_nConnectedModules_buf, modulesBuffers_->nConnectedModules_buf, nLowerModules_); - alpaka::wait(queue); - - uint16_t* module_nConnectedModules = alpaka::getPtrNative(module_nConnectedModules_buf); + // Allocate host index and fill it directly + auto index_buf_h = cms::alpakatools::make_host_buffer(queue, nLowerModules_); + auto* index = index_buf_h.data(); for (uint16_t innerLowerModuleIndex = 0; innerLowerModuleIndex < nLowerModules_; innerLowerModuleIndex++) { uint16_t nConnectedModules = module_nConnectedModules[innerLowerModuleIndex]; @@ -579,139 +520,116 @@ void lst::Event::createTriplets() { max_InnerSeg = std::max(max_InnerSeg, nInnerSegments); } - // Copy index from host to device - alpaka::memcpy(queue, index_gpu_buf, index_buf, nonZeroModules); - alpaka::wait(queue); + // Allocate and copy to device index + auto index_gpu_buf = allocBufWrapper(devAcc, nLowerModules_, queue); + alpaka::memcpy(queue, index_gpu_buf, index_buf_h, nonZeroModules); Vec3D const threadsPerBlockCreateTrip{1, 16, 16}; Vec3D const blocksPerGridCreateTrip{max_blocks, 1, 1}; WorkDiv3D const createTripletsInGPUv2_workDiv = createWorkDiv(blocksPerGridCreateTrip, threadsPerBlockCreateTrip, elementsPerThread); - lst::createTripletsInGPUv2 createTripletsInGPUv2_kernel; - auto const createTripletsInGPUv2Task(alpaka::createTaskKernel(createTripletsInGPUv2_workDiv, - createTripletsInGPUv2_kernel, - *modulesBuffers_->data(), - *mdsInGPU, - *segmentsInGPU, - *tripletsInGPU, - *rangesInGPU, - alpaka::getPtrNative(index_gpu_buf), - nonZeroModules)); - - alpaka::enqueue(queue, createTripletsInGPUv2Task); - - Vec3D const threadsPerBlockAddTrip{1, 1, 1024}; - Vec3D const blocksPerGridAddTrip{1, 1, 1}; - WorkDiv3D const addTripletRangesToEventExplicit_workDiv = - createWorkDiv(blocksPerGridAddTrip, threadsPerBlockAddTrip, elementsPerThread); - - lst::addTripletRangesToEventExplicit addTripletRangesToEventExplicit_kernel; - auto const addTripletRangesToEventExplicitTask( - alpaka::createTaskKernel(addTripletRangesToEventExplicit_workDiv, - addTripletRangesToEventExplicit_kernel, - *modulesBuffers_->data(), - *tripletsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, addTripletRangesToEventExplicitTask); - alpaka::wait(queue); + alpaka::exec(queue, + createTripletsInGPUv2_workDiv, + CreateTripletsInGPUv2{}, + *modulesBuffers_.data(), + *mdsInGPU, + *segmentsInGPU, + *tripletsInGPU, + *rangesInGPU, + index_gpu_buf.data(), + nonZeroModules); + + WorkDiv1D const addTripletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); + + alpaka::exec(queue, + addTripletRangesToEventExplicit_workDiv, + AddTripletRangesToEventExplicit{}, + *modulesBuffers_.data(), + *tripletsInGPU, + *rangesInGPU); if (addObjects) { addTripletsToEventExplicit(); } } -void lst::Event::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) { +void Event::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets) { if (trackCandidatesInGPU == nullptr) { - trackCandidatesInGPU = new lst::TrackCandidates(); - trackCandidatesBuffers = new lst::TrackCandidatesBuffer( + trackCandidatesInGPU = new TrackCandidates(); + trackCandidatesBuffers = new TrackCandidatesBuffer( n_max_nonpixel_track_candidates + n_max_pixel_track_candidates, devAcc, queue); trackCandidatesInGPU->setData(*trackCandidatesBuffers); } - // Pull nEligibleT5Modules from the device. - auto nEligibleModules_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nEligibleModules_buf, rangesBuffers->nEligibleT5Modules_buf); - alpaka::wait(queue); - uint16_t nEligibleModules = *alpaka::getPtrNative(nEligibleModules_buf); - Vec3D const threadsPerBlock_crossCleanpT3{1, 16, 64}; Vec3D const blocksPerGrid_crossCleanpT3{1, 4, 20}; WorkDiv3D const crossCleanpT3_workDiv = createWorkDiv(blocksPerGrid_crossCleanpT3, threadsPerBlock_crossCleanpT3, elementsPerThread); - lst::crossCleanpT3 crossCleanpT3_kernel; - auto const crossCleanpT3Task(alpaka::createTaskKernel(crossCleanpT3_workDiv, - crossCleanpT3_kernel, - *modulesBuffers_->data(), - *rangesInGPU, - *pixelTripletsInGPU, - *segmentsInGPU, - *pixelQuintupletsInGPU)); - - alpaka::enqueue(queue, crossCleanpT3Task); - - Vec3D const threadsPerBlock_addpT3asTrackCandidatesInGPU{1, 1, 512}; - Vec3D const blocksPerGrid_addpT3asTrackCandidatesInGPU{1, 1, 1}; - WorkDiv3D const addpT3asTrackCandidatesInGPU_workDiv = createWorkDiv( - blocksPerGrid_addpT3asTrackCandidatesInGPU, threadsPerBlock_addpT3asTrackCandidatesInGPU, elementsPerThread); - - lst::addpT3asTrackCandidatesInGPU addpT3asTrackCandidatesInGPU_kernel; - auto const addpT3asTrackCandidatesInGPUTask(alpaka::createTaskKernel(addpT3asTrackCandidatesInGPU_workDiv, - addpT3asTrackCandidatesInGPU_kernel, - nLowerModules_, - *pixelTripletsInGPU, - *trackCandidatesInGPU, - *segmentsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, addpT3asTrackCandidatesInGPUTask); + alpaka::exec(queue, + crossCleanpT3_workDiv, + CrossCleanpT3{}, + *modulesBuffers_.data(), + *rangesInGPU, + *pixelTripletsInGPU, + *segmentsInGPU, + *pixelQuintupletsInGPU); + + WorkDiv1D const addpT3asTrackCandidatesInGPU_workDiv = createWorkDiv({1}, {512}, {1}); + + alpaka::exec(queue, + addpT3asTrackCandidatesInGPU_workDiv, + AddpT3asTrackCandidatesInGPU{}, + nLowerModules_, + *pixelTripletsInGPU, + *trackCandidatesInGPU, + *segmentsInGPU, + *rangesInGPU); + + // Pull nEligibleT5Modules from the device. + auto nEligibleModules_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nEligibleModules_buf_h, rangesBuffers->nEligibleT5Modules_buf); + alpaka::wait(queue); // wait to get the value before using + auto const nEligibleModules = *nEligibleModules_buf_h.data(); Vec3D const threadsPerBlockRemoveDupQuints{1, 16, 32}; Vec3D const blocksPerGridRemoveDupQuints{1, std::max(nEligibleModules / 16, 1), std::max(nEligibleModules / 32, 1)}; WorkDiv3D const removeDupQuintupletsInGPUBeforeTC_workDiv = createWorkDiv(blocksPerGridRemoveDupQuints, threadsPerBlockRemoveDupQuints, elementsPerThread); - lst::removeDupQuintupletsInGPUBeforeTC removeDupQuintupletsInGPUBeforeTC_kernel; - auto const removeDupQuintupletsInGPUBeforeTCTask( - alpaka::createTaskKernel(removeDupQuintupletsInGPUBeforeTC_workDiv, - removeDupQuintupletsInGPUBeforeTC_kernel, - *quintupletsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, removeDupQuintupletsInGPUBeforeTCTask); + alpaka::exec(queue, + removeDupQuintupletsInGPUBeforeTC_workDiv, + RemoveDupQuintupletsInGPUBeforeTC{}, + *quintupletsInGPU, + *rangesInGPU); Vec3D const threadsPerBlock_crossCleanT5{32, 1, 32}; Vec3D const blocksPerGrid_crossCleanT5{(13296 / 32) + 1, 1, max_blocks}; WorkDiv3D const crossCleanT5_workDiv = createWorkDiv(blocksPerGrid_crossCleanT5, threadsPerBlock_crossCleanT5, elementsPerThread); - lst::crossCleanT5 crossCleanT5_kernel; - auto const crossCleanT5Task(alpaka::createTaskKernel(crossCleanT5_workDiv, - crossCleanT5_kernel, - *modulesBuffers_->data(), - *quintupletsInGPU, - *pixelQuintupletsInGPU, - *pixelTripletsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, crossCleanT5Task); + alpaka::exec(queue, + crossCleanT5_workDiv, + CrossCleanT5{}, + *modulesBuffers_.data(), + *quintupletsInGPU, + *pixelQuintupletsInGPU, + *pixelTripletsInGPU, + *rangesInGPU); Vec3D const threadsPerBlock_addT5asTrackCandidateInGPU{1, 8, 128}; Vec3D const blocksPerGrid_addT5asTrackCandidateInGPU{1, 8, 10}; WorkDiv3D const addT5asTrackCandidateInGPU_workDiv = createWorkDiv( blocksPerGrid_addT5asTrackCandidateInGPU, threadsPerBlock_addT5asTrackCandidateInGPU, elementsPerThread); - lst::addT5asTrackCandidateInGPU addT5asTrackCandidateInGPU_kernel; - auto const addT5asTrackCandidateInGPUTask(alpaka::createTaskKernel(addT5asTrackCandidateInGPU_workDiv, - addT5asTrackCandidateInGPU_kernel, - nLowerModules_, - *quintupletsInGPU, - *trackCandidatesInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, addT5asTrackCandidateInGPUTask); + alpaka::exec(queue, + addT5asTrackCandidateInGPU_workDiv, + AddT5asTrackCandidateInGPU{}, + nLowerModules_, + *quintupletsInGPU, + *trackCandidatesInGPU, + *rangesInGPU); if (!no_pls_dupclean) { Vec3D const threadsPerBlockCheckHitspLS{1, 16, 16}; @@ -719,11 +637,7 @@ void lst::Event::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_ WorkDiv3D const checkHitspLS_workDiv = createWorkDiv(blocksPerGridCheckHitspLS, threadsPerBlockCheckHitspLS, elementsPerThread); - lst::checkHitspLS checkHitspLS_kernel; - auto const checkHitspLSTask(alpaka::createTaskKernel( - checkHitspLS_workDiv, checkHitspLS_kernel, *modulesBuffers_->data(), *segmentsInGPU, true)); - - alpaka::enqueue(queue, checkHitspLSTask); + alpaka::exec(queue, checkHitspLS_workDiv, CheckHitspLS{}, *modulesBuffers_.data(), *segmentsInGPU, true); } Vec3D const threadsPerBlock_crossCleanpLS{1, 16, 32}; @@ -731,34 +645,30 @@ void lst::Event::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_ WorkDiv3D const crossCleanpLS_workDiv = createWorkDiv(blocksPerGrid_crossCleanpLS, threadsPerBlock_crossCleanpLS, elementsPerThread); - lst::crossCleanpLS crossCleanpLS_kernel; - auto const crossCleanpLSTask(alpaka::createTaskKernel(crossCleanpLS_workDiv, - crossCleanpLS_kernel, - *modulesBuffers_->data(), - *rangesInGPU, - *pixelTripletsInGPU, - *trackCandidatesInGPU, - *segmentsInGPU, - *mdsInGPU, - *hitsInGPU, - *quintupletsInGPU)); - - alpaka::enqueue(queue, crossCleanpLSTask); + alpaka::exec(queue, + crossCleanpLS_workDiv, + CrossCleanpLS{}, + *modulesBuffers_.data(), + *rangesInGPU, + *pixelTripletsInGPU, + *trackCandidatesInGPU, + *segmentsInGPU, + *mdsInGPU, + *hitsInGPU, + *quintupletsInGPU); Vec3D const threadsPerBlock_addpLSasTrackCandidateInGPU{1, 1, 384}; Vec3D const blocksPerGrid_addpLSasTrackCandidateInGPU{1, 1, max_blocks}; WorkDiv3D const addpLSasTrackCandidateInGPU_workDiv = createWorkDiv( blocksPerGrid_addpLSasTrackCandidateInGPU, threadsPerBlock_addpLSasTrackCandidateInGPU, elementsPerThread); - lst::addpLSasTrackCandidateInGPU addpLSasTrackCandidateInGPU_kernel; - auto const addpLSasTrackCandidateInGPUTask(alpaka::createTaskKernel(addpLSasTrackCandidateInGPU_workDiv, - addpLSasTrackCandidateInGPU_kernel, - nLowerModules_, - *trackCandidatesInGPU, - *segmentsInGPU, - tc_pls_triplets)); - - alpaka::enqueue(queue, addpLSasTrackCandidateInGPUTask); + alpaka::exec(queue, + addpLSasTrackCandidateInGPU_workDiv, + AddpLSasTrackCandidateInGPU{}, + nLowerModules_, + *trackCandidatesInGPU, + *segmentsInGPU, + tc_pls_triplets); // Check if either n_max_pixel_track_candidates or n_max_nonpixel_track_candidates was reached auto nTrackCanpT5Host_buf = allocBufWrapper(devHost, 1, queue); @@ -769,12 +679,12 @@ void lst::Event::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_ alpaka::memcpy(queue, nTrackCanpT3Host_buf, trackCandidatesBuffers->nTrackCandidatespT3_buf); alpaka::memcpy(queue, nTrackCanpLSHost_buf, trackCandidatesBuffers->nTrackCandidatespLS_buf); alpaka::memcpy(queue, nTrackCanT5Host_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf); - alpaka::wait(queue); + alpaka::wait(queue); // wait to get the values before using them - int nTrackCandidatespT5 = *alpaka::getPtrNative(nTrackCanpT5Host_buf); - int nTrackCandidatespT3 = *alpaka::getPtrNative(nTrackCanpT3Host_buf); - int nTrackCandidatespLS = *alpaka::getPtrNative(nTrackCanpLSHost_buf); - int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCanT5Host_buf); + auto nTrackCandidatespT5 = *nTrackCanpT5Host_buf.data(); + auto nTrackCandidatespT3 = *nTrackCanpT3Host_buf.data(); + auto nTrackCandidatespLS = *nTrackCanpLSHost_buf.data(); + auto nTrackCandidatesT5 = *nTrackCanT5Host_buf.data(); if ((nTrackCandidatespT5 + nTrackCandidatespT3 + nTrackCandidatespLS == n_max_pixel_track_candidates) || (nTrackCandidatesT5 == n_max_nonpixel_track_candidates)) { printf( @@ -786,43 +696,42 @@ void lst::Event::createTrackCandidates(bool no_pls_dupclean, bool tc_pls_ } } -void lst::Event::createPixelTriplets() { +void Event::createPixelTriplets() { if (pixelTripletsInGPU == nullptr) { - pixelTripletsInGPU = new lst::PixelTriplets(); - pixelTripletsBuffers = new lst::PixelTripletsBuffer(n_max_pixel_triplets, devAcc, queue); + pixelTripletsInGPU = new PixelTriplets(); + pixelTripletsBuffers = new PixelTripletsBuffer(n_max_pixel_triplets, devAcc, queue); pixelTripletsInGPU->setData(*pixelTripletsBuffers); } + auto superbins_buf = allocBufWrapper(devHost, n_max_pixel_segments_per_module, queue); + auto pixelTypes_buf = allocBufWrapper(devHost, n_max_pixel_segments_per_module, queue); + + alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf); + alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf); + auto const* superbins = superbins_buf.data(); + auto const* pixelTypes = pixelTypes_buf.data(); + unsigned int nInnerSegments; auto nInnerSegments_src_view = alpaka::createView(devHost, &nInnerSegments, (size_t)1u); + // Create a sub-view for the device buffer auto dev_view_nSegments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx)1u, (Idx)nLowerModules_); alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments); - alpaka::wait(queue); - - auto superbins_buf = allocBufWrapper(devHost, n_max_pixel_segments_per_module, queue); - auto pixelTypes_buf = allocBufWrapper(devHost, n_max_pixel_segments_per_module, queue); - - alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf); - alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf); - alpaka::wait(queue); + alpaka::wait(queue); // wait to get nInnerSegments (also superbins and pixelTypes) before using auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); - int* superbins = alpaka::getPtrNative(superbins_buf); - int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf); - unsigned int* connectedPixelSize_host = alpaka::getPtrNative(connectedPixelSize_host_buf); - unsigned int* connectedPixelIndex_host = alpaka::getPtrNative(connectedPixelIndex_host_buf); - alpaka::wait(queue); + unsigned int* connectedPixelSize_host = connectedPixelSize_host_buf.data(); + unsigned int* connectedPixelIndex_host = connectedPixelIndex_host_buf.data(); int pixelIndexOffsetPos = - pixelMapping_->connectedPixelsIndex[size_superbins - 1] + pixelMapping_->connectedPixelsSizes[size_superbins - 1]; - int pixelIndexOffsetNeg = pixelMapping_->connectedPixelsIndexPos[size_superbins - 1] + - pixelMapping_->connectedPixelsSizesPos[size_superbins - 1] + pixelIndexOffsetPos; + pixelMapping_.connectedPixelsIndex[size_superbins - 1] + pixelMapping_.connectedPixelsSizes[size_superbins - 1]; + int pixelIndexOffsetNeg = pixelMapping_.connectedPixelsIndexPos[size_superbins - 1] + + pixelMapping_.connectedPixelsSizesPos[size_superbins - 1] + pixelIndexOffsetPos; // TODO: check if a map/reduction to just eligible pLSs would speed up the kernel // the current selection still leaves a significant fraction of unmatchable pLSs @@ -838,56 +747,51 @@ void lst::Event::createPixelTriplets() { // Used pixel type to select correct size-index arrays if (pixelType == 0) { connectedPixelSize_host[i] = - pixelMapping_->connectedPixelsSizes[superbin]; // number of connected modules to this pixel - auto connectedIdxBase = pixelMapping_->connectedPixelsIndex[superbin]; + pixelMapping_.connectedPixelsSizes[superbin]; // number of connected modules to this pixel + auto connectedIdxBase = pixelMapping_.connectedPixelsIndex[superbin]; connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected modules for this superbin in map } else if (pixelType == 1) { connectedPixelSize_host[i] = - pixelMapping_->connectedPixelsSizesPos[superbin]; // number of pixel connected modules - auto connectedIdxBase = pixelMapping_->connectedPixelsIndexPos[superbin] + pixelIndexOffsetPos; + pixelMapping_.connectedPixelsSizesPos[superbin]; // number of pixel connected modules + auto connectedIdxBase = pixelMapping_.connectedPixelsIndexPos[superbin] + pixelIndexOffsetPos; connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected pixel modules } else if (pixelType == 2) { connectedPixelSize_host[i] = - pixelMapping_->connectedPixelsSizesNeg[superbin]; // number of pixel connected modules - auto connectedIdxBase = pixelMapping_->connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg; + pixelMapping_.connectedPixelsSizesNeg[superbin]; // number of pixel connected modules + auto connectedIdxBase = pixelMapping_.connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg; connectedPixelIndex_host[i] = connectedIdxBase; // index to get start of connected pixel modules } } alpaka::memcpy(queue, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments); alpaka::memcpy(queue, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments); - alpaka::wait(queue); Vec3D const threadsPerBlock{1, 4, 32}; Vec3D const blocksPerGrid{16 /* above median of connected modules*/, 4096, 1}; WorkDiv3D const createPixelTripletsInGPUFromMapv2_workDiv = createWorkDiv(blocksPerGrid, threadsPerBlock, elementsPerThread); - lst::createPixelTripletsInGPUFromMapv2 createPixelTripletsInGPUFromMapv2_kernel; - auto const createPixelTripletsInGPUFromMapv2Task( - alpaka::createTaskKernel(createPixelTripletsInGPUFromMapv2_workDiv, - createPixelTripletsInGPUFromMapv2_kernel, - *modulesBuffers_->data(), - *rangesInGPU, - *mdsInGPU, - *segmentsInGPU, - *tripletsInGPU, - *pixelTripletsInGPU, - alpaka::getPtrNative(connectedPixelSize_dev_buf), - alpaka::getPtrNative(connectedPixelIndex_dev_buf), - nInnerSegments)); - - alpaka::enqueue(queue, createPixelTripletsInGPUFromMapv2Task); - alpaka::wait(queue); + alpaka::exec(queue, + createPixelTripletsInGPUFromMapv2_workDiv, + CreatePixelTripletsInGPUFromMapv2{}, + *modulesBuffers_.data(), + *rangesInGPU, + *mdsInGPU, + *segmentsInGPU, + *tripletsInGPU, + *pixelTripletsInGPU, + connectedPixelSize_dev_buf.data(), + connectedPixelIndex_dev_buf.data(), + nInnerSegments); #ifdef WARNINGS auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf); - alpaka::wait(queue); + alpaka::wait(queue); // wait to get the value before using it - std::cout << "number of pixel triplets = " << *alpaka::getPtrNative(nPixelTriplets_buf) << std::endl; + std::cout << "number of pixel triplets = " << *nPixelTriplets_buf.data() << std::endl; #endif //pT3s can be cleaned here because they're not used in making pT5s! @@ -897,48 +801,36 @@ void lst::Event::createPixelTriplets() { WorkDiv3D const removeDupPixelTripletsInGPUFromMap_workDiv = createWorkDiv(blocksPerGridDupPixTrip, threadsPerBlockDupPixTrip, elementsPerThread); - lst::removeDupPixelTripletsInGPUFromMap removeDupPixelTripletsInGPUFromMap_kernel; - auto const removeDupPixelTripletsInGPUFromMapTask(alpaka::createTaskKernel( - removeDupPixelTripletsInGPUFromMap_workDiv, removeDupPixelTripletsInGPUFromMap_kernel, *pixelTripletsInGPU)); - - alpaka::enqueue(queue, removeDupPixelTripletsInGPUFromMapTask); - alpaka::wait(queue); + alpaka::exec( + queue, removeDupPixelTripletsInGPUFromMap_workDiv, RemoveDupPixelTripletsInGPUFromMap{}, *pixelTripletsInGPU); } -void lst::Event::createQuintuplets() { - Vec3D const threadsPerBlockCreateQuints{1, 1, 1024}; - Vec3D const blocksPerGridCreateQuints{1, 1, 1}; - WorkDiv3D const createEligibleModulesListForQuintupletsGPU_workDiv = - createWorkDiv(blocksPerGridCreateQuints, threadsPerBlockCreateQuints, elementsPerThread); - - lst::createEligibleModulesListForQuintupletsGPU createEligibleModulesListForQuintupletsGPU_kernel; - auto const createEligibleModulesListForQuintupletsGPUTask( - alpaka::createTaskKernel(createEligibleModulesListForQuintupletsGPU_workDiv, - createEligibleModulesListForQuintupletsGPU_kernel, - *modulesBuffers_->data(), - *tripletsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, createEligibleModulesListForQuintupletsGPUTask); - alpaka::wait(queue); +void Event::createQuintuplets() { + WorkDiv1D const createEligibleModulesListForQuintupletsGPU_workDiv = createWorkDiv({1}, {1024}, {1}); + + alpaka::exec(queue, + createEligibleModulesListForQuintupletsGPU_workDiv, + CreateEligibleModulesListForQuintupletsGPU{}, + *modulesBuffers_.data(), + *tripletsInGPU, + *rangesInGPU); auto nEligibleT5Modules_buf = allocBufWrapper(devHost, 1, queue); auto nTotalQuintuplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nEligibleT5Modules_buf, rangesBuffers->nEligibleT5Modules_buf); alpaka::memcpy(queue, nTotalQuintuplets_buf, rangesBuffers->device_nTotalQuints_buf); - alpaka::wait(queue); + alpaka::wait(queue); // wait for the values before using them - uint16_t nEligibleT5Modules = *alpaka::getPtrNative(nEligibleT5Modules_buf); - unsigned int nTotalQuintuplets = *alpaka::getPtrNative(nTotalQuintuplets_buf); + auto nEligibleT5Modules = *nEligibleT5Modules_buf.data(); + auto nTotalQuintuplets = *nTotalQuintuplets_buf.data(); if (quintupletsInGPU == nullptr) { - quintupletsInGPU = new lst::Quintuplets(); - quintupletsBuffers = new lst::QuintupletsBuffer(nTotalQuintuplets, nLowerModules_, devAcc, queue); + quintupletsInGPU = new Quintuplets(); + quintupletsBuffers = new QuintupletsBuffer(nTotalQuintuplets, nLowerModules_, devAcc, queue); quintupletsInGPU->setData(*quintupletsBuffers); alpaka::memcpy(queue, quintupletsBuffers->nMemoryLocations_buf, nTotalQuintuplets_buf); - alpaka::wait(queue); } Vec3D const threadsPerBlockQuints{1, 8, 32}; @@ -946,84 +838,75 @@ void lst::Event::createQuintuplets() { WorkDiv3D const createQuintupletsInGPUv2_workDiv = createWorkDiv(blocksPerGridQuints, threadsPerBlockQuints, elementsPerThread); - lst::createQuintupletsInGPUv2 createQuintupletsInGPUv2_kernel; - auto const createQuintupletsInGPUv2Task(alpaka::createTaskKernel(createQuintupletsInGPUv2_workDiv, - createQuintupletsInGPUv2_kernel, - *modulesBuffers_->data(), - *mdsInGPU, - *segmentsInGPU, - *tripletsInGPU, - *quintupletsInGPU, - *rangesInGPU, - nEligibleT5Modules)); - - alpaka::enqueue(queue, createQuintupletsInGPUv2Task); + alpaka::exec(queue, + createQuintupletsInGPUv2_workDiv, + CreateQuintupletsInGPUv2{}, + *modulesBuffers_.data(), + *mdsInGPU, + *segmentsInGPU, + *tripletsInGPU, + *quintupletsInGPU, + *rangesInGPU, + nEligibleT5Modules); Vec3D const threadsPerBlockDupQuint{1, 16, 16}; Vec3D const blocksPerGridDupQuint{max_blocks, 1, 1}; WorkDiv3D const removeDupQuintupletsInGPUAfterBuild_workDiv = createWorkDiv(blocksPerGridDupQuint, threadsPerBlockDupQuint, elementsPerThread); - lst::removeDupQuintupletsInGPUAfterBuild removeDupQuintupletsInGPUAfterBuild_kernel; - auto const removeDupQuintupletsInGPUAfterBuildTask( - alpaka::createTaskKernel(removeDupQuintupletsInGPUAfterBuild_workDiv, - removeDupQuintupletsInGPUAfterBuild_kernel, - *modulesBuffers_->data(), - *quintupletsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, removeDupQuintupletsInGPUAfterBuildTask); - - Vec3D const threadsPerBlockAddQuint{1, 1, 1024}; - Vec3D const blocksPerGridAddQuint{1, 1, 1}; - WorkDiv3D const addQuintupletRangesToEventExplicit_workDiv = - createWorkDiv(blocksPerGridAddQuint, threadsPerBlockAddQuint, elementsPerThread); - - lst::addQuintupletRangesToEventExplicit addQuintupletRangesToEventExplicit_kernel; - auto const addQuintupletRangesToEventExplicitTask( - alpaka::createTaskKernel(addQuintupletRangesToEventExplicit_workDiv, - addQuintupletRangesToEventExplicit_kernel, - *modulesBuffers_->data(), - *quintupletsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, addQuintupletRangesToEventExplicitTask); - alpaka::wait(queue); + alpaka::exec(queue, + removeDupQuintupletsInGPUAfterBuild_workDiv, + RemoveDupQuintupletsInGPUAfterBuild{}, + *modulesBuffers_.data(), + *quintupletsInGPU, + *rangesInGPU); + + WorkDiv1D const addQuintupletRangesToEventExplicit_workDiv = createWorkDiv({1}, {1024}, {1}); + + alpaka::exec(queue, + addQuintupletRangesToEventExplicit_workDiv, + AddQuintupletRangesToEventExplicit{}, + *modulesBuffers_.data(), + *quintupletsInGPU, + *rangesInGPU); if (addObjects) { addQuintupletsToEventExplicit(); } } -void lst::Event::pixelLineSegmentCleaning(bool no_pls_dupclean) { +void Event::pixelLineSegmentCleaning(bool no_pls_dupclean) { if (!no_pls_dupclean) { Vec3D const threadsPerBlockCheckHitspLS{1, 16, 16}; Vec3D const blocksPerGridCheckHitspLS{1, max_blocks * 4, max_blocks / 4}; WorkDiv3D const checkHitspLS_workDiv = createWorkDiv(blocksPerGridCheckHitspLS, threadsPerBlockCheckHitspLS, elementsPerThread); - lst::checkHitspLS checkHitspLS_kernel; - auto const checkHitspLSTask(alpaka::createTaskKernel( - checkHitspLS_workDiv, checkHitspLS_kernel, *modulesBuffers_->data(), *segmentsInGPU, false)); - - alpaka::enqueue(queue, checkHitspLSTask); - alpaka::wait(queue); + alpaka::exec(queue, checkHitspLS_workDiv, CheckHitspLS{}, *modulesBuffers_.data(), *segmentsInGPU, false); } } -void lst::Event::createPixelQuintuplets() { +void Event::createPixelQuintuplets() { if (pixelQuintupletsInGPU == nullptr) { - pixelQuintupletsInGPU = new lst::PixelQuintuplets(); - pixelQuintupletsBuffers = new lst::PixelQuintupletsBuffer(n_max_pixel_quintuplets, devAcc, queue); + pixelQuintupletsInGPU = new PixelQuintuplets(); + pixelQuintupletsBuffers = new PixelQuintupletsBuffer(n_max_pixel_quintuplets, devAcc, queue); pixelQuintupletsInGPU->setData(*pixelQuintupletsBuffers); } if (trackCandidatesInGPU == nullptr) { - trackCandidatesInGPU = new lst::TrackCandidates(); - trackCandidatesBuffers = new lst::TrackCandidatesBuffer( + trackCandidatesInGPU = new TrackCandidates(); + trackCandidatesBuffers = new TrackCandidatesBuffer( n_max_nonpixel_track_candidates + n_max_pixel_track_candidates, devAcc, queue); trackCandidatesInGPU->setData(*trackCandidatesBuffers); } + auto superbins_buf = allocBufWrapper(devHost, n_max_pixel_segments_per_module, queue); + auto pixelTypes_buf = allocBufWrapper(devHost, n_max_pixel_segments_per_module, queue); + + alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf); + alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf); + auto const* superbins = superbins_buf.data(); + auto const* pixelTypes = pixelTypes_buf.data(); + unsigned int nInnerSegments; auto nInnerSegments_src_view = alpaka::createView(devHost, &nInnerSegments, (size_t)1u); @@ -1031,36 +914,26 @@ void lst::Event::createPixelQuintuplets() { auto dev_view_nSegments = alpaka::createSubView(segmentsBuffers->nSegments_buf, (Idx)1u, (Idx)nLowerModules_); alpaka::memcpy(queue, nInnerSegments_src_view, dev_view_nSegments); - alpaka::wait(queue); - - auto superbins_buf = allocBufWrapper(devHost, n_max_pixel_segments_per_module, queue); - auto pixelTypes_buf = allocBufWrapper(devHost, n_max_pixel_segments_per_module, queue); - - alpaka::memcpy(queue, superbins_buf, segmentsBuffers->superbin_buf); - alpaka::memcpy(queue, pixelTypes_buf, segmentsBuffers->pixelType_buf); - alpaka::wait(queue); + alpaka::wait(queue); // wait to get nInnerSegments (also superbins and pixelTypes) before using auto connectedPixelSize_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); auto connectedPixelIndex_host_buf = allocBufWrapper(devHost, nInnerSegments, queue); auto connectedPixelSize_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); auto connectedPixelIndex_dev_buf = allocBufWrapper(devAcc, nInnerSegments, queue); - int* superbins = alpaka::getPtrNative(superbins_buf); - int8_t* pixelTypes = alpaka::getPtrNative(pixelTypes_buf); - unsigned int* connectedPixelSize_host = alpaka::getPtrNative(connectedPixelSize_host_buf); - unsigned int* connectedPixelIndex_host = alpaka::getPtrNative(connectedPixelIndex_host_buf); - alpaka::wait(queue); + auto* connectedPixelSize_host = connectedPixelSize_host_buf.data(); + auto* connectedPixelIndex_host = connectedPixelIndex_host_buf.data(); - int pixelIndexOffsetPos = - pixelMapping_->connectedPixelsIndex[size_superbins - 1] + pixelMapping_->connectedPixelsSizes[size_superbins - 1]; - int pixelIndexOffsetNeg = pixelMapping_->connectedPixelsIndexPos[size_superbins - 1] + - pixelMapping_->connectedPixelsSizesPos[size_superbins - 1] + pixelIndexOffsetPos; + int pixelIndexOffsetPos = pixelMapping_.connectedPixelsIndex[::size_superbins - 1] + + pixelMapping_.connectedPixelsSizes[::size_superbins - 1]; + int pixelIndexOffsetNeg = pixelMapping_.connectedPixelsIndexPos[::size_superbins - 1] + + pixelMapping_.connectedPixelsSizesPos[::size_superbins - 1] + pixelIndexOffsetPos; // Loop over # pLS for (unsigned int i = 0; i < nInnerSegments; i++) { int8_t pixelType = pixelTypes[i]; // Get pixel type for this pLS int superbin = superbins[i]; // Get superbin for this pixel - if ((superbin < 0) or (superbin >= (int)size_superbins) or (pixelType > 2) or (pixelType < 0)) { + if ((superbin < 0) or (superbin >= (int)::size_superbins) or (pixelType > 2) or (pixelType < 0)) { connectedPixelIndex_host[i] = 0; connectedPixelSize_host[i] = 0; continue; @@ -1068,111 +941,97 @@ void lst::Event::createPixelQuintuplets() { // Used pixel type to select correct size-index arrays if (pixelType == 0) { connectedPixelSize_host[i] = - pixelMapping_->connectedPixelsSizes[superbin]; //number of connected modules to this pixel - unsigned int connectedIdxBase = pixelMapping_->connectedPixelsIndex[superbin]; + pixelMapping_.connectedPixelsSizes[superbin]; //number of connected modules to this pixel + unsigned int connectedIdxBase = pixelMapping_.connectedPixelsIndex[superbin]; connectedPixelIndex_host[i] = connectedIdxBase; } else if (pixelType == 1) { - connectedPixelSize_host[i] = - pixelMapping_->connectedPixelsSizesPos[superbin]; //number of pixel connected modules - unsigned int connectedIdxBase = pixelMapping_->connectedPixelsIndexPos[superbin] + pixelIndexOffsetPos; + connectedPixelSize_host[i] = pixelMapping_.connectedPixelsSizesPos[superbin]; //number of pixel connected modules + unsigned int connectedIdxBase = pixelMapping_.connectedPixelsIndexPos[superbin] + pixelIndexOffsetPos; connectedPixelIndex_host[i] = connectedIdxBase; } else if (pixelType == 2) { - connectedPixelSize_host[i] = - pixelMapping_->connectedPixelsSizesNeg[superbin]; //number of pixel connected modules - unsigned int connectedIdxBase = pixelMapping_->connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg; + connectedPixelSize_host[i] = pixelMapping_.connectedPixelsSizesNeg[superbin]; //number of pixel connected modules + unsigned int connectedIdxBase = pixelMapping_.connectedPixelsIndexNeg[superbin] + pixelIndexOffsetNeg; connectedPixelIndex_host[i] = connectedIdxBase; } } alpaka::memcpy(queue, connectedPixelSize_dev_buf, connectedPixelSize_host_buf, nInnerSegments); alpaka::memcpy(queue, connectedPixelIndex_dev_buf, connectedPixelIndex_host_buf, nInnerSegments); - alpaka::wait(queue); Vec3D const threadsPerBlockCreatePixQuints{1, 16, 16}; Vec3D const blocksPerGridCreatePixQuints{16, max_blocks, 1}; WorkDiv3D const createPixelQuintupletsInGPUFromMapv2_workDiv = createWorkDiv(blocksPerGridCreatePixQuints, threadsPerBlockCreatePixQuints, elementsPerThread); - lst::createPixelQuintupletsInGPUFromMapv2 createPixelQuintupletsInGPUFromMapv2_kernel; - auto const createPixelQuintupletsInGPUFromMapv2Task( - alpaka::createTaskKernel(createPixelQuintupletsInGPUFromMapv2_workDiv, - createPixelQuintupletsInGPUFromMapv2_kernel, - *modulesBuffers_->data(), - *mdsInGPU, - *segmentsInGPU, - *tripletsInGPU, - *quintupletsInGPU, - *pixelQuintupletsInGPU, - alpaka::getPtrNative(connectedPixelSize_dev_buf), - alpaka::getPtrNative(connectedPixelIndex_dev_buf), - nInnerSegments, - *rangesInGPU)); - - alpaka::enqueue(queue, createPixelQuintupletsInGPUFromMapv2Task); + alpaka::exec(queue, + createPixelQuintupletsInGPUFromMapv2_workDiv, + CreatePixelQuintupletsInGPUFromMapv2{}, + *modulesBuffers_.data(), + *mdsInGPU, + *segmentsInGPU, + *tripletsInGPU, + *quintupletsInGPU, + *pixelQuintupletsInGPU, + connectedPixelSize_dev_buf.data(), + connectedPixelIndex_dev_buf.data(), + nInnerSegments, + *rangesInGPU); Vec3D const threadsPerBlockDupPix{1, 16, 16}; Vec3D const blocksPerGridDupPix{1, max_blocks, 1}; WorkDiv3D const removeDupPixelQuintupletsInGPUFromMap_workDiv = createWorkDiv(blocksPerGridDupPix, threadsPerBlockDupPix, elementsPerThread); - lst::removeDupPixelQuintupletsInGPUFromMap removeDupPixelQuintupletsInGPUFromMap_kernel; - auto const removeDupPixelQuintupletsInGPUFromMapTask( - alpaka::createTaskKernel(removeDupPixelQuintupletsInGPUFromMap_workDiv, - removeDupPixelQuintupletsInGPUFromMap_kernel, - *pixelQuintupletsInGPU)); - - alpaka::enqueue(queue, removeDupPixelQuintupletsInGPUFromMapTask); - - Vec3D const threadsPerBlockAddpT5asTrackCan{1, 1, 256}; - Vec3D const blocksPerGridAddpT5asTrackCan{1, 1, 1}; - WorkDiv3D const addpT5asTrackCandidateInGPU_workDiv = - createWorkDiv(blocksPerGridAddpT5asTrackCan, threadsPerBlockAddpT5asTrackCan, elementsPerThread); - - lst::addpT5asTrackCandidateInGPU addpT5asTrackCandidateInGPU_kernel; - auto const addpT5asTrackCandidateInGPUTask(alpaka::createTaskKernel(addpT5asTrackCandidateInGPU_workDiv, - addpT5asTrackCandidateInGPU_kernel, - nLowerModules_, - *pixelQuintupletsInGPU, - *trackCandidatesInGPU, - *segmentsInGPU, - *rangesInGPU)); - - alpaka::enqueue(queue, addpT5asTrackCandidateInGPUTask); - alpaka::wait(queue); + alpaka::exec(queue, + removeDupPixelQuintupletsInGPUFromMap_workDiv, + RemoveDupPixelQuintupletsInGPUFromMap{}, + *pixelQuintupletsInGPU); + + WorkDiv1D const addpT5asTrackCandidateInGPU_workDiv = createWorkDiv({1}, {256}, {1}); + + alpaka::exec(queue, + addpT5asTrackCandidateInGPU_workDiv, + AddpT5asTrackCandidateInGPU{}, + nLowerModules_, + *pixelQuintupletsInGPU, + *trackCandidatesInGPU, + *segmentsInGPU, + *rangesInGPU); #ifdef WARNINGS auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf); - alpaka::wait(queue); + alpaka::wait(queue); // wait to get the value before using it - std::cout << "number of pixel quintuplets = " << *alpaka::getPtrNative(nPixelQuintuplets_buf) << std::endl; + std::cout << "number of pixel quintuplets = " << *nPixelQuintuplets_buf.data() << std::endl; #endif } -void lst::Event::addMiniDoubletsToEventExplicit() { +void Event::addMiniDoubletsToEventExplicit() { auto nMDsCPU_buf = allocBufWrapper(devHost, nLowerModules_, queue); alpaka::memcpy(queue, nMDsCPU_buf, miniDoubletsBuffers->nMDs_buf, nLowerModules_); + // FIXME: replace by ES host data auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_->subdets_buf, nLowerModules_); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_.subdets_buf, nLowerModules_); auto module_layers_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, module_layers_buf, modulesBuffers_->layers_buf, nLowerModules_); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers_.layers_buf, nLowerModules_); auto module_hitRanges_buf = allocBufWrapper(devHost, nLowerModules_ * 2, queue); alpaka::memcpy(queue, module_hitRanges_buf, hitsBuffers->hitRanges_buf, nLowerModules_ * 2u); - alpaka::wait(queue); + alpaka::wait(queue); // wait for inputs before using them - unsigned int* nMDsCPU = alpaka::getPtrNative(nMDsCPU_buf); - short* module_subdets = alpaka::getPtrNative(module_subdets_buf); - short* module_layers = alpaka::getPtrNative(module_layers_buf); - int* module_hitRanges = alpaka::getPtrNative(module_hitRanges_buf); + auto const* nMDsCPU = nMDsCPU_buf.data(); + auto const* module_subdets = module_subdets_buf.data(); + auto const* module_layers = module_layers_buf.data(); + auto const* module_hitRanges = module_hitRanges_buf.data(); for (unsigned int i = 0; i < nLowerModules_; i++) { if (!(nMDsCPU[i] == 0 or module_hitRanges[i * 2] == -1)) { - if (module_subdets[i] == Barrel) { + if (module_subdets[i] == ::lst::Barrel) { n_minidoublets_by_layer_barrel_[module_layers[i] - 1] += nMDsCPU[i]; } else { n_minidoublets_by_layer_endcap_[module_layers[i] - 1] += nMDsCPU[i]; @@ -1181,25 +1040,26 @@ void lst::Event::addMiniDoubletsToEventExplicit() { } } -void lst::Event::addSegmentsToEventExplicit() { +void Event::addSegmentsToEventExplicit() { auto nSegmentsCPU_buf = allocBufWrapper(devHost, nLowerModules_, queue); alpaka::memcpy(queue, nSegmentsCPU_buf, segmentsBuffers->nSegments_buf, nLowerModules_); + // FIXME: replace by ES host data auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_->subdets_buf, nLowerModules_); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_.subdets_buf, nLowerModules_); auto module_layers_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, module_layers_buf, modulesBuffers_->layers_buf, nLowerModules_); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers_.layers_buf, nLowerModules_); - alpaka::wait(queue); + alpaka::wait(queue); // wait for inputs before using them - unsigned int* nSegmentsCPU = alpaka::getPtrNative(nSegmentsCPU_buf); - short* module_subdets = alpaka::getPtrNative(module_subdets_buf); - short* module_layers = alpaka::getPtrNative(module_layers_buf); + auto const* nSegmentsCPU = nSegmentsCPU_buf.data(); + auto const* module_subdets = module_subdets_buf.data(); + auto const* module_layers = module_layers_buf.data(); for (unsigned int i = 0; i < nLowerModules_; i++) { if (!(nSegmentsCPU[i] == 0)) { - if (module_subdets[i] == Barrel) { + if (module_subdets[i] == ::lst::Barrel) { n_segments_by_layer_barrel_[module_layers[i] - 1] += nSegmentsCPU[i]; } else { n_segments_by_layer_endcap_[module_layers[i] - 1] += nSegmentsCPU[i]; @@ -1208,29 +1068,30 @@ void lst::Event::addSegmentsToEventExplicit() { } } -void lst::Event::addQuintupletsToEventExplicit() { +void Event::addQuintupletsToEventExplicit() { auto nQuintupletsCPU_buf = allocBufWrapper(devHost, nLowerModules_, queue); alpaka::memcpy(queue, nQuintupletsCPU_buf, quintupletsBuffers->nQuintuplets_buf); + // FIXME: replace by ES host data auto module_subdets_buf = allocBufWrapper(devHost, nModules_, queue); - alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_->subdets_buf, nModules_); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_.subdets_buf, nModules_); auto module_layers_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, module_layers_buf, modulesBuffers_->layers_buf, nLowerModules_); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers_.layers_buf, nLowerModules_); auto module_quintupletModuleIndices_buf = allocBufWrapper(devHost, nLowerModules_, queue); alpaka::memcpy(queue, module_quintupletModuleIndices_buf, rangesBuffers->quintupletModuleIndices_buf); - alpaka::wait(queue); + alpaka::wait(queue); // wait for inputs before using them - unsigned int* nQuintupletsCPU = alpaka::getPtrNative(nQuintupletsCPU_buf); - short* module_subdets = alpaka::getPtrNative(module_subdets_buf); - short* module_layers = alpaka::getPtrNative(module_layers_buf); - int* module_quintupletModuleIndices = alpaka::getPtrNative(module_quintupletModuleIndices_buf); + auto const* nQuintupletsCPU = nQuintupletsCPU_buf.data(); + auto const* module_subdets = module_subdets_buf.data(); + auto const* module_layers = module_layers_buf.data(); + auto const* module_quintupletModuleIndices = module_quintupletModuleIndices_buf.data(); for (uint16_t i = 0; i < nLowerModules_; i++) { if (!(nQuintupletsCPU[i] == 0 or module_quintupletModuleIndices[i] == -1)) { - if (module_subdets[i] == Barrel) { + if (module_subdets[i] == ::lst::Barrel) { n_quintuplets_by_layer_barrel_[module_layers[i] - 1] += nQuintupletsCPU[i]; } else { n_quintuplets_by_layer_endcap_[module_layers[i] - 1] += nQuintupletsCPU[i]; @@ -1239,24 +1100,26 @@ void lst::Event::addQuintupletsToEventExplicit() { } } -void lst::Event::addTripletsToEventExplicit() { +void Event::addTripletsToEventExplicit() { auto nTripletsCPU_buf = allocBufWrapper(devHost, nLowerModules_, queue); alpaka::memcpy(queue, nTripletsCPU_buf, tripletsBuffers->nTriplets_buf); + // FIXME: replace by ES host data auto module_subdets_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_->subdets_buf, nLowerModules_); + alpaka::memcpy(queue, module_subdets_buf, modulesBuffers_.subdets_buf, nLowerModules_); auto module_layers_buf = allocBufWrapper(devHost, nLowerModules_, queue); - alpaka::memcpy(queue, module_layers_buf, modulesBuffers_->layers_buf, nLowerModules_); + alpaka::memcpy(queue, module_layers_buf, modulesBuffers_.layers_buf, nLowerModules_); - alpaka::wait(queue); - unsigned int* nTripletsCPU = alpaka::getPtrNative(nTripletsCPU_buf); - short* module_subdets = alpaka::getPtrNative(module_subdets_buf); - short* module_layers = alpaka::getPtrNative(module_layers_buf); + alpaka::wait(queue); // wait for inputs before using them + + auto const* nTripletsCPU = nTripletsCPU_buf.data(); + auto const* module_subdets = module_subdets_buf.data(); + auto const* module_layers = module_layers_buf.data(); for (uint16_t i = 0; i < nLowerModules_; i++) { if (nTripletsCPU[i] != 0) { - if (module_subdets[i] == Barrel) { + if (module_subdets[i] == ::lst::Barrel) { n_triplets_by_layer_barrel_[module_layers[i] - 1] += nTripletsCPU[i]; } else { n_triplets_by_layer_endcap_[module_layers[i] - 1] += nTripletsCPU[i]; @@ -1265,7 +1128,7 @@ void lst::Event::addTripletsToEventExplicit() { } } -unsigned int lst::Event::getNumberOfHits() { +unsigned int Event::getNumberOfHits() { unsigned int hits = 0; for (auto& it : n_hits_by_layer_barrel_) { hits += it; @@ -1277,22 +1140,18 @@ unsigned int lst::Event::getNumberOfHits() { return hits; } -unsigned int lst::Event::getNumberOfHitsByLayer(unsigned int layer) { +unsigned int Event::getNumberOfHitsByLayer(unsigned int layer) { if (layer == 6) return n_hits_by_layer_barrel_[layer]; else return n_hits_by_layer_barrel_[layer] + n_hits_by_layer_endcap_[layer]; } -unsigned int lst::Event::getNumberOfHitsByLayerBarrel(unsigned int layer) { - return n_hits_by_layer_barrel_[layer]; -} +unsigned int Event::getNumberOfHitsByLayerBarrel(unsigned int layer) { return n_hits_by_layer_barrel_[layer]; } -unsigned int lst::Event::getNumberOfHitsByLayerEndcap(unsigned int layer) { - return n_hits_by_layer_endcap_[layer]; -} +unsigned int Event::getNumberOfHitsByLayerEndcap(unsigned int layer) { return n_hits_by_layer_endcap_[layer]; } -unsigned int lst::Event::getNumberOfMiniDoublets() { +unsigned int Event::getNumberOfMiniDoublets() { unsigned int miniDoublets = 0; for (auto& it : n_minidoublets_by_layer_barrel_) { miniDoublets += it; @@ -1304,22 +1163,22 @@ unsigned int lst::Event::getNumberOfMiniDoublets() { return miniDoublets; } -unsigned int lst::Event::getNumberOfMiniDoubletsByLayer(unsigned int layer) { +unsigned int Event::getNumberOfMiniDoubletsByLayer(unsigned int layer) { if (layer == 6) return n_minidoublets_by_layer_barrel_[layer]; else return n_minidoublets_by_layer_barrel_[layer] + n_minidoublets_by_layer_endcap_[layer]; } -unsigned int lst::Event::getNumberOfMiniDoubletsByLayerBarrel(unsigned int layer) { +unsigned int Event::getNumberOfMiniDoubletsByLayerBarrel(unsigned int layer) { return n_minidoublets_by_layer_barrel_[layer]; } -unsigned int lst::Event::getNumberOfMiniDoubletsByLayerEndcap(unsigned int layer) { +unsigned int Event::getNumberOfMiniDoubletsByLayerEndcap(unsigned int layer) { return n_minidoublets_by_layer_endcap_[layer]; } -unsigned int lst::Event::getNumberOfSegments() { +unsigned int Event::getNumberOfSegments() { unsigned int segments = 0; for (auto& it : n_segments_by_layer_barrel_) { segments += it; @@ -1331,22 +1190,18 @@ unsigned int lst::Event::getNumberOfSegments() { return segments; } -unsigned int lst::Event::getNumberOfSegmentsByLayer(unsigned int layer) { +unsigned int Event::getNumberOfSegmentsByLayer(unsigned int layer) { if (layer == 6) return n_segments_by_layer_barrel_[layer]; else return n_segments_by_layer_barrel_[layer] + n_segments_by_layer_endcap_[layer]; } -unsigned int lst::Event::getNumberOfSegmentsByLayerBarrel(unsigned int layer) { - return n_segments_by_layer_barrel_[layer]; -} +unsigned int Event::getNumberOfSegmentsByLayerBarrel(unsigned int layer) { return n_segments_by_layer_barrel_[layer]; } -unsigned int lst::Event::getNumberOfSegmentsByLayerEndcap(unsigned int layer) { - return n_segments_by_layer_endcap_[layer]; -} +unsigned int Event::getNumberOfSegmentsByLayerEndcap(unsigned int layer) { return n_segments_by_layer_endcap_[layer]; } -unsigned int lst::Event::getNumberOfTriplets() { +unsigned int Event::getNumberOfTriplets() { unsigned int triplets = 0; for (auto& it : n_triplets_by_layer_barrel_) { triplets += it; @@ -1358,44 +1213,34 @@ unsigned int lst::Event::getNumberOfTriplets() { return triplets; } -unsigned int lst::Event::getNumberOfTripletsByLayer(unsigned int layer) { +unsigned int Event::getNumberOfTripletsByLayer(unsigned int layer) { if (layer == 6) return n_triplets_by_layer_barrel_[layer]; else return n_triplets_by_layer_barrel_[layer] + n_triplets_by_layer_endcap_[layer]; } -unsigned int lst::Event::getNumberOfTripletsByLayerBarrel(unsigned int layer) { - return n_triplets_by_layer_barrel_[layer]; -} - -unsigned int lst::Event::getNumberOfTripletsByLayerEndcap(unsigned int layer) { - return n_triplets_by_layer_endcap_[layer]; -} +unsigned int Event::getNumberOfTripletsByLayerBarrel(unsigned int layer) { return n_triplets_by_layer_barrel_[layer]; } -int lst::Event::getNumberOfPixelTriplets() { - auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); +unsigned int Event::getNumberOfTripletsByLayerEndcap(unsigned int layer) { return n_triplets_by_layer_endcap_[layer]; } - alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf); - alpaka::wait(queue); +int Event::getNumberOfPixelTriplets() { + auto nPixelTriplets_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); - int nPixelTriplets = *alpaka::getPtrNative(nPixelTriplets_buf); + alpaka::memcpy(queue, nPixelTriplets_buf_h, pixelTripletsBuffers->nPixelTriplets_buf); - return nPixelTriplets; + return *nPixelTriplets_buf_h.data(); } -int lst::Event::getNumberOfPixelQuintuplets() { - auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); - - alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf); - alpaka::wait(queue); +int Event::getNumberOfPixelQuintuplets() { + auto nPixelQuintuplets_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); - int nPixelQuintuplets = *alpaka::getPtrNative(nPixelQuintuplets_buf); + alpaka::memcpy(queue, nPixelQuintuplets_buf_h, pixelQuintupletsBuffers->nPixelQuintuplets_buf); - return nPixelQuintuplets; + return *nPixelQuintuplets_buf_h.data(); } -unsigned int lst::Event::getNumberOfQuintuplets() { +unsigned int Event::getNumberOfQuintuplets() { unsigned int quintuplets = 0; for (auto& it : n_quintuplets_by_layer_barrel_) { quintuplets += it; @@ -1407,133 +1252,117 @@ unsigned int lst::Event::getNumberOfQuintuplets() { return quintuplets; } -unsigned int lst::Event::getNumberOfQuintupletsByLayer(unsigned int layer) { +unsigned int Event::getNumberOfQuintupletsByLayer(unsigned int layer) { if (layer == 6) return n_quintuplets_by_layer_barrel_[layer]; else return n_quintuplets_by_layer_barrel_[layer] + n_quintuplets_by_layer_endcap_[layer]; } -unsigned int lst::Event::getNumberOfQuintupletsByLayerBarrel(unsigned int layer) { +unsigned int Event::getNumberOfQuintupletsByLayerBarrel(unsigned int layer) { return n_quintuplets_by_layer_barrel_[layer]; } -unsigned int lst::Event::getNumberOfQuintupletsByLayerEndcap(unsigned int layer) { +unsigned int Event::getNumberOfQuintupletsByLayerEndcap(unsigned int layer) { return n_quintuplets_by_layer_endcap_[layer]; } -int lst::Event::getNumberOfTrackCandidates() { - auto nTrackCandidates_buf = allocBufWrapper(devHost, 1, queue); - - alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf); - alpaka::wait(queue); +int Event::getNumberOfTrackCandidates() { + auto nTrackCandidates_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); - int nTrackCandidates = *alpaka::getPtrNative(nTrackCandidates_buf); + alpaka::memcpy(queue, nTrackCandidates_buf_h, trackCandidatesBuffers->nTrackCandidates_buf); - return nTrackCandidates; + return *nTrackCandidates_buf_h.data(); } -int lst::Event::getNumberOfPT5TrackCandidates() { - auto nTrackCandidatesPT5_buf = allocBufWrapper(devHost, 1, queue); +int Event::getNumberOfPT5TrackCandidates() { + auto nTrackCandidatesPT5_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); - alpaka::memcpy(queue, nTrackCandidatesPT5_buf, trackCandidatesBuffers->nTrackCandidatespT5_buf); + alpaka::memcpy(queue, nTrackCandidatesPT5_buf_h, trackCandidatesBuffers->nTrackCandidatespT5_buf); alpaka::wait(queue); - int nTrackCandidatesPT5 = *alpaka::getPtrNative(nTrackCandidatesPT5_buf); - - return nTrackCandidatesPT5; + return *nTrackCandidatesPT5_buf_h.data(); } -int lst::Event::getNumberOfPT3TrackCandidates() { - auto nTrackCandidatesPT3_buf = allocBufWrapper(devHost, 1, queue); - - alpaka::memcpy(queue, nTrackCandidatesPT3_buf, trackCandidatesBuffers->nTrackCandidatespT3_buf); - alpaka::wait(queue); +int Event::getNumberOfPT3TrackCandidates() { + auto nTrackCandidatesPT3_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); - int nTrackCandidatesPT3 = *alpaka::getPtrNative(nTrackCandidatesPT3_buf); + alpaka::memcpy(queue, nTrackCandidatesPT3_buf_h, trackCandidatesBuffers->nTrackCandidatespT3_buf); - return nTrackCandidatesPT3; + return *nTrackCandidatesPT3_buf_h.data(); } -int lst::Event::getNumberOfPLSTrackCandidates() { - auto nTrackCandidatesPLS_buf = allocBufWrapper(devHost, 1, queue); - - alpaka::memcpy(queue, nTrackCandidatesPLS_buf, trackCandidatesBuffers->nTrackCandidatespLS_buf); - alpaka::wait(queue); +int Event::getNumberOfPLSTrackCandidates() { + auto nTrackCandidatesPLS_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); - unsigned int nTrackCandidatesPLS = *alpaka::getPtrNative(nTrackCandidatesPLS_buf); + alpaka::memcpy(queue, nTrackCandidatesPLS_buf_h, trackCandidatesBuffers->nTrackCandidatespLS_buf); - return nTrackCandidatesPLS; + return *nTrackCandidatesPLS_buf_h.data(); } -int lst::Event::getNumberOfPixelTrackCandidates() { - auto nTrackCandidates_buf = allocBufWrapper(devHost, 1, queue); - auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1, queue); +int Event::getNumberOfPixelTrackCandidates() { + auto nTrackCandidates_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + auto nTrackCandidatesT5_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); - alpaka::memcpy(queue, nTrackCandidates_buf, trackCandidatesBuffers->nTrackCandidates_buf); - alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf); - alpaka::wait(queue); - - int nTrackCandidates = *alpaka::getPtrNative(nTrackCandidates_buf); - int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCandidatesT5_buf); + alpaka::memcpy(queue, nTrackCandidates_buf_h, trackCandidatesBuffers->nTrackCandidates_buf); + alpaka::memcpy(queue, nTrackCandidatesT5_buf_h, trackCandidatesBuffers->nTrackCandidatesT5_buf); - return nTrackCandidates - nTrackCandidatesT5; + return (*nTrackCandidates_buf_h.data()) - (*nTrackCandidatesT5_buf_h.data()); } -int lst::Event::getNumberOfT5TrackCandidates() { - auto nTrackCandidatesT5_buf = allocBufWrapper(devHost, 1, queue); +int Event::getNumberOfT5TrackCandidates() { + auto nTrackCandidatesT5_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); - alpaka::memcpy(queue, nTrackCandidatesT5_buf, trackCandidatesBuffers->nTrackCandidatesT5_buf); - alpaka::wait(queue); + alpaka::memcpy(queue, nTrackCandidatesT5_buf_h, trackCandidatesBuffers->nTrackCandidatesT5_buf); - int nTrackCandidatesT5 = *alpaka::getPtrNative(nTrackCandidatesT5_buf); - - return nTrackCandidatesT5; + return *nTrackCandidatesT5_buf_h.data(); } -lst::HitsBuffer* lst::Event::getHits() //std::shared_ptr should take care of garbage collection +HitsBuffer* Event::getHits(bool sync) //std::shared_ptr should take care of garbage collection { if (hitsInCPU == nullptr) { - auto nHits_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf); - alpaka::wait(queue); + auto nHits_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nHits_buf_h, hitsBuffers->nHits_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nHits = *alpaka::getPtrNative(nHits_buf); - hitsInCPU = new lst::HitsBuffer(nModules_, nHits, devHost, queue); + auto const nHits = *nHits_buf_h.data(); + hitsInCPU = new HitsBuffer(nModules_, nHits, devHost, queue); hitsInCPU->setData(*hitsInCPU); - *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits; + *hitsInCPU->nHits_buf.data() = nHits; alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsBuffers->idxs_buf, nHits); alpaka::memcpy(queue, hitsInCPU->detid_buf, hitsBuffers->detid_buf, nHits); alpaka::memcpy(queue, hitsInCPU->xs_buf, hitsBuffers->xs_buf, nHits); alpaka::memcpy(queue, hitsInCPU->ys_buf, hitsBuffers->ys_buf, nHits); alpaka::memcpy(queue, hitsInCPU->zs_buf, hitsBuffers->zs_buf, nHits); alpaka::memcpy(queue, hitsInCPU->moduleIndices_buf, hitsBuffers->moduleIndices_buf, nHits); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return hitsInCPU; } -lst::HitsBuffer* lst::Event::getHitsInCMSSW() { +HitsBuffer* Event::getHitsInCMSSW(bool sync) { if (hitsInCPU == nullptr) { - auto nHits_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nHits_buf, hitsBuffers->nHits_buf); - alpaka::wait(queue); + auto nHits_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nHits_buf_h, hitsBuffers->nHits_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nHits = *alpaka::getPtrNative(nHits_buf); - hitsInCPU = new lst::HitsBuffer(nModules_, nHits, devHost, queue); + auto const nHits = *nHits_buf_h.data(); + hitsInCPU = new HitsBuffer(nModules_, nHits, devHost, queue); hitsInCPU->setData(*hitsInCPU); - *alpaka::getPtrNative(hitsInCPU->nHits_buf) = nHits; + *hitsInCPU->nHits_buf.data() = nHits; alpaka::memcpy(queue, hitsInCPU->idxs_buf, hitsBuffers->idxs_buf, nHits); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return hitsInCPU; } -lst::ObjectRangesBuffer* lst::Event::getRanges() { +ObjectRangesBuffer* Event::getRanges(bool sync) { if (rangesInCPU == nullptr) { - rangesInCPU = new lst::ObjectRangesBuffer(nModules_, nLowerModules_, devHost, queue); + rangesInCPU = new ObjectRangesBuffer(nModules_, nLowerModules_, devHost, queue); rangesInCPU->setData(*rangesInCPU); alpaka::memcpy(queue, rangesInCPU->hitRanges_buf, rangesBuffers->hitRanges_buf); @@ -1541,46 +1370,48 @@ lst::ObjectRangesBuffer* lst::Event::getRanges() { alpaka::memcpy(queue, rangesInCPU->miniDoubletModuleIndices_buf, rangesBuffers->miniDoubletModuleIndices_buf); alpaka::memcpy(queue, rangesInCPU->segmentModuleIndices_buf, rangesBuffers->segmentModuleIndices_buf); alpaka::memcpy(queue, rangesInCPU->tripletModuleIndices_buf, rangesBuffers->tripletModuleIndices_buf); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // wait to get completed host data } return rangesInCPU; } -lst::MiniDoubletsBuffer* lst::Event::getMiniDoublets() { +MiniDoubletsBuffer* Event::getMiniDoublets(bool sync) { if (mdsInCPU == nullptr) { // Get nMemoryLocations parameter to initialize host based mdsInCPU - auto nMemHost_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nMemHost_buf, miniDoubletsBuffers->nMemoryLocations_buf); - alpaka::wait(queue); + auto nMemHost_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nMemHost_buf_h, miniDoubletsBuffers->nMemoryLocations_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nMemHost = *alpaka::getPtrNative(nMemHost_buf); - mdsInCPU = new lst::MiniDoubletsBuffer(nMemHost, nLowerModules_, devHost, queue); + auto const nMemHost = *nMemHost_buf_h.data(); + mdsInCPU = new MiniDoubletsBuffer(nMemHost, nLowerModules_, devHost, queue); mdsInCPU->setData(*mdsInCPU); - *alpaka::getPtrNative(mdsInCPU->nMemoryLocations_buf) = nMemHost; + *mdsInCPU->nMemoryLocations_buf.data() = nMemHost; alpaka::memcpy(queue, mdsInCPU->anchorHitIndices_buf, miniDoubletsBuffers->anchorHitIndices_buf, nMemHost); alpaka::memcpy(queue, mdsInCPU->outerHitIndices_buf, miniDoubletsBuffers->outerHitIndices_buf, nMemHost); alpaka::memcpy(queue, mdsInCPU->dphichanges_buf, miniDoubletsBuffers->dphichanges_buf, nMemHost); alpaka::memcpy(queue, mdsInCPU->nMDs_buf, miniDoubletsBuffers->nMDs_buf); alpaka::memcpy(queue, mdsInCPU->totOccupancyMDs_buf, miniDoubletsBuffers->totOccupancyMDs_buf); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return mdsInCPU; } -lst::SegmentsBuffer* lst::Event::getSegments() { +SegmentsBuffer* Event::getSegments(bool sync) { if (segmentsInCPU == nullptr) { // Get nMemoryLocations parameter to initialize host based segmentsInCPU - auto nMemHost_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nMemHost_buf, segmentsBuffers->nMemoryLocations_buf); - alpaka::wait(queue); + auto nMemHost_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nMemHost_buf_h, segmentsBuffers->nMemoryLocations_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nMemHost = *alpaka::getPtrNative(nMemHost_buf); + auto const nMemHost = *nMemHost_buf_h.data(); segmentsInCPU = - new lst::SegmentsBuffer(nMemHost, nLowerModules_, n_max_pixel_segments_per_module, devHost, queue); + new SegmentsBuffer(nMemHost, nLowerModules_, n_max_pixel_segments_per_module, devHost, queue); segmentsInCPU->setData(*segmentsInCPU); - *alpaka::getPtrNative(segmentsInCPU->nMemoryLocations_buf) = nMemHost; + *segmentsInCPU->nMemoryLocations_buf.data() = nMemHost; alpaka::memcpy(queue, segmentsInCPU->nSegments_buf, segmentsBuffers->nSegments_buf); alpaka::memcpy(queue, segmentsInCPU->mdIndices_buf, segmentsBuffers->mdIndices_buf, 2u * nMemHost); alpaka::memcpy(queue, @@ -1599,23 +1430,24 @@ lst::SegmentsBuffer* lst::Event::getSegments() { alpaka::memcpy(queue, segmentsInCPU->isDup_buf, segmentsBuffers->isDup_buf); alpaka::memcpy(queue, segmentsInCPU->isQuad_buf, segmentsBuffers->isQuad_buf); alpaka::memcpy(queue, segmentsInCPU->score_buf, segmentsBuffers->score_buf); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return segmentsInCPU; } -lst::TripletsBuffer* lst::Event::getTriplets() { +TripletsBuffer* Event::getTriplets(bool sync) { if (tripletsInCPU == nullptr) { // Get nMemoryLocations parameter to initialize host based tripletsInCPU - auto nMemHost_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nMemHost_buf, tripletsBuffers->nMemoryLocations_buf); - alpaka::wait(queue); + auto nMemHost_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nMemHost_buf_h, tripletsBuffers->nMemoryLocations_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nMemHost = *alpaka::getPtrNative(nMemHost_buf); - tripletsInCPU = new lst::TripletsBuffer(nMemHost, nLowerModules_, devHost, queue); + auto const nMemHost = *nMemHost_buf_h.data(); + tripletsInCPU = new TripletsBuffer(nMemHost, nLowerModules_, devHost, queue); tripletsInCPU->setData(*tripletsInCPU); - *alpaka::getPtrNative(tripletsInCPU->nMemoryLocations_buf) = nMemHost; + *tripletsInCPU->nMemoryLocations_buf.data() = nMemHost; #ifdef CUT_VALUE_DEBUG alpaka::memcpy(queue, tripletsInCPU->zOut_buf, tripletsBuffers->zOut_buf, nMemHost); alpaka::memcpy(queue, tripletsInCPU->zLo_buf, tripletsBuffers->zLo_buf, nMemHost); @@ -1635,23 +1467,24 @@ lst::TripletsBuffer* lst::Event::getTriplets() { alpaka::memcpy(queue, tripletsInCPU->circleRadius_buf, tripletsBuffers->circleRadius_buf, nMemHost); alpaka::memcpy(queue, tripletsInCPU->nTriplets_buf, tripletsBuffers->nTriplets_buf); alpaka::memcpy(queue, tripletsInCPU->totOccupancyTriplets_buf, tripletsBuffers->totOccupancyTriplets_buf); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return tripletsInCPU; } -lst::QuintupletsBuffer* lst::Event::getQuintuplets() { +QuintupletsBuffer* Event::getQuintuplets(bool sync) { if (quintupletsInCPU == nullptr) { // Get nMemoryLocations parameter to initialize host based quintupletsInCPU - auto nMemHost_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nMemHost_buf, quintupletsBuffers->nMemoryLocations_buf); - alpaka::wait(queue); + auto nMemHost_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nMemHost_buf_h, quintupletsBuffers->nMemoryLocations_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nMemHost = *alpaka::getPtrNative(nMemHost_buf); - quintupletsInCPU = new lst::QuintupletsBuffer(nMemHost, nLowerModules_, devHost, queue); + auto const nMemHost = *nMemHost_buf_h.data(); + quintupletsInCPU = new QuintupletsBuffer(nMemHost, nLowerModules_, devHost, queue); quintupletsInCPU->setData(*quintupletsInCPU); - *alpaka::getPtrNative(quintupletsInCPU->nMemoryLocations_buf) = nMemHost; + *quintupletsInCPU->nMemoryLocations_buf.data() = nMemHost; alpaka::memcpy(queue, quintupletsInCPU->nQuintuplets_buf, quintupletsBuffers->nQuintuplets_buf); alpaka::memcpy( queue, quintupletsInCPU->totOccupancyQuintuplets_buf, quintupletsBuffers->totOccupancyQuintuplets_buf); @@ -1671,23 +1504,24 @@ lst::QuintupletsBuffer* lst::Event::getQuintuplets() { alpaka::memcpy(queue, quintupletsInCPU->rzChiSquared_buf, quintupletsBuffers->rzChiSquared_buf, nMemHost); alpaka::memcpy( queue, quintupletsInCPU->nonAnchorChiSquared_buf, quintupletsBuffers->nonAnchorChiSquared_buf, nMemHost); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return quintupletsInCPU; } -lst::PixelTripletsBuffer* lst::Event::getPixelTriplets() { +PixelTripletsBuffer* Event::getPixelTriplets(bool sync) { if (pixelTripletsInCPU == nullptr) { // Get nPixelTriplets parameter to initialize host based quintupletsInCPU - auto nPixelTriplets_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nPixelTriplets_buf, pixelTripletsBuffers->nPixelTriplets_buf); - alpaka::wait(queue); + auto nPixelTriplets_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nPixelTriplets_buf_h, pixelTripletsBuffers->nPixelTriplets_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nPixelTriplets = *alpaka::getPtrNative(nPixelTriplets_buf); - pixelTripletsInCPU = new lst::PixelTripletsBuffer(nPixelTriplets, devHost, queue); + auto const nPixelTriplets = *nPixelTriplets_buf_h.data(); + pixelTripletsInCPU = new PixelTripletsBuffer(nPixelTriplets, devHost, queue); pixelTripletsInCPU->setData(*pixelTripletsInCPU); - *alpaka::getPtrNative(pixelTripletsInCPU->nPixelTriplets_buf) = nPixelTriplets; + *pixelTripletsInCPU->nPixelTriplets_buf.data() = nPixelTriplets; alpaka::memcpy( queue, pixelTripletsInCPU->totOccupancyPixelTriplets_buf, pixelTripletsBuffers->totOccupancyPixelTriplets_buf); alpaka::memcpy(queue, pixelTripletsInCPU->rzChiSquared_buf, pixelTripletsBuffers->rzChiSquared_buf, nPixelTriplets); @@ -1710,23 +1544,24 @@ lst::PixelTripletsBuffer* lst::Event::getPixelTriplets() { alpaka::memcpy(queue, pixelTripletsInCPU->eta_buf, pixelTripletsBuffers->eta_buf, nPixelTriplets); alpaka::memcpy(queue, pixelTripletsInCPU->phi_buf, pixelTripletsBuffers->phi_buf, nPixelTriplets); alpaka::memcpy(queue, pixelTripletsInCPU->score_buf, pixelTripletsBuffers->score_buf, nPixelTriplets); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return pixelTripletsInCPU; } -lst::PixelQuintupletsBuffer* lst::Event::getPixelQuintuplets() { +PixelQuintupletsBuffer* Event::getPixelQuintuplets(bool sync) { if (pixelQuintupletsInCPU == nullptr) { // Get nPixelQuintuplets parameter to initialize host based quintupletsInCPU - auto nPixelQuintuplets_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nPixelQuintuplets_buf, pixelQuintupletsBuffers->nPixelQuintuplets_buf); - alpaka::wait(queue); + auto nPixelQuintuplets_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nPixelQuintuplets_buf_h, pixelQuintupletsBuffers->nPixelQuintuplets_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nPixelQuintuplets = *alpaka::getPtrNative(nPixelQuintuplets_buf); - pixelQuintupletsInCPU = new lst::PixelQuintupletsBuffer(nPixelQuintuplets, devHost, queue); + auto const nPixelQuintuplets = *nPixelQuintuplets_buf_h.data(); + pixelQuintupletsInCPU = new PixelQuintupletsBuffer(nPixelQuintuplets, devHost, queue); pixelQuintupletsInCPU->setData(*pixelQuintupletsInCPU); - *alpaka::getPtrNative(pixelQuintupletsInCPU->nPixelQuintuplets_buf) = nPixelQuintuplets; + *pixelQuintupletsInCPU->nPixelQuintuplets_buf.data() = nPixelQuintuplets; alpaka::memcpy(queue, pixelQuintupletsInCPU->totOccupancyPixelQuintuplets_buf, pixelQuintupletsBuffers->totOccupancyPixelQuintuplets_buf); @@ -1746,24 +1581,25 @@ lst::PixelQuintupletsBuffer* lst::Event::getPixelQuintuplets() { queue, pixelQuintupletsInCPU->T5Indices_buf, pixelQuintupletsBuffers->T5Indices_buf, nPixelQuintuplets); alpaka::memcpy(queue, pixelQuintupletsInCPU->isDup_buf, pixelQuintupletsBuffers->isDup_buf, nPixelQuintuplets); alpaka::memcpy(queue, pixelQuintupletsInCPU->score_buf, pixelQuintupletsBuffers->score_buf, nPixelQuintuplets); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return pixelQuintupletsInCPU; } -lst::TrackCandidatesBuffer* lst::Event::getTrackCandidates() { +TrackCandidatesBuffer* Event::getTrackCandidates(bool sync) { if (trackCandidatesInCPU == nullptr) { // Get nTrackCanHost parameter to initialize host based trackCandidatesInCPU - auto nTrackCanHost_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nTrackCanHost_buf, trackCandidatesBuffers->nTrackCandidates_buf); + auto nTrackCanHost_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nTrackCanHost_buf_h, trackCandidatesBuffers->nTrackCandidates_buf); alpaka::wait(queue); - unsigned int nTrackCanHost = *alpaka::getPtrNative(nTrackCanHost_buf); - trackCandidatesInCPU = new lst::TrackCandidatesBuffer( + auto const nTrackCanHost = *nTrackCanHost_buf_h.data(); + trackCandidatesInCPU = new TrackCandidatesBuffer( n_max_nonpixel_track_candidates + n_max_pixel_track_candidates, devHost, queue); trackCandidatesInCPU->setData(*trackCandidatesInCPU); - *alpaka::getPtrNative(trackCandidatesInCPU->nTrackCandidates_buf) = nTrackCanHost; + *trackCandidatesInCPU->nTrackCandidates_buf.data() = nTrackCanHost; alpaka::memcpy(queue, trackCandidatesInCPU->hitIndices_buf, trackCandidatesBuffers->hitIndices_buf, @@ -1784,24 +1620,25 @@ lst::TrackCandidatesBuffer* lst::Event::getTrackCandidates() { trackCandidatesInCPU->trackCandidateType_buf, trackCandidatesBuffers->trackCandidateType_buf, nTrackCanHost); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return trackCandidatesInCPU; } -lst::TrackCandidatesBuffer* lst::Event::getTrackCandidatesInCMSSW() { +TrackCandidatesBuffer* Event::getTrackCandidatesInCMSSW(bool sync) { if (trackCandidatesInCPU == nullptr) { // Get nTrackCanHost parameter to initialize host based trackCandidatesInCPU - auto nTrackCanHost_buf = allocBufWrapper(devHost, 1, queue); - alpaka::memcpy(queue, nTrackCanHost_buf, trackCandidatesBuffers->nTrackCandidates_buf); - alpaka::wait(queue); + auto nTrackCanHost_buf_h = cms::alpakatools::make_host_buffer(queue, 1u); + alpaka::memcpy(queue, nTrackCanHost_buf_h, trackCandidatesBuffers->nTrackCandidates_buf); + alpaka::wait(queue); // wait for the value before using - unsigned int nTrackCanHost = *alpaka::getPtrNative(nTrackCanHost_buf); - trackCandidatesInCPU = new lst::TrackCandidatesBuffer( + auto const nTrackCanHost = *nTrackCanHost_buf_h.data(); + trackCandidatesInCPU = new TrackCandidatesBuffer( n_max_nonpixel_track_candidates + n_max_pixel_track_candidates, devHost, queue); trackCandidatesInCPU->setData(*trackCandidatesInCPU); - *alpaka::getPtrNative(trackCandidatesInCPU->nTrackCandidates_buf) = nTrackCanHost; + *trackCandidatesInCPU->nTrackCandidates_buf.data() = nTrackCanHost; alpaka::memcpy(queue, trackCandidatesInCPU->hitIndices_buf, trackCandidatesBuffers->hitIndices_buf, @@ -1812,17 +1649,20 @@ lst::TrackCandidatesBuffer* lst::Event::getTrackCandidatesInCMSS trackCandidatesInCPU->trackCandidateType_buf, trackCandidatesBuffers->trackCandidateType_buf, nTrackCanHost); - alpaka::wait(queue); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return trackCandidatesInCPU; } -lst::ModulesBuffer* lst::Event::getModules(bool isFull) { +ModulesBuffer* Event::getModules(bool isFull, bool sync) { if (modulesInCPU == nullptr) { // The last input here is just a small placeholder for the allocation. - modulesInCPU = new lst::ModulesBuffer(devHost, nModules_, nPixels_); + modulesInCPU = new ModulesBuffer(devHost, nModules_, nPixels_); - modulesInCPU->copyFromSrc(queue, *modulesBuffers_, isFull); + modulesInCPU->copyFromSrc(queue, modulesBuffers_, isFull); + if (sync) + alpaka::wait(queue); // host consumers expect filled data } return modulesInCPU; } diff --git a/RecoTracker/LSTCore/src/alpaka/Event.h b/RecoTracker/LSTCore/src/alpaka/Event.h index 01abacba7dc74..2ad8e150ece88 100644 --- a/RecoTracker/LSTCore/src/alpaka/Event.h +++ b/RecoTracker/LSTCore/src/alpaka/Event.h @@ -2,8 +2,8 @@ #define RecoTracker_LSTCore_src_alpaka_Event_h #include "RecoTracker/LSTCore/interface/alpaka/Constants.h" +#include "RecoTracker/LSTCore/interface/alpaka/LST.h" #include "RecoTracker/LSTCore/interface/Module.h" -#include "RecoTracker/LSTCore/interface/LST.h" #include "Hit.h" #include "Segment.h" @@ -17,190 +17,193 @@ #include "HeterogeneousCore/AlpakaInterface/interface/host.h" -namespace lst { - - using namespace ALPAKA_ACCELERATOR_NAMESPACE; - - template - class Event; - - template <> - class Event { - private: - Queue queue; - Device devAcc; - DevHost devHost; - bool addObjects; - - std::array n_hits_by_layer_barrel_; - std::array n_hits_by_layer_endcap_; - std::array n_minidoublets_by_layer_barrel_; - std::array n_minidoublets_by_layer_endcap_; - std::array n_segments_by_layer_barrel_; - std::array n_segments_by_layer_endcap_; - std::array n_triplets_by_layer_barrel_; - std::array n_triplets_by_layer_endcap_; - std::array n_trackCandidates_by_layer_barrel_; - std::array n_trackCandidates_by_layer_endcap_; - std::array n_quintuplets_by_layer_barrel_; - std::array n_quintuplets_by_layer_endcap_; - - //Device stuff - unsigned int nTotalSegments; - ObjectRanges* rangesInGPU; - ObjectRangesBuffer* rangesBuffers; - Hits* hitsInGPU; - HitsBuffer* hitsBuffers; - MiniDoublets* mdsInGPU; - MiniDoubletsBuffer* miniDoubletsBuffers; - Segments* segmentsInGPU; - SegmentsBuffer* segmentsBuffers; - Triplets* tripletsInGPU; - TripletsBuffer* tripletsBuffers; - Quintuplets* quintupletsInGPU; - QuintupletsBuffer* quintupletsBuffers; - TrackCandidates* trackCandidatesInGPU; - TrackCandidatesBuffer* trackCandidatesBuffers; - PixelTriplets* pixelTripletsInGPU; - PixelTripletsBuffer* pixelTripletsBuffers; - PixelQuintuplets* pixelQuintupletsInGPU; - PixelQuintupletsBuffer* pixelQuintupletsBuffers; - - //CPU interface stuff - ObjectRangesBuffer* rangesInCPU; - HitsBuffer* hitsInCPU; - MiniDoubletsBuffer* mdsInCPU; - SegmentsBuffer* segmentsInCPU; - TripletsBuffer* tripletsInCPU; - TrackCandidatesBuffer* trackCandidatesInCPU; - ModulesBuffer* modulesInCPU; - QuintupletsBuffer* quintupletsInCPU; - PixelTripletsBuffer* pixelTripletsInCPU; - PixelQuintupletsBuffer* pixelQuintupletsInCPU; - - void init(bool verbose); - - int* superbinCPU; - int8_t* pixelTypeCPU; - - const uint16_t nModules_; - const uint16_t nLowerModules_; - const unsigned int nPixels_; - const unsigned int nEndCapMap_; - const std::shared_ptr> modulesBuffers_; - const std::shared_ptr pixelMapping_; - const std::shared_ptr> endcapGeometryBuffers_; - - public: - // Constructor used for CMSSW integration. Uses an external queue. - template - Event(bool verbose, TQueue const& q, const LSTESData* deviceESData) - : queue(q), - devAcc(alpaka::getDev(q)), - devHost(cms::alpakatools::host()), - nModules_(deviceESData->nModules), - nLowerModules_(deviceESData->nLowerModules), - nPixels_(deviceESData->nPixels), - nEndCapMap_(deviceESData->nEndCapMap), - modulesBuffers_(deviceESData->modulesBuffers), - pixelMapping_(deviceESData->pixelMapping), - endcapGeometryBuffers_(deviceESData->endcapGeometryBuffers) { - init(verbose); - } - void resetEvent(); - - // Calls the appropriate hit function, then increments the counter - void addHitToEvent(std::vector const& x, - std::vector const& y, - std::vector const& z, - std::vector const& detId, - std::vector const& idxInNtuple); - void addPixelSegmentToEvent(std::vector const& hitIndices0, - std::vector const& hitIndices1, - std::vector const& hitIndices2, - std::vector const& hitIndices3, - std::vector const& dPhiChange, - std::vector const& ptIn, - std::vector const& ptErr, - std::vector const& px, - std::vector const& py, - std::vector const& pz, - std::vector const& eta, - std::vector const& etaErr, - std::vector const& phi, - std::vector const& charge, - std::vector const& seedIdx, - std::vector const& superbin, - std::vector const& pixelType, - std::vector const& isQuad); - - // functions that map the objects to the appropriate modules - void addMiniDoubletsToEventExplicit(); - void addSegmentsToEventExplicit(); - void addTripletsToEventExplicit(); - void addQuintupletsToEventExplicit(); - void resetObjectsInModule(); - - void createMiniDoublets(); - void createSegmentsWithModuleMap(); - void createTriplets(); - void createPixelTracklets(); - void createPixelTrackletsWithMap(); - void createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets); - void createExtendedTracks(); - void createQuintuplets(); - void createPixelTriplets(); - void createPixelQuintuplets(); - void pixelLineSegmentCleaning(bool no_pls_dupclean); - - unsigned int getNumberOfHits(); - unsigned int getNumberOfHitsByLayer(unsigned int layer); - unsigned int getNumberOfHitsByLayerBarrel(unsigned int layer); - unsigned int getNumberOfHitsByLayerEndcap(unsigned int layer); - - unsigned int getNumberOfMiniDoublets(); - unsigned int getNumberOfMiniDoubletsByLayer(unsigned int layer); - unsigned int getNumberOfMiniDoubletsByLayerBarrel(unsigned int layer); - unsigned int getNumberOfMiniDoubletsByLayerEndcap(unsigned int layer); - - unsigned int getNumberOfSegments(); - unsigned int getNumberOfSegmentsByLayer(unsigned int layer); - unsigned int getNumberOfSegmentsByLayerBarrel(unsigned int layer); - unsigned int getNumberOfSegmentsByLayerEndcap(unsigned int layer); - - unsigned int getNumberOfTriplets(); - unsigned int getNumberOfTripletsByLayer(unsigned int layer); - unsigned int getNumberOfTripletsByLayerBarrel(unsigned int layer); - unsigned int getNumberOfTripletsByLayerEndcap(unsigned int layer); - - int getNumberOfTrackCandidates(); - int getNumberOfPixelTrackCandidates(); - int getNumberOfPT5TrackCandidates(); - int getNumberOfPT3TrackCandidates(); - int getNumberOfT5TrackCandidates(); - int getNumberOfPLSTrackCandidates(); - - unsigned int getNumberOfQuintuplets(); - unsigned int getNumberOfQuintupletsByLayer(unsigned int layer); - unsigned int getNumberOfQuintupletsByLayerBarrel(unsigned int layer); - unsigned int getNumberOfQuintupletsByLayerEndcap(unsigned int layer); - - int getNumberOfPixelTriplets(); - int getNumberOfPixelQuintuplets(); - - ObjectRangesBuffer* getRanges(); - HitsBuffer* getHits(); - HitsBuffer* getHitsInCMSSW(); - MiniDoubletsBuffer* getMiniDoublets(); - SegmentsBuffer* getSegments(); - TripletsBuffer* getTriplets(); - QuintupletsBuffer* getQuintuplets(); - TrackCandidatesBuffer* getTrackCandidates(); - TrackCandidatesBuffer* getTrackCandidatesInCMSSW(); - PixelTripletsBuffer* getPixelTriplets(); - PixelQuintupletsBuffer* getPixelQuintuplets(); - ModulesBuffer* getModules(bool isFull = false); - }; - -} // namespace lst +using ::lst::EndcapGeometryBuffer; +using ::lst::LSTESData; +using ::lst::ModulesBuffer; +using ::lst::PixelMap; + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + namespace lst { + + class Event { + private: + Queue queue; + Device devAcc; + DevHost devHost; + bool addObjects; + + std::array n_hits_by_layer_barrel_; + std::array n_hits_by_layer_endcap_; + std::array n_minidoublets_by_layer_barrel_; + std::array n_minidoublets_by_layer_endcap_; + std::array n_segments_by_layer_barrel_; + std::array n_segments_by_layer_endcap_; + std::array n_triplets_by_layer_barrel_; + std::array n_triplets_by_layer_endcap_; + std::array n_trackCandidates_by_layer_barrel_; + std::array n_trackCandidates_by_layer_endcap_; + std::array n_quintuplets_by_layer_barrel_; + std::array n_quintuplets_by_layer_endcap_; + unsigned int nTotalSegments_; + + //Device stuff + ObjectRanges* rangesInGPU; + ObjectRangesBuffer* rangesBuffers; + Hits* hitsInGPU; + HitsBuffer* hitsBuffers; + MiniDoublets* mdsInGPU; + MiniDoubletsBuffer* miniDoubletsBuffers; + Segments* segmentsInGPU; + SegmentsBuffer* segmentsBuffers; + Triplets* tripletsInGPU; + TripletsBuffer* tripletsBuffers; + Quintuplets* quintupletsInGPU; + QuintupletsBuffer* quintupletsBuffers; + TrackCandidates* trackCandidatesInGPU; + TrackCandidatesBuffer* trackCandidatesBuffers; + PixelTriplets* pixelTripletsInGPU; + PixelTripletsBuffer* pixelTripletsBuffers; + PixelQuintuplets* pixelQuintupletsInGPU; + PixelQuintupletsBuffer* pixelQuintupletsBuffers; + + //CPU interface stuff + ObjectRangesBuffer* rangesInCPU; + HitsBuffer* hitsInCPU; + MiniDoubletsBuffer* mdsInCPU; + SegmentsBuffer* segmentsInCPU; + TripletsBuffer* tripletsInCPU; + TrackCandidatesBuffer* trackCandidatesInCPU; + ModulesBuffer* modulesInCPU; + QuintupletsBuffer* quintupletsInCPU; + PixelTripletsBuffer* pixelTripletsInCPU; + PixelQuintupletsBuffer* pixelQuintupletsInCPU; + + void initSync(bool verbose); + + int* superbinCPU; + int8_t* pixelTypeCPU; + + const uint16_t nModules_; + const uint16_t nLowerModules_; + const unsigned int nPixels_; + const unsigned int nEndCapMap_; + ModulesBuffer const& modulesBuffers_; + PixelMap const& pixelMapping_; + EndcapGeometryBuffer const& endcapGeometryBuffers_; + + public: + // Constructor used for CMSSW integration. Uses an external queue. + Event(bool verbose, Queue const& q, const LSTESData* deviceESData) + : queue(q), + devAcc(alpaka::getDev(q)), + devHost(cms::alpakatools::host()), + nModules_(deviceESData->nModules), + nLowerModules_(deviceESData->nLowerModules), + nPixels_(deviceESData->nPixels), + nEndCapMap_(deviceESData->nEndCapMap), + modulesBuffers_(deviceESData->modulesBuffers), + pixelMapping_(*deviceESData->pixelMapping), + endcapGeometryBuffers_(deviceESData->endcapGeometryBuffers) { + initSync(verbose); + } + void resetEventSync(); // synchronizes + void wait() const { alpaka::wait(queue); } + + // Calls the appropriate hit function, then increments the counter + void addHitToEvent(std::vector const& x, + std::vector const& y, + std::vector const& z, + std::vector const& detId, + std::vector const& idxInNtuple); + void addPixelSegmentToEvent(std::vector const& hitIndices0, + std::vector const& hitIndices1, + std::vector const& hitIndices2, + std::vector const& hitIndices3, + std::vector const& dPhiChange, + std::vector const& ptIn, + std::vector const& ptErr, + std::vector const& px, + std::vector const& py, + std::vector const& pz, + std::vector const& eta, + std::vector const& etaErr, + std::vector const& phi, + std::vector const& charge, + std::vector const& seedIdx, + std::vector const& superbin, + std::vector const& pixelType, + std::vector const& isQuad); + + void createMiniDoublets(); + void createSegmentsWithModuleMap(); + void createTriplets(); + void createTrackCandidates(bool no_pls_dupclean, bool tc_pls_triplets); + void createPixelTriplets(); + void createQuintuplets(); + void pixelLineSegmentCleaning(bool no_pls_dupclean); + void createPixelQuintuplets(); + + // functions that map the objects to the appropriate modules + void addMiniDoubletsToEventExplicit(); + void addSegmentsToEventExplicit(); + void addQuintupletsToEventExplicit(); + void addTripletsToEventExplicit(); + void resetObjectsInModule(); + + unsigned int getNumberOfHits(); + unsigned int getNumberOfHitsByLayer(unsigned int layer); + unsigned int getNumberOfHitsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfHitsByLayerEndcap(unsigned int layer); + + unsigned int getNumberOfMiniDoublets(); + unsigned int getNumberOfMiniDoubletsByLayer(unsigned int layer); + unsigned int getNumberOfMiniDoubletsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfMiniDoubletsByLayerEndcap(unsigned int layer); + + unsigned int getNumberOfSegments(); + unsigned int getNumberOfSegmentsByLayer(unsigned int layer); + unsigned int getNumberOfSegmentsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfSegmentsByLayerEndcap(unsigned int layer); + + unsigned int getNumberOfTriplets(); + unsigned int getNumberOfTripletsByLayer(unsigned int layer); + unsigned int getNumberOfTripletsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfTripletsByLayerEndcap(unsigned int layer); + + int getNumberOfPixelTriplets(); + int getNumberOfPixelQuintuplets(); + + unsigned int getNumberOfQuintuplets(); + unsigned int getNumberOfQuintupletsByLayer(unsigned int layer); + unsigned int getNumberOfQuintupletsByLayerBarrel(unsigned int layer); + unsigned int getNumberOfQuintupletsByLayerEndcap(unsigned int layer); + + int getNumberOfTrackCandidates(); + int getNumberOfPT5TrackCandidates(); + int getNumberOfPT3TrackCandidates(); + int getNumberOfPLSTrackCandidates(); + int getNumberOfPixelTrackCandidates(); + int getNumberOfT5TrackCandidates(); + + // sync adds alpaka::wait at the end of filling a buffer during lazy fill + // (has no effect on repeated calls) + // set to false may allow faster operation with concurrent calls of get* + // HANDLE WITH CARE + HitsBuffer* getHits(bool sync = true); + HitsBuffer* getHitsInCMSSW(bool sync = true); + ObjectRangesBuffer* getRanges(bool sync = true); + MiniDoubletsBuffer* getMiniDoublets(bool sync = true); + SegmentsBuffer* getSegments(bool sync = true); + TripletsBuffer* getTriplets(bool sync = true); + QuintupletsBuffer* getQuintuplets(bool sync = true); + PixelTripletsBuffer* getPixelTriplets(bool sync = true); + PixelQuintupletsBuffer* getPixelQuintuplets(bool sync = true); + TrackCandidatesBuffer* getTrackCandidates(bool sync = true); + TrackCandidatesBuffer* getTrackCandidatesInCMSSW(bool sync = true); + ModulesBuffer* getModules(bool isFull = false, bool sync = true); + }; + + } // namespace lst + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE #endif diff --git a/RecoTracker/LSTCore/src/alpaka/Hit.h b/RecoTracker/LSTCore/src/alpaka/Hit.h index c14ac26124e6d..1a54008d4331c 100644 --- a/RecoTracker/LSTCore/src/alpaka/Hit.h +++ b/RecoTracker/LSTCore/src/alpaka/Hit.h @@ -4,7 +4,9 @@ #include "RecoTracker/LSTCore/interface/alpaka/Constants.h" #include "RecoTracker/LSTCore/interface/Module.h" -namespace lst { +using ::lst::Modules; + +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { struct Hits { unsigned int* nHits; float* xs; @@ -28,25 +30,25 @@ namespace lst { template void setData(TBuff& buf) { - nHits = alpaka::getPtrNative(buf.nHits_buf); - xs = alpaka::getPtrNative(buf.xs_buf); - ys = alpaka::getPtrNative(buf.ys_buf); - zs = alpaka::getPtrNative(buf.zs_buf); - moduleIndices = alpaka::getPtrNative(buf.moduleIndices_buf); - idxs = alpaka::getPtrNative(buf.idxs_buf); - detid = alpaka::getPtrNative(buf.detid_buf); - rts = alpaka::getPtrNative(buf.rts_buf); - phis = alpaka::getPtrNative(buf.phis_buf); - etas = alpaka::getPtrNative(buf.etas_buf); - highEdgeXs = alpaka::getPtrNative(buf.highEdgeXs_buf); - highEdgeYs = alpaka::getPtrNative(buf.highEdgeYs_buf); - lowEdgeXs = alpaka::getPtrNative(buf.lowEdgeXs_buf); - lowEdgeYs = alpaka::getPtrNative(buf.lowEdgeYs_buf); - hitRanges = alpaka::getPtrNative(buf.hitRanges_buf); - hitRangesLower = alpaka::getPtrNative(buf.hitRangesLower_buf); - hitRangesUpper = alpaka::getPtrNative(buf.hitRangesUpper_buf); - hitRangesnLower = alpaka::getPtrNative(buf.hitRangesnLower_buf); - hitRangesnUpper = alpaka::getPtrNative(buf.hitRangesnUpper_buf); + nHits = buf.nHits_buf.data(); + xs = buf.xs_buf.data(); + ys = buf.ys_buf.data(); + zs = buf.zs_buf.data(); + moduleIndices = buf.moduleIndices_buf.data(); + idxs = buf.idxs_buf.data(); + detid = buf.detid_buf.data(); + rts = buf.rts_buf.data(); + phis = buf.phis_buf.data(); + etas = buf.etas_buf.data(); + highEdgeXs = buf.highEdgeXs_buf.data(); + highEdgeYs = buf.highEdgeYs_buf.data(); + lowEdgeXs = buf.lowEdgeXs_buf.data(); + lowEdgeYs = buf.lowEdgeYs_buf.data(); + hitRanges = buf.hitRanges_buf.data(); + hitRangesLower = buf.hitRangesLower_buf.data(); + hitRangesUpper = buf.hitRangesUpper_buf.data(); + hitRangesnLower = buf.hitRangesnLower_buf.data(); + hitRangesnUpper = buf.hitRangesnUpper_buf.data(); } }; @@ -100,7 +102,6 @@ namespace lst { alpaka::memset(queue, hitRangesUpper_buf, 0xff); alpaka::memset(queue, hitRangesnLower_buf, 0xff); alpaka::memset(queue, hitRangesnUpper_buf, 0xff); - alpaka::wait(queue); } inline Hits const* data() const { return &data_; } @@ -113,7 +114,7 @@ namespace lst { float rt = alpaka::math::sqrt(acc, x * x + y * y); float eta = ((z > 0) - (z < 0)) * alpaka::math::acosh(acc, r3 / rt); return eta; - }; + } template ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float phi_mpi_pi(TAcc const& acc, float x) { @@ -123,24 +124,24 @@ namespace lst { constexpr float o2pi = 1.f / (2.f * float(M_PI)); float n = alpaka::math::round(acc, x * o2pi); return x - n * float(2.f * float(M_PI)); - }; + } template ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float phi(TAcc const& acc, float x, float y) { return phi_mpi_pi(acc, float(M_PI) + alpaka::math::atan2(acc, -y, -x)); - }; + } template ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float deltaPhi(TAcc const& acc, float x1, float y1, float x2, float y2) { float phi1 = phi(acc, x1, y1); float phi2 = phi(acc, x2, y2); return phi_mpi_pi(acc, (phi2 - phi1)); - }; + } template ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE float deltaPhiChange(TAcc const& acc, float x1, float y1, float x2, float y2) { return deltaPhi(acc, x1, y1, x2 - x1, y2 - y1); - }; + } ALPAKA_FN_ACC ALPAKA_FN_INLINE float calculate_dPhi(float phi1, float phi2) { // Calculate dPhi @@ -154,7 +155,7 @@ namespace lst { } return dPhi; - }; + } ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE int binary_search(const unsigned int* data, // Array that we are searching over unsigned int search_val, // Value we want to find in data array @@ -175,14 +176,11 @@ namespace lst { } // Couldn't find search value in array. return -1; - }; + } - struct moduleRangesKernel { + struct ModuleRangesKernel { template - ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::Hits hitsInGPU, - int nLowerModules) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, Modules modulesInGPU, Hits hitsInGPU, int nLowerModules) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -200,7 +198,7 @@ namespace lst { } }; - struct hitLoopKernel { + struct HitLoopKernel { template ALPAKA_FN_ACC void operator()(TAcc const& acc, uint16_t Endcap, // Integer corresponding to endcap in module subdets @@ -209,8 +207,8 @@ namespace lst { unsigned int nEndCapMap, // Number of elements in endcap map const unsigned int* geoMapDetId, // DetId's from endcap map const float* geoMapPhi, // Phi values from endcap map - lst::Modules modulesInGPU, - lst::Hits hitsInGPU, + Modules modulesInGPU, + Hits hitsInGPU, unsigned int nHits) const // Total number of hits in event { auto const globalThreadIdx = alpaka::getIdx(acc); @@ -222,7 +220,7 @@ namespace lst { int iDetId = hitsInGPU.detid[ihit]; hitsInGPU.rts[ihit] = alpaka::math::sqrt(acc, ihit_x * ihit_x + ihit_y * ihit_y); - hitsInGPU.phis[ihit] = lst::phi(acc, ihit_x, ihit_y); + hitsInGPU.phis[ihit] = phi(acc, ihit_x, ihit_y); hitsInGPU.etas[ihit] = ((ihit_z > 0) - (ihit_z < 0)) * alpaka::math::acosh( @@ -244,15 +242,17 @@ namespace lst { hitsInGPU.lowEdgeYs[ihit] = ihit_y - 2.5f * sin_phi; } // Need to set initial value if index hasn't been seen before. - int old = alpaka::atomicOp( - acc, &(hitsInGPU.hitRanges[lastModuleIndex * 2]), -1, static_cast(ihit)); + int old = alpaka::atomicCas( + acc, &(hitsInGPU.hitRanges[lastModuleIndex * 2]), -1, static_cast(ihit), alpaka::hierarchy::Threads{}); // For subsequent visits, stores the min value. if (old != -1) - alpaka::atomicOp(acc, &hitsInGPU.hitRanges[lastModuleIndex * 2], static_cast(ihit)); + alpaka::atomicMin( + acc, &hitsInGPU.hitRanges[lastModuleIndex * 2], static_cast(ihit), alpaka::hierarchy::Threads{}); - alpaka::atomicOp(acc, &hitsInGPU.hitRanges[lastModuleIndex * 2 + 1], static_cast(ihit)); + alpaka::atomicMax( + acc, &hitsInGPU.hitRanges[lastModuleIndex * 2 + 1], static_cast(ihit), alpaka::hierarchy::Threads{}); } } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/Kernels.h b/RecoTracker/LSTCore/src/alpaka/Kernels.h index 8e3fa46c3ab6f..bc284d052cc05 100644 --- a/RecoTracker/LSTCore/src/alpaka/Kernels.h +++ b/RecoTracker/LSTCore/src/alpaka/Kernels.h @@ -13,32 +13,32 @@ #include "PixelQuintuplet.h" #include "PixelTriplet.h" -namespace lst { - ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmQuintupletFromMemory(lst::Quintuplets& quintupletsInGPU, +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmQuintupletFromMemory(Quintuplets& quintupletsInGPU, unsigned int quintupletIndex, bool secondpass = false) { quintupletsInGPU.isDup[quintupletIndex] |= 1 + secondpass; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelTripletFromMemory(lst::PixelTriplets& pixelTripletsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelTripletFromMemory(PixelTriplets& pixelTripletsInGPU, unsigned int pixelTripletIndex) { pixelTripletsInGPU.isDup[pixelTripletIndex] = true; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelQuintupletFromMemory(lst::PixelQuintuplets& pixelQuintupletsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelQuintupletFromMemory(PixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelQuintupletIndex) { pixelQuintupletsInGPU.isDup[pixelQuintupletIndex] = true; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(lst::Segments& segmentsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void rmPixelSegmentFromMemory(Segments& segmentsInGPU, unsigned int pixelSegmentArrayIndex, bool secondpass = false) { segmentsInGPU.isDup[pixelSegmentArrayIndex] |= 1 + secondpass; - }; + } ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkHitsT5(unsigned int ix, unsigned int jx, - lst::Quintuplets const& quintupletsInGPU) { + Quintuplets const& quintupletsInGPU) { unsigned int hits1[Params_T5::kHits]; unsigned int hits2[Params_T5::kHits]; @@ -61,11 +61,11 @@ namespace lst { } } return nMatched; - }; + } ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkHitspT5(unsigned int ix, unsigned int jx, - lst::PixelQuintuplets const& pixelQuintupletsInGPU) { + PixelQuintuplets const& pixelQuintupletsInGPU) { unsigned int hits1[Params_pT5::kHits]; unsigned int hits2[Params_pT5::kHits]; @@ -88,11 +88,11 @@ namespace lst { } } return nMatched; - }; + } ALPAKA_FN_ACC ALPAKA_FN_INLINE void checkHitspT3(unsigned int ix, unsigned int jx, - lst::PixelTriplets const& pixelTripletsInGPU, + PixelTriplets const& pixelTripletsInGPU, int* matched) { int phits1[Params_pLS::kHits]; int phits2[Params_pLS::kHits]; @@ -140,14 +140,14 @@ namespace lst { matched[0] = npMatched; matched[1] = nMatched; - }; + } - struct removeDupQuintupletsInGPUAfterBuild { + struct RemoveDupQuintupletsInGPUAfterBuild { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::Quintuplets quintupletsInGPU, - lst::ObjectRanges rangesInGPU) const { + Modules modulesInGPU, + Quintuplets quintupletsInGPU, + ObjectRanges rangesInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -168,7 +168,7 @@ namespace lst { float eta2 = __H2F(quintupletsInGPU.eta[jx]); float phi2 = __H2F(quintupletsInGPU.phi[jx]); float dEta = alpaka::math::abs(acc, eta1 - eta2); - float dPhi = lst::calculate_dPhi(phi1, phi2); + float dPhi = calculate_dPhi(phi1, phi2); float score_rphisum2 = __H2F(quintupletsInGPU.score_rphisum[jx]); if (dEta > 0.1f) @@ -192,11 +192,9 @@ namespace lst { } }; - struct removeDupQuintupletsInGPUBeforeTC { + struct RemoveDupQuintupletsInGPUBeforeTC { template - ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Quintuplets quintupletsInGPU, - lst::ObjectRanges rangesInGPU) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, Quintuplets quintupletsInGPU, ObjectRanges rangesInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -240,7 +238,7 @@ namespace lst { float score_rphisum2 = __H2F(quintupletsInGPU.score_rphisum[jx]); float dEta = alpaka::math::abs(acc, eta1 - eta2); - float dPhi = lst::calculate_dPhi(phi1, phi2); + float dPhi = calculate_dPhi(phi1, phi2); if (dEta > 0.1f) continue; @@ -267,9 +265,9 @@ namespace lst { } }; - struct removeDupPixelTripletsInGPUFromMap { + struct RemoveDupPixelTripletsInGPUFromMap { template - ALPAKA_FN_ACC void operator()(TAcc const& acc, lst::PixelTriplets pixelTripletsInGPU) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, PixelTriplets pixelTripletsInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -304,9 +302,9 @@ namespace lst { } }; - struct removeDupPixelQuintupletsInGPUFromMap { + struct RemoveDupPixelQuintupletsInGPUFromMap { template - ALPAKA_FN_ACC void operator()(TAcc const& acc, lst::PixelQuintuplets pixelQuintupletsInGPU) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, PixelQuintuplets pixelQuintupletsInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -331,12 +329,9 @@ namespace lst { } }; - struct checkHitspLS { + struct CheckHitspLS { template - ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::Segments segmentsInGPU, - bool secondpass) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, Modules modulesInGPU, Segments segmentsInGPU, bool secondpass) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -411,7 +406,7 @@ namespace lst { } if (secondpass) { float dEta = alpaka::math::abs(acc, eta_pix1 - eta_pix2); - float dPhi = lst::calculate_dPhi(phi_pix1, phi_pix2); + float dPhi = calculate_dPhi(phi_pix1, phi_pix2); float dR2 = dEta * dEta + dPhi * dPhi; if ((npMatched >= 1) || (dR2 < 1e-5f)) { @@ -422,5 +417,5 @@ namespace lst { } } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/LST.dev.cc b/RecoTracker/LSTCore/src/alpaka/LST.dev.cc index 940469e8682a2..e847eb892af8c 100644 --- a/RecoTracker/LSTCore/src/alpaka/LST.dev.cc +++ b/RecoTracker/LSTCore/src/alpaka/LST.dev.cc @@ -1,4 +1,4 @@ -#include "RecoTracker/LSTCore/interface/LST.h" +#include "RecoTracker/LSTCore/interface/alpaka/LST.h" #include "Event.h" @@ -19,26 +19,25 @@ namespace { } } // namespace -template <> -void lst::LST::prepareInput(std::vector const& see_px, - std::vector const& see_py, - std::vector const& see_pz, - std::vector const& see_dxy, - std::vector const& see_dz, - std::vector const& see_ptErr, - std::vector const& see_etaErr, - std::vector const& see_stateTrajGlbX, - std::vector const& see_stateTrajGlbY, - std::vector const& see_stateTrajGlbZ, - std::vector const& see_stateTrajGlbPx, - std::vector const& see_stateTrajGlbPy, - std::vector const& see_stateTrajGlbPz, - std::vector const& see_q, - std::vector> const& see_hitIdx, - std::vector const& ph2_detId, - std::vector const& ph2_x, - std::vector const& ph2_y, - std::vector const& ph2_z) { +void ALPAKA_ACCELERATOR_NAMESPACE::lst::LST::prepareInput(std::vector const& see_px, + std::vector const& see_py, + std::vector const& see_pz, + std::vector const& see_dxy, + std::vector const& see_dz, + std::vector const& see_ptErr, + std::vector const& see_etaErr, + std::vector const& see_stateTrajGlbX, + std::vector const& see_stateTrajGlbY, + std::vector const& see_stateTrajGlbZ, + std::vector const& see_stateTrajGlbPx, + std::vector const& see_stateTrajGlbPy, + std::vector const& see_stateTrajGlbPz, + std::vector const& see_q, + std::vector> const& see_hitIdx, + std::vector const& ph2_detId, + std::vector const& ph2_x, + std::vector const& ph2_y, + std::vector const& ph2_z) { unsigned int count = 0; auto n_see = see_stateTrajGlbPx.size(); std::vector px_vec; @@ -212,11 +211,10 @@ void lst::LST::prepareInput(std::vector const& see_px, in_isQuad_vec_ = isQuad_vec; } -template <> -std::vector lst::LST::getHitIdxs(short trackCandidateType, - unsigned int TCIdx, - unsigned int const* TCHitIndices, - unsigned int const* hitIndices) { +std::vector ALPAKA_ACCELERATOR_NAMESPACE::lst::LST::getHitIdxs(short trackCandidateType, + unsigned int TCIdx, + unsigned int const* TCHitIndices, + unsigned int const* hitIndices) { std::vector hits; unsigned int maxNHits = 0; @@ -248,17 +246,17 @@ std::vector lst::LST::getHitIdxs(short trackCandidateType, return hits; } -template <> -void lst::LST::getOutput(lst::Event& event) { +void ALPAKA_ACCELERATOR_NAMESPACE::lst::LST::getOutput(ALPAKA_ACCELERATOR_NAMESPACE::lst::Event& event) { std::vector> tc_hitIdxs; std::vector tc_len; std::vector tc_seedIdx; std::vector tc_trackCandidateType; - lst::HitsBuffer& hitsInGPU = (*event.getHitsInCMSSW()); - lst::TrackCandidates const* trackCandidates = event.getTrackCandidatesInCMSSW()->data(); + HitsBuffer& hitsInGPU = (*event.getHitsInCMSSW(false)); // sync on next line + TrackCandidates const* trackCandidates = event.getTrackCandidatesInCMSSW()->data(); unsigned int nTrackCandidates = *trackCandidates->nTrackCandidates; + for (unsigned int idx = 0; idx < nTrackCandidates; idx++) { short trackCandidateType = trackCandidates->trackCandidateType[idx]; std::vector hit_idx = @@ -276,33 +274,31 @@ void lst::LST::getOutput(lst::Event& event) { out_tc_trackCandidateType_ = tc_trackCandidateType; } -template <> -template <> -void lst::LST::run(Queue& queue, - bool verbose, - LSTESData const* deviceESData, - std::vector const& see_px, - std::vector const& see_py, - std::vector const& see_pz, - std::vector const& see_dxy, - std::vector const& see_dz, - std::vector const& see_ptErr, - std::vector const& see_etaErr, - std::vector const& see_stateTrajGlbX, - std::vector const& see_stateTrajGlbY, - std::vector const& see_stateTrajGlbZ, - std::vector const& see_stateTrajGlbPx, - std::vector const& see_stateTrajGlbPy, - std::vector const& see_stateTrajGlbPz, - std::vector const& see_q, - std::vector> const& see_hitIdx, - std::vector const& ph2_detId, - std::vector const& ph2_x, - std::vector const& ph2_y, - std::vector const& ph2_z, - bool no_pls_dupclean, - bool tc_pls_triplets) { - auto event = lst::Event(verbose, queue, deviceESData); +void ALPAKA_ACCELERATOR_NAMESPACE::lst::LST::run(Queue& queue, + bool verbose, + LSTESData const* deviceESData, + std::vector const& see_px, + std::vector const& see_py, + std::vector const& see_pz, + std::vector const& see_dxy, + std::vector const& see_dz, + std::vector const& see_ptErr, + std::vector const& see_etaErr, + std::vector const& see_stateTrajGlbX, + std::vector const& see_stateTrajGlbY, + std::vector const& see_stateTrajGlbZ, + std::vector const& see_stateTrajGlbPx, + std::vector const& see_stateTrajGlbPy, + std::vector const& see_stateTrajGlbPz, + std::vector const& see_q, + std::vector> const& see_hitIdx, + std::vector const& ph2_detId, + std::vector const& ph2_x, + std::vector const& ph2_y, + std::vector const& ph2_z, + bool no_pls_dupclean, + bool tc_pls_triplets) { + auto event = ALPAKA_ACCELERATOR_NAMESPACE::lst::Event(verbose, queue, deviceESData); prepareInput(see_px, see_py, see_pz, @@ -344,6 +340,7 @@ void lst::LST::run(Queue& queue, in_isQuad_vec_); event.createMiniDoublets(); if (verbose) { + alpaka::wait(queue); // event calls are asynchronous: wait before printing printf("# of Mini-doublets produced: %d\n", event.getNumberOfMiniDoublets()); printf("# of Mini-doublets produced barrel layer 1: %d\n", event.getNumberOfMiniDoubletsByLayerBarrel(0)); printf("# of Mini-doublets produced barrel layer 2: %d\n", event.getNumberOfMiniDoubletsByLayerBarrel(1)); @@ -360,6 +357,7 @@ void lst::LST::run(Queue& queue, event.createSegmentsWithModuleMap(); if (verbose) { + alpaka::wait(queue); // event calls are asynchronous: wait before printing printf("# of Segments produced: %d\n", event.getNumberOfSegments()); printf("# of Segments produced layer 1-2: %d\n", event.getNumberOfSegmentsByLayerBarrel(0)); printf("# of Segments produced layer 2-3: %d\n", event.getNumberOfSegmentsByLayerBarrel(1)); @@ -375,6 +373,7 @@ void lst::LST::run(Queue& queue, event.createTriplets(); if (verbose) { + alpaka::wait(queue); // event calls are asynchronous: wait before printing printf("# of T3s produced: %d\n", event.getNumberOfTriplets()); printf("# of T3s produced layer 1-2-3: %d\n", event.getNumberOfTripletsByLayerBarrel(0)); printf("# of T3s produced layer 2-3-4: %d\n", event.getNumberOfTripletsByLayerBarrel(1)); @@ -392,6 +391,7 @@ void lst::LST::run(Queue& queue, event.createQuintuplets(); if (verbose) { + alpaka::wait(queue); // event calls are asynchronous: wait before printing printf("# of Quintuplets produced: %d\n", event.getNumberOfQuintuplets()); printf("# of Quintuplets produced layer 1-2-3-4-5-6: %d\n", event.getNumberOfQuintupletsByLayerBarrel(0)); printf("# of Quintuplets produced layer 2: %d\n", event.getNumberOfQuintupletsByLayerBarrel(1)); @@ -409,15 +409,20 @@ void lst::LST::run(Queue& queue, event.pixelLineSegmentCleaning(no_pls_dupclean); event.createPixelQuintuplets(); - if (verbose) + if (verbose) { + alpaka::wait(queue); // event calls are asynchronous: wait before printing printf("# of Pixel Quintuplets produced: %d\n", event.getNumberOfPixelQuintuplets()); + } event.createPixelTriplets(); - if (verbose) + if (verbose) { + alpaka::wait(queue); // event calls are asynchronous: wait before printing printf("# of Pixel T3s produced: %d\n", event.getNumberOfPixelTriplets()); + } event.createTrackCandidates(no_pls_dupclean, tc_pls_triplets); if (verbose) { + alpaka::wait(queue); // event calls are asynchronous: wait before printing printf("# of TrackCandidates produced: %d\n", event.getNumberOfTrackCandidates()); printf(" # of Pixel TrackCandidates produced: %d\n", event.getNumberOfPixelTrackCandidates()); printf(" # of pT5 TrackCandidates produced: %d\n", event.getNumberOfPT5TrackCandidates()); @@ -428,5 +433,5 @@ void lst::LST::run(Queue& queue, getOutput(event); - event.resetEvent(); + event.resetEventSync(); } diff --git a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h index 86a22d943c33f..335ceeea2ab79 100644 --- a/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h +++ b/RecoTracker/LSTCore/src/alpaka/MiniDoublet.h @@ -10,7 +10,7 @@ #include "Hit.h" #include "ObjectRanges.h" -namespace lst { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { struct MiniDoublets { unsigned int* nMemoryLocations; @@ -56,42 +56,42 @@ namespace lst { template void setData(TBuf& buf) { - nMemoryLocations = alpaka::getPtrNative(buf.nMemoryLocations_buf); - anchorHitIndices = alpaka::getPtrNative(buf.anchorHitIndices_buf); - outerHitIndices = alpaka::getPtrNative(buf.outerHitIndices_buf); - moduleIndices = alpaka::getPtrNative(buf.moduleIndices_buf); - nMDs = alpaka::getPtrNative(buf.nMDs_buf); - totOccupancyMDs = alpaka::getPtrNative(buf.totOccupancyMDs_buf); - dphichanges = alpaka::getPtrNative(buf.dphichanges_buf); - dzs = alpaka::getPtrNative(buf.dzs_buf); - dphis = alpaka::getPtrNative(buf.dphis_buf); - shiftedXs = alpaka::getPtrNative(buf.shiftedXs_buf); - shiftedYs = alpaka::getPtrNative(buf.shiftedYs_buf); - shiftedZs = alpaka::getPtrNative(buf.shiftedZs_buf); - noShiftedDphis = alpaka::getPtrNative(buf.noShiftedDphis_buf); - noShiftedDphiChanges = alpaka::getPtrNative(buf.noShiftedDphiChanges_buf); - anchorX = alpaka::getPtrNative(buf.anchorX_buf); - anchorY = alpaka::getPtrNative(buf.anchorY_buf); - anchorZ = alpaka::getPtrNative(buf.anchorZ_buf); - anchorRt = alpaka::getPtrNative(buf.anchorRt_buf); - anchorPhi = alpaka::getPtrNative(buf.anchorPhi_buf); - anchorEta = alpaka::getPtrNative(buf.anchorEta_buf); - anchorHighEdgeX = alpaka::getPtrNative(buf.anchorHighEdgeX_buf); - anchorHighEdgeY = alpaka::getPtrNative(buf.anchorHighEdgeY_buf); - anchorLowEdgeX = alpaka::getPtrNative(buf.anchorLowEdgeX_buf); - anchorLowEdgeY = alpaka::getPtrNative(buf.anchorLowEdgeY_buf); - outerX = alpaka::getPtrNative(buf.outerX_buf); - outerY = alpaka::getPtrNative(buf.outerY_buf); - outerZ = alpaka::getPtrNative(buf.outerZ_buf); - outerRt = alpaka::getPtrNative(buf.outerRt_buf); - outerPhi = alpaka::getPtrNative(buf.outerPhi_buf); - outerEta = alpaka::getPtrNative(buf.outerEta_buf); - outerHighEdgeX = alpaka::getPtrNative(buf.outerHighEdgeX_buf); - outerHighEdgeY = alpaka::getPtrNative(buf.outerHighEdgeY_buf); - outerLowEdgeX = alpaka::getPtrNative(buf.outerLowEdgeX_buf); - outerLowEdgeY = alpaka::getPtrNative(buf.outerLowEdgeY_buf); - anchorLowEdgePhi = alpaka::getPtrNative(buf.anchorLowEdgePhi_buf); - anchorHighEdgePhi = alpaka::getPtrNative(buf.anchorHighEdgePhi_buf); + nMemoryLocations = buf.nMemoryLocations_buf.data(); + anchorHitIndices = buf.anchorHitIndices_buf.data(); + outerHitIndices = buf.outerHitIndices_buf.data(); + moduleIndices = buf.moduleIndices_buf.data(); + nMDs = buf.nMDs_buf.data(); + totOccupancyMDs = buf.totOccupancyMDs_buf.data(); + dphichanges = buf.dphichanges_buf.data(); + dzs = buf.dzs_buf.data(); + dphis = buf.dphis_buf.data(); + shiftedXs = buf.shiftedXs_buf.data(); + shiftedYs = buf.shiftedYs_buf.data(); + shiftedZs = buf.shiftedZs_buf.data(); + noShiftedDphis = buf.noShiftedDphis_buf.data(); + noShiftedDphiChanges = buf.noShiftedDphiChanges_buf.data(); + anchorX = buf.anchorX_buf.data(); + anchorY = buf.anchorY_buf.data(); + anchorZ = buf.anchorZ_buf.data(); + anchorRt = buf.anchorRt_buf.data(); + anchorPhi = buf.anchorPhi_buf.data(); + anchorEta = buf.anchorEta_buf.data(); + anchorHighEdgeX = buf.anchorHighEdgeX_buf.data(); + anchorHighEdgeY = buf.anchorHighEdgeY_buf.data(); + anchorLowEdgeX = buf.anchorLowEdgeX_buf.data(); + anchorLowEdgeY = buf.anchorLowEdgeY_buf.data(); + outerX = buf.outerX_buf.data(); + outerY = buf.outerY_buf.data(); + outerZ = buf.outerZ_buf.data(); + outerRt = buf.outerRt_buf.data(); + outerPhi = buf.outerPhi_buf.data(); + outerEta = buf.outerEta_buf.data(); + outerHighEdgeX = buf.outerHighEdgeX_buf.data(); + outerHighEdgeY = buf.outerHighEdgeY_buf.data(); + outerLowEdgeX = buf.outerLowEdgeX_buf.data(); + outerLowEdgeY = buf.outerLowEdgeY_buf.data(); + anchorLowEdgePhi = buf.anchorLowEdgePhi_buf.data(); + anchorHighEdgePhi = buf.anchorHighEdgePhi_buf.data(); } }; @@ -181,7 +181,6 @@ namespace lst { outerLowEdgeY_buf(allocBufWrapper(devAccIn, nMemoryLoc, queue)) { alpaka::memset(queue, nMDs_buf, 0u); alpaka::memset(queue, totOccupancyMDs_buf, 0u); - alpaka::wait(queue); } inline MiniDoublets const* data() const { return &data_; } @@ -190,9 +189,9 @@ namespace lst { template ALPAKA_FN_ACC ALPAKA_FN_INLINE void addMDToMemory(TAcc const& acc, - lst::MiniDoublets& mdsInGPU, - lst::Hits const& hitsInGPU, - lst::Modules const& modulesInGPU, + MiniDoublets& mdsInGPU, + Hits const& hitsInGPU, + Modules const& modulesInGPU, unsigned int lowerHitIdx, unsigned int upperHitIdx, uint16_t lowerModuleIdx, @@ -210,7 +209,8 @@ namespace lst { mdsInGPU.moduleIndices[idx] = lowerModuleIdx; unsigned int anchorHitIndex, outerHitIndex; - if (modulesInGPU.moduleType[lowerModuleIdx] == PS and modulesInGPU.moduleLayerType[lowerModuleIdx] == Strip) { + if (modulesInGPU.moduleType[lowerModuleIdx] == ::lst::PS and + modulesInGPU.moduleLayerType[lowerModuleIdx] == ::lst::Strip) { mdsInGPU.anchorHitIndices[idx] = upperHitIdx; mdsInGPU.outerHitIndices[idx] = lowerHitIdx; @@ -260,9 +260,9 @@ namespace lst { mdsInGPU.outerHighEdgeY[idx] = hitsInGPU.highEdgeYs[outerHitIndex]; mdsInGPU.outerLowEdgeX[idx] = hitsInGPU.lowEdgeXs[outerHitIndex]; mdsInGPU.outerLowEdgeY[idx] = hitsInGPU.lowEdgeYs[outerHitIndex]; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules(lst::Modules const& modulesInGPU, uint16_t moduleIndex) { + ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules(Modules const& modulesInGPU, uint16_t moduleIndex) { // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing // This is the same as what was previously considered as"isNormalTiltedModules" // See Figure 9.1 of https://cds.cern.ch/record/2272264/files/CMS-TDR-014.pdf @@ -271,18 +271,18 @@ namespace lst { short side = modulesInGPU.sides[moduleIndex]; short rod = modulesInGPU.rods[moduleIndex]; - if (subdet == Barrel) { - if ((side != Center and layer == 3) or (side == NegZ and layer == 2 and rod > 5) or - (side == PosZ and layer == 2 and rod < 8) or (side == NegZ and layer == 1 and rod > 9) or - (side == PosZ and layer == 1 and rod < 4)) + if (subdet == ::lst::Barrel) { + if ((side != ::lst::Center and layer == 3) or (side == ::lst::NegZ and layer == 2 and rod > 5) or + (side == ::lst::PosZ and layer == 2 and rod < 8) or (side == ::lst::NegZ and layer == 1 and rod > 9) or + (side == ::lst::PosZ and layer == 1 and rod < 4)) return true; else return false; } else return false; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE float moduleGapSize(struct lst::Modules const& modulesInGPU, uint16_t moduleIndex) { + ALPAKA_FN_ACC ALPAKA_FN_INLINE float moduleGapSize(Modules const& modulesInGPU, uint16_t moduleIndex) { float miniDeltaTilted[3] = {0.26f, 0.26f, 0.26f}; float miniDeltaFlat[6] = {0.26f, 0.16f, 0.16f, 0.18f, 0.18f, 0.18f}; float miniDeltaLooseTilted[3] = {0.4f, 0.4f, 0.4f}; @@ -319,11 +319,11 @@ namespace lst { float moduleSeparation = 0; - if (subdet == Barrel and side == Center) { + if (subdet == ::lst::Barrel and side == ::lst::Center) { moduleSeparation = miniDeltaFlat[iL]; } else if (isTighterTiltedModules(modulesInGPU, moduleIndex)) { moduleSeparation = miniDeltaTilted[iL]; - } else if (subdet == Endcap) { + } else if (subdet == ::lst::Endcap) { moduleSeparation = miniDeltaEndcap[iL][iR]; } else //Loose tilted modules { @@ -331,11 +331,11 @@ namespace lst { } return moduleSeparation; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE float dPhiThreshold( - TAcc const& acc, float rt, lst::Modules const& modulesInGPU, uint16_t moduleIndex, float dPhi = 0, float dz = 0) { + TAcc const& acc, float rt, Modules const& modulesInGPU, uint16_t moduleIndex, float dPhi = 0, float dz = 0) { // ================================================================= // Various constants // ================================================================= @@ -348,16 +348,19 @@ namespace lst { unsigned int iL = modulesInGPU.layers[moduleIndex] - 1; const float miniSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rt * k2Rinv1GeVf / ptCut, kSinAlphaMax)); const float rLayNominal = - ((modulesInGPU.subdets[moduleIndex] == Barrel) ? kMiniRminMeanBarrel[iL] : kMiniRminMeanEndcap[iL]); + ((modulesInGPU.subdets[moduleIndex] == ::lst::Barrel) ? kMiniRminMeanBarrel[iL] : kMiniRminMeanEndcap[iL]); const float miniPVoff = 0.1f / rLayNominal; - const float miniMuls = ((modulesInGPU.subdets[moduleIndex] == Barrel) ? kMiniMulsPtScaleBarrel[iL] * 3.f / ptCut - : kMiniMulsPtScaleEndcap[iL] * 3.f / ptCut); - const bool isTilted = modulesInGPU.subdets[moduleIndex] == Barrel and modulesInGPU.sides[moduleIndex] != Center; + const float miniMuls = + ((modulesInGPU.subdets[moduleIndex] == ::lst::Barrel) ? kMiniMulsPtScaleBarrel[iL] * 3.f / ptCut + : kMiniMulsPtScaleEndcap[iL] * 3.f / ptCut); + const bool isTilted = + modulesInGPU.subdets[moduleIndex] == ::lst::Barrel and modulesInGPU.sides[moduleIndex] != ::lst::Center; //the lower module is sent in irrespective of its layer type. We need to fetch the drdz properly float drdz; if (isTilted) { - if (modulesInGPU.moduleType[moduleIndex] == PS and modulesInGPU.moduleLayerType[moduleIndex] == Strip) { + if (modulesInGPU.moduleType[moduleIndex] == ::lst::PS and + modulesInGPU.moduleLayerType[moduleIndex] == ::lst::Strip) { drdz = modulesInGPU.drdzs[moduleIndex]; } else { drdz = modulesInGPU.drdzs[modulesInGPU.partnerModuleIndices[moduleIndex]]; @@ -376,12 +379,12 @@ namespace lst { // Return the threshold value // ================================================================= // Following condition is met if the module is central and flatly lying - if (modulesInGPU.subdets[moduleIndex] == Barrel and modulesInGPU.sides[moduleIndex] == Center) { + if (modulesInGPU.subdets[moduleIndex] == ::lst::Barrel and modulesInGPU.sides[moduleIndex] == ::lst::Center) { return miniSlope + alpaka::math::sqrt(acc, miniMuls * miniMuls + miniPVoff * miniPVoff); } // Following condition is met if the module is central and tilted - else if (modulesInGPU.subdets[moduleIndex] == Barrel and - modulesInGPU.sides[moduleIndex] != Center) //all types of tilted modules + else if (modulesInGPU.subdets[moduleIndex] == ::lst::Barrel and + modulesInGPU.sides[moduleIndex] != ::lst::Center) //all types of tilted modules { return miniSlope + alpaka::math::sqrt(acc, miniMuls * miniMuls + miniPVoff * miniPVoff + miniTilt2 * miniSlope * miniSlope); @@ -390,11 +393,11 @@ namespace lst { else { return miniSlope + alpaka::math::sqrt(acc, miniMuls * miniMuls + miniPVoff * miniPVoff + miniLum * miniLum); } - }; + } template ALPAKA_FN_INLINE ALPAKA_FN_ACC void shiftStripHits(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, uint16_t lowerModuleIndex, uint16_t upperModuleIndex, unsigned int lowerHitIndex, @@ -420,8 +423,8 @@ namespace lst { // lowerModule // lowerHit // upperHit - // lst::endcapGeometry - // lst::tiltedGeometry + // endcapGeometry + // tiltedGeometry // Some variables relevant to the function float xp; // pixel x (pixel hit x) @@ -450,10 +453,11 @@ namespace lst { float absdzprime; // The distance between the two points after shifting const float& drdz_ = modulesInGPU.drdzs[lowerModuleIndex]; // Assign hit pointers based on their hit type - if (modulesInGPU.moduleType[lowerModuleIndex] == PS) { + if (modulesInGPU.moduleType[lowerModuleIndex] == ::lst::PS) { // TODO: This is somewhat of an mystery.... somewhat confused why this is the case - if (modulesInGPU.subdets[lowerModuleIndex] == Barrel ? modulesInGPU.moduleLayerType[lowerModuleIndex] != Pixel - : modulesInGPU.moduleLayerType[lowerModuleIndex] == Pixel) { + if (modulesInGPU.subdets[lowerModuleIndex] == ::lst::Barrel + ? modulesInGPU.moduleLayerType[lowerModuleIndex] != ::lst::Pixel + : modulesInGPU.moduleLayerType[lowerModuleIndex] == ::lst::Pixel) { xo = xUpper; yo = yUpper; xp = xLower; @@ -478,7 +482,7 @@ namespace lst { } // If it is endcap some of the math gets simplified (and also computers don't like infinities) - isEndcap = modulesInGPU.subdets[lowerModuleIndex] == Endcap; + isEndcap = modulesInGPU.subdets[lowerModuleIndex] == ::lst::Endcap; // NOTE: TODO: Keep in mind that the sin(atan) function can be simplified to something like x / sqrt(1 + x^2) and similar for cos // I am not sure how slow sin, atan, cos, functions are in c++. If x / sqrt(1 + x^2) are faster change this later to reduce arithmetic computation time @@ -493,14 +497,15 @@ namespace lst { moduleSeparation = moduleGapSize(modulesInGPU, lowerModuleIndex); // Sign flips if the pixel is later layer - if (modulesInGPU.moduleType[lowerModuleIndex] == PS and modulesInGPU.moduleLayerType[lowerModuleIndex] != Pixel) { + if (modulesInGPU.moduleType[lowerModuleIndex] == ::lst::PS and + modulesInGPU.moduleLayerType[lowerModuleIndex] != ::lst::Pixel) { moduleSeparation *= -1; } drprime = (moduleSeparation / alpaka::math::sin(acc, angleA + angleB)) * alpaka::math::sin(acc, angleA); // Compute arctan of the slope and take care of the slope = infinity case - absArctanSlope = ((slope != lst::lst_INF) ? fabs(alpaka::math::atan(acc, slope)) : float(M_PI) / 2.f); + absArctanSlope = ((slope != lst_INF) ? fabs(alpaka::math::atan(acc, slope)) : float(M_PI) / 2.f); // Depending on which quadrant the pixel hit lies, we define the angleM by shifting them slightly differently if (xp > 0 and yp > 0) { @@ -524,7 +529,7 @@ namespace lst { // Compute the new strip hit position (if the slope value is in special condition take care of the exceptions) if (slope == - lst::lst_INF) // Designated for tilted module when the slope is exactly infinity (module lying along y-axis) + lst_INF) // Designated for tilted module when the slope is exactly infinity (module lying along y-axis) { xn = xa; // New x point is simply where the anchor is yn = yo; // No shift in y @@ -545,7 +550,7 @@ namespace lst { angleA)); // module separation sign is for shifting in radial direction for z-axis direction take care of the sign later // Depending on which one as closer to the interactin point compute the new z wrt to the pixel properly - if (modulesInGPU.moduleLayerType[lowerModuleIndex] == Pixel) { + if (modulesInGPU.moduleLayerType[lowerModuleIndex] == ::lst::Pixel) { abszn = alpaka::math::abs(acc, zp) + absdzprime; } else { abszn = alpaka::math::abs(acc, zp) - absdzprime; @@ -556,83 +561,11 @@ namespace lst { shiftedCoords[0] = xn; shiftedCoords[1] = yn; shiftedCoords[2] = zn; - }; - - template - ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgo(TAcc const& acc, - lst::Modules const& modulesInGPU, - uint16_t lowerModuleIndex, - uint16_t upperModuleIndex, - unsigned int lowerHitIndex, - unsigned int upperHitIndex, - float& dz, - float& dPhi, - float& dPhiChange, - float& shiftedX, - float& shiftedY, - float& shiftedZ, - float& noShiftedDphi, - float& noShiftedDphiChange, - float xLower, - float yLower, - float zLower, - float rtLower, - float xUpper, - float yUpper, - float zUpper, - float rtUpper) { - if (modulesInGPU.subdets[lowerModuleIndex] == lst::Barrel) { - return runMiniDoubletDefaultAlgoBarrel(acc, - modulesInGPU, - lowerModuleIndex, - upperModuleIndex, - lowerHitIndex, - upperHitIndex, - dz, - dPhi, - dPhiChange, - shiftedX, - shiftedY, - shiftedZ, - noShiftedDphi, - noShiftedDphiChange, - xLower, - yLower, - zLower, - rtLower, - xUpper, - yUpper, - zUpper, - rtUpper); - } else { - return runMiniDoubletDefaultAlgoEndcap(acc, - modulesInGPU, - lowerModuleIndex, - upperModuleIndex, - lowerHitIndex, - upperHitIndex, - dz, - dPhi, - dPhiChange, - shiftedX, - shiftedY, - shiftedZ, - noShiftedDphi, - noShiftedDphiChange, - xLower, - yLower, - zLower, - rtLower, - xUpper, - yUpper, - zUpper, - rtUpper); - } - }; + } template ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgoBarrel(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, uint16_t lowerModuleIndex, uint16_t upperModuleIndex, unsigned int lowerHitIndex, @@ -654,7 +587,7 @@ namespace lst { float zUpper, float rtUpper) { dz = zLower - zUpper; - const float dzCut = modulesInGPU.moduleType[lowerModuleIndex] == lst::PS ? 2.f : 10.f; + const float dzCut = modulesInGPU.moduleType[lowerModuleIndex] == ::lst::PS ? 2.f : 10.f; const float sign = ((dz > 0) - (dz < 0)) * ((zLower > 0) - (zLower < 0)); const float invertedcrossercut = (alpaka::math::abs(acc, dz) > 2) * sign; @@ -663,7 +596,7 @@ namespace lst { float miniCut = 0; - miniCut = modulesInGPU.moduleLayerType[lowerModuleIndex] == lst::Pixel + miniCut = modulesInGPU.moduleLayerType[lowerModuleIndex] == ::lst::Pixel ? dPhiThreshold(acc, rtLower, modulesInGPU, lowerModuleIndex) : dPhiThreshold(acc, rtUpper, modulesInGPU, lowerModuleIndex); @@ -671,7 +604,7 @@ namespace lst { // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3085 float xn = 0.f, yn = 0.f; // , zn = 0; float shiftedRt2; - if (modulesInGPU.sides[lowerModuleIndex] != Center) // If barrel and not center it is tilted + if (modulesInGPU.sides[lowerModuleIndex] != ::lst::Center) // If barrel and not center it is tilted { // Shift the hits and calculate new xn, yn position float shiftedCoords[3]; @@ -694,27 +627,27 @@ namespace lst { yn = shiftedCoords[1]; // Lower or the upper hit needs to be modified depending on which one was actually shifted - if (modulesInGPU.moduleLayerType[lowerModuleIndex] == lst::Pixel) { + if (modulesInGPU.moduleLayerType[lowerModuleIndex] == ::lst::Pixel) { shiftedX = xn; shiftedY = yn; shiftedZ = zUpper; shiftedRt2 = xn * xn + yn * yn; - dPhi = lst::deltaPhi(acc, xLower, yLower, shiftedX, shiftedY); //function from Hit.cc - noShiftedDphi = lst::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + dPhi = deltaPhi(acc, xLower, yLower, shiftedX, shiftedY); //function from Hit.cc + noShiftedDphi = deltaPhi(acc, xLower, yLower, xUpper, yUpper); } else { shiftedX = xn; shiftedY = yn; shiftedZ = zLower; shiftedRt2 = xn * xn + yn * yn; - dPhi = lst::deltaPhi(acc, shiftedX, shiftedY, xUpper, yUpper); - noShiftedDphi = lst::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + dPhi = deltaPhi(acc, shiftedX, shiftedY, xUpper, yUpper); + noShiftedDphi = deltaPhi(acc, xLower, yLower, xUpper, yUpper); } } else { shiftedX = 0; shiftedY = 0; shiftedZ = 0; - dPhi = lst::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + dPhi = deltaPhi(acc, xLower, yLower, xUpper, yUpper); noShiftedDphi = dPhi; } @@ -723,43 +656,43 @@ namespace lst { // Cut #3: The dphi change going from lower Hit to upper Hit // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3076 - if (modulesInGPU.sides[lowerModuleIndex] != Center) { + if (modulesInGPU.sides[lowerModuleIndex] != ::lst::Center) { // When it is tilted, use the new shifted positions // TODO: This is somewhat of an mystery.... somewhat confused why this is the case - if (modulesInGPU.moduleLayerType[lowerModuleIndex] != lst::Pixel) { + if (modulesInGPU.moduleLayerType[lowerModuleIndex] != ::lst::Pixel) { // dPhi Change should be calculated so that the upper hit has higher rt. // In principle, this kind of check rt_lower < rt_upper should not be necessary because the hit shifting should have taken care of this. // (i.e. the strip hit is shifted to be aligned in the line of sight from interaction point to pixel hit of PS module guaranteeing rt ordering) // But I still placed this check for safety. (TODO: After checking explicitly if not needed remove later?) // setdeltaPhiChange(lowerHit.rt() < upperHitMod.rt() ? lowerHit.deltaPhiChange(upperHitMod) : upperHitMod.deltaPhiChange(lowerHit)); - dPhiChange = (rtLower * rtLower < shiftedRt2) ? lst::deltaPhiChange(acc, xLower, yLower, shiftedX, shiftedY) - : lst::deltaPhiChange(acc, shiftedX, shiftedY, xLower, yLower); - noShiftedDphiChange = rtLower < rtUpper ? lst::deltaPhiChange(acc, xLower, yLower, xUpper, yUpper) - : lst::deltaPhiChange(acc, xUpper, yUpper, xLower, yLower); + dPhiChange = (rtLower * rtLower < shiftedRt2) ? deltaPhiChange(acc, xLower, yLower, shiftedX, shiftedY) + : deltaPhiChange(acc, shiftedX, shiftedY, xLower, yLower); + noShiftedDphiChange = rtLower < rtUpper ? deltaPhiChange(acc, xLower, yLower, xUpper, yUpper) + : deltaPhiChange(acc, xUpper, yUpper, xLower, yLower); } else { // dPhi Change should be calculated so that the upper hit has higher rt. // In principle, this kind of check rt_lower < rt_upper should not be necessary because the hit shifting should have taken care of this. // (i.e. the strip hit is shifted to be aligned in the line of sight from interaction point to pixel hit of PS module guaranteeing rt ordering) // But I still placed this check for safety. (TODO: After checking explicitly if not needed remove later?) - dPhiChange = (shiftedRt2 < rtUpper * rtUpper) ? lst::deltaPhiChange(acc, shiftedX, shiftedY, xUpper, yUpper) - : lst::deltaPhiChange(acc, xUpper, yUpper, shiftedX, shiftedY); - noShiftedDphiChange = rtLower < rtUpper ? lst::deltaPhiChange(acc, xLower, yLower, xUpper, yUpper) - : lst::deltaPhiChange(acc, xUpper, yUpper, xLower, yLower); + dPhiChange = (shiftedRt2 < rtUpper * rtUpper) ? deltaPhiChange(acc, shiftedX, shiftedY, xUpper, yUpper) + : deltaPhiChange(acc, xUpper, yUpper, shiftedX, shiftedY); + noShiftedDphiChange = rtLower < rtUpper ? deltaPhiChange(acc, xLower, yLower, xUpper, yUpper) + : deltaPhiChange(acc, xUpper, yUpper, xLower, yLower); } } else { // When it is flat lying module, whichever is the lowerSide will always have rt lower - dPhiChange = lst::deltaPhiChange(acc, xLower, yLower, xUpper, yUpper); + dPhiChange = deltaPhiChange(acc, xLower, yLower, xUpper, yUpper); noShiftedDphiChange = dPhiChange; } return alpaka::math::abs(acc, dPhiChange) < miniCut; - }; + } template ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgoEndcap(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, uint16_t lowerModuleIndex, uint16_t upperModuleIndex, unsigned int lowerHitIndex, @@ -793,7 +726,7 @@ namespace lst { return false; // Cut #2 : drt cut. The dz difference can't be larger than 1cm. (max separation is 4mm for modules in the endcap) // Ref to original code: https://github.com/slava77/cms-tkph2-ntuple/blob/184d2325147e6930030d3d1f780136bc2dd29ce6/doubletAnalysis.C#L3100 - const float drtCut = modulesInGPU.moduleType[lowerModuleIndex] == lst::PS ? 2.f : 10.f; + const float drtCut = modulesInGPU.moduleType[lowerModuleIndex] == ::lst::PS ? 2.f : 10.f; drt = rtLower - rtUpper; if (alpaka::math::abs(acc, drt) >= drtCut) return false; @@ -821,37 +754,37 @@ namespace lst { yn = shiftedCoords[1]; zn = shiftedCoords[2]; - if (modulesInGPU.moduleType[lowerModuleIndex] == lst::PS) { + if (modulesInGPU.moduleType[lowerModuleIndex] == ::lst::PS) { // Appropriate lower or upper hit is modified after checking which one was actually shifted - if (modulesInGPU.moduleLayerType[lowerModuleIndex] == lst::Pixel) { + if (modulesInGPU.moduleLayerType[lowerModuleIndex] == ::lst::Pixel) { shiftedX = xn; shiftedY = yn; shiftedZ = zUpper; - dPhi = lst::deltaPhi(acc, xLower, yLower, shiftedX, shiftedY); - noShiftedDphi = lst::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + dPhi = deltaPhi(acc, xLower, yLower, shiftedX, shiftedY); + noShiftedDphi = deltaPhi(acc, xLower, yLower, xUpper, yUpper); } else { shiftedX = xn; shiftedY = yn; shiftedZ = zLower; - dPhi = lst::deltaPhi(acc, shiftedX, shiftedY, xUpper, yUpper); - noShiftedDphi = lst::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + dPhi = deltaPhi(acc, shiftedX, shiftedY, xUpper, yUpper); + noShiftedDphi = deltaPhi(acc, xLower, yLower, xUpper, yUpper); } } else { shiftedX = xn; shiftedY = yn; shiftedZ = zUpper; - dPhi = lst::deltaPhi(acc, xLower, yLower, xn, yn); - noShiftedDphi = lst::deltaPhi(acc, xLower, yLower, xUpper, yUpper); + dPhi = deltaPhi(acc, xLower, yLower, xn, yn); + noShiftedDphi = deltaPhi(acc, xLower, yLower, xUpper, yUpper); } // dz needs to change if it is a PS module where the strip hits are shifted in order to properly account for the case when a tilted module falls under "endcap logic" // if it was an endcap it will have zero effect - if (modulesInGPU.moduleType[lowerModuleIndex] == lst::PS) { - dz = modulesInGPU.moduleLayerType[lowerModuleIndex] == lst::Pixel ? zLower - zn : zUpper - zn; + if (modulesInGPU.moduleType[lowerModuleIndex] == ::lst::PS) { + dz = modulesInGPU.moduleLayerType[lowerModuleIndex] == ::lst::Pixel ? zLower - zn : zUpper - zn; } float miniCut = 0; - miniCut = modulesInGPU.moduleLayerType[lowerModuleIndex] == lst::Pixel + miniCut = modulesInGPU.moduleLayerType[lowerModuleIndex] == ::lst::Pixel ? dPhiThreshold(acc, rtLower, modulesInGPU, lowerModuleIndex, dPhi, dz) : dPhiThreshold(acc, rtUpper, modulesInGPU, lowerModuleIndex, dPhi, dz); @@ -866,15 +799,84 @@ namespace lst { noShiftedDphichange = noShiftedDphi / dzFrac * (1.f + dzFrac); return alpaka::math::abs(acc, dPhiChange) < miniCut; - }; + } + + template + ALPAKA_FN_ACC bool runMiniDoubletDefaultAlgo(TAcc const& acc, + Modules const& modulesInGPU, + uint16_t lowerModuleIndex, + uint16_t upperModuleIndex, + unsigned int lowerHitIndex, + unsigned int upperHitIndex, + float& dz, + float& dPhi, + float& dPhiChange, + float& shiftedX, + float& shiftedY, + float& shiftedZ, + float& noShiftedDphi, + float& noShiftedDphiChange, + float xLower, + float yLower, + float zLower, + float rtLower, + float xUpper, + float yUpper, + float zUpper, + float rtUpper) { + if (modulesInGPU.subdets[lowerModuleIndex] == ::lst::Barrel) { + return runMiniDoubletDefaultAlgoBarrel(acc, + modulesInGPU, + lowerModuleIndex, + upperModuleIndex, + lowerHitIndex, + upperHitIndex, + dz, + dPhi, + dPhiChange, + shiftedX, + shiftedY, + shiftedZ, + noShiftedDphi, + noShiftedDphiChange, + xLower, + yLower, + zLower, + rtLower, + xUpper, + yUpper, + zUpper, + rtUpper); + } else { + return runMiniDoubletDefaultAlgoEndcap(acc, + modulesInGPU, + lowerModuleIndex, + upperModuleIndex, + lowerHitIndex, + upperHitIndex, + dz, + dPhi, + dPhiChange, + shiftedX, + shiftedY, + shiftedZ, + noShiftedDphi, + noShiftedDphiChange, + xLower, + yLower, + zLower, + rtLower, + xUpper, + yUpper, + zUpper, + rtUpper); + } + } - struct createMiniDoubletsInGPUv2 { + struct CreateMiniDoubletsInGPUv2 { template - ALPAKA_FN_ACC void operator()(TAcc const& acc, - struct lst::Modules modulesInGPU, - struct lst::Hits hitsInGPU, - struct lst::MiniDoublets mdsInGPU, - struct lst::ObjectRanges rangesInGPU) const { + ALPAKA_FN_ACC void operator()( + TAcc const& acc, Modules modulesInGPU, Hits hitsInGPU, MiniDoublets mdsInGPU, ObjectRanges rangesInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -932,13 +934,14 @@ namespace lst { rtUpper); if (success) { int totOccupancyMDs = - alpaka::atomicOp(acc, &mdsInGPU.totOccupancyMDs[lowerModuleIndex], 1u); + alpaka::atomicAdd(acc, &mdsInGPU.totOccupancyMDs[lowerModuleIndex], 1u, alpaka::hierarchy::Threads{}); if (totOccupancyMDs >= (rangesInGPU.miniDoubletModuleOccupancy[lowerModuleIndex])) { #ifdef WARNINGS printf("Mini-doublet excess alert! Module index = %d\n", lowerModuleIndex); #endif } else { - int mdModuleIndex = alpaka::atomicOp(acc, &mdsInGPU.nMDs[lowerModuleIndex], 1u); + int mdModuleIndex = + alpaka::atomicAdd(acc, &mdsInGPU.nMDs[lowerModuleIndex], 1u, alpaka::hierarchy::Threads{}); unsigned int mdIndex = rangesInGPU.miniDoubletModuleIndices[lowerModuleIndex] + mdModuleIndex; addMDToMemory(acc, @@ -964,11 +967,13 @@ namespace lst { } }; - struct createMDArrayRangesGPU { + struct CreateMDArrayRangesGPU { template - ALPAKA_FN_ACC void operator()(TAcc const& acc, - struct lst::Modules modulesInGPU, - struct lst::ObjectRanges rangesInGPU) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, Modules modulesInGPU, ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -979,10 +984,10 @@ namespace lst { } alpaka::syncBlockThreads(acc); - // Initialize variables outside of the for loop. + // Create variables outside of the for loop. int occupancy, category_number, eta_number; - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { short module_rings = modulesInGPU.rings[i]; short module_layers = modulesInGPU.layers[i]; short module_subdets = modulesInGPU.subdets[i]; @@ -1041,7 +1046,7 @@ namespace lst { #endif } - unsigned int nTotMDs = alpaka::atomicOp(acc, &nTotalMDs, occupancy); + unsigned int nTotMDs = alpaka::atomicAdd(acc, &nTotalMDs, occupancy, alpaka::hierarchy::Threads{}); rangesInGPU.miniDoubletModuleIndices[i] = nTotMDs; rangesInGPU.miniDoubletModuleOccupancy[i] = occupancy; @@ -1056,17 +1061,18 @@ namespace lst { } }; - struct addMiniDoubletRangesToEventExplicit { + struct AddMiniDoubletRangesToEventExplicit { template - ALPAKA_FN_ACC void operator()(TAcc const& acc, - struct lst::Modules modulesInGPU, - struct lst::MiniDoublets mdsInGPU, - struct lst::ObjectRanges rangesInGPU, - struct lst::Hits hitsInGPU) const { + ALPAKA_FN_ACC void operator()( + TAcc const& acc, Modules modulesInGPU, MiniDoublets mdsInGPU, ObjectRanges rangesInGPU, Hits hitsInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (mdsInGPU.nMDs[i] == 0 or hitsInGPU.hitRanges[i * 2] == -1) { rangesInGPU.mdRanges[i * 2] = -1; rangesInGPU.mdRanges[i * 2 + 1] = -1; @@ -1077,5 +1083,5 @@ namespace lst { } } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h b/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h index b337b5f83f8ba..85b7b08dc075b 100644 --- a/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h +++ b/RecoTracker/LSTCore/src/alpaka/NeuralNetwork.h @@ -10,153 +10,156 @@ #include "Hit.h" #include "Triplet.h" -namespace lst::t5dnn { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE float runInference(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, - lst::Triplets const& tripletsInGPU, - const float* xVec, - const float* yVec, - const unsigned int* mdIndices, - const uint16_t* lowerModuleIndices, - unsigned int innerTripletIndex, - unsigned int outerTripletIndex, - float innerRadius, - float outerRadius, - float bridgeRadius) { - // Unpack x-coordinates of hits - float x1 = xVec[0]; - float x2 = xVec[1]; - float x3 = xVec[2]; - float x4 = xVec[3]; - float x5 = xVec[4]; - // Unpack y-coordinates of hits - float y1 = yVec[0]; - float y2 = yVec[1]; - float y3 = yVec[2]; - float y4 = yVec[3]; - float y5 = yVec[4]; - // Unpack module indices - unsigned int mdIndex1 = mdIndices[0]; - unsigned int mdIndex2 = mdIndices[1]; - unsigned int mdIndex3 = mdIndices[2]; - unsigned int mdIndex4 = mdIndices[3]; - unsigned int mdIndex5 = mdIndices[4]; - // Unpack module indices - uint16_t lowerModuleIndex1 = lowerModuleIndices[0]; - uint16_t lowerModuleIndex2 = lowerModuleIndices[1]; - uint16_t lowerModuleIndex3 = lowerModuleIndices[2]; - uint16_t lowerModuleIndex4 = lowerModuleIndices[3]; - uint16_t lowerModuleIndex5 = lowerModuleIndices[4]; - // Compute some convenience variables - short layer2_adjustment = 0; - if (modulesInGPU.layers[lowerModuleIndex1] == 1) { - layer2_adjustment = 1; // get upper segment to be in second layer - } - unsigned int md_idx_for_t5_eta_phi = - segmentsInGPU.mdIndices[2 * tripletsInGPU.segmentIndices[2 * innerTripletIndex + layer2_adjustment]]; - bool is_endcap1 = (modulesInGPU.subdets[lowerModuleIndex1] == 4); // true if anchor hit 1 is in the endcap - bool is_endcap2 = (modulesInGPU.subdets[lowerModuleIndex2] == 4); // true if anchor hit 2 is in the endcap - bool is_endcap3 = (modulesInGPU.subdets[lowerModuleIndex3] == 4); // true if anchor hit 3 is in the endcap - bool is_endcap4 = (modulesInGPU.subdets[lowerModuleIndex4] == 4); // true if anchor hit 4 is in the endcap - bool is_endcap5 = (modulesInGPU.subdets[lowerModuleIndex5] == 4); // true if anchor hit 5 is in the endcap + namespace t5dnn { + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float runInference(TAcc const& acc, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, + Triplets const& tripletsInGPU, + const float* xVec, + const float* yVec, + const unsigned int* mdIndices, + const uint16_t* lowerModuleIndices, + unsigned int innerTripletIndex, + unsigned int outerTripletIndex, + float innerRadius, + float outerRadius, + float bridgeRadius) { + // Unpack x-coordinates of hits + float x1 = xVec[0]; + float x2 = xVec[1]; + float x3 = xVec[2]; + float x4 = xVec[3]; + float x5 = xVec[4]; + // Unpack y-coordinates of hits + float y1 = yVec[0]; + float y2 = yVec[1]; + float y3 = yVec[2]; + float y4 = yVec[3]; + float y5 = yVec[4]; + // Unpack module indices + unsigned int mdIndex1 = mdIndices[0]; + unsigned int mdIndex2 = mdIndices[1]; + unsigned int mdIndex3 = mdIndices[2]; + unsigned int mdIndex4 = mdIndices[3]; + unsigned int mdIndex5 = mdIndices[4]; + // Unpack module indices + uint16_t lowerModuleIndex1 = lowerModuleIndices[0]; + uint16_t lowerModuleIndex2 = lowerModuleIndices[1]; + uint16_t lowerModuleIndex3 = lowerModuleIndices[2]; + uint16_t lowerModuleIndex4 = lowerModuleIndices[3]; + uint16_t lowerModuleIndex5 = lowerModuleIndices[4]; + // Compute some convenience variables + short layer2_adjustment = 0; + if (modulesInGPU.layers[lowerModuleIndex1] == 1) { + layer2_adjustment = 1; // get upper segment to be in second layer + } + unsigned int md_idx_for_t5_eta_phi = + segmentsInGPU.mdIndices[2 * tripletsInGPU.segmentIndices[2 * innerTripletIndex + layer2_adjustment]]; + bool is_endcap1 = (modulesInGPU.subdets[lowerModuleIndex1] == 4); // true if anchor hit 1 is in the endcap + bool is_endcap2 = (modulesInGPU.subdets[lowerModuleIndex2] == 4); // true if anchor hit 2 is in the endcap + bool is_endcap3 = (modulesInGPU.subdets[lowerModuleIndex3] == 4); // true if anchor hit 3 is in the endcap + bool is_endcap4 = (modulesInGPU.subdets[lowerModuleIndex4] == 4); // true if anchor hit 4 is in the endcap + bool is_endcap5 = (modulesInGPU.subdets[lowerModuleIndex5] == 4); // true if anchor hit 5 is in the endcap - // Build DNN input vector (corresponding output N-tuple branch noted in parenthetical in comment) - float x[38] = { - alpaka::math::log10(acc, 2 * lst::k2Rinv1GeVf * innerRadius), // inner T3 pT (t3_pt) - mdsInGPU.anchorEta[mdIndex1], // inner T3 anchor hit 1 eta (t3_0_eta) - mdsInGPU.anchorPhi[mdIndex1], // inner T3 anchor hit 1 phi (t3_0_phi) - mdsInGPU.anchorZ[mdIndex1], // inner T3 anchor hit 1 z (t3_0_z) - alpaka::math::sqrt(acc, x1 * x1 + y1 * y1), // inner T3 anchor hit 1 r (t3_0_r) - float(modulesInGPU.layers[lowerModuleIndex1] + 6 * is_endcap1), // inner T3 anchor hit 1 layer (t3_0_layer) - mdsInGPU.anchorEta[mdIndex2], // inner T3 anchor hit 2 eta (t3_2_eta) - mdsInGPU.anchorPhi[mdIndex2], // inner T3 anchor hit 2 phi (t3_2_phi) - mdsInGPU.anchorZ[mdIndex2], // inner T3 anchor hit 2 z (t3_2_z) - alpaka::math::sqrt(acc, x2 * x2 + y2 * y2), // inner T3 anchor hit 2 r (t3_2_r) - float(modulesInGPU.layers[lowerModuleIndex2] + 6 * is_endcap2), // inner T3 anchor hit 2 layer (t3_2_layer) - mdsInGPU.anchorEta[mdIndex3], // inner T3 anchor hit 3 eta (t3_4_eta) - mdsInGPU.anchorPhi[mdIndex3], // inner T3 anchor hit 3 phi (t3_4_phi) - mdsInGPU.anchorZ[mdIndex3], // inner T3 anchor hit 3 z (t3_4_z) - alpaka::math::sqrt(acc, x3 * x3 + y3 * y3), // inner T3 anchor hit 3 r (t3_4_r) - float(modulesInGPU.layers[lowerModuleIndex3] + 6 * is_endcap3), // inner T3 anchor hit 3 layer (t3_4_layer) - alpaka::math::log10(acc, 2 * lst::k2Rinv1GeVf * outerRadius), // outer T3 pT (t3_pt) - mdsInGPU.anchorEta[mdIndex3], // outer T3 anchor hit 4 eta (t3_0_eta) - mdsInGPU.anchorPhi[mdIndex3], // outer T3 anchor hit 4 phi (t3_0_phi) - mdsInGPU.anchorZ[mdIndex3], // outer T3 anchor hit 3 eta (t3_0_z) - alpaka::math::sqrt(acc, x3 * x3 + y3 * y3), // outer T3 anchor hit 3 r (t3_0_r) - float(modulesInGPU.layers[lowerModuleIndex3] + 6 * is_endcap3), // outer T3 anchor hit 3 layer (t3_0_layer) - mdsInGPU.anchorEta[mdIndex4], // outer T3 anchor hit 4 eta (t3_2_eta) - mdsInGPU.anchorPhi[mdIndex4], // outer T3 anchor hit 4 phi (t3_2_phi) - mdsInGPU.anchorZ[mdIndex4], // outer T3 anchor hit 4 z (t3_2_z) - alpaka::math::sqrt(acc, x4 * x4 + y4 * y4), // outer T3 anchor hit 4 r (t3_2_r) - float(modulesInGPU.layers[lowerModuleIndex4] + 6 * is_endcap4), // outer T3 anchor hit 4 layer (t3_2_layer) - mdsInGPU.anchorEta[mdIndex5], // outer T3 anchor hit 5 eta (t3_4_eta) - mdsInGPU.anchorPhi[mdIndex5], // outer T3 anchor hit 5 phi (t3_4_phi) - mdsInGPU.anchorZ[mdIndex5], // outer T3 anchor hit 5 z (t3_4_z) - alpaka::math::sqrt(acc, x5 * x5 + y5 * y5), // outer T3 anchor hit 5 r (t3_4_r) - float(modulesInGPU.layers[lowerModuleIndex5] + 6 * is_endcap5), // outer T3 anchor hit 5 layer (t3_4_layer) - alpaka::math::log10(acc, (innerRadius + outerRadius) * lst::k2Rinv1GeVf), // T5 pT (t5_pt) - mdsInGPU.anchorEta[md_idx_for_t5_eta_phi], // T5 eta (t5_eta) - mdsInGPU.anchorPhi[md_idx_for_t5_eta_phi], // T5 phi (t5_phi) - alpaka::math::log10(acc, innerRadius), // T5 inner radius (t5_innerRadius) - alpaka::math::log10(acc, bridgeRadius), // T5 bridge radius (t5_bridgeRadius) - alpaka::math::log10(acc, outerRadius) // T5 outer radius (t5_outerRadius) - }; + // Build DNN input vector (corresponding output N-tuple branch noted in parenthetical in comment) + float x[38] = { + alpaka::math::log10(acc, 2 * k2Rinv1GeVf * innerRadius), // inner T3 pT (t3_pt) + mdsInGPU.anchorEta[mdIndex1], // inner T3 anchor hit 1 eta (t3_0_eta) + mdsInGPU.anchorPhi[mdIndex1], // inner T3 anchor hit 1 phi (t3_0_phi) + mdsInGPU.anchorZ[mdIndex1], // inner T3 anchor hit 1 z (t3_0_z) + alpaka::math::sqrt(acc, x1 * x1 + y1 * y1), // inner T3 anchor hit 1 r (t3_0_r) + float(modulesInGPU.layers[lowerModuleIndex1] + 6 * is_endcap1), // inner T3 anchor hit 1 layer (t3_0_layer) + mdsInGPU.anchorEta[mdIndex2], // inner T3 anchor hit 2 eta (t3_2_eta) + mdsInGPU.anchorPhi[mdIndex2], // inner T3 anchor hit 2 phi (t3_2_phi) + mdsInGPU.anchorZ[mdIndex2], // inner T3 anchor hit 2 z (t3_2_z) + alpaka::math::sqrt(acc, x2 * x2 + y2 * y2), // inner T3 anchor hit 2 r (t3_2_r) + float(modulesInGPU.layers[lowerModuleIndex2] + 6 * is_endcap2), // inner T3 anchor hit 2 layer (t3_2_layer) + mdsInGPU.anchorEta[mdIndex3], // inner T3 anchor hit 3 eta (t3_4_eta) + mdsInGPU.anchorPhi[mdIndex3], // inner T3 anchor hit 3 phi (t3_4_phi) + mdsInGPU.anchorZ[mdIndex3], // inner T3 anchor hit 3 z (t3_4_z) + alpaka::math::sqrt(acc, x3 * x3 + y3 * y3), // inner T3 anchor hit 3 r (t3_4_r) + float(modulesInGPU.layers[lowerModuleIndex3] + 6 * is_endcap3), // inner T3 anchor hit 3 layer (t3_4_layer) + alpaka::math::log10(acc, 2 * k2Rinv1GeVf * outerRadius), // outer T3 pT (t3_pt) + mdsInGPU.anchorEta[mdIndex3], // outer T3 anchor hit 4 eta (t3_0_eta) + mdsInGPU.anchorPhi[mdIndex3], // outer T3 anchor hit 4 phi (t3_0_phi) + mdsInGPU.anchorZ[mdIndex3], // outer T3 anchor hit 3 eta (t3_0_z) + alpaka::math::sqrt(acc, x3 * x3 + y3 * y3), // outer T3 anchor hit 3 r (t3_0_r) + float(modulesInGPU.layers[lowerModuleIndex3] + 6 * is_endcap3), // outer T3 anchor hit 3 layer (t3_0_layer) + mdsInGPU.anchorEta[mdIndex4], // outer T3 anchor hit 4 eta (t3_2_eta) + mdsInGPU.anchorPhi[mdIndex4], // outer T3 anchor hit 4 phi (t3_2_phi) + mdsInGPU.anchorZ[mdIndex4], // outer T3 anchor hit 4 z (t3_2_z) + alpaka::math::sqrt(acc, x4 * x4 + y4 * y4), // outer T3 anchor hit 4 r (t3_2_r) + float(modulesInGPU.layers[lowerModuleIndex4] + 6 * is_endcap4), // outer T3 anchor hit 4 layer (t3_2_layer) + mdsInGPU.anchorEta[mdIndex5], // outer T3 anchor hit 5 eta (t3_4_eta) + mdsInGPU.anchorPhi[mdIndex5], // outer T3 anchor hit 5 phi (t3_4_phi) + mdsInGPU.anchorZ[mdIndex5], // outer T3 anchor hit 5 z (t3_4_z) + alpaka::math::sqrt(acc, x5 * x5 + y5 * y5), // outer T3 anchor hit 5 r (t3_4_r) + float(modulesInGPU.layers[lowerModuleIndex5] + 6 * is_endcap5), // outer T3 anchor hit 5 layer (t3_4_layer) + alpaka::math::log10(acc, (innerRadius + outerRadius) * k2Rinv1GeVf), // T5 pT (t5_pt) + mdsInGPU.anchorEta[md_idx_for_t5_eta_phi], // T5 eta (t5_eta) + mdsInGPU.anchorPhi[md_idx_for_t5_eta_phi], // T5 phi (t5_phi) + alpaka::math::log10(acc, innerRadius), // T5 inner radius (t5_innerRadius) + alpaka::math::log10(acc, bridgeRadius), // T5 bridge radius (t5_bridgeRadius) + alpaka::math::log10(acc, outerRadius) // T5 outer radius (t5_outerRadius) + }; - // (0): Linear(in_features=38, out_features=32, bias=True) => x = x*W_T + b - float x_0[32]; - for (unsigned int col = 0; col < 32; ++col) { - x_0[col] = 0; - for (unsigned int inner = 0; inner < 38; ++inner) { - x_0[col] += x[inner] * wgtT_0[inner][col]; + // (0): Linear(in_features=38, out_features=32, bias=True) => x = x*W_T + b + float x_0[32]; + for (unsigned int col = 0; col < 32; ++col) { + x_0[col] = 0; + for (unsigned int inner = 0; inner < 38; ++inner) { + x_0[col] += x[inner] * wgtT_0[inner][col]; + } + x_0[col] += bias_0[col]; } - x_0[col] += bias_0[col]; - } - // (1): ReLU() - float x_1[32]; - for (unsigned int col = 0; col < 32; ++col) { - x_1[col] = (x_0[col] > 0.f) ? x_0[col] : 0.f; - } + // (1): ReLU() + float x_1[32]; + for (unsigned int col = 0; col < 32; ++col) { + x_1[col] = (x_0[col] > 0.f) ? x_0[col] : 0.f; + } - // (2): Linear(in_features=32, out_features=32, bias=True) => x = x*W_T + b - float x_2[32]; - for (unsigned int col = 0; col < 32; ++col) { - x_2[col] = 0; - for (unsigned int inner = 0; inner < 32; ++inner) { - x_2[col] += x_1[inner] * wgtT_2[inner][col]; + // (2): Linear(in_features=32, out_features=32, bias=True) => x = x*W_T + b + float x_2[32]; + for (unsigned int col = 0; col < 32; ++col) { + x_2[col] = 0; + for (unsigned int inner = 0; inner < 32; ++inner) { + x_2[col] += x_1[inner] * wgtT_2[inner][col]; + } + x_2[col] += bias_2[col]; } - x_2[col] += bias_2[col]; - } - // (3): ReLU() - float x_3[32]; - for (unsigned int col = 0; col < 32; ++col) { - x_3[col] = (x_2[col] > 0.f) ? x_2[col] : 0.f; - } + // (3): ReLU() + float x_3[32]; + for (unsigned int col = 0; col < 32; ++col) { + x_3[col] = (x_2[col] > 0.f) ? x_2[col] : 0.f; + } - // (4): Linear(in_features=32, out_features=1, bias=True) => x = x*W_T + b - float x_4[1]; - for (unsigned int col = 0; col < 1; ++col) { - x_4[col] = 0; - for (unsigned int inner = 0; inner < 32; ++inner) { - x_4[col] += x_3[inner] * wgtT_4[inner][col]; + // (4): Linear(in_features=32, out_features=1, bias=True) => x = x*W_T + b + float x_4[1]; + for (unsigned int col = 0; col < 1; ++col) { + x_4[col] = 0; + for (unsigned int inner = 0; inner < 32; ++inner) { + x_4[col] += x_3[inner] * wgtT_4[inner][col]; + } + x_4[col] += bias_4[col]; } - x_4[col] += bias_4[col]; - } - // (5): Sigmoid() - float x_5[1]; - for (unsigned int col = 0; col < 1; ++col) { - x_5[col] = alpaka::math::exp(acc, x_4[col]) / (alpaka::math::exp(acc, x_4[col]) + 1); - } + // (5): Sigmoid() + float x_5[1]; + for (unsigned int col = 0; col < 1; ++col) { + x_5[col] = alpaka::math::exp(acc, x_4[col]) / (alpaka::math::exp(acc, x_4[col]) + 1); + } - return x_5[0]; - } + return x_5[0]; + } -} //namespace lst::t5dnn + } // namespace t5dnn +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/NeuralNetworkWeights.h b/RecoTracker/LSTCore/src/alpaka/NeuralNetworkWeights.h index d7b2f03937bdb..d5321fea07a6e 100644 --- a/RecoTracker/LSTCore/src/alpaka/NeuralNetworkWeights.h +++ b/RecoTracker/LSTCore/src/alpaka/NeuralNetworkWeights.h @@ -3,311 +3,313 @@ #include -namespace lst::t5dnn { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { + namespace t5dnn { - ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_0[32] = { - -4.5069356f, -5.8842053f, 1.0793180f, -0.1540973f, -0.4705772f, 6.4027028f, -0.6620818f, -7.0734525f, - 0.6211641f, 4.9630723f, 3.4310920f, -0.8856288f, 4.5843782f, -6.0180559f, 0.0126438f, -1.5725276f, - -0.8549317f, -6.8545237f, -1.2129461f, 3.0617838f, -0.3911322f, 0.0799793f, -2.5398655f, -0.5780622f, - 2.8533990f, -0.1777968f, -2.6457164f, -0.7976936f, 4.5644889f, -2.1747942f, 3.4286616f, -10.1073380f}; - ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_0[38][32] = { - {6.1269712f, -10.6625051f, 17.4907818f, -0.0019928f, -3.4468415f, 1.6674044f, -7.8957767f, 2.2077549f, - 9.5517254f, -5.1345053f, -30.1643391f, 4.0148559f, -19.8330841f, -18.3806915f, 0.1334764f, 1.6213616f, - -4.1423774f, -15.3062429f, -1.0209556f, 1.5580219f, 0.7426265f, 0.0033929f, 1.3924170f, 0.9196110f, - -0.8995734f, 1.0594707f, 39.4390869f, 8.7642002f, 28.4583893f, -5.9235659f, 3.7221889f, 14.4167147f}, - {1.7863803f, -0.6068707f, 0.3166098f, -0.0608759f, 0.5939785f, 0.4870262f, -3.1375074f, -17.7147388f, - -0.7231818f, -9.3808413f, 2.2070611f, 15.7461920f, 0.9355862f, 2.3942475f, -0.0671409f, 3.5954301f, - -3.0463996f, -2.0748904f, -0.5450584f, -4.4800100f, 0.6074556f, -0.0161482f, 3.0624702f, -4.5688419f, - 2.9881518f, -0.3714012f, -0.0387531f, -0.7699140f, 4.4028845f, 5.0333014f, -4.7350726f, -8.6568584f}, - {5.6548429f, -0.0207700f, 0.1785973f, 0.0881671f, 0.2530097f, -0.1893259f, -0.1105739f, -0.5183877f, - 1.0728362f, 0.1833011f, 1.7765219f, 0.3127359f, 0.0455277f, -0.1442616f, -0.1048361f, -0.1235604f, - -0.1217661f, -0.5487315f, 0.7575656f, -0.1177454f, -17.0993137f, 0.1628031f, 0.2789381f, 0.5304270f, - 0.0837841f, -3.1120780f, 0.0074821f, -0.1648044f, -0.3395336f, 0.3958135f, 0.8718957f, -1.1980486f}, - {0.2401041f, -0.0585765f, -0.0144584f, 0.0411095f, 0.0752229f, 0.0292672f, -0.2437613f, -1.4396472f, - -0.0971315f, -1.7181139f, 0.2417643f, 2.2030578f, 0.0566049f, 0.1081589f, -0.1060181f, 0.3473758f, - -0.7095683f, -0.0345675f, 0.2794849f, -1.1702278f, 0.2622930f, -0.0072611f, 0.5026371f, -1.2882922f, - -0.4712771f, 0.0597130f, -0.0039970f, -0.6050836f, 0.1554724f, 1.0991164f, -0.4975886f, 0.2597970f}, - {0.0766028f, 0.0218421f, -0.1739017f, -0.0076569f, 0.0384461f, -0.1841756f, 0.9677940f, -3.1114254f, - 2.3830564f, 2.0706992f, -0.9643140f, 0.7361387f, -0.0060253f, -0.1554846f, -0.0831100f, 2.8754771f, - -1.4403527f, -0.5281797f, 0.5157787f, 4.2405987f, 0.4807618f, 0.0217647f, -1.2626950f, 0.9145837f, - -0.3931780f, 0.3426280f, -0.0065206f, -0.7510439f, -0.4555758f, 2.7724340f, -1.2173026f, 0.1039017f}, - {0.5685715f, 0.3927337f, 0.4942532f, -0.0671033f, -0.2808350f, -0.0336000f, -1.3983957f, 0.9876546f, - -2.3840380f, 0.7315395f, -2.2009561f, -1.4631602f, -0.4672308f, -0.4994236f, 0.1169335f, -1.1894208f, - -1.2692982f, 0.3303853f, -2.0147655f, -0.9912014f, 1.0042895f, 0.1121151f, -1.0789106f, -2.2821584f, - -6.6459913f, -0.0959398f, -0.0068429f, -2.8177626f, 0.3213172f, -2.6832986f, -4.7613306f, -0.9985733f}, - {1.4419515f, -0.3864825f, -0.6756768f, -0.1273375f, 0.4321181f, 0.3354745f, -0.8236564f, -2.8190827f, - 0.7090831f, 1.9072700f, -3.1834064f, -2.6938572f, 0.5051147f, 1.4382831f, 0.1241910f, -0.7352629f, - 0.7703634f, -1.7556250f, -2.1104112f, 3.0603442f, 1.9873468f, -0.0358815f, -1.0087154f, 3.8253262f, - -0.5466214f, 0.0875162f, 0.2691758f, 0.7121435f, 1.9314718f, -0.1580560f, 3.6484149f, -5.3173709f}, - {6.9104381f, -0.0033664f, -1.4405546f, -0.1768288f, 0.2028089f, -0.1012344f, -4.4735684f, 0.6354278f, - 4.3039737f, 0.2056303f, 1.8338999f, -1.1351355f, 0.1015760f, -0.0733253f, -0.0561627f, 2.5292397f, - 1.6314448f, -0.9333628f, -0.7773662f, 0.8313186f, -0.7829623f, 0.1265118f, 0.5922315f, -0.3463379f, - -1.3269740f, -3.3302619f, -0.0061799f, 2.3374722f, 0.0880938f, 0.7470241f, -0.4205743f, -4.7557602f}, - {0.0380794f, 0.0947470f, 0.0419397f, 0.0582226f, -0.0603404f, 0.0234028f, -0.2575402f, 0.4125248f, - 0.3035339f, 0.2663808f, -0.6092452f, -1.4727812f, 0.0247187f, -0.0539688f, -0.0150413f, 0.2094955f, - 0.5379737f, -0.3255228f, -0.5639279f, 0.0786276f, 0.6703192f, 0.1557026f, -0.2753083f, 1.1463971f, - -0.9372965f, 0.5657740f, 0.0041413f, 0.0870248f, 0.0101520f, -0.8214461f, 0.1212932f, 1.5648646f}, - {-0.0969819f, 0.0137566f, 1.3515147f, -0.0155047f, -0.1416170f, -0.1636726f, 0.5184190f, 0.4732984f, - 0.6815788f, -1.0522166f, -0.4486531f, -0.0516016f, 0.0201894f, -0.0849667f, -0.0861271f, -1.2027841f, - 1.2458711f, -0.7061657f, 1.0381308f, -0.3450044f, -0.1300479f, -0.0828402f, 0.6859242f, -1.0575374f, - 0.6947553f, -0.0922188f, 0.0199132f, 0.8038982f, -0.1734094f, -0.1057449f, 1.6305015f, -0.0688597f}, - {-1.8151448f, 0.1024327f, 1.7063105f, 0.1130912f, -0.1081472f, -0.2904744f, -1.3465070f, -1.0455177f, - -0.4581082f, -3.2220871f, 0.5221398f, -5.1637673f, 0.0811146f, -0.1326323f, -0.0379338f, -3.0439703f, - -2.4246936f, -0.3670847f, -3.1256330f, -1.6595014f, -3.4715190f, -0.1526113f, -1.0420206f, 0.9536474f, - -3.2932863f, 1.6048199f, 0.0025162f, -3.6049840f, 0.0604250f, -2.2404826f, 1.8406851f, -3.1381185f}, - {1.2985691f, -1.1044264f, 0.9062797f, -0.0788333f, 0.2694912f, 0.0032800f, -0.0574267f, 0.9734111f, - 1.1532565f, 2.6786125f, -3.8574269f, -2.2871449f, -0.1261243f, 1.0545347f, -0.1454154f, -0.5609738f, - 1.8385800f, -0.8035598f, -1.7668265f, 5.1665063f, 0.7966110f, 0.0940206f, -2.3943975f, 2.3344002f, - 1.0342182f, 0.4806454f, -0.3880928f, 0.6998246f, 1.4011886f, -1.7313483f, 4.9702630f, -6.0058608f}, - {1.0300356f, 0.0616315f, -0.1113776f, -0.1694220f, 0.7159944f, 0.0626456f, 2.0994680f, 0.3452290f, - -3.0487001f, 0.0654031f, -1.1510723f, 0.5370992f, -0.0290704f, -0.0300795f, 0.0751569f, -0.2345951f, - -0.3472281f, 0.4424143f, 1.2444530f, -0.2114656f, 0.7865694f, -0.0709381f, -0.1839961f, -0.0529834f, - 0.5867608f, -3.8793530f, -0.0814745f, -0.6368676f, 0.0361213f, -0.5549288f, 0.5661780f, 1.8374584f}, - {0.3345098f, 0.0068199f, -0.4205509f, -0.1088801f, -0.1043202f, -0.0040804f, 0.3400922f, 0.2673528f, - -0.6050695f, 0.4443954f, -0.4319905f, -0.6044132f, -0.0260679f, 0.0137036f, 0.0765494f, -0.0095099f, - 0.5880439f, -0.0083854f, -0.2407522f, 0.1942379f, 0.6554548f, -0.1322891f, -0.8298992f, 0.7909554f, - 1.0528831f, 0.1970959f, 0.0754069f, -0.0947960f, -0.0279494f, -0.5888316f, 0.8919419f, 0.4828835f}, - {0.3995822f, -0.2139665f, 0.3982936f, -0.1285759f, -0.3445527f, -0.1167238f, -0.1263519f, 0.8393803f, - -0.7758383f, 0.0719291f, -0.0134762f, 0.1715237f, 0.0796666f, 0.1023507f, -0.1172728f, -1.2364722f, - 1.2592632f, -0.3168479f, 0.7487004f, -1.5170647f, -0.2235429f, -0.1620898f, 1.4064828f, -1.0821995f, - 0.0740103f, -1.0412805f, -0.0621277f, 0.2439800f, 0.2684972f, -1.1661061f, 0.7859434f, -0.6170313f}, - {2.1615884f, 0.1431713f, 0.0642652f, -0.0522325f, -0.2658786f, -0.0245810f, -1.6857448f, -0.6685011f, - -0.6978170f, -0.8716729f, 0.3129902f, -2.5870812f, -0.2855283f, -0.3205920f, -0.0084069f, 1.3182145f, - -0.6923816f, -0.3730274f, -2.3638811f, -1.1128502f, -2.4709859f, 0.1349022f, -0.3574466f, -0.6597407f, - -4.1122031f, 0.2240651f, 0.1806145f, -1.6836300f, -0.0766231f, -3.2611966f, 0.0091456f, -0.0997367f}, - {5.2476101f, -0.1966512f, 4.8935304f, -0.1551689f, 1.6919724f, -0.8324367f, 14.3318472f, -0.3503132f, - 10.3614969f, -9.1522884f, -0.2543063f, -1.8476851f, 16.7961140f, 9.9541416f, -0.0434563f, -9.6973553f, - -5.0469398f, 6.1688442f, 7.6429725f, -7.3149266f, 1.2345183f, 0.1412155f, 0.7114770f, -1.6378664f, - 5.1548996f, 0.3686100f, -45.3027611f, 3.0492647f, -37.3445892f, 2.7421410f, -2.7958770f, -25.2034016f}, - {1.4597454f, -1.0561740f, 0.9751291f, 0.0446527f, 0.3691662f, 0.1006782f, 0.1418435f, 0.8871480f, - 1.1603093f, 2.8034730f, -4.0856910f, -1.9786842f, -0.2206208f, 0.9539357f, 0.0868183f, -0.6811873f, - 1.9642411f, -0.8065316f, -2.0244894f, 5.2936082f, 0.6120632f, -0.1194160f, -2.3925939f, 2.5555069f, - 1.0149733f, 0.4607603f, -0.2197217f, 0.5703423f, 1.4049014f, -1.5900208f, 5.1645074f, -6.0569463f}, - {0.9000676f, -0.0028781f, -0.1967366f, 0.1039593f, 0.7993248f, 0.0655172f, 2.2296758f, 0.4391927f, - -3.0292840f, 0.0334536f, -1.1728534f, 0.3479103f, -0.1190938f, 0.0410203f, 0.1146637f, -0.2958017f, - -0.3240463f, 0.4361866f, 1.0564958f, -0.1989332f, 0.5194008f, -0.0628912f, -0.1733121f, -0.1255383f, - 0.5990249f, -3.7692382f, 0.0995128f, -0.7101220f, -0.0785123f, -0.3514554f, 0.6662078f, 2.0991604f}, - {0.1781942f, -0.1873588f, -0.4653996f, -0.0153059f, -0.1399561f, -0.0498718f, 0.4552556f, 0.2300792f, - -0.7682312f, 0.4342302f, -0.3787803f, -0.6089386f, -0.1049337f, 0.0395331f, 0.0220332f, 0.0114750f, - 0.4672548f, 0.1284784f, -0.2472819f, 0.2892784f, 0.4788667f, 0.0472555f, -0.6593549f, 0.6508777f, - 0.9286987f, 0.3043948f, -0.0635985f, 0.0814399f, -0.1168853f, -0.6688027f, 0.8876534f, 0.4865684f}, - {0.4024099f, 0.0480259f, 0.4588822f, -0.1793082f, -0.2151573f, -0.1871128f, -0.1502780f, 1.1011307f, - -0.9467706f, 0.2632496f, -0.1257263f, -0.0241331f, 0.2280627f, 0.0878608f, -0.1334262f, -1.1642927f, - 1.0943586f, -0.4799654f, 0.5981907f, -1.5051398f, -0.4235946f, 0.0012827f, 1.2342577f, -0.8281875f, - 0.2776567f, -1.0362227f, 0.0408372f, 0.1540821f, 0.1777556f, -1.2684357f, 0.8836584f, -0.4001710f}, - {2.1558056f, 0.2082023f, 0.0863442f, 0.0364868f, -0.3985825f, 0.0307202f, -1.8889453f, -0.5614714f, - -0.7311882f, -0.8075573f, 0.4895108f, -2.7770483f, -0.3121874f, -0.1671291f, -0.1281284f, 1.3212786f, - -0.5310181f, -0.1974759f, -2.6240873f, -0.8320529f, -2.3875966f, -0.0286360f, -0.6263188f, -0.6553424f, - -4.1658955f, -0.0601300f, 0.0946256f, -1.6795633f, -0.1251303f, -3.0974686f, 0.2412274f, -0.0687501f}, - {2.0523887f, -0.6387668f, 2.0633900f, -0.0550964f, 0.5181718f, -0.4202190f, 1.8569367f, 0.8295385f, - 0.8555872f, 2.4727983f, -0.2072828f, -1.9006120f, 0.5379534f, 0.4463673f, 0.1468820f, 0.4918649f, - -3.4016700f, 0.2884440f, -1.9418719f, 4.5157170f, -0.5160927f, -0.0199372f, 3.1353824f, -0.9863126f, - -1.5135859f, 0.7576568f, 0.6715558f, 2.7409093f, 0.9291748f, -0.3247162f, 1.8204515f, -8.9181070f}, - {-0.1428107f, -0.0829889f, 0.4213613f, 0.0225415f, 1.2238166f, 0.0477106f, 0.3031853f, -0.7466553f, - 2.0663500f, 0.7588379f, 0.3689216f, -0.2003786f, 0.1242338f, 0.1693589f, -0.0351716f, -0.0186597f, - -0.0189417f, 0.5468715f, -0.2862698f, -0.1311738f, 3.0747476f, -0.0310747f, 0.0943165f, 0.3139819f, - 0.6274695f, -1.8314874f, 0.0147495f, 0.3554756f, 0.3829916f, 0.4891713f, 0.1328600f, 1.0535098f}, - {0.0534900f, 0.1787969f, -0.0571320f, -0.0685673f, 0.1968977f, 0.0374476f, 0.7876674f, 0.0828491f, - 0.6444036f, -0.2203166f, -0.2383427f, 0.5397566f, 0.0106769f, -0.1230072f, -0.0135021f, -0.5691944f, - -1.5040319f, 0.0406933f, -0.0025478f, 0.9251419f, -1.7180276f, -0.1112956f, 1.4840862f, 0.0407115f, - -0.0100329f, 0.0583593f, -0.0110524f, 0.7431355f, -0.0971857f, -0.5501527f, -0.6371027f, -0.1935233f}, - {-0.6455778f, 0.2317368f, 0.9285696f, -0.1415854f, 0.0822560f, 0.2488030f, -2.6992166f, 0.0884904f, - 0.6735302f, -0.1467820f, 0.5641044f, 0.6436581f, 0.0818401f, -0.0336634f, -0.0729000f, -0.1206900f, - -2.5739892f, 0.5776953f, 0.9531668f, -1.2362405f, -0.0615577f, -0.0143544f, -2.7525210f, 1.3738545f, - 0.2751348f, -1.7463943f, -0.0020144f, 2.4814103f, 0.1716725f, -0.7055540f, -0.3474010f, 0.4482578f}, - {-0.2526205f, -0.7463821f, -3.6076138f, -0.1511098f, 0.1216256f, 0.0888247f, -1.0190924f, -1.3260181f, - -0.0443211f, -4.8911066f, -3.4385188f, -6.0057454f, 0.3340450f, 0.2997236f, -0.0907855f, 0.7500492f, - -0.4007562f, 1.9382039f, 0.5687234f, 2.6511824f, 4.7703862f, 0.0006749f, -0.0201394f, -3.5885489f, - -4.1518898f, 0.0807014f, -0.0584071f, -0.8100027f, 0.7697087f, -0.8038046f, -1.2945876f, -4.0110312f}, - {0.4337017f, -1.1532011f, 2.0740633f, 0.0271806f, 0.6654227f, 0.1012998f, -4.0791736f, 1.2631345f, - 1.9511020f, 2.3272331f, 1.2707534f, 1.6306664f, 0.4936035f, 0.8285242f, 0.0807625f, 3.8652387f, - 0.0281145f, 1.6877037f, 1.2557380f, -0.3036775f, 0.5604967f, 0.1551418f, -0.9599600f, -6.3067718f, - -0.6352320f, 0.8058553f, 0.3657880f, -2.0491202f, -0.3926269f, 2.5650854f, 1.3697821f, -8.3070078f}, - {5.1334143f, -0.0351738f, -0.4774780f, -0.0679726f, 1.4569254f, 0.0580191f, -0.3649136f, -0.2298838f, - -3.3826666f, -0.7392708f, -0.6036060f, -0.2612940f, -0.1877640f, -0.1145124f, -0.0042578f, -0.0311193f, - -0.0320479f, 0.5270581f, -0.4324475f, 0.2681437f, 4.7813129f, -0.0222701f, -0.0525629f, -0.2861001f, - -0.1251072f, 3.9112861f, 0.0045046f, -0.0426071f, -0.3299106f, -0.0686970f, -0.1602017f, -0.0070103f}, - {-0.6633690f, 0.0103367f, 0.5998458f, 0.1256577f, -0.0359184f, -0.0176820f, -0.6458368f, -0.0370536f, - 0.3542259f, 0.1394724f, 0.8255956f, 0.2501569f, 0.0320156f, -0.0256806f, 0.0277949f, 0.0036392f, - 0.2825173f, 0.1400358f, 1.0011463f, -0.6792242f, 0.0672508f, 0.0728705f, -0.1089695f, -1.0414587f, - -0.4135485f, 0.4293025f, -0.0041241f, -0.9564193f, 0.0314900f, 0.8658463f, -0.7734696f, -0.7610567f}, - {-0.0200122f, -0.0749178f, -1.5026549f, -0.0387432f, -0.0713735f, 0.1214790f, 1.8730290f, -0.0552839f, - -1.6867150f, 0.2282097f, 0.7161849f, -0.1018546f, -0.1092003f, 0.0365504f, -0.1326883f, 1.2310545f, - 0.1800210f, 0.7024739f, -2.9606545f, 1.2275347f, -0.2050014f, 0.0940569f, 0.4761694f, 0.8812068f, - -0.0083424f, -1.5406264f, 0.0061815f, -2.7606382f, 0.0248556f, 1.1086880f, -1.3608936f, 1.0795454f}, - {0.9734020f, 0.3905411f, -3.7008634f, 0.0013557f, 0.1649124f, 0.9935362f, 1.3489184f, 0.9505764f, - 0.7966231f, -0.1627246f, -2.5754328f, 1.4892205f, 0.8586300f, 0.6974363f, 0.1320204f, -0.7840260f, - 0.3121157f, 0.0966901f, 2.7447381f, 1.8256680f, 0.7229405f, -0.1723188f, 0.9145948f, -2.1376033f, - 0.5259342f, 0.0731194f, -0.2908303f, -0.2603913f, -0.2326528f, 3.6684167f, -0.2883157f, -2.8546307f}, - {-4.8917460f, 6.7944999f, -0.2255474f, 0.1051999f, 3.9000113f, 2.0624907f, 5.3019547f, 10.0209141f, - 1.1268179f, 2.2669628f, -6.5002980f, 1.8408583f, 5.3039579f, 2.2055962f, 0.1055369f, 1.7230233f, - 6.9605255f, 7.7025104f, 2.9880707f, -0.9274251f, -0.2287160f, -0.0206735f, 0.6885675f, 2.8179996f, - -7.1129837f, -1.3772345f, 3.8655453f, -5.9388318f, -0.0469947f, 7.2763596f, -6.3536129f, -17.0069847f}, - {1.8787041f, -0.9953383f, -1.4839923f, 0.1308209f, 0.3657510f, 0.3106483f, -1.4158971f, -6.7449651f, - 0.6553892f, -4.5046172f, -3.5489719f, 3.5363002f, 0.5454772f, 2.3521471f, 0.1612140f, -0.9744226f, - 0.6546553f, -2.7179255f, -1.7758157f, 0.3089439f, 1.7462813f, 0.1654593f, -0.2440207f, 3.9501827f, - 1.3750844f, 0.0596805f, -0.1977254f, 0.0264880f, 2.6396444f, 1.0816911f, 3.6413448f, -6.0299959f}, - {-4.1295738f, 0.1044480f, 0.2131937f, 0.0420826f, 0.5292229f, 0.0090477f, -0.0973486f, 0.9596778f, - 2.9579651f, -0.6364226f, -1.7556342f, 0.1539868f, -0.1273174f, -0.1348504f, 0.1257833f, -1.4168571f, - -1.0960362f, 0.0482449f, -1.4395387f, -0.2524115f, -2.9162085f, -0.0451428f, -0.4021681f, -0.5756381f, - 0.0515293f, -3.1996479f, -0.0007676f, -1.3878343f, -0.2864279f, -0.9579773f, -1.0999249f, 1.6500067f}, - {-2.4806111f, -6.8115449f, 3.2805641f, 0.1187415f, -0.9950783f, 6.2553434f, -1.6450261f, -6.1463733f, - 2.7507148f, 4.2995782f, 0.0461297f, -0.5417359f, 2.4306326f, -7.3530145f, 0.0698273f, -0.9394333f, - -1.3595498f, -7.5141478f, -1.4911395f, 3.2300410f, 0.1203540f, 0.0314884f, -2.0116949f, -0.8167119f, - 2.4133310f, 0.1920709f, 1.0619365f, 0.2459123f, 6.9166069f, -2.6384118f, 3.6829739f, -7.2385545f}, - {0.9408096f, 14.9067144f, 1.7709646f, 0.1105646f, -0.5600107f, -15.3188124f, -12.3718462f, -1.8893757f, - 13.6364670f, -5.7327847f, -14.1805468f, 1.0581509f, -14.2186184f, 14.8948650f, 0.0190344f, 5.4395180f, - 6.7243400f, 9.8468456f, 4.5144215f, -1.4551491f, 1.1032411f, -0.0317988f, 2.3398454f, -3.1671596f, - -7.7541409f, 1.1255593f, 6.7340465f, -4.4448423f, -9.1472626f, -3.1959128f, 4.4181323f, -2.7904994f}, - {-2.1621978f, -4.7202382f, 1.7378219f, 0.1417439f, -0.5000908f, 5.4468708f, 1.4260571f, -6.6136570f, - 1.5713804f, 3.4479704f, 2.7354901f, -0.7388076f, 5.4666147f, -3.8697338f, -0.1368596f, -2.7903373f, - -1.2043713f, -4.9554005f, 0.3324645f, 1.6767365f, 0.1156244f, -0.0326964f, -2.0945346f, -0.4590589f, - 3.0942657f, 0.0015020f, -6.2626700f, -0.3969755f, 0.7717427f, -1.9667094f, 2.9664171f, -11.9477053f}, - }; - ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_2[32] = { - 9.8383608f, 3.6922295f, 3.5774977f, -4.4619012f, 6.5087032f, -0.9540017f, -0.5059246f, 0.0706402f, - 14.3396597f, -0.2771132f, -4.8409863f, -8.3581600f, -3.5078344f, 4.3287506f, -5.7808843f, 3.9264839f, - -2.1697845f, -0.0040514f, -0.2095029f, -6.8678174f, 1.7911285f, -0.4510343f, 1.2410443f, -4.5678806f, - -0.5693849f, 2.3320096f, 4.4606552f, -6.3771009f, -4.3149071f, -0.1905672f, -3.5726390f, -1.0744030f}; - ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_2[32][32] = { - {-0.0155548f, 0.0243339f, 0.0037967f, -0.2771824f, 0.0111955f, -0.0115980f, 0.0079653f, -2.9803498f, - -0.0061037f, -0.0956634f, 0.0332446f, 0.0179244f, -0.0080377f, -9.0180779f, 0.1720033f, 0.0350694f, - -0.0146588f, -0.2135506f, -0.3158041f, 1.3697664f, 0.0119146f, 0.0119120f, -0.0986927f, 0.0297492f, - 0.0355827f, -0.1196868f, -0.0745119f, 0.0281862f, -0.0422190f, -0.3069138f, -0.0477367f, -0.0550450f}, - {-1.7374619f, 1.4822800f, -2.1885235f, 1.8354234f, -0.5380136f, 1.6621803f, 0.6251035f, 0.1008954f, - -0.8387129f, -0.2063313f, 1.0661691f, -0.9799694f, -5.1710258f, -3.2260630f, -1.5073707f, -1.0792168f, - 1.8569958f, -0.2289213f, 0.0563821f, -1.6398847f, -4.1649504f, -2.7527378f, -0.0134577f, 3.0424533f, - 0.0364320f, 0.6762254f, -3.1551330f, 2.4888904f, 1.4757305f, -0.3141717f, -2.0126467f, -0.1675602f}, - {-0.9571826f, 0.0914152f, 0.0404339f, 0.2927902f, 0.2933607f, 0.0619171f, 0.0772318f, -1.3796169f, - -0.8194544f, -0.2179988f, -1.1241078f, -0.1443964f, 0.0559355f, -1.2914546f, -0.3445117f, 0.2031156f, - 0.0273864f, -0.0193422f, -0.2136522f, 0.0429592f, 0.0212854f, 0.0414394f, -1.1734651f, 0.0582848f, - 0.0136039f, -0.1892604f, 0.0764908f, -0.0130132f, -0.1272559f, -0.0818855f, -0.0408583f, -0.1563294f}, - {-0.0213695f, 0.0596942f, -0.0641309f, -0.0146449f, 0.0416586f, -0.0378931f, 0.1234860f, 0.1622967f, - 0.0794091f, -0.0639933f, -0.1030663f, 0.0579078f, 0.1050275f, -0.0136866f, 0.0149978f, 0.0876813f, - 0.0693554f, 0.1612417f, -0.0595916f, -0.1008234f, -0.0579058f, 0.0915138f, 0.1321436f, -0.1484535f, - -0.0920316f, -0.0024532f, -0.1045300f, 0.0924260f, 0.0277524f, -0.0287276f, -0.1271127f, 0.1164243f}, - {0.0713067f, 0.0198056f, -0.3023696f, -0.0025908f, -0.0085885f, -1.1157553f, 0.0236462f, -0.0704844f, - -0.0189257f, -0.0997382f, 0.3379845f, -0.1229390f, -0.0616165f, -0.8968034f, 0.0401445f, -0.1144476f, - -0.0532077f, 0.0604580f, 0.0609454f, -0.1613472f, 0.0103525f, -0.1653874f, 0.0205189f, 0.0758978f, - -0.1514593f, 0.0151441f, 0.2043469f, 0.0349607f, -0.1361278f, -0.1255922f, 0.0631648f, 0.3570991f}, - {0.3371337f, -3.7541580f, 2.2215877f, -0.3390516f, 0.1912718f, -4.1861577f, -1.2264019f, 2.8179801f, - 0.0667294f, -0.0093539f, 2.3029909f, 3.1814916f, 3.9780347f, 0.2310601f, 0.3986159f, -0.8544636f, - 0.4139664f, -0.1876569f, -0.2448732f, -2.8053334f, 4.0488625f, 2.1094146f, -6.7310257f, -4.9950023f, - -0.8315823f, 0.0555959f, 2.4573720f, -3.7234364f, -4.2910552f, -0.2995245f, -3.2605181f, 2.3620574f}, - {-1.5522735f, -0.1866350f, -0.0067679f, 0.3196557f, 1.4052233f, 2.8143549f, -0.9992948f, -0.5309914f, - -25.8852596f, -0.1218249f, 0.6625420f, 0.3007106f, -0.2767264f, -0.1847300f, -0.5313534f, -0.0383462f, - -0.1987552f, 0.0581405f, -0.3376078f, 1.2621028f, 0.0818709f, -0.1401216f, -0.4550788f, -0.1592657f, - 0.0597123f, 0.1344101f, -0.1005317f, -0.1538406f, 2.9142656f, -0.0806051f, -0.4267367f, -31.9512234f}, - {0.6859627f, 0.1212986f, 0.1291616f, 0.0459838f, -0.0899920f, 0.0287645f, 0.1987007f, -2.7079368f, - -0.2628384f, -0.1402464f, -0.6302179f, -0.2923960f, -0.1106663f, 0.8256195f, -2.8054097f, -0.0296494f, - -0.5632019f, -0.1335654f, -0.1558440f, -6.8611612f, 0.0203786f, 0.0046566f, -0.4401442f, -0.0471430f, - 0.4535986f, -0.8657981f, 0.0684740f, 0.0518814f, -0.0123748f, -0.2270164f, 0.0922878f, -0.3863277f}, - {0.0127175f, 2.3346109f, -0.4390767f, -0.4657893f, 0.1659466f, -0.1132782f, -0.4928388f, 0.7652873f, - 1.1510741f, -0.0879600f, 0.2721785f, -0.1878961f, -0.3477249f, -0.8473209f, -0.8931856f, -0.4328294f, - -11.9181929f, -0.0282545f, -0.0217915f, 1.6676594f, -0.2122232f, -0.6190930f, 1.9053432f, -0.7592348f, - -1.0739189f, -0.7170524f, 0.3864411f, -0.8849231f, 0.1393488f, 0.0738489f, 0.4460345f, 1.9020857f}, - {0.4453296f, -0.0767821f, 0.1638939f, 1.6997167f, -0.1098599f, -0.0551604f, 0.0040561f, -13.5290670f, - -0.1285677f, -0.0590394f, 0.6499141f, -0.7617344f, 0.0453151f, 0.3104213f, -1.0711143f, 0.1361838f, - -0.4365610f, -0.1300649f, 0.2013344f, -0.5308123f, 0.1451896f, 0.1030715f, -0.6487910f, -0.3136590f, - -0.0280079f, 0.5394178f, 0.1318262f, -0.0159292f, 0.0636870f, -0.3224248f, -0.1868187f, -0.2468304f}, - {-0.0333494f, -0.0834255f, -0.1221875f, 0.6861304f, 0.0521738f, -0.0416543f, -0.4437352f, -19.3246250f, - -0.1520821f, 0.0528602f, -0.6375434f, -0.5803806f, -0.0958465f, -2.0058544f, -0.8282642f, 0.0259000f, - 0.4846996f, 0.1211179f, 0.0356884f, 1.0009497f, 0.0635682f, -0.0314105f, -0.0011147f, 0.0131714f, - -0.3410152f, 0.2798154f, 0.0961889f, 0.1266228f, -0.0934717f, -0.0904307f, 0.1355542f, 0.5722573f}, - {0.2146454f, 0.2143834f, 0.1290650f, -0.9063646f, 0.2100945f, 0.1331054f, -0.2620614f, -0.1264993f, - 0.1313979f, 0.0455465f, -0.8395286f, -0.4967833f, -0.0538581f, 0.9155380f, 0.6627046f, 0.1691243f, - 0.9887002f, -0.1597013f, -0.1236713f, -1.9041336f, 0.0427585f, 0.0849747f, -5.2559652f, -0.3133100f, - 0.0141170f, -0.1635530f, 0.4938746f, 0.0162943f, 0.2107756f, -0.3413893f, -0.0657575f, 1.0542560f}, - {-2.8868380f, -2.0837426f, -1.0611480f, -0.6143807f, -0.6398501f, -2.8018746f, 0.5166737f, -1.0814301f, - -1.9272422f, -0.1017482f, -0.4651161f, -1.4021232f, 1.8854499f, 0.1815407f, 0.5965426f, -2.3344259f, - -0.0690846f, -0.1678239f, -0.4219488f, 0.6215640f, 1.0270095f, -0.3473049f, -0.3926674f, -0.7942593f, - 1.1305071f, -1.4621233f, -0.8051161f, -0.7698632f, -2.6038630f, -0.3090037f, -1.6365144f, -1.0179478f}, - {0.0046026f, 1.1319581f, -2.6405678f, -2.0353596f, -2.1687336f, 0.3364883f, 2.1122196f, 0.2584647f, - -2.4344857f, -0.0378498f, 0.6158544f, -0.6060749f, -4.9598379f, 0.1570698f, 2.2436838f, -2.6198347f, - -2.0935996f, -0.1845744f, -0.0716080f, -1.9338604f, -4.1995640f, -3.6706774f, -1.6762524f, 3.9646862f, - -0.9677961f, 1.8319578f, -3.1916575f, 3.7312632f, 0.0820446f, -0.0497568f, -0.0898171f, -0.2499462f}, - {-0.0780375f, -0.0286571f, 0.1007227f, 0.0012229f, -0.0531285f, 0.0840718f, 0.1013894f, 0.1312424f, - -0.0673772f, 0.1603183f, 0.0074385f, -0.0718321f, -0.1549873f, 0.1616689f, 0.0405887f, -0.1558588f, - 0.0740745f, 0.1696893f, -0.0064026f, -0.1656420f, -0.1186674f, -0.1262667f, -0.0784757f, -0.1280154f, - 0.0909976f, 0.0853046f, -0.1075811f, 0.1310615f, 0.0610194f, 0.0647223f, 0.1360559f, 0.0440074f}, - {-0.2106480f, 0.0087131f, 0.1119385f, -1.0611318f, 0.5250220f, 0.0525479f, -0.2733742f, -1.0799565f, - -0.5601607f, -0.0651806f, -1.9793440f, -0.3373334f, -0.1550518f, 0.8932216f, 0.7264332f, -0.0450735f, - 1.2373760f, -0.1236272f, 0.0680048f, -3.0446634f, -0.1533586f, -0.0127355f, -0.3326311f, -0.0225603f, - -0.2265739f, -2.3752897f, -0.3771705f, -0.0728938f, 0.1741305f, 0.1111639f, 0.4131119f, 0.2239323f}, - {-2.5691276f, -1.4011253f, -2.0640867f, -3.7236946f, 1.5542637f, -0.9456654f, -1.7575809f, 3.6794879f, - -0.4439790f, -0.1009826f, 3.6702275f, -0.1935008f, -0.4423219f, -0.3825364f, -0.4784791f, 0.5927492f, - -2.3482494f, 0.0801714f, -0.1567418f, -1.7934613f, -0.1706410f, -0.6326947f, 0.6260155f, 0.3631033f, - -0.9325932f, 1.9647995f, -1.3409088f, 1.3501998f, 0.0367797f, -0.1744210f, 1.8690013f, -1.0737898f}, - {-0.5934777f, 0.6232591f, -0.3391055f, 0.2640936f, -0.2824444f, 0.4815128f, 0.6625078f, -0.1103976f, - 0.9555223f, -0.0624896f, -0.6778919f, 0.1181502f, -0.5425385f, 0.7297349f, -1.7261271f, -0.2917557f, - 1.1873137f, -0.2725933f, 0.0975242f, 1.7756181f, -0.5735835f, -0.4453230f, 0.9800369f, 0.9344145f, - -1.8692539f, 0.0120440f, -0.7315661f, 0.6250805f, 0.3839143f, -0.0376306f, 0.3816243f, 0.6059195f}, - {0.5522162f, -1.8043815f, -10.9379101f, 0.5719097f, -0.2246755f, -1.4856353f, 0.4877502f, 0.7163438f, - -11.8135147f, -0.0180790f, -0.9928634f, 0.1107815f, -0.0005064f, -0.3824990f, -0.7453306f, -1.9909632f, - -7.4362645f, -0.0245507f, -0.1815712f, -3.5507584f, -0.0075889f, -11.0296011f, -1.1292133f, -0.0710276f, - 0.5675677f, 0.2017778f, -0.0684891f, -0.0367653f, -1.6674192f, 0.0281711f, -0.8356591f, -0.0447807f}, - {0.2537312f, -3.0178010f, -0.3493635f, 1.8573236f, 0.4017631f, 0.9912633f, -0.8625028f, -0.7783228f, - -1.7815375f, -0.1204695f, 1.8551122f, 0.3344182f, -0.2828701f, -1.3226960f, -1.4470471f, 0.2895959f, - 0.6780876f, -0.2010069f, 0.0425280f, -2.1786852f, -0.1274053f, -0.2549899f, -0.2233993f, -0.1561645f, - -0.4640818f, 0.6375850f, 0.7733670f, -0.2388286f, 1.0447853f, -0.1503223f, 0.3823584f, -13.8176088f}, - {0.2575197f, -2.2127593f, -0.0389457f, -0.0215759f, 0.1659477f, -0.0097748f, -0.1935415f, -0.9091369f, - -0.1453371f, 0.0442428f, -0.1206519f, 0.1435609f, -0.0186047f, -5.0154042f, 0.0538177f, 0.0403250f, - 0.0240955f, 0.0331080f, 0.0517951f, 0.7422639f, 0.0069818f, 0.0248351f, -0.2205741f, -0.0082387f, - 0.2043269f, 0.0459435f, 0.0876343f, 0.0140607f, 0.1056308f, 0.0062555f, 0.0184278f, -0.5539715f}, - {-0.0398742f, 0.1075264f, 0.1725024f, -0.0755192f, -0.0360048f, 0.1325573f, 0.0903103f, -0.0882263f, - 0.1207692f, 0.0032722f, 0.0048489f, -0.1257241f, 0.1450990f, -0.0713558f, 0.1116815f, 0.1107689f, - -0.1447252f, 0.1581838f, -0.0160124f, -0.0425587f, 0.1411217f, 0.0865060f, -0.0643460f, -0.0431262f, - -0.1452804f, -0.0195101f, 0.1234572f, 0.0520887f, 0.1117576f, -0.0751791f, 0.1511539f, 0.1224861f}, - {0.7728126f, 2.3075340f, -0.0385258f, -3.1270287f, 0.9414487f, 3.5251477f, -0.8043440f, 0.7212446f, - -7.6850162f, -0.1609414f, -3.7687578f, -1.0751100f, -0.2052089f, 5.0728245f, 2.2835267f, 0.5930225f, - 0.1303335f, -0.1428799f, -0.3715075f, 0.5136011f, -0.4755619f, -0.2192461f, -3.8696294f, -0.0062392f, - -1.3774812f, -0.0034140f, -1.5944362f, 0.9773729f, 3.2859125f, -0.1616932f, -1.2785367f, -13.5732412f}, - {0.5535743f, 0.1461481f, -0.2218016f, -0.2971808f, -0.2169309f, 0.1564545f, -0.0390397f, 1.1558976f, - -0.0119933f, -0.0774637f, 1.1907971f, -0.5127968f, -0.0066028f, -1.6794037f, -0.3650940f, 0.2555613f, - -0.9488379f, 0.0449603f, -0.1620417f, 0.1583214f, 0.0000908f, 0.0152763f, -1.0660053f, -0.0139402f, - -1.7440189f, 0.2515209f, 0.3333162f, 0.1904725f, 0.1116094f, -0.2287960f, -0.0007165f, -1.7047704f}, - {-5.9897852f, -0.1316296f, -0.0218074f, -0.4602887f, 0.3288545f, -0.0882939f, -0.5929499f, 0.4294790f, - -0.0383545f, 0.0556869f, 0.1975944f, 0.1341491f, 0.0629570f, -2.2742157f, 0.0175826f, -0.1439869f, - -24.8701649f, -0.1582915f, -0.2460304f, -3.9643264f, 0.0863483f, 0.0180861f, -0.2210452f, -0.0868723f, - -0.4175525f, -0.8231756f, 0.0247534f, -0.1473545f, -0.0021330f, -0.0410253f, -1.1944869f, -1.1523768f}, - {0.1031547f, -3.3402514f, -4.3636522f, -0.1534714f, -0.0622189f, 0.0374694f, -0.0870097f, -4.1865788f, - -0.0555377f, 0.0252329f, 0.1339467f, 0.0461691f, -0.0503090f, 0.0289890f, -0.0095674f, -0.3289992f, - -0.0279080f, 0.0274977f, -0.0903500f, 0.5610157f, -0.0478177f, 0.4346960f, 0.4822784f, -0.1058945f, - -0.2026870f, -0.0560638f, 0.0910069f, -0.0818529f, 0.0819198f, -0.0292193f, 0.3040628f, -0.1275230f}, - {-5.8789845f, -17.1114635f, -4.6755161f, 0.1016624f, -0.8685016f, -0.3898779f, -2.3363957f, 0.1413794f, - -2.4254086f, -0.2171030f, -0.0901150f, 0.7058705f, 0.4166250f, -0.0231085f, -0.1789686f, -9.4244318f, - -0.6418229f, -0.0857969f, 0.1683681f, -0.0310597f, -0.0247807f, -5.3748040f, -7.4730940f, 0.1019564f, - -1.2126822f, -0.3726285f, -1.0287101f, 0.1803891f, -0.2227769f, -0.0791530f, -0.0159770f, -1.4883354f}, - {-17.9394970f, -0.5228514f, -11.3547935f, -0.0672671f, -2.0371394f, -0.9076943f, 2.4331825f, -6.9409127f, - 0.8286008f, 0.0208618f, -0.8009814f, 1.2268484f, 0.1943726f, -1.7297083f, -0.7668949f, -6.5505466f, - -0.6495168f, -0.0404727f, -0.1260914f, -3.5029383f, -0.0852898f, -2.9679556f, 1.6404767f, -0.0251449f, - 1.1460075f, -0.7877688f, -0.0586593f, -0.4741839f, -1.7420560f, 0.0295600f, -2.3574052f, 0.0974777f}, - {0.4443443f, 0.6384261f, 1.3317494f, -1.0085982f, 0.9508762f, 1.3168396f, -0.1862490f, -0.1801148f, - 1.1106120f, -0.0654911f, 0.1186706f, -0.7198273f, 0.5449172f, -0.5886080f, 0.7504217f, 1.8046317f, - -0.1294390f, -0.1939137f, -0.2383934f, 0.4131435f, 0.6910310f, 1.2821866f, -0.1088722f, -0.5660405f, - -0.1188610f, 0.0364403f, 0.3597929f, -0.6409024f, 1.2114668f, -0.0212278f, 0.8423592f, 0.4848156f}, - {-0.8772649f, -13.5265112f, -4.5540547f, -0.2856667f, 0.7604876f, -0.6829260f, -0.8320626f, 0.6541347f, - 0.4020181f, 0.0009324f, -10.9660740f, -0.3540186f, -0.2316812f, 0.3576394f, 0.0998953f, -1.5738430f, - 1.2089975f, 0.0706465f, -0.2538019f, 0.7016497f, -0.0282650f, -3.1291001f, -0.4375663f, -0.3979468f, - -0.1588882f, 0.3978875f, 0.2038192f, -0.4281644f, -0.5787544f, -0.0922198f, 0.9595569f, 0.0212818f}, - {0.3392667f, 0.1170919f, -0.0705636f, -0.1025443f, -0.1192213f, -0.0495686f, 0.0284667f, -0.1226804f, - 0.0050191f, -0.0516545f, -1.0892097f, 0.0033689f, 0.0471462f, 1.4266804f, 0.0288870f, -0.0110408f, - -1.1283765f, -0.1299917f, -0.4318301f, -0.9854419f, -0.0190479f, -0.0269406f, 0.3697925f, -0.0757695f, - -0.3632923f, -0.1714077f, 0.0669245f, 0.0557428f, -0.1713906f, -0.4307863f, -0.1749060f, -2.1246362f}, - {0.8383662f, -3.8122442f, 0.1568939f, -2.2105119f, -0.7086993f, -0.4664145f, -0.3578597f, 0.5554636f, - 0.6965880f, -0.1506968f, 0.2646832f, 0.2874083f, 0.1901203f, -2.4997077f, -0.3519035f, -0.0518054f, - 1.0862818f, -0.2502540f, -0.3133347f, -0.7411230f, 0.1268138f, 0.1069811f, -0.8109779f, 0.0264679f, - 0.1604289f, -0.7534032f, -0.1419461f, 0.0688303f, -0.1570919f, -0.3055144f, -0.7415189f, 2.5547018f}, - }; - ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_4[1] = {1.4616280f}; - ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_4[32][1] = { - {0.0609813f}, {0.0685224f}, {0.1655236f}, {-0.0599842f}, {0.0669006f}, {-0.1817371f}, {-0.0539167f}, - {-0.0737955f}, {0.0654664f}, {0.0302955f}, {-0.0586768f}, {0.0717433f}, {0.1472274f}, {-0.0610073f}, - {-0.0601061f}, {0.2086218f}, {-0.0545418f}, {-0.0388369f}, {-0.0613536f}, {-0.1141072f}, {-0.2289097f}, - {-0.3354485f}, {0.0831025f}, {0.1333673f}, {0.0490410f}, {0.0484894f}, {0.0436755f}, {-0.1479877f}, - {0.1540713f}, {0.0021261f}, {-0.0845848f}, {-0.0564973f}, - }; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_0[32] = { + -4.5069356f, -5.8842053f, 1.0793180f, -0.1540973f, -0.4705772f, 6.4027028f, -0.6620818f, -7.0734525f, + 0.6211641f, 4.9630723f, 3.4310920f, -0.8856288f, 4.5843782f, -6.0180559f, 0.0126438f, -1.5725276f, + -0.8549317f, -6.8545237f, -1.2129461f, 3.0617838f, -0.3911322f, 0.0799793f, -2.5398655f, -0.5780622f, + 2.8533990f, -0.1777968f, -2.6457164f, -0.7976936f, 4.5644889f, -2.1747942f, 3.4286616f, -10.1073380f}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_0[38][32] = { + {6.1269712f, -10.6625051f, 17.4907818f, -0.0019928f, -3.4468415f, 1.6674044f, -7.8957767f, 2.2077549f, + 9.5517254f, -5.1345053f, -30.1643391f, 4.0148559f, -19.8330841f, -18.3806915f, 0.1334764f, 1.6213616f, + -4.1423774f, -15.3062429f, -1.0209556f, 1.5580219f, 0.7426265f, 0.0033929f, 1.3924170f, 0.9196110f, + -0.8995734f, 1.0594707f, 39.4390869f, 8.7642002f, 28.4583893f, -5.9235659f, 3.7221889f, 14.4167147f}, + {1.7863803f, -0.6068707f, 0.3166098f, -0.0608759f, 0.5939785f, 0.4870262f, -3.1375074f, -17.7147388f, + -0.7231818f, -9.3808413f, 2.2070611f, 15.7461920f, 0.9355862f, 2.3942475f, -0.0671409f, 3.5954301f, + -3.0463996f, -2.0748904f, -0.5450584f, -4.4800100f, 0.6074556f, -0.0161482f, 3.0624702f, -4.5688419f, + 2.9881518f, -0.3714012f, -0.0387531f, -0.7699140f, 4.4028845f, 5.0333014f, -4.7350726f, -8.6568584f}, + {5.6548429f, -0.0207700f, 0.1785973f, 0.0881671f, 0.2530097f, -0.1893259f, -0.1105739f, -0.5183877f, + 1.0728362f, 0.1833011f, 1.7765219f, 0.3127359f, 0.0455277f, -0.1442616f, -0.1048361f, -0.1235604f, + -0.1217661f, -0.5487315f, 0.7575656f, -0.1177454f, -17.0993137f, 0.1628031f, 0.2789381f, 0.5304270f, + 0.0837841f, -3.1120780f, 0.0074821f, -0.1648044f, -0.3395336f, 0.3958135f, 0.8718957f, -1.1980486f}, + {0.2401041f, -0.0585765f, -0.0144584f, 0.0411095f, 0.0752229f, 0.0292672f, -0.2437613f, -1.4396472f, + -0.0971315f, -1.7181139f, 0.2417643f, 2.2030578f, 0.0566049f, 0.1081589f, -0.1060181f, 0.3473758f, + -0.7095683f, -0.0345675f, 0.2794849f, -1.1702278f, 0.2622930f, -0.0072611f, 0.5026371f, -1.2882922f, + -0.4712771f, 0.0597130f, -0.0039970f, -0.6050836f, 0.1554724f, 1.0991164f, -0.4975886f, 0.2597970f}, + {0.0766028f, 0.0218421f, -0.1739017f, -0.0076569f, 0.0384461f, -0.1841756f, 0.9677940f, -3.1114254f, + 2.3830564f, 2.0706992f, -0.9643140f, 0.7361387f, -0.0060253f, -0.1554846f, -0.0831100f, 2.8754771f, + -1.4403527f, -0.5281797f, 0.5157787f, 4.2405987f, 0.4807618f, 0.0217647f, -1.2626950f, 0.9145837f, + -0.3931780f, 0.3426280f, -0.0065206f, -0.7510439f, -0.4555758f, 2.7724340f, -1.2173026f, 0.1039017f}, + {0.5685715f, 0.3927337f, 0.4942532f, -0.0671033f, -0.2808350f, -0.0336000f, -1.3983957f, 0.9876546f, + -2.3840380f, 0.7315395f, -2.2009561f, -1.4631602f, -0.4672308f, -0.4994236f, 0.1169335f, -1.1894208f, + -1.2692982f, 0.3303853f, -2.0147655f, -0.9912014f, 1.0042895f, 0.1121151f, -1.0789106f, -2.2821584f, + -6.6459913f, -0.0959398f, -0.0068429f, -2.8177626f, 0.3213172f, -2.6832986f, -4.7613306f, -0.9985733f}, + {1.4419515f, -0.3864825f, -0.6756768f, -0.1273375f, 0.4321181f, 0.3354745f, -0.8236564f, -2.8190827f, + 0.7090831f, 1.9072700f, -3.1834064f, -2.6938572f, 0.5051147f, 1.4382831f, 0.1241910f, -0.7352629f, + 0.7703634f, -1.7556250f, -2.1104112f, 3.0603442f, 1.9873468f, -0.0358815f, -1.0087154f, 3.8253262f, + -0.5466214f, 0.0875162f, 0.2691758f, 0.7121435f, 1.9314718f, -0.1580560f, 3.6484149f, -5.3173709f}, + {6.9104381f, -0.0033664f, -1.4405546f, -0.1768288f, 0.2028089f, -0.1012344f, -4.4735684f, 0.6354278f, + 4.3039737f, 0.2056303f, 1.8338999f, -1.1351355f, 0.1015760f, -0.0733253f, -0.0561627f, 2.5292397f, + 1.6314448f, -0.9333628f, -0.7773662f, 0.8313186f, -0.7829623f, 0.1265118f, 0.5922315f, -0.3463379f, + -1.3269740f, -3.3302619f, -0.0061799f, 2.3374722f, 0.0880938f, 0.7470241f, -0.4205743f, -4.7557602f}, + {0.0380794f, 0.0947470f, 0.0419397f, 0.0582226f, -0.0603404f, 0.0234028f, -0.2575402f, 0.4125248f, + 0.3035339f, 0.2663808f, -0.6092452f, -1.4727812f, 0.0247187f, -0.0539688f, -0.0150413f, 0.2094955f, + 0.5379737f, -0.3255228f, -0.5639279f, 0.0786276f, 0.6703192f, 0.1557026f, -0.2753083f, 1.1463971f, + -0.9372965f, 0.5657740f, 0.0041413f, 0.0870248f, 0.0101520f, -0.8214461f, 0.1212932f, 1.5648646f}, + {-0.0969819f, 0.0137566f, 1.3515147f, -0.0155047f, -0.1416170f, -0.1636726f, 0.5184190f, 0.4732984f, + 0.6815788f, -1.0522166f, -0.4486531f, -0.0516016f, 0.0201894f, -0.0849667f, -0.0861271f, -1.2027841f, + 1.2458711f, -0.7061657f, 1.0381308f, -0.3450044f, -0.1300479f, -0.0828402f, 0.6859242f, -1.0575374f, + 0.6947553f, -0.0922188f, 0.0199132f, 0.8038982f, -0.1734094f, -0.1057449f, 1.6305015f, -0.0688597f}, + {-1.8151448f, 0.1024327f, 1.7063105f, 0.1130912f, -0.1081472f, -0.2904744f, -1.3465070f, -1.0455177f, + -0.4581082f, -3.2220871f, 0.5221398f, -5.1637673f, 0.0811146f, -0.1326323f, -0.0379338f, -3.0439703f, + -2.4246936f, -0.3670847f, -3.1256330f, -1.6595014f, -3.4715190f, -0.1526113f, -1.0420206f, 0.9536474f, + -3.2932863f, 1.6048199f, 0.0025162f, -3.6049840f, 0.0604250f, -2.2404826f, 1.8406851f, -3.1381185f}, + {1.2985691f, -1.1044264f, 0.9062797f, -0.0788333f, 0.2694912f, 0.0032800f, -0.0574267f, 0.9734111f, + 1.1532565f, 2.6786125f, -3.8574269f, -2.2871449f, -0.1261243f, 1.0545347f, -0.1454154f, -0.5609738f, + 1.8385800f, -0.8035598f, -1.7668265f, 5.1665063f, 0.7966110f, 0.0940206f, -2.3943975f, 2.3344002f, + 1.0342182f, 0.4806454f, -0.3880928f, 0.6998246f, 1.4011886f, -1.7313483f, 4.9702630f, -6.0058608f}, + {1.0300356f, 0.0616315f, -0.1113776f, -0.1694220f, 0.7159944f, 0.0626456f, 2.0994680f, 0.3452290f, + -3.0487001f, 0.0654031f, -1.1510723f, 0.5370992f, -0.0290704f, -0.0300795f, 0.0751569f, -0.2345951f, + -0.3472281f, 0.4424143f, 1.2444530f, -0.2114656f, 0.7865694f, -0.0709381f, -0.1839961f, -0.0529834f, + 0.5867608f, -3.8793530f, -0.0814745f, -0.6368676f, 0.0361213f, -0.5549288f, 0.5661780f, 1.8374584f}, + {0.3345098f, 0.0068199f, -0.4205509f, -0.1088801f, -0.1043202f, -0.0040804f, 0.3400922f, 0.2673528f, + -0.6050695f, 0.4443954f, -0.4319905f, -0.6044132f, -0.0260679f, 0.0137036f, 0.0765494f, -0.0095099f, + 0.5880439f, -0.0083854f, -0.2407522f, 0.1942379f, 0.6554548f, -0.1322891f, -0.8298992f, 0.7909554f, + 1.0528831f, 0.1970959f, 0.0754069f, -0.0947960f, -0.0279494f, -0.5888316f, 0.8919419f, 0.4828835f}, + {0.3995822f, -0.2139665f, 0.3982936f, -0.1285759f, -0.3445527f, -0.1167238f, -0.1263519f, 0.8393803f, + -0.7758383f, 0.0719291f, -0.0134762f, 0.1715237f, 0.0796666f, 0.1023507f, -0.1172728f, -1.2364722f, + 1.2592632f, -0.3168479f, 0.7487004f, -1.5170647f, -0.2235429f, -0.1620898f, 1.4064828f, -1.0821995f, + 0.0740103f, -1.0412805f, -0.0621277f, 0.2439800f, 0.2684972f, -1.1661061f, 0.7859434f, -0.6170313f}, + {2.1615884f, 0.1431713f, 0.0642652f, -0.0522325f, -0.2658786f, -0.0245810f, -1.6857448f, -0.6685011f, + -0.6978170f, -0.8716729f, 0.3129902f, -2.5870812f, -0.2855283f, -0.3205920f, -0.0084069f, 1.3182145f, + -0.6923816f, -0.3730274f, -2.3638811f, -1.1128502f, -2.4709859f, 0.1349022f, -0.3574466f, -0.6597407f, + -4.1122031f, 0.2240651f, 0.1806145f, -1.6836300f, -0.0766231f, -3.2611966f, 0.0091456f, -0.0997367f}, + {5.2476101f, -0.1966512f, 4.8935304f, -0.1551689f, 1.6919724f, -0.8324367f, 14.3318472f, -0.3503132f, + 10.3614969f, -9.1522884f, -0.2543063f, -1.8476851f, 16.7961140f, 9.9541416f, -0.0434563f, -9.6973553f, + -5.0469398f, 6.1688442f, 7.6429725f, -7.3149266f, 1.2345183f, 0.1412155f, 0.7114770f, -1.6378664f, + 5.1548996f, 0.3686100f, -45.3027611f, 3.0492647f, -37.3445892f, 2.7421410f, -2.7958770f, -25.2034016f}, + {1.4597454f, -1.0561740f, 0.9751291f, 0.0446527f, 0.3691662f, 0.1006782f, 0.1418435f, 0.8871480f, + 1.1603093f, 2.8034730f, -4.0856910f, -1.9786842f, -0.2206208f, 0.9539357f, 0.0868183f, -0.6811873f, + 1.9642411f, -0.8065316f, -2.0244894f, 5.2936082f, 0.6120632f, -0.1194160f, -2.3925939f, 2.5555069f, + 1.0149733f, 0.4607603f, -0.2197217f, 0.5703423f, 1.4049014f, -1.5900208f, 5.1645074f, -6.0569463f}, + {0.9000676f, -0.0028781f, -0.1967366f, 0.1039593f, 0.7993248f, 0.0655172f, 2.2296758f, 0.4391927f, + -3.0292840f, 0.0334536f, -1.1728534f, 0.3479103f, -0.1190938f, 0.0410203f, 0.1146637f, -0.2958017f, + -0.3240463f, 0.4361866f, 1.0564958f, -0.1989332f, 0.5194008f, -0.0628912f, -0.1733121f, -0.1255383f, + 0.5990249f, -3.7692382f, 0.0995128f, -0.7101220f, -0.0785123f, -0.3514554f, 0.6662078f, 2.0991604f}, + {0.1781942f, -0.1873588f, -0.4653996f, -0.0153059f, -0.1399561f, -0.0498718f, 0.4552556f, 0.2300792f, + -0.7682312f, 0.4342302f, -0.3787803f, -0.6089386f, -0.1049337f, 0.0395331f, 0.0220332f, 0.0114750f, + 0.4672548f, 0.1284784f, -0.2472819f, 0.2892784f, 0.4788667f, 0.0472555f, -0.6593549f, 0.6508777f, + 0.9286987f, 0.3043948f, -0.0635985f, 0.0814399f, -0.1168853f, -0.6688027f, 0.8876534f, 0.4865684f}, + {0.4024099f, 0.0480259f, 0.4588822f, -0.1793082f, -0.2151573f, -0.1871128f, -0.1502780f, 1.1011307f, + -0.9467706f, 0.2632496f, -0.1257263f, -0.0241331f, 0.2280627f, 0.0878608f, -0.1334262f, -1.1642927f, + 1.0943586f, -0.4799654f, 0.5981907f, -1.5051398f, -0.4235946f, 0.0012827f, 1.2342577f, -0.8281875f, + 0.2776567f, -1.0362227f, 0.0408372f, 0.1540821f, 0.1777556f, -1.2684357f, 0.8836584f, -0.4001710f}, + {2.1558056f, 0.2082023f, 0.0863442f, 0.0364868f, -0.3985825f, 0.0307202f, -1.8889453f, -0.5614714f, + -0.7311882f, -0.8075573f, 0.4895108f, -2.7770483f, -0.3121874f, -0.1671291f, -0.1281284f, 1.3212786f, + -0.5310181f, -0.1974759f, -2.6240873f, -0.8320529f, -2.3875966f, -0.0286360f, -0.6263188f, -0.6553424f, + -4.1658955f, -0.0601300f, 0.0946256f, -1.6795633f, -0.1251303f, -3.0974686f, 0.2412274f, -0.0687501f}, + {2.0523887f, -0.6387668f, 2.0633900f, -0.0550964f, 0.5181718f, -0.4202190f, 1.8569367f, 0.8295385f, + 0.8555872f, 2.4727983f, -0.2072828f, -1.9006120f, 0.5379534f, 0.4463673f, 0.1468820f, 0.4918649f, + -3.4016700f, 0.2884440f, -1.9418719f, 4.5157170f, -0.5160927f, -0.0199372f, 3.1353824f, -0.9863126f, + -1.5135859f, 0.7576568f, 0.6715558f, 2.7409093f, 0.9291748f, -0.3247162f, 1.8204515f, -8.9181070f}, + {-0.1428107f, -0.0829889f, 0.4213613f, 0.0225415f, 1.2238166f, 0.0477106f, 0.3031853f, -0.7466553f, + 2.0663500f, 0.7588379f, 0.3689216f, -0.2003786f, 0.1242338f, 0.1693589f, -0.0351716f, -0.0186597f, + -0.0189417f, 0.5468715f, -0.2862698f, -0.1311738f, 3.0747476f, -0.0310747f, 0.0943165f, 0.3139819f, + 0.6274695f, -1.8314874f, 0.0147495f, 0.3554756f, 0.3829916f, 0.4891713f, 0.1328600f, 1.0535098f}, + {0.0534900f, 0.1787969f, -0.0571320f, -0.0685673f, 0.1968977f, 0.0374476f, 0.7876674f, 0.0828491f, + 0.6444036f, -0.2203166f, -0.2383427f, 0.5397566f, 0.0106769f, -0.1230072f, -0.0135021f, -0.5691944f, + -1.5040319f, 0.0406933f, -0.0025478f, 0.9251419f, -1.7180276f, -0.1112956f, 1.4840862f, 0.0407115f, + -0.0100329f, 0.0583593f, -0.0110524f, 0.7431355f, -0.0971857f, -0.5501527f, -0.6371027f, -0.1935233f}, + {-0.6455778f, 0.2317368f, 0.9285696f, -0.1415854f, 0.0822560f, 0.2488030f, -2.6992166f, 0.0884904f, + 0.6735302f, -0.1467820f, 0.5641044f, 0.6436581f, 0.0818401f, -0.0336634f, -0.0729000f, -0.1206900f, + -2.5739892f, 0.5776953f, 0.9531668f, -1.2362405f, -0.0615577f, -0.0143544f, -2.7525210f, 1.3738545f, + 0.2751348f, -1.7463943f, -0.0020144f, 2.4814103f, 0.1716725f, -0.7055540f, -0.3474010f, 0.4482578f}, + {-0.2526205f, -0.7463821f, -3.6076138f, -0.1511098f, 0.1216256f, 0.0888247f, -1.0190924f, -1.3260181f, + -0.0443211f, -4.8911066f, -3.4385188f, -6.0057454f, 0.3340450f, 0.2997236f, -0.0907855f, 0.7500492f, + -0.4007562f, 1.9382039f, 0.5687234f, 2.6511824f, 4.7703862f, 0.0006749f, -0.0201394f, -3.5885489f, + -4.1518898f, 0.0807014f, -0.0584071f, -0.8100027f, 0.7697087f, -0.8038046f, -1.2945876f, -4.0110312f}, + {0.4337017f, -1.1532011f, 2.0740633f, 0.0271806f, 0.6654227f, 0.1012998f, -4.0791736f, 1.2631345f, + 1.9511020f, 2.3272331f, 1.2707534f, 1.6306664f, 0.4936035f, 0.8285242f, 0.0807625f, 3.8652387f, + 0.0281145f, 1.6877037f, 1.2557380f, -0.3036775f, 0.5604967f, 0.1551418f, -0.9599600f, -6.3067718f, + -0.6352320f, 0.8058553f, 0.3657880f, -2.0491202f, -0.3926269f, 2.5650854f, 1.3697821f, -8.3070078f}, + {5.1334143f, -0.0351738f, -0.4774780f, -0.0679726f, 1.4569254f, 0.0580191f, -0.3649136f, -0.2298838f, + -3.3826666f, -0.7392708f, -0.6036060f, -0.2612940f, -0.1877640f, -0.1145124f, -0.0042578f, -0.0311193f, + -0.0320479f, 0.5270581f, -0.4324475f, 0.2681437f, 4.7813129f, -0.0222701f, -0.0525629f, -0.2861001f, + -0.1251072f, 3.9112861f, 0.0045046f, -0.0426071f, -0.3299106f, -0.0686970f, -0.1602017f, -0.0070103f}, + {-0.6633690f, 0.0103367f, 0.5998458f, 0.1256577f, -0.0359184f, -0.0176820f, -0.6458368f, -0.0370536f, + 0.3542259f, 0.1394724f, 0.8255956f, 0.2501569f, 0.0320156f, -0.0256806f, 0.0277949f, 0.0036392f, + 0.2825173f, 0.1400358f, 1.0011463f, -0.6792242f, 0.0672508f, 0.0728705f, -0.1089695f, -1.0414587f, + -0.4135485f, 0.4293025f, -0.0041241f, -0.9564193f, 0.0314900f, 0.8658463f, -0.7734696f, -0.7610567f}, + {-0.0200122f, -0.0749178f, -1.5026549f, -0.0387432f, -0.0713735f, 0.1214790f, 1.8730290f, -0.0552839f, + -1.6867150f, 0.2282097f, 0.7161849f, -0.1018546f, -0.1092003f, 0.0365504f, -0.1326883f, 1.2310545f, + 0.1800210f, 0.7024739f, -2.9606545f, 1.2275347f, -0.2050014f, 0.0940569f, 0.4761694f, 0.8812068f, + -0.0083424f, -1.5406264f, 0.0061815f, -2.7606382f, 0.0248556f, 1.1086880f, -1.3608936f, 1.0795454f}, + {0.9734020f, 0.3905411f, -3.7008634f, 0.0013557f, 0.1649124f, 0.9935362f, 1.3489184f, 0.9505764f, + 0.7966231f, -0.1627246f, -2.5754328f, 1.4892205f, 0.8586300f, 0.6974363f, 0.1320204f, -0.7840260f, + 0.3121157f, 0.0966901f, 2.7447381f, 1.8256680f, 0.7229405f, -0.1723188f, 0.9145948f, -2.1376033f, + 0.5259342f, 0.0731194f, -0.2908303f, -0.2603913f, -0.2326528f, 3.6684167f, -0.2883157f, -2.8546307f}, + {-4.8917460f, 6.7944999f, -0.2255474f, 0.1051999f, 3.9000113f, 2.0624907f, 5.3019547f, 10.0209141f, + 1.1268179f, 2.2669628f, -6.5002980f, 1.8408583f, 5.3039579f, 2.2055962f, 0.1055369f, 1.7230233f, + 6.9605255f, 7.7025104f, 2.9880707f, -0.9274251f, -0.2287160f, -0.0206735f, 0.6885675f, 2.8179996f, + -7.1129837f, -1.3772345f, 3.8655453f, -5.9388318f, -0.0469947f, 7.2763596f, -6.3536129f, -17.0069847f}, + {1.8787041f, -0.9953383f, -1.4839923f, 0.1308209f, 0.3657510f, 0.3106483f, -1.4158971f, -6.7449651f, + 0.6553892f, -4.5046172f, -3.5489719f, 3.5363002f, 0.5454772f, 2.3521471f, 0.1612140f, -0.9744226f, + 0.6546553f, -2.7179255f, -1.7758157f, 0.3089439f, 1.7462813f, 0.1654593f, -0.2440207f, 3.9501827f, + 1.3750844f, 0.0596805f, -0.1977254f, 0.0264880f, 2.6396444f, 1.0816911f, 3.6413448f, -6.0299959f}, + {-4.1295738f, 0.1044480f, 0.2131937f, 0.0420826f, 0.5292229f, 0.0090477f, -0.0973486f, 0.9596778f, + 2.9579651f, -0.6364226f, -1.7556342f, 0.1539868f, -0.1273174f, -0.1348504f, 0.1257833f, -1.4168571f, + -1.0960362f, 0.0482449f, -1.4395387f, -0.2524115f, -2.9162085f, -0.0451428f, -0.4021681f, -0.5756381f, + 0.0515293f, -3.1996479f, -0.0007676f, -1.3878343f, -0.2864279f, -0.9579773f, -1.0999249f, 1.6500067f}, + {-2.4806111f, -6.8115449f, 3.2805641f, 0.1187415f, -0.9950783f, 6.2553434f, -1.6450261f, -6.1463733f, + 2.7507148f, 4.2995782f, 0.0461297f, -0.5417359f, 2.4306326f, -7.3530145f, 0.0698273f, -0.9394333f, + -1.3595498f, -7.5141478f, -1.4911395f, 3.2300410f, 0.1203540f, 0.0314884f, -2.0116949f, -0.8167119f, + 2.4133310f, 0.1920709f, 1.0619365f, 0.2459123f, 6.9166069f, -2.6384118f, 3.6829739f, -7.2385545f}, + {0.9408096f, 14.9067144f, 1.7709646f, 0.1105646f, -0.5600107f, -15.3188124f, -12.3718462f, -1.8893757f, + 13.6364670f, -5.7327847f, -14.1805468f, 1.0581509f, -14.2186184f, 14.8948650f, 0.0190344f, 5.4395180f, + 6.7243400f, 9.8468456f, 4.5144215f, -1.4551491f, 1.1032411f, -0.0317988f, 2.3398454f, -3.1671596f, + -7.7541409f, 1.1255593f, 6.7340465f, -4.4448423f, -9.1472626f, -3.1959128f, 4.4181323f, -2.7904994f}, + {-2.1621978f, -4.7202382f, 1.7378219f, 0.1417439f, -0.5000908f, 5.4468708f, 1.4260571f, -6.6136570f, + 1.5713804f, 3.4479704f, 2.7354901f, -0.7388076f, 5.4666147f, -3.8697338f, -0.1368596f, -2.7903373f, + -1.2043713f, -4.9554005f, 0.3324645f, 1.6767365f, 0.1156244f, -0.0326964f, -2.0945346f, -0.4590589f, + 3.0942657f, 0.0015020f, -6.2626700f, -0.3969755f, 0.7717427f, -1.9667094f, 2.9664171f, -11.9477053f}, + }; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_2[32] = { + 9.8383608f, 3.6922295f, 3.5774977f, -4.4619012f, 6.5087032f, -0.9540017f, -0.5059246f, 0.0706402f, + 14.3396597f, -0.2771132f, -4.8409863f, -8.3581600f, -3.5078344f, 4.3287506f, -5.7808843f, 3.9264839f, + -2.1697845f, -0.0040514f, -0.2095029f, -6.8678174f, 1.7911285f, -0.4510343f, 1.2410443f, -4.5678806f, + -0.5693849f, 2.3320096f, 4.4606552f, -6.3771009f, -4.3149071f, -0.1905672f, -3.5726390f, -1.0744030f}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_2[32][32] = { + {-0.0155548f, 0.0243339f, 0.0037967f, -0.2771824f, 0.0111955f, -0.0115980f, 0.0079653f, -2.9803498f, + -0.0061037f, -0.0956634f, 0.0332446f, 0.0179244f, -0.0080377f, -9.0180779f, 0.1720033f, 0.0350694f, + -0.0146588f, -0.2135506f, -0.3158041f, 1.3697664f, 0.0119146f, 0.0119120f, -0.0986927f, 0.0297492f, + 0.0355827f, -0.1196868f, -0.0745119f, 0.0281862f, -0.0422190f, -0.3069138f, -0.0477367f, -0.0550450f}, + {-1.7374619f, 1.4822800f, -2.1885235f, 1.8354234f, -0.5380136f, 1.6621803f, 0.6251035f, 0.1008954f, + -0.8387129f, -0.2063313f, 1.0661691f, -0.9799694f, -5.1710258f, -3.2260630f, -1.5073707f, -1.0792168f, + 1.8569958f, -0.2289213f, 0.0563821f, -1.6398847f, -4.1649504f, -2.7527378f, -0.0134577f, 3.0424533f, + 0.0364320f, 0.6762254f, -3.1551330f, 2.4888904f, 1.4757305f, -0.3141717f, -2.0126467f, -0.1675602f}, + {-0.9571826f, 0.0914152f, 0.0404339f, 0.2927902f, 0.2933607f, 0.0619171f, 0.0772318f, -1.3796169f, + -0.8194544f, -0.2179988f, -1.1241078f, -0.1443964f, 0.0559355f, -1.2914546f, -0.3445117f, 0.2031156f, + 0.0273864f, -0.0193422f, -0.2136522f, 0.0429592f, 0.0212854f, 0.0414394f, -1.1734651f, 0.0582848f, + 0.0136039f, -0.1892604f, 0.0764908f, -0.0130132f, -0.1272559f, -0.0818855f, -0.0408583f, -0.1563294f}, + {-0.0213695f, 0.0596942f, -0.0641309f, -0.0146449f, 0.0416586f, -0.0378931f, 0.1234860f, 0.1622967f, + 0.0794091f, -0.0639933f, -0.1030663f, 0.0579078f, 0.1050275f, -0.0136866f, 0.0149978f, 0.0876813f, + 0.0693554f, 0.1612417f, -0.0595916f, -0.1008234f, -0.0579058f, 0.0915138f, 0.1321436f, -0.1484535f, + -0.0920316f, -0.0024532f, -0.1045300f, 0.0924260f, 0.0277524f, -0.0287276f, -0.1271127f, 0.1164243f}, + {0.0713067f, 0.0198056f, -0.3023696f, -0.0025908f, -0.0085885f, -1.1157553f, 0.0236462f, -0.0704844f, + -0.0189257f, -0.0997382f, 0.3379845f, -0.1229390f, -0.0616165f, -0.8968034f, 0.0401445f, -0.1144476f, + -0.0532077f, 0.0604580f, 0.0609454f, -0.1613472f, 0.0103525f, -0.1653874f, 0.0205189f, 0.0758978f, + -0.1514593f, 0.0151441f, 0.2043469f, 0.0349607f, -0.1361278f, -0.1255922f, 0.0631648f, 0.3570991f}, + {0.3371337f, -3.7541580f, 2.2215877f, -0.3390516f, 0.1912718f, -4.1861577f, -1.2264019f, 2.8179801f, + 0.0667294f, -0.0093539f, 2.3029909f, 3.1814916f, 3.9780347f, 0.2310601f, 0.3986159f, -0.8544636f, + 0.4139664f, -0.1876569f, -0.2448732f, -2.8053334f, 4.0488625f, 2.1094146f, -6.7310257f, -4.9950023f, + -0.8315823f, 0.0555959f, 2.4573720f, -3.7234364f, -4.2910552f, -0.2995245f, -3.2605181f, 2.3620574f}, + {-1.5522735f, -0.1866350f, -0.0067679f, 0.3196557f, 1.4052233f, 2.8143549f, -0.9992948f, -0.5309914f, + -25.8852596f, -0.1218249f, 0.6625420f, 0.3007106f, -0.2767264f, -0.1847300f, -0.5313534f, -0.0383462f, + -0.1987552f, 0.0581405f, -0.3376078f, 1.2621028f, 0.0818709f, -0.1401216f, -0.4550788f, -0.1592657f, + 0.0597123f, 0.1344101f, -0.1005317f, -0.1538406f, 2.9142656f, -0.0806051f, -0.4267367f, -31.9512234f}, + {0.6859627f, 0.1212986f, 0.1291616f, 0.0459838f, -0.0899920f, 0.0287645f, 0.1987007f, -2.7079368f, + -0.2628384f, -0.1402464f, -0.6302179f, -0.2923960f, -0.1106663f, 0.8256195f, -2.8054097f, -0.0296494f, + -0.5632019f, -0.1335654f, -0.1558440f, -6.8611612f, 0.0203786f, 0.0046566f, -0.4401442f, -0.0471430f, + 0.4535986f, -0.8657981f, 0.0684740f, 0.0518814f, -0.0123748f, -0.2270164f, 0.0922878f, -0.3863277f}, + {0.0127175f, 2.3346109f, -0.4390767f, -0.4657893f, 0.1659466f, -0.1132782f, -0.4928388f, 0.7652873f, + 1.1510741f, -0.0879600f, 0.2721785f, -0.1878961f, -0.3477249f, -0.8473209f, -0.8931856f, -0.4328294f, + -11.9181929f, -0.0282545f, -0.0217915f, 1.6676594f, -0.2122232f, -0.6190930f, 1.9053432f, -0.7592348f, + -1.0739189f, -0.7170524f, 0.3864411f, -0.8849231f, 0.1393488f, 0.0738489f, 0.4460345f, 1.9020857f}, + {0.4453296f, -0.0767821f, 0.1638939f, 1.6997167f, -0.1098599f, -0.0551604f, 0.0040561f, -13.5290670f, + -0.1285677f, -0.0590394f, 0.6499141f, -0.7617344f, 0.0453151f, 0.3104213f, -1.0711143f, 0.1361838f, + -0.4365610f, -0.1300649f, 0.2013344f, -0.5308123f, 0.1451896f, 0.1030715f, -0.6487910f, -0.3136590f, + -0.0280079f, 0.5394178f, 0.1318262f, -0.0159292f, 0.0636870f, -0.3224248f, -0.1868187f, -0.2468304f}, + {-0.0333494f, -0.0834255f, -0.1221875f, 0.6861304f, 0.0521738f, -0.0416543f, -0.4437352f, -19.3246250f, + -0.1520821f, 0.0528602f, -0.6375434f, -0.5803806f, -0.0958465f, -2.0058544f, -0.8282642f, 0.0259000f, + 0.4846996f, 0.1211179f, 0.0356884f, 1.0009497f, 0.0635682f, -0.0314105f, -0.0011147f, 0.0131714f, + -0.3410152f, 0.2798154f, 0.0961889f, 0.1266228f, -0.0934717f, -0.0904307f, 0.1355542f, 0.5722573f}, + {0.2146454f, 0.2143834f, 0.1290650f, -0.9063646f, 0.2100945f, 0.1331054f, -0.2620614f, -0.1264993f, + 0.1313979f, 0.0455465f, -0.8395286f, -0.4967833f, -0.0538581f, 0.9155380f, 0.6627046f, 0.1691243f, + 0.9887002f, -0.1597013f, -0.1236713f, -1.9041336f, 0.0427585f, 0.0849747f, -5.2559652f, -0.3133100f, + 0.0141170f, -0.1635530f, 0.4938746f, 0.0162943f, 0.2107756f, -0.3413893f, -0.0657575f, 1.0542560f}, + {-2.8868380f, -2.0837426f, -1.0611480f, -0.6143807f, -0.6398501f, -2.8018746f, 0.5166737f, -1.0814301f, + -1.9272422f, -0.1017482f, -0.4651161f, -1.4021232f, 1.8854499f, 0.1815407f, 0.5965426f, -2.3344259f, + -0.0690846f, -0.1678239f, -0.4219488f, 0.6215640f, 1.0270095f, -0.3473049f, -0.3926674f, -0.7942593f, + 1.1305071f, -1.4621233f, -0.8051161f, -0.7698632f, -2.6038630f, -0.3090037f, -1.6365144f, -1.0179478f}, + {0.0046026f, 1.1319581f, -2.6405678f, -2.0353596f, -2.1687336f, 0.3364883f, 2.1122196f, 0.2584647f, + -2.4344857f, -0.0378498f, 0.6158544f, -0.6060749f, -4.9598379f, 0.1570698f, 2.2436838f, -2.6198347f, + -2.0935996f, -0.1845744f, -0.0716080f, -1.9338604f, -4.1995640f, -3.6706774f, -1.6762524f, 3.9646862f, + -0.9677961f, 1.8319578f, -3.1916575f, 3.7312632f, 0.0820446f, -0.0497568f, -0.0898171f, -0.2499462f}, + {-0.0780375f, -0.0286571f, 0.1007227f, 0.0012229f, -0.0531285f, 0.0840718f, 0.1013894f, 0.1312424f, + -0.0673772f, 0.1603183f, 0.0074385f, -0.0718321f, -0.1549873f, 0.1616689f, 0.0405887f, -0.1558588f, + 0.0740745f, 0.1696893f, -0.0064026f, -0.1656420f, -0.1186674f, -0.1262667f, -0.0784757f, -0.1280154f, + 0.0909976f, 0.0853046f, -0.1075811f, 0.1310615f, 0.0610194f, 0.0647223f, 0.1360559f, 0.0440074f}, + {-0.2106480f, 0.0087131f, 0.1119385f, -1.0611318f, 0.5250220f, 0.0525479f, -0.2733742f, -1.0799565f, + -0.5601607f, -0.0651806f, -1.9793440f, -0.3373334f, -0.1550518f, 0.8932216f, 0.7264332f, -0.0450735f, + 1.2373760f, -0.1236272f, 0.0680048f, -3.0446634f, -0.1533586f, -0.0127355f, -0.3326311f, -0.0225603f, + -0.2265739f, -2.3752897f, -0.3771705f, -0.0728938f, 0.1741305f, 0.1111639f, 0.4131119f, 0.2239323f}, + {-2.5691276f, -1.4011253f, -2.0640867f, -3.7236946f, 1.5542637f, -0.9456654f, -1.7575809f, 3.6794879f, + -0.4439790f, -0.1009826f, 3.6702275f, -0.1935008f, -0.4423219f, -0.3825364f, -0.4784791f, 0.5927492f, + -2.3482494f, 0.0801714f, -0.1567418f, -1.7934613f, -0.1706410f, -0.6326947f, 0.6260155f, 0.3631033f, + -0.9325932f, 1.9647995f, -1.3409088f, 1.3501998f, 0.0367797f, -0.1744210f, 1.8690013f, -1.0737898f}, + {-0.5934777f, 0.6232591f, -0.3391055f, 0.2640936f, -0.2824444f, 0.4815128f, 0.6625078f, -0.1103976f, + 0.9555223f, -0.0624896f, -0.6778919f, 0.1181502f, -0.5425385f, 0.7297349f, -1.7261271f, -0.2917557f, + 1.1873137f, -0.2725933f, 0.0975242f, 1.7756181f, -0.5735835f, -0.4453230f, 0.9800369f, 0.9344145f, + -1.8692539f, 0.0120440f, -0.7315661f, 0.6250805f, 0.3839143f, -0.0376306f, 0.3816243f, 0.6059195f}, + {0.5522162f, -1.8043815f, -10.9379101f, 0.5719097f, -0.2246755f, -1.4856353f, 0.4877502f, 0.7163438f, + -11.8135147f, -0.0180790f, -0.9928634f, 0.1107815f, -0.0005064f, -0.3824990f, -0.7453306f, -1.9909632f, + -7.4362645f, -0.0245507f, -0.1815712f, -3.5507584f, -0.0075889f, -11.0296011f, -1.1292133f, -0.0710276f, + 0.5675677f, 0.2017778f, -0.0684891f, -0.0367653f, -1.6674192f, 0.0281711f, -0.8356591f, -0.0447807f}, + {0.2537312f, -3.0178010f, -0.3493635f, 1.8573236f, 0.4017631f, 0.9912633f, -0.8625028f, -0.7783228f, + -1.7815375f, -0.1204695f, 1.8551122f, 0.3344182f, -0.2828701f, -1.3226960f, -1.4470471f, 0.2895959f, + 0.6780876f, -0.2010069f, 0.0425280f, -2.1786852f, -0.1274053f, -0.2549899f, -0.2233993f, -0.1561645f, + -0.4640818f, 0.6375850f, 0.7733670f, -0.2388286f, 1.0447853f, -0.1503223f, 0.3823584f, -13.8176088f}, + {0.2575197f, -2.2127593f, -0.0389457f, -0.0215759f, 0.1659477f, -0.0097748f, -0.1935415f, -0.9091369f, + -0.1453371f, 0.0442428f, -0.1206519f, 0.1435609f, -0.0186047f, -5.0154042f, 0.0538177f, 0.0403250f, + 0.0240955f, 0.0331080f, 0.0517951f, 0.7422639f, 0.0069818f, 0.0248351f, -0.2205741f, -0.0082387f, + 0.2043269f, 0.0459435f, 0.0876343f, 0.0140607f, 0.1056308f, 0.0062555f, 0.0184278f, -0.5539715f}, + {-0.0398742f, 0.1075264f, 0.1725024f, -0.0755192f, -0.0360048f, 0.1325573f, 0.0903103f, -0.0882263f, + 0.1207692f, 0.0032722f, 0.0048489f, -0.1257241f, 0.1450990f, -0.0713558f, 0.1116815f, 0.1107689f, + -0.1447252f, 0.1581838f, -0.0160124f, -0.0425587f, 0.1411217f, 0.0865060f, -0.0643460f, -0.0431262f, + -0.1452804f, -0.0195101f, 0.1234572f, 0.0520887f, 0.1117576f, -0.0751791f, 0.1511539f, 0.1224861f}, + {0.7728126f, 2.3075340f, -0.0385258f, -3.1270287f, 0.9414487f, 3.5251477f, -0.8043440f, 0.7212446f, + -7.6850162f, -0.1609414f, -3.7687578f, -1.0751100f, -0.2052089f, 5.0728245f, 2.2835267f, 0.5930225f, + 0.1303335f, -0.1428799f, -0.3715075f, 0.5136011f, -0.4755619f, -0.2192461f, -3.8696294f, -0.0062392f, + -1.3774812f, -0.0034140f, -1.5944362f, 0.9773729f, 3.2859125f, -0.1616932f, -1.2785367f, -13.5732412f}, + {0.5535743f, 0.1461481f, -0.2218016f, -0.2971808f, -0.2169309f, 0.1564545f, -0.0390397f, 1.1558976f, + -0.0119933f, -0.0774637f, 1.1907971f, -0.5127968f, -0.0066028f, -1.6794037f, -0.3650940f, 0.2555613f, + -0.9488379f, 0.0449603f, -0.1620417f, 0.1583214f, 0.0000908f, 0.0152763f, -1.0660053f, -0.0139402f, + -1.7440189f, 0.2515209f, 0.3333162f, 0.1904725f, 0.1116094f, -0.2287960f, -0.0007165f, -1.7047704f}, + {-5.9897852f, -0.1316296f, -0.0218074f, -0.4602887f, 0.3288545f, -0.0882939f, -0.5929499f, 0.4294790f, + -0.0383545f, 0.0556869f, 0.1975944f, 0.1341491f, 0.0629570f, -2.2742157f, 0.0175826f, -0.1439869f, + -24.8701649f, -0.1582915f, -0.2460304f, -3.9643264f, 0.0863483f, 0.0180861f, -0.2210452f, -0.0868723f, + -0.4175525f, -0.8231756f, 0.0247534f, -0.1473545f, -0.0021330f, -0.0410253f, -1.1944869f, -1.1523768f}, + {0.1031547f, -3.3402514f, -4.3636522f, -0.1534714f, -0.0622189f, 0.0374694f, -0.0870097f, -4.1865788f, + -0.0555377f, 0.0252329f, 0.1339467f, 0.0461691f, -0.0503090f, 0.0289890f, -0.0095674f, -0.3289992f, + -0.0279080f, 0.0274977f, -0.0903500f, 0.5610157f, -0.0478177f, 0.4346960f, 0.4822784f, -0.1058945f, + -0.2026870f, -0.0560638f, 0.0910069f, -0.0818529f, 0.0819198f, -0.0292193f, 0.3040628f, -0.1275230f}, + {-5.8789845f, -17.1114635f, -4.6755161f, 0.1016624f, -0.8685016f, -0.3898779f, -2.3363957f, 0.1413794f, + -2.4254086f, -0.2171030f, -0.0901150f, 0.7058705f, 0.4166250f, -0.0231085f, -0.1789686f, -9.4244318f, + -0.6418229f, -0.0857969f, 0.1683681f, -0.0310597f, -0.0247807f, -5.3748040f, -7.4730940f, 0.1019564f, + -1.2126822f, -0.3726285f, -1.0287101f, 0.1803891f, -0.2227769f, -0.0791530f, -0.0159770f, -1.4883354f}, + {-17.9394970f, -0.5228514f, -11.3547935f, -0.0672671f, -2.0371394f, -0.9076943f, 2.4331825f, -6.9409127f, + 0.8286008f, 0.0208618f, -0.8009814f, 1.2268484f, 0.1943726f, -1.7297083f, -0.7668949f, -6.5505466f, + -0.6495168f, -0.0404727f, -0.1260914f, -3.5029383f, -0.0852898f, -2.9679556f, 1.6404767f, -0.0251449f, + 1.1460075f, -0.7877688f, -0.0586593f, -0.4741839f, -1.7420560f, 0.0295600f, -2.3574052f, 0.0974777f}, + {0.4443443f, 0.6384261f, 1.3317494f, -1.0085982f, 0.9508762f, 1.3168396f, -0.1862490f, -0.1801148f, + 1.1106120f, -0.0654911f, 0.1186706f, -0.7198273f, 0.5449172f, -0.5886080f, 0.7504217f, 1.8046317f, + -0.1294390f, -0.1939137f, -0.2383934f, 0.4131435f, 0.6910310f, 1.2821866f, -0.1088722f, -0.5660405f, + -0.1188610f, 0.0364403f, 0.3597929f, -0.6409024f, 1.2114668f, -0.0212278f, 0.8423592f, 0.4848156f}, + {-0.8772649f, -13.5265112f, -4.5540547f, -0.2856667f, 0.7604876f, -0.6829260f, -0.8320626f, 0.6541347f, + 0.4020181f, 0.0009324f, -10.9660740f, -0.3540186f, -0.2316812f, 0.3576394f, 0.0998953f, -1.5738430f, + 1.2089975f, 0.0706465f, -0.2538019f, 0.7016497f, -0.0282650f, -3.1291001f, -0.4375663f, -0.3979468f, + -0.1588882f, 0.3978875f, 0.2038192f, -0.4281644f, -0.5787544f, -0.0922198f, 0.9595569f, 0.0212818f}, + {0.3392667f, 0.1170919f, -0.0705636f, -0.1025443f, -0.1192213f, -0.0495686f, 0.0284667f, -0.1226804f, + 0.0050191f, -0.0516545f, -1.0892097f, 0.0033689f, 0.0471462f, 1.4266804f, 0.0288870f, -0.0110408f, + -1.1283765f, -0.1299917f, -0.4318301f, -0.9854419f, -0.0190479f, -0.0269406f, 0.3697925f, -0.0757695f, + -0.3632923f, -0.1714077f, 0.0669245f, 0.0557428f, -0.1713906f, -0.4307863f, -0.1749060f, -2.1246362f}, + {0.8383662f, -3.8122442f, 0.1568939f, -2.2105119f, -0.7086993f, -0.4664145f, -0.3578597f, 0.5554636f, + 0.6965880f, -0.1506968f, 0.2646832f, 0.2874083f, 0.1901203f, -2.4997077f, -0.3519035f, -0.0518054f, + 1.0862818f, -0.2502540f, -0.3133347f, -0.7411230f, 0.1268138f, 0.1069811f, -0.8109779f, 0.0264679f, + 0.1604289f, -0.7534032f, -0.1419461f, 0.0688303f, -0.1570919f, -0.3055144f, -0.7415189f, 2.5547018f}, + }; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float bias_4[1] = {1.4616280f}; + ALPAKA_STATIC_ACC_MEM_GLOBAL const float wgtT_4[32][1] = { + {0.0609813f}, {0.0685224f}, {0.1655236f}, {-0.0599842f}, {0.0669006f}, {-0.1817371f}, {-0.0539167f}, + {-0.0737955f}, {0.0654664f}, {0.0302955f}, {-0.0586768f}, {0.0717433f}, {0.1472274f}, {-0.0610073f}, + {-0.0601061f}, {0.2086218f}, {-0.0545418f}, {-0.0388369f}, {-0.0613536f}, {-0.1141072f}, {-0.2289097f}, + {-0.3354485f}, {0.0831025f}, {0.1333673f}, {0.0490410f}, {0.0484894f}, {0.0436755f}, {-0.1479877f}, + {0.1540713f}, {0.0021261f}, {-0.0845848f}, {-0.0564973f}, + }; -} //namespace lst::t5dnn + } // namespace t5dnn +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/ObjectRanges.h b/RecoTracker/LSTCore/src/alpaka/ObjectRanges.h index 1e1ccf8df12bc..81e4358ab30d6 100644 --- a/RecoTracker/LSTCore/src/alpaka/ObjectRanges.h +++ b/RecoTracker/LSTCore/src/alpaka/ObjectRanges.h @@ -3,7 +3,7 @@ #include "RecoTracker/LSTCore/interface/Constants.h" -namespace lst { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { struct ObjectRanges { int* hitRanges; @@ -40,34 +40,34 @@ namespace lst { template void setData(TBuff& buf) { - hitRanges = alpaka::getPtrNative(buf.hitRanges_buf); - hitRangesLower = alpaka::getPtrNative(buf.hitRangesLower_buf); - hitRangesUpper = alpaka::getPtrNative(buf.hitRangesUpper_buf); - hitRangesnLower = alpaka::getPtrNative(buf.hitRangesnLower_buf); - hitRangesnUpper = alpaka::getPtrNative(buf.hitRangesnUpper_buf); - mdRanges = alpaka::getPtrNative(buf.mdRanges_buf); - segmentRanges = alpaka::getPtrNative(buf.segmentRanges_buf); - trackletRanges = alpaka::getPtrNative(buf.trackletRanges_buf); - tripletRanges = alpaka::getPtrNative(buf.tripletRanges_buf); - trackCandidateRanges = alpaka::getPtrNative(buf.trackCandidateRanges_buf); - quintupletRanges = alpaka::getPtrNative(buf.quintupletRanges_buf); - - nEligibleT5Modules = alpaka::getPtrNative(buf.nEligibleT5Modules_buf); - indicesOfEligibleT5Modules = alpaka::getPtrNative(buf.indicesOfEligibleT5Modules_buf); - - quintupletModuleIndices = alpaka::getPtrNative(buf.quintupletModuleIndices_buf); - quintupletModuleOccupancy = alpaka::getPtrNative(buf.quintupletModuleOccupancy_buf); - miniDoubletModuleIndices = alpaka::getPtrNative(buf.miniDoubletModuleIndices_buf); - miniDoubletModuleOccupancy = alpaka::getPtrNative(buf.miniDoubletModuleOccupancy_buf); - segmentModuleIndices = alpaka::getPtrNative(buf.segmentModuleIndices_buf); - segmentModuleOccupancy = alpaka::getPtrNative(buf.segmentModuleOccupancy_buf); - tripletModuleIndices = alpaka::getPtrNative(buf.tripletModuleIndices_buf); - tripletModuleOccupancy = alpaka::getPtrNative(buf.tripletModuleOccupancy_buf); - - device_nTotalMDs = alpaka::getPtrNative(buf.device_nTotalMDs_buf); - device_nTotalSegs = alpaka::getPtrNative(buf.device_nTotalSegs_buf); - device_nTotalTrips = alpaka::getPtrNative(buf.device_nTotalTrips_buf); - device_nTotalQuints = alpaka::getPtrNative(buf.device_nTotalQuints_buf); + hitRanges = buf.hitRanges_buf.data(); + hitRangesLower = buf.hitRangesLower_buf.data(); + hitRangesUpper = buf.hitRangesUpper_buf.data(); + hitRangesnLower = buf.hitRangesnLower_buf.data(); + hitRangesnUpper = buf.hitRangesnUpper_buf.data(); + mdRanges = buf.mdRanges_buf.data(); + segmentRanges = buf.segmentRanges_buf.data(); + trackletRanges = buf.trackletRanges_buf.data(); + tripletRanges = buf.tripletRanges_buf.data(); + trackCandidateRanges = buf.trackCandidateRanges_buf.data(); + quintupletRanges = buf.quintupletRanges_buf.data(); + + nEligibleT5Modules = buf.nEligibleT5Modules_buf.data(); + indicesOfEligibleT5Modules = buf.indicesOfEligibleT5Modules_buf.data(); + + quintupletModuleIndices = buf.quintupletModuleIndices_buf.data(); + quintupletModuleOccupancy = buf.quintupletModuleOccupancy_buf.data(); + miniDoubletModuleIndices = buf.miniDoubletModuleIndices_buf.data(); + miniDoubletModuleOccupancy = buf.miniDoubletModuleOccupancy_buf.data(); + segmentModuleIndices = buf.segmentModuleIndices_buf.data(); + segmentModuleOccupancy = buf.segmentModuleOccupancy_buf.data(); + tripletModuleIndices = buf.tripletModuleIndices_buf.data(); + tripletModuleOccupancy = buf.tripletModuleOccupancy_buf.data(); + + device_nTotalMDs = buf.device_nTotalMDs_buf.data(); + device_nTotalSegs = buf.device_nTotalSegs_buf.data(); + device_nTotalTrips = buf.device_nTotalTrips_buf.data(); + device_nTotalQuints = buf.device_nTotalQuints_buf.data(); } }; @@ -143,7 +143,6 @@ namespace lst { alpaka::memset(queue, trackCandidateRanges_buf, 0xff); alpaka::memset(queue, quintupletRanges_buf, 0xff); alpaka::memset(queue, quintupletModuleIndices_buf, 0xff); - alpaka::wait(queue); data_.setData(*this); } @@ -151,5 +150,5 @@ namespace lst { void setData(ObjectRangesBuffer& buf) { data_.setData(buf); } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h index ee172f9e05f6e..1ecc256887c77 100644 --- a/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h +++ b/RecoTracker/LSTCore/src/alpaka/PixelQuintuplet.h @@ -11,7 +11,7 @@ #include "Quintuplet.h" #include "PixelTriplet.h" -namespace lst { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { struct PixelQuintuplets { unsigned int* pixelIndices; unsigned int* T5Indices; @@ -34,24 +34,24 @@ namespace lst { template void setData(TBuff& buf) { - pixelIndices = alpaka::getPtrNative(buf.pixelIndices_buf); - T5Indices = alpaka::getPtrNative(buf.T5Indices_buf); - nPixelQuintuplets = alpaka::getPtrNative(buf.nPixelQuintuplets_buf); - totOccupancyPixelQuintuplets = alpaka::getPtrNative(buf.totOccupancyPixelQuintuplets_buf); - isDup = alpaka::getPtrNative(buf.isDup_buf); - score = alpaka::getPtrNative(buf.score_buf); - eta = alpaka::getPtrNative(buf.eta_buf); - phi = alpaka::getPtrNative(buf.phi_buf); - logicalLayers = alpaka::getPtrNative(buf.logicalLayers_buf); - hitIndices = alpaka::getPtrNative(buf.hitIndices_buf); - lowerModuleIndices = alpaka::getPtrNative(buf.lowerModuleIndices_buf); - pixelRadius = alpaka::getPtrNative(buf.pixelRadius_buf); - quintupletRadius = alpaka::getPtrNative(buf.quintupletRadius_buf); - centerX = alpaka::getPtrNative(buf.centerX_buf); - centerY = alpaka::getPtrNative(buf.centerY_buf); - rzChiSquared = alpaka::getPtrNative(buf.rzChiSquared_buf); - rPhiChiSquared = alpaka::getPtrNative(buf.rPhiChiSquared_buf); - rPhiChiSquaredInwards = alpaka::getPtrNative(buf.rPhiChiSquaredInwards_buf); + pixelIndices = buf.pixelIndices_buf.data(); + T5Indices = buf.T5Indices_buf.data(); + nPixelQuintuplets = buf.nPixelQuintuplets_buf.data(); + totOccupancyPixelQuintuplets = buf.totOccupancyPixelQuintuplets_buf.data(); + isDup = buf.isDup_buf.data(); + score = buf.score_buf.data(); + eta = buf.eta_buf.data(); + phi = buf.phi_buf.data(); + logicalLayers = buf.logicalLayers_buf.data(); + hitIndices = buf.hitIndices_buf.data(); + lowerModuleIndices = buf.lowerModuleIndices_buf.data(); + pixelRadius = buf.pixelRadius_buf.data(); + quintupletRadius = buf.quintupletRadius_buf.data(); + centerX = buf.centerX_buf.data(); + centerY = buf.centerY_buf.data(); + rzChiSquared = buf.rzChiSquared_buf.data(); + rPhiChiSquared = buf.rPhiChiSquared_buf.data(); + rPhiChiSquaredInwards = buf.rPhiChiSquaredInwards_buf.data(); } }; @@ -100,18 +100,17 @@ namespace lst { rPhiChiSquaredInwards_buf(allocBufWrapper(devAccIn, maxPixelQuintuplets, queue)) { alpaka::memset(queue, nPixelQuintuplets_buf, 0u); alpaka::memset(queue, totOccupancyPixelQuintuplets_buf, 0u); - alpaka::wait(queue); } inline PixelQuintuplets const* data() const { return &data_; } inline void setData(PixelQuintupletsBuffer& buf) { data_.setData(buf); } }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, - lst::Quintuplets const& quintupletsInGPU, - lst::PixelQuintuplets& pixelQuintupletsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelQuintupletToMemory(Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, + Quintuplets const& quintupletsInGPU, + PixelQuintuplets& pixelQuintupletsInGPU, unsigned int pixelIndex, unsigned int T5Index, unsigned int pixelQuintupletIndex, @@ -201,9 +200,9 @@ namespace lst { pixelQuintupletsInGPU.rzChiSquared[pixelQuintupletIndex] = rzChiSquared; pixelQuintupletsInGPU.rPhiChiSquared[pixelQuintupletIndex] = rPhiChiSquared; pixelQuintupletsInGPU.rPhiChiSquaredInwards[pixelQuintupletIndex] = rPhiChiSquaredInwards; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RZChiSquaredCuts(lst::Modules const& modulesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RZChiSquaredCuts(Modules const& modulesInGPU, uint16_t lowerModuleIndex1, uint16_t lowerModuleIndex2, uint16_t lowerModuleIndex3, @@ -211,25 +210,25 @@ namespace lst { uint16_t lowerModuleIndex5, float rzChiSquared) { const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + - 6 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex1] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == ::lst::TwoS); const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + - 6 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex2] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == ::lst::TwoS); const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + - 6 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex3] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == ::lst::TwoS); const int layer4 = modulesInGPU.layers[lowerModuleIndex4] + - 6 * (modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex4] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex4] == ::lst::TwoS); const int layer5 = modulesInGPU.layers[lowerModuleIndex5] + - 6 * (modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex5] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex5] == ::lst::TwoS); if (layer1 == 1 and layer2 == 2 and layer3 == 3) { if (layer4 == 12 and layer5 == 13) { @@ -291,9 +290,9 @@ namespace lst { } } return true; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RPhiChiSquaredCuts(lst::Modules const& modulesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RPhiChiSquaredCuts(Modules const& modulesInGPU, uint16_t lowerModuleIndex1, uint16_t lowerModuleIndex2, uint16_t lowerModuleIndex3, @@ -301,25 +300,25 @@ namespace lst { uint16_t lowerModuleIndex5, float rPhiChiSquared) { const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + - 6 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex1] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == ::lst::TwoS); const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + - 6 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex2] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == ::lst::TwoS); const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + - 6 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex3] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == ::lst::TwoS); const int layer4 = modulesInGPU.layers[lowerModuleIndex4] + - 6 * (modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex4] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex4] == ::lst::TwoS); const int layer5 = modulesInGPU.layers[lowerModuleIndex5] + - 6 * (modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex5] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex5] == ::lst::TwoS); if (layer1 == 1 and layer2 == 2 and layer3 == 3) { if (layer4 == 12 and layer5 == 13) { @@ -381,7 +380,7 @@ namespace lst { } } return true; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeChiSquaredpT5(TAcc const& acc, @@ -402,8 +401,8 @@ namespace lst { float chiSquared = 0.f; float absArctanSlope, angleM, xPrime, yPrime, sigma2; for (size_t i = 0; i < nPoints; i++) { - absArctanSlope = ((slopes[i] != lst::lst_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) - : 0.5f * float(M_PI)); + absArctanSlope = + ((slopes[i] != lst_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) : 0.5f * float(M_PI)); if (xs[i] > 0 and ys[i] > 0) { angleM = 0.5f * float(M_PI) - absArctanSlope; } else if (xs[i] < 0 and ys[i] > 0) { @@ -427,11 +426,11 @@ namespace lst { (xs[i] * xs[i] + ys[i] * ys[i] - 2 * g * xs[i] - 2 * f * ys[i] + c) / (sigma2); } return chiSquared; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeSigmasForRegression_pT5(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, const uint16_t* lowerModuleIndices, float* delta1, float* delta2, @@ -447,7 +446,7 @@ namespace lst { need not always be a PS strip module, but all non-anchor hits sit on strip modules. */ - ModuleType moduleType; + ::lst::ModuleType moduleType; short moduleSubdet, moduleSide; float inv1 = kWidthPS / kWidth2S; float inv2 = kPixelPSZpitch / kWidth2S; @@ -459,21 +458,21 @@ namespace lst { const float& drdz = modulesInGPU.drdzs[lowerModuleIndices[i]]; slopes[i] = modulesInGPU.dxdys[lowerModuleIndices[i]]; //category 1 - barrel PS flat - if (moduleSubdet == Barrel and moduleType == PS and moduleSide == Center) { + if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::PS and moduleSide == ::lst::Center) { delta1[i] = inv1; delta2[i] = inv1; slopes[i] = -999.f; isFlat[i] = true; } //category 2 - barrel 2S - else if (moduleSubdet == Barrel and moduleType == TwoS) { + else if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::TwoS) { delta1[i] = 1.f; delta2[i] = 1.f; slopes[i] = -999.f; isFlat[i] = true; } //category 3 - barrel PS tilted - else if (moduleSubdet == Barrel and moduleType == PS and moduleSide != Center) { + else if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::PS and moduleSide != ::lst::Center) { delta1[i] = inv1; isFlat[i] = false; @@ -484,7 +483,7 @@ namespace lst { } } //category 4 - endcap PS - else if (moduleSubdet == Endcap and moduleType == PS) { + else if (moduleSubdet == ::lst::Endcap and moduleType == ::lst::PS) { delta1[i] = inv1; isFlat[i] = false; /* @@ -499,7 +498,7 @@ namespace lst { } } //category 5 - endcap 2S - else if (moduleSubdet == Endcap and moduleType == TwoS) { + else if (moduleSubdet == ::lst::Endcap and moduleType == ::lst::TwoS) { delta1[i] = 1.f; delta2[i] = 500.f * inv1; isFlat[i] = false; @@ -513,11 +512,11 @@ namespace lst { } #endif } - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT5RPhiChiSquared(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, uint16_t* lowerModuleIndices, float g, float f, @@ -536,7 +535,7 @@ namespace lst { chiSquared = computeChiSquaredpT5(acc, 5, xs, ys, delta1, delta2, slopes, isFlat, g, f, radius); return chiSquared; - }; + } ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT5RPhiChiSquaredInwards( float g, float f, float r, float* xPix, float* yPix) { @@ -551,9 +550,9 @@ namespace lst { } chiSquared *= 0.5f; return chiSquared; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RPhiChiSquaredInwardsCuts(lst::Modules const& modulesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT5RPhiChiSquaredInwardsCuts(Modules const& modulesInGPU, uint16_t lowerModuleIndex1, uint16_t lowerModuleIndex2, uint16_t lowerModuleIndex3, @@ -561,25 +560,25 @@ namespace lst { uint16_t lowerModuleIndex5, float rPhiChiSquared) { const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + - 6 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex1] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == ::lst::TwoS); const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + - 6 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex2] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == ::lst::TwoS); const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + - 6 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex3] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == ::lst::TwoS); const int layer4 = modulesInGPU.layers[lowerModuleIndex4] + - 6 * (modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex4] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex4] == ::lst::TwoS); const int layer5 = modulesInGPU.layers[lowerModuleIndex5] + - 6 * (modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex5] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex5] == ::lst::TwoS); if (layer1 == 1 and layer2 == 2 and layer3 == 3) { if (layer4 == 12 and layer5 == 13) { @@ -641,16 +640,60 @@ namespace lst { } } return true; - }; + } + + template + ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT5RZChiSquared(TAcc const& acc, + Modules const& modulesInGPU, + uint16_t* lowerModuleIndices, + float* rtPix, + float* zPix, + float* rts, + float* zs) { + //use the two anchor hits of the pixel segment to compute the slope + //then compute the pseudo chi squared of the five outer hits + + float slope = (zPix[1] - zPix[0]) / (rtPix[1] - rtPix[0]); + float residual = 0; + float error2 = 0; + //hardcoded array indices!!! + float RMSE = 0; + for (size_t i = 0; i < Params_T5::kLayers; i++) { + uint16_t& lowerModuleIndex = lowerModuleIndices[i]; + const int moduleType = modulesInGPU.moduleType[lowerModuleIndex]; + const int moduleSide = modulesInGPU.sides[lowerModuleIndex]; + const int moduleSubdet = modulesInGPU.subdets[lowerModuleIndex]; + + residual = (moduleSubdet == ::lst::Barrel) ? (zs[i] - zPix[0]) - slope * (rts[i] - rtPix[0]) + : (rts[i] - rtPix[0]) - (zs[i] - zPix[0]) / slope; + const float& drdz = modulesInGPU.drdzs[lowerModuleIndex]; + //PS Modules + if (moduleType == 0) { + error2 = kPixelPSZpitch * kPixelPSZpitch; + } else //2S modules + { + error2 = kStrip2SZpitch * kStrip2SZpitch; + } + + //special dispensation to tilted PS modules! + if (moduleType == 0 and moduleSubdet == ::lst::Barrel and moduleSide != ::lst::Center) { + error2 /= (1.f + drdz * drdz); + } + RMSE += (residual * residual) / error2; + } + + RMSE = alpaka::math::sqrt(acc, 0.2f * RMSE); // Divided by the degree of freedom 5. + return RMSE; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelQuintupletDefaultAlgo(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::ObjectRanges const& rangesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, - lst::Triplets const& tripletsInGPU, - lst::Quintuplets const& quintupletsInGPU, + Modules const& modulesInGPU, + ObjectRanges const& rangesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, + Triplets const& tripletsInGPU, + Quintuplets const& quintupletsInGPU, unsigned int pixelSegmentIndex, unsigned int quintupletIndex, float& rzChiSquared, @@ -787,65 +830,21 @@ namespace lst { centerY = (centerY + T5CenterY) / 2; return true; - }; - - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT5RZChiSquared(TAcc const& acc, - lst::Modules const& modulesInGPU, - uint16_t* lowerModuleIndices, - float* rtPix, - float* zPix, - float* rts, - float* zs) { - //use the two anchor hits of the pixel segment to compute the slope - //then compute the pseudo chi squared of the five outer hits - - float slope = (zPix[1] - zPix[0]) / (rtPix[1] - rtPix[0]); - float residual = 0; - float error2 = 0; - //hardcoded array indices!!! - float RMSE = 0; - for (size_t i = 0; i < Params_T5::kLayers; i++) { - uint16_t& lowerModuleIndex = lowerModuleIndices[i]; - const int moduleType = modulesInGPU.moduleType[lowerModuleIndex]; - const int moduleSide = modulesInGPU.sides[lowerModuleIndex]; - const int moduleSubdet = modulesInGPU.subdets[lowerModuleIndex]; - - residual = (moduleSubdet == lst::Barrel) ? (zs[i] - zPix[0]) - slope * (rts[i] - rtPix[0]) - : (rts[i] - rtPix[0]) - (zs[i] - zPix[0]) / slope; - const float& drdz = modulesInGPU.drdzs[lowerModuleIndex]; - //PS Modules - if (moduleType == 0) { - error2 = kPixelPSZpitch * kPixelPSZpitch; - } else //2S modules - { - error2 = kStrip2SZpitch * kStrip2SZpitch; - } - - //special dispensation to tilted PS modules! - if (moduleType == 0 and moduleSubdet == lst::Barrel and moduleSide != Center) { - error2 /= (1.f + drdz * drdz); - } - RMSE += (residual * residual) / error2; - } - - RMSE = alpaka::math::sqrt(acc, 0.2f * RMSE); // Divided by the degree of freedom 5. - return RMSE; - }; + } - struct createPixelQuintupletsInGPUFromMapv2 { + struct CreatePixelQuintupletsInGPUFromMapv2 { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::MiniDoublets mdsInGPU, - lst::Segments segmentsInGPU, - lst::Triplets tripletsInGPU, - lst::Quintuplets quintupletsInGPU, - lst::PixelQuintuplets pixelQuintupletsInGPU, + Modules modulesInGPU, + MiniDoublets mdsInGPU, + Segments segmentsInGPU, + Triplets tripletsInGPU, + Quintuplets quintupletsInGPU, + PixelQuintuplets pixelQuintupletsInGPU, unsigned int* connectedPixelSize, unsigned int* connectedPixelIndex, unsigned int nPixelSegments, - lst::ObjectRanges rangesInGPU) const { + ObjectRanges rangesInGPU) const { auto const globalBlockIdx = alpaka::getIdx(acc); auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridBlockExtent = alpaka::getWorkDiv(acc); @@ -859,7 +858,7 @@ namespace lst { uint16_t quintupletLowerModuleIndex = modulesInGPU.connectedPixels[iLSModule]; if (quintupletLowerModuleIndex >= *modulesInGPU.nLowerModules) continue; - if (modulesInGPU.moduleType[quintupletLowerModuleIndex] == lst::TwoS) + if (modulesInGPU.moduleType[quintupletLowerModuleIndex] == ::lst::TwoS) continue; uint16_t pixelModuleIndex = *modulesInGPU.nLowerModules; if (segmentsInGPU.isDup[i_pLS]) @@ -901,15 +900,15 @@ namespace lst { centerY, static_cast(i_pLS)); if (success) { - unsigned int totOccupancyPixelQuintuplets = - alpaka::atomicOp(acc, pixelQuintupletsInGPU.totOccupancyPixelQuintuplets, 1u); + unsigned int totOccupancyPixelQuintuplets = alpaka::atomicAdd( + acc, pixelQuintupletsInGPU.totOccupancyPixelQuintuplets, 1u, alpaka::hierarchy::Threads{}); if (totOccupancyPixelQuintuplets >= n_max_pixel_quintuplets) { #ifdef WARNINGS printf("Pixel Quintuplet excess alert!\n"); #endif } else { unsigned int pixelQuintupletIndex = - alpaka::atomicOp(acc, pixelQuintupletsInGPU.nPixelQuintuplets, 1u); + alpaka::atomicAdd(acc, pixelQuintupletsInGPU.nPixelQuintuplets, 1u, alpaka::hierarchy::Threads{}); float eta = __H2F(quintupletsInGPU.eta[quintupletIndex]); float phi = __H2F(quintupletsInGPU.phi[quintupletIndex]); @@ -943,5 +942,5 @@ namespace lst { } // end i_pLS } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h b/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h index 3b6faffbce426..710c760fb809f 100644 --- a/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h +++ b/RecoTracker/LSTCore/src/alpaka/PixelTriplet.h @@ -11,7 +11,7 @@ #include "ObjectRanges.h" #include "Quintuplet.h" -namespace lst { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { // One pixel segment, one outer tracker triplet! struct PixelTriplets { unsigned int* pixelSegmentIndices; @@ -42,28 +42,28 @@ namespace lst { template void setData(TBuff& buf) { - pixelSegmentIndices = alpaka::getPtrNative(buf.pixelSegmentIndices_buf); - tripletIndices = alpaka::getPtrNative(buf.tripletIndices_buf); - nPixelTriplets = alpaka::getPtrNative(buf.nPixelTriplets_buf); - totOccupancyPixelTriplets = alpaka::getPtrNative(buf.totOccupancyPixelTriplets_buf); - pixelRadius = alpaka::getPtrNative(buf.pixelRadius_buf); - tripletRadius = alpaka::getPtrNative(buf.tripletRadius_buf); - pt = alpaka::getPtrNative(buf.pt_buf); - eta = alpaka::getPtrNative(buf.eta_buf); - phi = alpaka::getPtrNative(buf.phi_buf); - eta_pix = alpaka::getPtrNative(buf.eta_pix_buf); - phi_pix = alpaka::getPtrNative(buf.phi_pix_buf); - score = alpaka::getPtrNative(buf.score_buf); - isDup = alpaka::getPtrNative(buf.isDup_buf); - partOfPT5 = alpaka::getPtrNative(buf.partOfPT5_buf); - logicalLayers = alpaka::getPtrNative(buf.logicalLayers_buf); - hitIndices = alpaka::getPtrNative(buf.hitIndices_buf); - lowerModuleIndices = alpaka::getPtrNative(buf.lowerModuleIndices_buf); - centerX = alpaka::getPtrNative(buf.centerX_buf); - centerY = alpaka::getPtrNative(buf.centerY_buf); - rPhiChiSquared = alpaka::getPtrNative(buf.rPhiChiSquared_buf); - rPhiChiSquaredInwards = alpaka::getPtrNative(buf.rPhiChiSquaredInwards_buf); - rzChiSquared = alpaka::getPtrNative(buf.rzChiSquared_buf); + pixelSegmentIndices = buf.pixelSegmentIndices_buf.data(); + tripletIndices = buf.tripletIndices_buf.data(); + nPixelTriplets = buf.nPixelTriplets_buf.data(); + totOccupancyPixelTriplets = buf.totOccupancyPixelTriplets_buf.data(); + pixelRadius = buf.pixelRadius_buf.data(); + tripletRadius = buf.tripletRadius_buf.data(); + pt = buf.pt_buf.data(); + eta = buf.eta_buf.data(); + phi = buf.phi_buf.data(); + eta_pix = buf.eta_pix_buf.data(); + phi_pix = buf.phi_pix_buf.data(); + score = buf.score_buf.data(); + isDup = buf.isDup_buf.data(); + partOfPT5 = buf.partOfPT5_buf.data(); + logicalLayers = buf.logicalLayers_buf.data(); + hitIndices = buf.hitIndices_buf.data(); + lowerModuleIndices = buf.lowerModuleIndices_buf.data(); + centerX = buf.centerX_buf.data(); + centerY = buf.centerY_buf.data(); + rPhiChiSquared = buf.rPhiChiSquared_buf.data(); + rPhiChiSquaredInwards = buf.rPhiChiSquaredInwards_buf.data(); + rzChiSquared = buf.rzChiSquared_buf.data(); } }; @@ -123,17 +123,16 @@ namespace lst { alpaka::memset(queue, nPixelTriplets_buf, 0u); alpaka::memset(queue, totOccupancyPixelTriplets_buf, 0u); alpaka::memset(queue, partOfPT5_buf, false); - alpaka::wait(queue); } inline PixelTriplets const* data() const { return &data_; } inline void setData(PixelTripletsBuffer& buf) { data_.setData(buf); } }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, - lst::Triplets const& tripletsInGPU, - lst::PixelTriplets& pixelTripletsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelTripletToMemory(MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, + Triplets const& tripletsInGPU, + PixelTriplets& pixelTripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float pixelRadius, @@ -211,10 +210,10 @@ namespace lst { template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTrackletDefaultAlgopT3(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::ObjectRanges const& rangesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + ObjectRanges const& rangesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t pixelLowerModuleIndex, uint16_t outerInnerLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -229,8 +228,8 @@ namespace lst { unsigned int thirdMDIndex = segmentsInGPU.mdIndices[Params_LS::kLayers * outerSegmentIndex]; unsigned int fourthMDIndex = segmentsInGPU.mdIndices[Params_LS::kLayers * outerSegmentIndex + 1]; - if (outerInnerLowerModuleSubdet == lst::Barrel and - (outerOuterLowerModuleSubdet == lst::Barrel or outerOuterLowerModuleSubdet == lst::Endcap)) { + if (outerInnerLowerModuleSubdet == ::lst::Barrel and + (outerOuterLowerModuleSubdet == ::lst::Barrel or outerOuterLowerModuleSubdet == ::lst::Endcap)) { return runTripletDefaultAlgoPPBB(acc, modulesInGPU, rangesInGPU, @@ -245,7 +244,7 @@ namespace lst { secondMDIndex, thirdMDIndex, fourthMDIndex); - } else if (outerInnerLowerModuleSubdet == lst::Endcap and outerOuterLowerModuleSubdet == lst::Endcap) { + } else if (outerInnerLowerModuleSubdet == ::lst::Endcap and outerOuterLowerModuleSubdet == ::lst::Endcap) { return runTripletDefaultAlgoPPEE(acc, modulesInGPU, rangesInGPU, @@ -264,23 +263,23 @@ namespace lst { return false; }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RZChiSquaredCuts(lst::Modules const& modulesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RZChiSquaredCuts(Modules const& modulesInGPU, uint16_t lowerModuleIndex1, uint16_t lowerModuleIndex2, uint16_t lowerModuleIndex3, float rzChiSquared) { const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + - 6 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex1] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == ::lst::TwoS); const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + - 6 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex2] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == ::lst::TwoS); const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + - 6 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex3] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == ::lst::TwoS); if (layer1 == 8 and layer2 == 9 and layer3 == 10) { return rzChiSquared < 13.6067f; @@ -336,8 +335,8 @@ namespace lst { float chiSquared = 0.f; float absArctanSlope, angleM, xPrime, yPrime, sigma2; for (size_t i = 0; i < nPoints; i++) { - absArctanSlope = ((slopes[i] != lst::lst_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) - : 0.5f * float(M_PI)); + absArctanSlope = + ((slopes[i] != lst_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) : 0.5f * float(M_PI)); if (xs[i] > 0 and ys[i] > 0) { angleM = 0.5f * float(M_PI) - absArctanSlope; } else if (xs[i] < 0 and ys[i] > 0) { @@ -367,7 +366,7 @@ namespace lst { //TODO: merge this one and the pT5 function later into a single function template ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT3RPhiChiSquared(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, uint16_t* lowerModuleIndices, float g, float f, @@ -380,33 +379,33 @@ namespace lst { float inv1 = kWidthPS / kWidth2S; float inv2 = kPixelPSZpitch / kWidth2S; for (size_t i = 0; i < 3; i++) { - ModuleType moduleType = modulesInGPU.moduleType[lowerModuleIndices[i]]; + ::lst::ModuleType moduleType = modulesInGPU.moduleType[lowerModuleIndices[i]]; short moduleSubdet = modulesInGPU.subdets[lowerModuleIndices[i]]; short moduleSide = modulesInGPU.sides[lowerModuleIndices[i]]; float drdz = modulesInGPU.drdzs[lowerModuleIndices[i]]; slopes[i] = modulesInGPU.dxdys[lowerModuleIndices[i]]; //category 1 - barrel PS flat - if (moduleSubdet == Barrel and moduleType == PS and moduleSide == Center) { + if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::PS and moduleSide == ::lst::Center) { delta1[i] = inv1; delta2[i] = inv1; slopes[i] = -999; isFlat[i] = true; } //category 2 - barrel 2S - else if (moduleSubdet == Barrel and moduleType == TwoS) { + else if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::TwoS) { delta1[i] = 1; delta2[i] = 1; slopes[i] = -999; isFlat[i] = true; } //category 3 - barrel PS tilted - else if (moduleSubdet == Barrel and moduleType == PS and moduleSide != Center) { + else if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::PS and moduleSide != ::lst::Center) { delta1[i] = inv1; isFlat[i] = false; delta2[i] = (inv2 * drdz / alpaka::math::sqrt(acc, 1 + drdz * drdz)); } //category 4 - endcap PS - else if (moduleSubdet == Endcap and moduleType == PS) { + else if (moduleSubdet == ::lst::Endcap and moduleType == ::lst::PS) { delta1[i] = inv1; isFlat[i] = false; @@ -417,7 +416,7 @@ namespace lst { delta2[i] = inv2; } //category 5 - endcap 2S - else if (moduleSubdet == Endcap and moduleType == TwoS) { + else if (moduleSubdet == ::lst::Endcap and moduleType == ::lst::TwoS) { delta1[i] = 1; delta2[i] = 500 * inv1; isFlat[i] = false; @@ -448,23 +447,23 @@ namespace lst { }; //90pc threshold - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RPhiChiSquaredCuts(lst::Modules const& modulesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RPhiChiSquaredCuts(Modules const& modulesInGPU, uint16_t lowerModuleIndex1, uint16_t lowerModuleIndex2, uint16_t lowerModuleIndex3, float chiSquared) { const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + - 6 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex1] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == ::lst::TwoS); const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + - 6 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex2] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == ::lst::TwoS); const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + - 6 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex3] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == ::lst::TwoS); if (layer1 == 8 and layer2 == 9 and layer3 == 10) { return chiSquared < 7.003f; @@ -495,23 +494,23 @@ namespace lst { return true; }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RPhiChiSquaredInwardsCuts(lst::Modules const& modulesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPT3RPhiChiSquaredInwardsCuts(Modules const& modulesInGPU, uint16_t lowerModuleIndex1, uint16_t lowerModuleIndex2, uint16_t lowerModuleIndex3, float chiSquared) { const int layer1 = modulesInGPU.layers[lowerModuleIndex1] + - 6 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex1] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == ::lst::TwoS); const int layer2 = modulesInGPU.layers[lowerModuleIndex2] + - 6 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex2] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == ::lst::TwoS); const int layer3 = modulesInGPU.layers[lowerModuleIndex3] + - 6 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap) + - 5 * (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex3] == lst::TwoS); + 6 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap) + + 5 * (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == ::lst::TwoS); if (layer1 == 7 and layer2 == 8 and layer3 == 9) // endcap layer 1,2,3, ps { @@ -664,18 +663,18 @@ namespace lst { template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRadiusCriterion(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, float pixelRadius, float pixelRadiusError, float tripletRadius, int16_t lowerModuleIndex, uint16_t middleModuleIndex, uint16_t upperModuleIndex) { - if (modulesInGPU.subdets[lowerModuleIndex] == lst::Endcap) { + if (modulesInGPU.subdets[lowerModuleIndex] == ::lst::Endcap) { return passRadiusCriterionEEE(acc, pixelRadius, pixelRadiusError, tripletRadius); - } else if (modulesInGPU.subdets[middleModuleIndex] == lst::Endcap) { + } else if (modulesInGPU.subdets[middleModuleIndex] == ::lst::Endcap) { return passRadiusCriterionBEE(acc, pixelRadius, pixelRadiusError, tripletRadius); - } else if (modulesInGPU.subdets[upperModuleIndex] == lst::Endcap) { + } else if (modulesInGPU.subdets[upperModuleIndex] == ::lst::Endcap) { return passRadiusCriterionBBE(acc, pixelRadius, pixelRadiusError, tripletRadius); } else { return passRadiusCriterionBBB(acc, pixelRadius, pixelRadiusError, tripletRadius); @@ -684,7 +683,7 @@ namespace lst { template ALPAKA_FN_ACC ALPAKA_FN_INLINE float computePT3RZChiSquared(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, const uint16_t* lowerModuleIndices, const float* rtPix, const float* xPix, @@ -725,14 +724,14 @@ namespace lst { float p = alpaka::math::sqrt(acc, Px * Px + Py * Py + Pz * Pz); float rou = a / p; - if (moduleSubdet == lst::Endcap) { + if (moduleSubdet == ::lst::Endcap) { float s = (zsi - z1) * p / Pz; float x = x1 + Px / a * alpaka::math::sin(acc, rou * s) - Py / a * (1 - alpaka::math::cos(acc, rou * s)); float y = y1 + Py / a * alpaka::math::sin(acc, rou * s) + Px / a * (1 - alpaka::math::cos(acc, rou * s)); diffr = alpaka::math::abs(acc, rtsi - alpaka::math::sqrt(acc, x * x + y * y)) * 100; } - if (moduleSubdet == lst::Barrel) { + if (moduleSubdet == ::lst::Barrel) { float paraA = r1 * r1 + 2 * (Px * Px + Py * Py) / (a * a) + 2 * (y1 * Px - x1 * Py) / a - rtsi * rtsi; float paraB = 2 * (x1 * Px + y1 * Py) / a; float paraC = 2 * (y1 * Px - x1 * Py) / a + 2 * (Px * Px + Py * Py) / (a * a); @@ -748,7 +747,7 @@ namespace lst { diffz = alpaka::math::min(acc, diffz1, diffz2); } - residual = moduleSubdet == lst::Barrel ? diffz : diffr; + residual = moduleSubdet == ::lst::Barrel ? diffz : diffr; //PS Modules if (moduleType == 0) { @@ -759,7 +758,7 @@ namespace lst { } //special dispensation to tilted PS modules! - if (moduleType == 0 and moduleSubdet == lst::Barrel and moduleSide != Center) { + if (moduleType == 0 and moduleSubdet == ::lst::Barrel and moduleSide != ::lst::Center) { float drdz = modulesInGPU.drdzs[lowerModuleIndex]; error2 /= (1 + drdz * drdz); } @@ -773,11 +772,11 @@ namespace lst { template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runPixelTripletDefaultAlgo(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::ObjectRanges const& rangesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, - lst::Triplets const& tripletsInGPU, + Modules const& modulesInGPU, + ObjectRanges const& rangesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, + Triplets const& tripletsInGPU, unsigned int pixelSegmentIndex, unsigned int tripletIndex, float& pixelRadius, @@ -926,15 +925,15 @@ namespace lst { return true; }; - struct createPixelTripletsInGPUFromMapv2 { + struct CreatePixelTripletsInGPUFromMapv2 { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::ObjectRanges rangesInGPU, - lst::MiniDoublets mdsInGPU, - lst::Segments segmentsInGPU, - lst::Triplets tripletsInGPU, - lst::PixelTriplets pixelTripletsInGPU, + Modules modulesInGPU, + ObjectRanges rangesInGPU, + MiniDoublets mdsInGPU, + Segments segmentsInGPU, + Triplets tripletsInGPU, + PixelTriplets pixelTripletsInGPU, unsigned int* connectedPixelSize, unsigned int* connectedPixelIndex, unsigned int nPixelSegments) const { @@ -960,7 +959,7 @@ namespace lst { } #endif //Removes 2S-2S :FIXME: filter these out in the pixel map - if (modulesInGPU.moduleType[tripletLowerModuleIndex] == lst::TwoS) + if (modulesInGPU.moduleType[tripletLowerModuleIndex] == ::lst::TwoS) continue; uint16_t pixelModuleIndex = *modulesInGPU.nLowerModules; @@ -991,7 +990,7 @@ namespace lst { outerTripletArrayIndex += gridThreadExtent[2]) { unsigned int outerTripletIndex = rangesInGPU.tripletModuleIndices[tripletLowerModuleIndex] + outerTripletArrayIndex; - if (modulesInGPU.moduleType[tripletsInGPU.lowerModuleIndices[3 * outerTripletIndex + 1]] == lst::TwoS) + if (modulesInGPU.moduleType[tripletsInGPU.lowerModuleIndices[3 * outerTripletIndex + 1]] == ::lst::TwoS) continue; //REMOVES PS-2S if (tripletsInGPU.partOfPT5[outerTripletIndex]) @@ -1025,15 +1024,15 @@ namespace lst { float phi_pix = segmentsInGPU.phi[i_pLS]; float pt = segmentsInGPU.ptIn[i_pLS]; float score = rPhiChiSquared + rPhiChiSquaredInwards; - unsigned int totOccupancyPixelTriplets = - alpaka::atomicOp(acc, pixelTripletsInGPU.totOccupancyPixelTriplets, 1u); + unsigned int totOccupancyPixelTriplets = alpaka::atomicAdd( + acc, pixelTripletsInGPU.totOccupancyPixelTriplets, 1u, alpaka::hierarchy::Threads{}); if (totOccupancyPixelTriplets >= n_max_pixel_triplets) { #ifdef WARNINGS printf("Pixel Triplet excess alert!\n"); #endif } else { unsigned int pixelTripletIndex = - alpaka::atomicOp(acc, pixelTripletsInGPU.nPixelTriplets, 1u); + alpaka::atomicAdd(acc, pixelTripletsInGPU.nPixelTriplets, 1u, alpaka::hierarchy::Threads{}); addPixelTripletToMemory(mdsInGPU, segmentsInGPU, tripletsInGPU, @@ -1077,33 +1076,30 @@ namespace lst { betaOut += alpaka::math::copysign( acc, alpaka::math::asin( - acc, - alpaka::math::min(acc, sdOut_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), + acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), betaOut); return; } if (betaIn * betaOut > 0.f and - (alpaka::math::abs(acc, pt_beta) < 4.f * lst::kPt_betaMax or + (alpaka::math::abs(acc, pt_beta) < 4.f * kPt_betaMax or (lIn >= 11 and alpaka::math::abs(acc, pt_beta) < - 8.f * lst::kPt_betaMax))) //and the pt_beta is well-defined; less strict for endcap-endcap + 8.f * kPt_betaMax))) //and the pt_beta is well-defined; less strict for endcap-endcap { const float betaInUpd = - betaIn + alpaka::math::copysign( - acc, - alpaka::math::asin( - acc, - alpaka::math::min( - acc, sdIn_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), - betaIn); //FIXME: need a faster version + betaIn + + alpaka::math::copysign( + acc, + alpaka::math::asin( + acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), + betaIn); //FIXME: need a faster version const float betaOutUpd = - betaOut + alpaka::math::copysign( - acc, - alpaka::math::asin( - acc, - alpaka::math::min( - acc, sdOut_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), - betaOut); //FIXME: need a faster version + betaOut + + alpaka::math::copysign( + acc, + alpaka::math::asin( + acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), + betaOut); //FIXME: need a faster version betaAv = 0.5f * (betaInUpd + betaOutUpd); //1st update @@ -1112,69 +1108,65 @@ namespace lst { betaIn += alpaka::math::copysign( acc, - alpaka::math::asin(acc, alpaka::math::min(acc, sdIn_dr * lst::k2Rinv1GeVf * pt_beta_inv, lst::kSinAlphaMax)), + alpaka::math::asin(acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf * pt_beta_inv, kSinAlphaMax)), betaIn); //FIXME: need a faster version betaOut += alpaka::math::copysign( acc, - alpaka::math::asin(acc, alpaka::math::min(acc, sdOut_dr * lst::k2Rinv1GeVf * pt_beta_inv, lst::kSinAlphaMax)), + alpaka::math::asin(acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf * pt_beta_inv, kSinAlphaMax)), betaOut); //FIXME: need a faster version //update the av and pt betaAv = 0.5f * (betaIn + betaOut); //2nd update - pt_beta = dr * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate } else if (lIn < 11 && alpaka::math::abs(acc, betaOut) < 0.2f * alpaka::math::abs(acc, betaIn) && - alpaka::math::abs(acc, pt_beta) < 12.f * lst::kPt_betaMax) //use betaIn sign as ref + alpaka::math::abs(acc, pt_beta) < 12.f * kPt_betaMax) //use betaIn sign as ref { const float pt_betaIn = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaIn); const float betaInUpd = - betaIn + alpaka::math::copysign( - acc, - alpaka::math::asin( - acc, - alpaka::math::min( - acc, sdIn_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), lst::kSinAlphaMax)), - betaIn); //FIXME: need a faster version + betaIn + + alpaka::math::copysign( + acc, + alpaka::math::asin( + acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), kSinAlphaMax)), + betaIn); //FIXME: need a faster version const float betaOutUpd = betaOut + alpaka::math::copysign( acc, alpaka::math::asin( acc, - alpaka::math::min( - acc, sdOut_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), lst::kSinAlphaMax)), + alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), kSinAlphaMax)), betaIn); //FIXME: need a faster version betaAv = (alpaka::math::abs(acc, betaOut) > 0.2f * alpaka::math::abs(acc, betaIn)) ? (0.5f * (betaInUpd + betaOutUpd)) : betaInUpd; //1st update - pt_beta = dr * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate betaIn += alpaka::math::copysign( acc, alpaka::math::asin( - acc, - alpaka::math::min(acc, sdIn_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), + acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), betaIn); //FIXME: need a faster version betaOut += alpaka::math::copysign( acc, alpaka::math::asin( - acc, - alpaka::math::min(acc, sdOut_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), + acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), betaIn); //FIXME: need a faster version //update the av and pt betaAv = 0.5f * (betaIn + betaOut); //2nd update - pt_beta = dr * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate } - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPBB(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::ObjectRanges const& rangesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + ObjectRanges const& rangesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t pixelModuleIndex, uint16_t outerInnerLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -1186,7 +1178,7 @@ namespace lst { unsigned int fourthMDIndex) { float dPhi, betaIn, betaOut, pt_beta, zLo, zHi, zLoPointed, zHiPointed, dPhiCut, betaOutCut; - bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == lst::PS); + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == ::lst::PS); float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; float rt_InUp = mdsInGPU.anchorRt[secondMDIndex]; @@ -1208,7 +1200,7 @@ namespace lst { float rt_InOut = rt_InUp; - if (alpaka::math::abs(acc, lst::deltaPhi(acc, x_InUp, y_InUp, x_OutLo, y_OutLo)) > 0.5f * float(M_PI)) + if (alpaka::math::abs(acc, deltaPhi(acc, x_InUp, y_InUp, x_OutLo, y_OutLo)) > 0.5f * float(M_PI)) return false; unsigned int pixelSegmentArrayIndex = innerSegmentIndex - rangesInGPU.segmentModuleIndices[pixelModuleIndex]; @@ -1282,7 +1274,7 @@ namespace lst { float diffX = x_OutLo - x_InLo; float diffY = y_OutLo - y_InLo; - dPhi = lst::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + dPhi = deltaPhi(acc, midPointX, midPointY, diffX, diffY); if (alpaka::math::abs(acc, dPhi) > dPhiCut) return false; @@ -1292,11 +1284,11 @@ namespace lst { float alpha_InLo = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); float alpha_OutLo = __H2F(segmentsInGPU.dPhiChanges[outerSegmentIndex]); - bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == lst::Endcap and - modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::TwoS; + bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == ::lst::Endcap and + modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::TwoS; float alpha_OutUp, alpha_OutUp_highEdge, alpha_OutUp_lowEdge; - alpha_OutUp = lst::deltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo); + alpha_OutUp = deltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo); alpha_OutUp_highEdge = alpha_OutUp; alpha_OutUp_lowEdge = alpha_OutUp; @@ -1310,42 +1302,42 @@ namespace lst { float tl_axis_lowEdge_x = tl_axis_x; float tl_axis_lowEdge_y = tl_axis_y; - betaIn = -lst::deltaPhi(acc, px, py, tl_axis_x, tl_axis_y); + betaIn = -deltaPhi(acc, px, py, tl_axis_x, tl_axis_y); float betaInRHmin = betaIn; float betaInRHmax = betaIn; - betaOut = -alpha_OutUp + lst::deltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y); + betaOut = -alpha_OutUp + deltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y); float betaOutRHmin = betaOut; float betaOutRHmax = betaOut; if (isEC_lastLayer) { - alpha_OutUp_highEdge = lst::deltaPhi(acc, - mdsInGPU.anchorHighEdgeX[fourthMDIndex], - mdsInGPU.anchorHighEdgeY[fourthMDIndex], - mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_OutLo, - mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_OutLo); - alpha_OutUp_lowEdge = lst::deltaPhi(acc, - mdsInGPU.anchorLowEdgeX[fourthMDIndex], - mdsInGPU.anchorLowEdgeY[fourthMDIndex], - mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_OutLo, - mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_OutLo); + alpha_OutUp_highEdge = deltaPhi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex], + mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_OutLo, + mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_OutLo); + alpha_OutUp_lowEdge = deltaPhi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex], + mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_OutLo, + mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_OutLo); tl_axis_highEdge_x = mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_InUp; tl_axis_highEdge_y = mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_InUp; tl_axis_lowEdge_x = mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_InUp; tl_axis_lowEdge_y = mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_InUp; - betaOutRHmin = -alpha_OutUp_highEdge + lst::deltaPhi(acc, - mdsInGPU.anchorHighEdgeX[fourthMDIndex], - mdsInGPU.anchorHighEdgeY[fourthMDIndex], - tl_axis_highEdge_x, - tl_axis_highEdge_y); - betaOutRHmax = -alpha_OutUp_lowEdge + lst::deltaPhi(acc, - mdsInGPU.anchorLowEdgeX[fourthMDIndex], - mdsInGPU.anchorLowEdgeY[fourthMDIndex], - tl_axis_lowEdge_x, - tl_axis_lowEdge_y); + betaOutRHmin = -alpha_OutUp_highEdge + deltaPhi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex], + tl_axis_highEdge_x, + tl_axis_highEdge_y); + betaOutRHmax = -alpha_OutUp_lowEdge + deltaPhi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex], + tl_axis_lowEdge_x, + tl_axis_lowEdge_y); } //beta computation @@ -1379,7 +1371,7 @@ namespace lst { betaOutRHmax *= betaOutMMSF; float min_ptBeta_ptBetaMax = alpaka::math::min( - acc, alpaka::math::abs(acc, pt_beta), lst::kPt_betaMax); //need to confirm the range-out value of 7 GeV + acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax); //need to confirm the range-out value of 7 GeV const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_ptBetaMax * min_ptBeta_ptBetaMax); const float alphaInAbsReg = alpaka::math::max(acc, @@ -1425,14 +1417,14 @@ namespace lst { (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax))); float dBeta = betaIn - betaOut; return dBeta * dBeta <= dBetaCut2; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletDefaultAlgoPPEE(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::ObjectRanges const& rangesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + ObjectRanges const& rangesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t pixelModuleIndex, uint16_t outerInnerLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -1444,7 +1436,7 @@ namespace lst { unsigned int fourthMDIndex) { float dPhi, betaIn, betaOut, pt_beta, rtLo, rtHi, dPhiCut, betaOutCut; - bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == lst::PS); + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == ::lst::PS); float z_InUp = mdsInGPU.anchorZ[secondMDIndex]; float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; @@ -1488,7 +1480,7 @@ namespace lst { const float dzDrtScale = alpaka::math::tan(acc, slope) / slope; //FIXME: need approximate value const float dLum = alpaka::math::copysign(acc, kDeltaZLum, z_InUp); - bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == lst::PS; + bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == ::lst::PS; const float rtGeom1 = isOutSgInnerMDPS ? kPixelPSZpitch @@ -1545,7 +1537,7 @@ namespace lst { float diffX = x_OutLo - x_InLo; float diffY = y_OutLo - y_InLo; - dPhi = lst::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + dPhi = deltaPhi(acc, midPointX, midPointY, diffX, diffY); // Cut #5: deltaPhiChange if (alpaka::math::abs(acc, dPhi) > dPhiCut) @@ -1554,12 +1546,12 @@ namespace lst { float alpha_InLo = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); float alpha_OutLo = __H2F(segmentsInGPU.dPhiChanges[outerSegmentIndex]); - bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == lst::Endcap and - modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::TwoS; + bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == ::lst::Endcap and + modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::TwoS; float alpha_OutUp, alpha_OutUp_highEdge, alpha_OutUp_lowEdge; - alpha_OutUp = lst::deltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo); + alpha_OutUp = deltaPhi(acc, x_OutUp, y_OutUp, x_OutUp - x_OutLo, y_OutUp - y_OutLo); alpha_OutUp_highEdge = alpha_OutUp; alpha_OutUp_lowEdge = alpha_OutUp; @@ -1572,41 +1564,41 @@ namespace lst { float tl_axis_lowEdge_x = tl_axis_x; float tl_axis_lowEdge_y = tl_axis_y; - betaIn = -lst::deltaPhi(acc, px, py, tl_axis_x, tl_axis_y); + betaIn = -deltaPhi(acc, px, py, tl_axis_x, tl_axis_y); float betaInRHmin = betaIn; float betaInRHmax = betaIn; - betaOut = -alpha_OutUp + lst::deltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y); + betaOut = -alpha_OutUp + deltaPhi(acc, x_OutUp, y_OutUp, tl_axis_x, tl_axis_y); float betaOutRHmin = betaOut; float betaOutRHmax = betaOut; if (isEC_lastLayer) { - alpha_OutUp_highEdge = lst::deltaPhi(acc, - mdsInGPU.anchorHighEdgeX[fourthMDIndex], - mdsInGPU.anchorHighEdgeY[fourthMDIndex], - mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_OutLo, - mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_OutLo); - alpha_OutUp_lowEdge = lst::deltaPhi(acc, - mdsInGPU.anchorLowEdgeX[fourthMDIndex], - mdsInGPU.anchorLowEdgeY[fourthMDIndex], - mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_OutLo, - mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_OutLo); + alpha_OutUp_highEdge = deltaPhi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex], + mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_OutLo, + mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_OutLo); + alpha_OutUp_lowEdge = deltaPhi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex], + mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_OutLo, + mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_OutLo); tl_axis_highEdge_x = mdsInGPU.anchorHighEdgeX[fourthMDIndex] - x_InUp; tl_axis_highEdge_y = mdsInGPU.anchorHighEdgeY[fourthMDIndex] - y_InUp; tl_axis_lowEdge_x = mdsInGPU.anchorLowEdgeX[fourthMDIndex] - x_InUp; tl_axis_lowEdge_y = mdsInGPU.anchorLowEdgeY[fourthMDIndex] - y_InUp; - betaOutRHmin = -alpha_OutUp_highEdge + lst::deltaPhi(acc, - mdsInGPU.anchorHighEdgeX[fourthMDIndex], - mdsInGPU.anchorHighEdgeY[fourthMDIndex], - tl_axis_highEdge_x, - tl_axis_highEdge_y); - betaOutRHmax = -alpha_OutUp_lowEdge + lst::deltaPhi(acc, - mdsInGPU.anchorLowEdgeX[fourthMDIndex], - mdsInGPU.anchorLowEdgeY[fourthMDIndex], - tl_axis_lowEdge_x, - tl_axis_lowEdge_y); + betaOutRHmin = -alpha_OutUp_highEdge + deltaPhi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex], + tl_axis_highEdge_x, + tl_axis_highEdge_y); + betaOutRHmax = -alpha_OutUp_lowEdge + deltaPhi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex], + tl_axis_lowEdge_x, + tl_axis_lowEdge_y); } //beta computation @@ -1638,7 +1630,7 @@ namespace lst { betaOutRHmax *= betaOutMMSF; float min_ptBeta_ptBetaMax = alpaka::math::min( - acc, alpaka::math::abs(acc, pt_beta), lst::kPt_betaMax); //need to confirm the range-out value of 7 GeV + acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax); //need to confirm the range-out value of 7 GeV const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_ptBetaMax * min_ptBeta_ptBetaMax); const float alphaInAbsReg = @@ -1689,7 +1681,7 @@ namespace lst { (alpaka::math::abs(acc, betaInRHmin - betaInRHmax) + alpaka::math::abs(acc, betaOutRHmin - betaOutRHmax))); float dBeta = betaIn - betaOut; return dBeta * dBeta <= dBetaCut2; - }; + } -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h index 1165d33f6da5e..4ff67d66d2844 100644 --- a/RecoTracker/LSTCore/src/alpaka/Quintuplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Quintuplet.h @@ -14,7 +14,7 @@ #include "ObjectRanges.h" #include "Triplet.h" -namespace lst { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { struct Quintuplets { unsigned int* tripletIndices; uint16_t* lowerModuleIndices; @@ -46,30 +46,30 @@ namespace lst { template void setData(TBuff& buf) { - tripletIndices = alpaka::getPtrNative(buf.tripletIndices_buf); - lowerModuleIndices = alpaka::getPtrNative(buf.lowerModuleIndices_buf); - nQuintuplets = alpaka::getPtrNative(buf.nQuintuplets_buf); - totOccupancyQuintuplets = alpaka::getPtrNative(buf.totOccupancyQuintuplets_buf); - nMemoryLocations = alpaka::getPtrNative(buf.nMemoryLocations_buf); - innerRadius = alpaka::getPtrNative(buf.innerRadius_buf); - bridgeRadius = alpaka::getPtrNative(buf.bridgeRadius_buf); - outerRadius = alpaka::getPtrNative(buf.outerRadius_buf); - pt = alpaka::getPtrNative(buf.pt_buf); - eta = alpaka::getPtrNative(buf.eta_buf); - phi = alpaka::getPtrNative(buf.phi_buf); - score_rphisum = alpaka::getPtrNative(buf.score_rphisum_buf); - layer = alpaka::getPtrNative(buf.layer_buf); - isDup = alpaka::getPtrNative(buf.isDup_buf); - TightCutFlag = alpaka::getPtrNative(buf.TightCutFlag_buf); - partOfPT5 = alpaka::getPtrNative(buf.partOfPT5_buf); - regressionRadius = alpaka::getPtrNative(buf.regressionRadius_buf); - regressionG = alpaka::getPtrNative(buf.regressionG_buf); - regressionF = alpaka::getPtrNative(buf.regressionF_buf); - logicalLayers = alpaka::getPtrNative(buf.logicalLayers_buf); - hitIndices = alpaka::getPtrNative(buf.hitIndices_buf); - rzChiSquared = alpaka::getPtrNative(buf.rzChiSquared_buf); - chiSquared = alpaka::getPtrNative(buf.chiSquared_buf); - nonAnchorChiSquared = alpaka::getPtrNative(buf.nonAnchorChiSquared_buf); + tripletIndices = buf.tripletIndices_buf.data(); + lowerModuleIndices = buf.lowerModuleIndices_buf.data(); + nQuintuplets = buf.nQuintuplets_buf.data(); + totOccupancyQuintuplets = buf.totOccupancyQuintuplets_buf.data(); + nMemoryLocations = buf.nMemoryLocations_buf.data(); + innerRadius = buf.innerRadius_buf.data(); + bridgeRadius = buf.bridgeRadius_buf.data(); + outerRadius = buf.outerRadius_buf.data(); + pt = buf.pt_buf.data(); + eta = buf.eta_buf.data(); + phi = buf.phi_buf.data(); + score_rphisum = buf.score_rphisum_buf.data(); + layer = buf.layer_buf.data(); + isDup = buf.isDup_buf.data(); + TightCutFlag = buf.TightCutFlag_buf.data(); + partOfPT5 = buf.partOfPT5_buf.data(); + regressionRadius = buf.regressionRadius_buf.data(); + regressionG = buf.regressionG_buf.data(); + regressionF = buf.regressionF_buf.data(); + logicalLayers = buf.logicalLayers_buf.data(); + hitIndices = buf.hitIndices_buf.data(); + rzChiSquared = buf.rzChiSquared_buf.data(); + chiSquared = buf.chiSquared_buf.data(); + nonAnchorChiSquared = buf.nonAnchorChiSquared_buf.data(); } }; @@ -136,7 +136,6 @@ namespace lst { alpaka::memset(queue, isDup_buf, 0u); alpaka::memset(queue, TightCutFlag_buf, false); alpaka::memset(queue, partOfPT5_buf, false); - alpaka::wait(queue); } inline Quintuplets const* data() const { return &data_; } @@ -148,10 +147,10 @@ namespace lst { float secondMin, float secondMax) { return ((firstMin <= secondMin) && (secondMin < firstMax)) || ((secondMin < firstMin) && (firstMin < secondMax)); - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addQuintupletToMemory(lst::Triplets const& tripletsInGPU, - lst::Quintuplets& quintupletsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addQuintupletToMemory(Triplets const& tripletsInGPU, + Quintuplets& quintupletsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex, uint16_t lowerModule1, @@ -230,10 +229,10 @@ namespace lst { quintupletsInGPU.rzChiSquared[quintupletIndex] = rzChiSquared; quintupletsInGPU.chiSquared[quintupletIndex] = rPhiChiSquared; quintupletsInGPU.nonAnchorChiSquared[quintupletIndex] = nonAnchorChiSquared; - }; + } //90% constraint - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passChiSquaredConstraint(lst::Modules const& modulesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passChiSquaredConstraint(Modules const& modulesInGPU, uint16_t lowerModuleIndex1, uint16_t lowerModuleIndex2, uint16_t lowerModuleIndex3, @@ -313,13 +312,13 @@ namespace lst { } return true; - }; + } //bounds can be found at http://uaf-10.t2.ucsd.edu/~bsathian/SDL/T5_RZFix/t5_rz_thresholds.txt template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passT5RZConstraint(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, unsigned int firstMDIndex, unsigned int secondMDIndex, unsigned int thirdMDIndex, @@ -529,7 +528,7 @@ namespace lst { continue; } - // calculation is copied from PixelTriplet.cc lst::computePT3RZChiSquared + // calculation is copied from PixelTriplet.cc computePT3RZChiSquared float diffr = 0, diffz = 0; float rou = a / p; @@ -587,14 +586,14 @@ namespace lst { subdets = modulesInGPU.subdets[lowerModuleIndex3]; } if (i == 2 || i == 3) { - residual = (layeri <= 6 && ((side == lst::Center) or (drdz < 1))) ? diffz : diffr; + residual = (layeri <= 6 && ((side == ::lst::Center) or (drdz < 1))) ? diffz : diffr; float projection_missing2 = 1.f; if (drdz < 1) - projection_missing2 = ((subdets == lst::Endcap) or (side == lst::Center)) + projection_missing2 = ((subdets == ::lst::Endcap) or (side == ::lst::Center)) ? 1.f : 1.f / (1 + drdz * drdz); // cos(atan(drdz)), if dr/dz<1 if (drdz > 1) - projection_missing2 = ((subdets == lst::Endcap) or (side == lst::Center)) + projection_missing2 = ((subdets == ::lst::Endcap) or (side == ::lst::Center)) ? 1.f : (drdz * drdz) / (1 + drdz * drdz); //sin(atan(drdz)), if dr/dz>1 error2 = error2 * projection_missing2; @@ -749,11 +748,11 @@ namespace lst { } } return true; - }; + } template - ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(lst::Triplets const& tripletsInGPU, - lst::Segments const& segmentsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE bool T5HasCommonMiniDoublet(Triplets const& tripletsInGPU, + Segments const& segmentsInGPU, unsigned int innerTripletIndex, unsigned int outerTripletIndex) { unsigned int innerOuterSegmentIndex = tripletsInGPU.segmentIndices[2 * innerTripletIndex + 1]; @@ -764,7 +763,7 @@ namespace lst { segmentsInGPU.mdIndices[2 * outerInnerSegmentIndex]; //outer triplet inner segment inner MD index return (innerOuterOuterMiniDoubletIndex == outerInnerInnerMiniDoubletIndex); - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeErrorInRadius(TAcc const& acc, @@ -779,7 +778,7 @@ namespace lst { //brute force float candidateRadius; float g, f; - minimumRadius = lst::lst_INF; + minimumRadius = lst_INF; maximumRadius = 0.f; for (size_t i = 0; i < 3; i++) { float x1 = x1Vec[i]; @@ -796,7 +795,7 @@ namespace lst { } } } - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBEE12378(TAcc const& acc, @@ -820,7 +819,7 @@ namespace lst { innerInvRadiusMax, alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); - }; + } /*bounds for high Pt taken from : http://uaf-10.t2.ucsd.edu/~bsathian/SDL/T5_efficiency/efficiencies/new_efficiencies/efficiencies_20210513_T5_recovering_high_Pt_efficiencies/highE_radius_matching/highE_bounds.txt */ template @@ -845,7 +844,7 @@ namespace lst { bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); return checkIntervalOverlap(innerInvRadiusMin, innerInvRadiusMax, bridgeInvRadiusMin, bridgeInvRadiusMax); - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBBE(TAcc const& acc, @@ -869,7 +868,7 @@ namespace lst { bridgeInvRadiusMin = alpaka::math::max(acc, 0.f, (1.f - bridgeInvRadiusErrorBound) / bridgeRadius); return checkIntervalOverlap(innerInvRadiusMin, innerInvRadiusMax, bridgeInvRadiusMin, bridgeInvRadiusMax); - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBEE23478(TAcc const& acc, @@ -893,7 +892,7 @@ namespace lst { innerInvRadiusMax, alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBBEE34578(TAcc const& acc, @@ -917,7 +916,7 @@ namespace lst { innerInvRadiusMax, alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBBEEE(TAcc const& acc, @@ -947,7 +946,7 @@ namespace lst { innerInvRadiusMax, alpaka::math::min(acc, bridgeInvRadiusMin, 1.0f / bridgeRadiusMax2S), alpaka::math::max(acc, bridgeInvRadiusMax, 1.0f / bridgeRadiusMin2S)); - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiBEEEE(TAcc const& acc, @@ -978,7 +977,7 @@ namespace lst { alpaka::math::max(acc, innerInvRadiusMax, 1.0 / innerRadiusMin2S), alpaka::math::min(acc, bridgeInvRadiusMin, 1.0 / bridgeRadiusMax2S), alpaka::math::max(acc, bridgeInvRadiusMax, 1.0 / bridgeRadiusMin2S)); - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool matchRadiiEEEEE(TAcc const& acc, @@ -1009,11 +1008,11 @@ namespace lst { alpaka::math::max(acc, innerInvRadiusMax, 1.0 / innerRadiusMin2S), alpaka::math::min(acc, bridgeInvRadiusMin, 1.0 / bridgeRadiusMax2S), alpaka::math::max(acc, bridgeInvRadiusMax, 1.0 / bridgeRadiusMin2S)); - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE void computeSigmasForRegression(TAcc const& acc, - lst::Modules const& modulesInGPU, + Modules const& modulesInGPU, const uint16_t* lowerModuleIndices, float* delta1, float* delta2, @@ -1030,7 +1029,7 @@ namespace lst { modules. */ - ModuleType moduleType; + ::lst::ModuleType moduleType; short moduleSubdet, moduleSide; float inv1 = kWidthPS / kWidth2S; float inv2 = kPixelPSZpitch / kWidth2S; @@ -1042,21 +1041,21 @@ namespace lst { const float& drdz = modulesInGPU.drdzs[lowerModuleIndices[i]]; slopes[i] = modulesInGPU.dxdys[lowerModuleIndices[i]]; //category 1 - barrel PS flat - if (moduleSubdet == Barrel and moduleType == PS and moduleSide == Center) { + if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::PS and moduleSide == ::lst::Center) { delta1[i] = inv1; delta2[i] = inv1; slopes[i] = -999.f; isFlat[i] = true; } //category 2 - barrel 2S - else if (moduleSubdet == Barrel and moduleType == TwoS) { + else if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::TwoS) { delta1[i] = 1.f; delta2[i] = 1.f; slopes[i] = -999.f; isFlat[i] = true; } //category 3 - barrel PS tilted - else if (moduleSubdet == Barrel and moduleType == PS and moduleSide != Center) { + else if (moduleSubdet == ::lst::Barrel and moduleType == ::lst::PS and moduleSide != ::lst::Center) { delta1[i] = inv1; isFlat[i] = false; @@ -1067,7 +1066,7 @@ namespace lst { } } //category 4 - endcap PS - else if (moduleSubdet == Endcap and moduleType == PS) { + else if (moduleSubdet == ::lst::Endcap and moduleType == ::lst::PS) { delta1[i] = inv1; isFlat[i] = false; @@ -1083,7 +1082,7 @@ namespace lst { } } //category 5 - endcap 2S - else if (moduleSubdet == Endcap and moduleType == TwoS) { + else if (moduleSubdet == ::lst::Endcap and moduleType == ::lst::TwoS) { delta1[i] = 1.f; delta2[i] = 500.f * inv1; isFlat[i] = false; @@ -1096,7 +1095,7 @@ namespace lst { #endif } } - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeRadiusUsingRegression(TAcc const& acc, @@ -1131,8 +1130,8 @@ namespace lst { // Computing sigmas is a very tricky affair // if the module is tilted or endcap, we need to use the slopes properly! - absArctanSlope = ((slopes[i] != lst::lst_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) - : 0.5f * float(M_PI)); + absArctanSlope = + ((slopes[i] != lst_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) : 0.5f * float(M_PI)); if (xs[i] > 0 and ys[i] > 0) { angleM = 0.5f * float(M_PI) - absArctanSlope; @@ -1194,7 +1193,7 @@ namespace lst { (xs[i] * xs[i] + ys[i] * ys[i] - twoG * xs[i] - twoF * ys[i] + c) / sigmas2[i]; } return radius; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeChiSquared(TAcc const& acc, @@ -1214,8 +1213,8 @@ namespace lst { float chiSquared = 0.f; float absArctanSlope, angleM, xPrime, yPrime, sigma2; for (size_t i = 0; i < nPoints; i++) { - absArctanSlope = ((slopes[i] != lst::lst_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) - : 0.5f * float(M_PI)); + absArctanSlope = + ((slopes[i] != lst_INF) ? alpaka::math::abs(acc, alpaka::math::atan(acc, slopes[i])) : 0.5f * float(M_PI)); if (xs[i] > 0 and ys[i] > 0) { angleM = 0.5f * float(M_PI) - absArctanSlope; } else if (xs[i] < 0 and ys[i] > 0) { @@ -1240,7 +1239,7 @@ namespace lst { (xs[i] * xs[i] + ys[i] * ys[i] - 2 * g * xs[i] - 2 * f * ys[i] + c) / sigma2; } return chiSquared; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE void runDeltaBetaIterationsT5(TAcc const& acc, @@ -1256,33 +1255,30 @@ namespace lst { betaOut += alpaka::math::copysign( acc, alpaka::math::asin( - acc, - alpaka::math::min(acc, sdOut_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), + acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), betaOut); return; } if (betaIn * betaOut > 0.f and - (alpaka::math::abs(acc, pt_beta) < 4.f * lst::kPt_betaMax or + (alpaka::math::abs(acc, pt_beta) < 4.f * kPt_betaMax or (lIn >= 11 and alpaka::math::abs(acc, pt_beta) < - 8.f * lst::kPt_betaMax))) //and the pt_beta is well-defined; less strict for endcap-endcap + 8.f * kPt_betaMax))) //and the pt_beta is well-defined; less strict for endcap-endcap { const float betaInUpd = - betaIn + alpaka::math::copysign( - acc, - alpaka::math::asin( - acc, - alpaka::math::min( - acc, sdIn_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), - betaIn); //FIXME: need a faster version + betaIn + + alpaka::math::copysign( + acc, + alpaka::math::asin( + acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), + betaIn); //FIXME: need a faster version const float betaOutUpd = - betaOut + alpaka::math::copysign( - acc, - alpaka::math::asin( - acc, - alpaka::math::min( - acc, sdOut_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), - betaOut); //FIXME: need a faster version + betaOut + + alpaka::math::copysign( + acc, + alpaka::math::asin( + acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), + betaOut); //FIXME: need a faster version betaAv = 0.5f * (betaInUpd + betaOutUpd); //1st update @@ -1291,68 +1287,64 @@ namespace lst { betaIn += alpaka::math::copysign( acc, - alpaka::math::asin(acc, alpaka::math::min(acc, sdIn_dr * lst::k2Rinv1GeVf * pt_beta_inv, lst::kSinAlphaMax)), + alpaka::math::asin(acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf * pt_beta_inv, kSinAlphaMax)), betaIn); //FIXME: need a faster version betaOut += alpaka::math::copysign( acc, - alpaka::math::asin(acc, alpaka::math::min(acc, sdOut_dr * lst::k2Rinv1GeVf * pt_beta_inv, lst::kSinAlphaMax)), + alpaka::math::asin(acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf * pt_beta_inv, kSinAlphaMax)), betaOut); //FIXME: need a faster version //update the av and pt betaAv = 0.5f * (betaIn + betaOut); //2nd update - pt_beta = dr * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate } else if (lIn < 11 && alpaka::math::abs(acc, betaOut) < 0.2f * alpaka::math::abs(acc, betaIn) && - alpaka::math::abs(acc, pt_beta) < 12.f * lst::kPt_betaMax) //use betaIn sign as ref + alpaka::math::abs(acc, pt_beta) < 12.f * kPt_betaMax) //use betaIn sign as ref { const float pt_betaIn = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaIn); const float betaInUpd = - betaIn + alpaka::math::copysign( - acc, - alpaka::math::asin( - acc, - alpaka::math::min( - acc, sdIn_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), lst::kSinAlphaMax)), - betaIn); //FIXME: need a faster version + betaIn + + alpaka::math::copysign( + acc, + alpaka::math::asin( + acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), kSinAlphaMax)), + betaIn); //FIXME: need a faster version const float betaOutUpd = betaOut + alpaka::math::copysign( acc, alpaka::math::asin( acc, - alpaka::math::min( - acc, sdOut_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), lst::kSinAlphaMax)), + alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_betaIn), kSinAlphaMax)), betaIn); //FIXME: need a faster version betaAv = (alpaka::math::abs(acc, betaOut) > 0.2f * alpaka::math::abs(acc, betaIn)) ? (0.5f * (betaInUpd + betaOutUpd)) : betaInUpd; //1st update - pt_beta = dr * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate betaIn += alpaka::math::copysign( acc, alpaka::math::asin( - acc, - alpaka::math::min(acc, sdIn_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), + acc, alpaka::math::min(acc, sdIn_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), betaIn); //FIXME: need a faster version betaOut += alpaka::math::copysign( acc, alpaka::math::asin( - acc, - alpaka::math::min(acc, sdOut_dr * lst::k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), lst::kSinAlphaMax)), + acc, alpaka::math::min(acc, sdOut_dr * k2Rinv1GeVf / alpaka::math::abs(acc, pt_beta), kSinAlphaMax)), betaIn); //FIXME: need a faster version //update the av and pt betaAv = 0.5f * (betaIn + betaOut); //2nd update - pt_beta = dr * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate + pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); //get a better pt estimate } - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBBB(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t innerOuterLowerModuleIndex, uint16_t outerInnerLowerModuleIndex, @@ -1363,8 +1355,8 @@ namespace lst { unsigned int secondMDIndex, unsigned int thirdMDIndex, unsigned int fourthMDIndex) { - bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == lst::PS); - bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == lst::PS); + bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == ::lst::PS); + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == ::lst::PS); float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; float rt_InOut = mdsInGPU.anchorRt[secondMDIndex]; @@ -1375,17 +1367,17 @@ namespace lst { float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; float alpha1GeV_OutLo = - alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)); + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, kSinAlphaMax)); float rtRatio_OutLoInLo = rt_OutLo / rt_InLo; // Outer segment beginning rt divided by inner segment beginning rt; float dzDrtScale = alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly - float zpitch_InLo = (isPS_InLo ? lst::kPixelPSZpitch : lst::kStrip2SZpitch); - float zpitch_OutLo = (isPS_OutLo ? lst::kPixelPSZpitch : lst::kStrip2SZpitch); + float zpitch_InLo = (isPS_InLo ? kPixelPSZpitch : kStrip2SZpitch); + float zpitch_OutLo = (isPS_OutLo ? kPixelPSZpitch : kStrip2SZpitch); - float zHi = z_InLo + (z_InLo + lst::kDeltaZLum) * (rtRatio_OutLoInLo - 1.f) * (z_InLo < 0.f ? 1.f : dzDrtScale) + + float zHi = z_InLo + (z_InLo + kDeltaZLum) * (rtRatio_OutLoInLo - 1.f) * (z_InLo < 0.f ? 1.f : dzDrtScale) + (zpitch_InLo + zpitch_OutLo); - float zLo = z_InLo + (z_InLo - lst::kDeltaZLum) * (rtRatio_OutLoInLo - 1.f) * (z_InLo > 0.f ? 1.f : dzDrtScale) - + float zLo = z_InLo + (z_InLo - kDeltaZLum) * (rtRatio_OutLoInLo - 1.f) * (z_InLo > 0.f ? 1.f : dzDrtScale) - (zpitch_InLo + zpitch_OutLo); //Cut 1 - z compatibility @@ -1403,7 +1395,7 @@ namespace lst { float dzErr = (zpitch_InLo + zpitch_OutLo) * (zpitch_InLo + zpitch_OutLo) * 2.f; float thetaMuls2 = (kMulsInGeV * kMulsInGeV) * (0.1f + 0.2f * (rt_OutLo - rt_InLo) / 50.f) * (r3_InLo / rt_InLo); - float muls2 = thetaMuls2 * 9.f / (lst::ptCut * lst::ptCut) * 16.f; + float muls2 = thetaMuls2 * 9.f / (ptCut * ptCut) * 16.f; dzErr += muls2 * drt_OutLo_InLo * drt_OutLo_InLo / 3.f * coshEta * coshEta; dzErr = alpaka::math::sqrt(acc, dzErr); @@ -1411,7 +1403,7 @@ namespace lst { const float dzMean = dz_InSeg / drt_InSeg * drt_OutLo_InLo; const float zWindow = dzErr / drt_InSeg * drt_OutLo_InLo + - (zpitch_InLo + zpitch_OutLo); //FIXME for lst::ptCut lower than ~0.8 need to add curv path correction + (zpitch_InLo + zpitch_OutLo); //FIXME for ptCut lower than ~0.8 need to add curv path correction float zLoPointed = z_InLo + dzMean * (z_InLo > 0.f ? 1.f : dzDrtScale) - zWindow; float zHiPointed = z_InLo + dzMean * (z_InLo < 0.f ? 1.f : dzDrtScale) + zWindow; @@ -1422,7 +1414,7 @@ namespace lst { float pvOffset = 0.1f / rt_OutLo; float dPhiCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, muls2 + pvOffset * pvOffset); - float deltaPhiPos = lst::phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); + float deltaPhiPos = phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); // Cut #3: FIXME:deltaPhiPos can be tighter if (alpaka::math::abs(acc, deltaPhiPos) > dPhiCut) return false; @@ -1432,7 +1424,7 @@ namespace lst { float diffX = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; float diffY = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - float dPhi = lst::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + float dPhi = deltaPhi(acc, midPointX, midPointY, diffX, diffY); // Cut #4: deltaPhiChange if (alpaka::math::abs(acc, dPhi) > dPhiCut) @@ -1443,16 +1435,16 @@ namespace lst { float alpha_InLo = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); float alpha_OutLo = __H2F(segmentsInGPU.dPhiChanges[outerSegmentIndex]); - bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == lst::Endcap and - modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::TwoS; + bool isEC_lastLayer = modulesInGPU.subdets[outerOuterLowerModuleIndex] == ::lst::Endcap and + modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::TwoS; float alpha_OutUp, alpha_OutUp_highEdge, alpha_OutUp_lowEdge; - alpha_OutUp = lst::phi_mpi_pi(acc, - lst::phi(acc, - mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], - mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - - mdsInGPU.anchorPhi[fourthMDIndex]); + alpha_OutUp = phi_mpi_pi(acc, + phi(acc, + mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - + mdsInGPU.anchorPhi[fourthMDIndex]); alpha_OutUp_highEdge = alpha_OutUp; alpha_OutUp_lowEdge = alpha_OutUp; @@ -1464,42 +1456,38 @@ namespace lst { float tl_axis_lowEdge_x = tl_axis_x; float tl_axis_lowEdge_y = tl_axis_y; - float betaIn = - alpha_InLo - lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + float betaIn = alpha_InLo - phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); float betaInRHmin = betaIn; float betaInRHmax = betaIn; - float betaOut = - -alpha_OutUp + lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); + float betaOut = -alpha_OutUp + phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); float betaOutRHmin = betaOut; float betaOutRHmax = betaOut; if (isEC_lastLayer) { - alpha_OutUp_highEdge = - lst::phi_mpi_pi(acc, - lst::phi(acc, - mdsInGPU.anchorHighEdgeX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], - mdsInGPU.anchorHighEdgeY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - - mdsInGPU.anchorHighEdgePhi[fourthMDIndex]); - alpha_OutUp_lowEdge = - lst::phi_mpi_pi(acc, - lst::phi(acc, - mdsInGPU.anchorLowEdgeX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], - mdsInGPU.anchorLowEdgeY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - - mdsInGPU.anchorLowEdgePhi[fourthMDIndex]); + alpha_OutUp_highEdge = phi_mpi_pi(acc, + phi(acc, + mdsInGPU.anchorHighEdgeX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorHighEdgeY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - + mdsInGPU.anchorHighEdgePhi[fourthMDIndex]); + alpha_OutUp_lowEdge = phi_mpi_pi(acc, + phi(acc, + mdsInGPU.anchorLowEdgeX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorLowEdgeY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - + mdsInGPU.anchorLowEdgePhi[fourthMDIndex]); tl_axis_highEdge_x = mdsInGPU.anchorHighEdgeX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; tl_axis_highEdge_y = mdsInGPU.anchorHighEdgeY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; tl_axis_lowEdge_x = mdsInGPU.anchorLowEdgeX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; tl_axis_lowEdge_y = mdsInGPU.anchorLowEdgeY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - betaOutRHmin = -alpha_OutUp_highEdge + lst::phi_mpi_pi(acc, - lst::phi(acc, tl_axis_highEdge_x, tl_axis_highEdge_y) - - mdsInGPU.anchorHighEdgePhi[fourthMDIndex]); - betaOutRHmax = -alpha_OutUp_lowEdge + lst::phi_mpi_pi(acc, - lst::phi(acc, tl_axis_lowEdge_x, tl_axis_lowEdge_y) - - mdsInGPU.anchorLowEdgePhi[fourthMDIndex]); + betaOutRHmin = + -alpha_OutUp_highEdge + + phi_mpi_pi(acc, phi(acc, tl_axis_highEdge_x, tl_axis_highEdge_y) - mdsInGPU.anchorHighEdgePhi[fourthMDIndex]); + betaOutRHmax = + -alpha_OutUp_lowEdge + + phi_mpi_pi(acc, phi(acc, tl_axis_lowEdge_x, tl_axis_lowEdge_y) - mdsInGPU.anchorLowEdgePhi[fourthMDIndex]); } //beta computation @@ -1515,9 +1503,7 @@ namespace lst { (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex])); float betaInCut = alpaka::math::asin( - acc, - alpaka::math::min( - acc, (-rt_InSeg * corrF + drt_tl_axis) * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + + acc, alpaka::math::min(acc, (-rt_InSeg * corrF + drt_tl_axis) * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + (0.02f / drt_InSeg); //Cut #5: first beta cut @@ -1525,7 +1511,7 @@ namespace lst { return false; float betaAv = 0.5f * (betaIn + betaOut); - float pt_beta = drt_tl_axis * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); + float pt_beta = drt_tl_axis * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); int lIn = 5; int lOut = isEC_lastLayer ? 11 : 5; float sdOut_dr = alpaka::math::sqrt(acc, @@ -1535,7 +1521,7 @@ namespace lst { (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex])); float sdOut_d = mdsInGPU.anchorRt[fourthMDIndex] - mdsInGPU.anchorRt[thirdMDIndex]; - lst::runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, rt_InSeg, sdOut_dr, drt_tl_axis, lIn); + runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, rt_InSeg, sdOut_dr, drt_tl_axis, lIn); const float betaInMMSF = (alpaka::math::abs(acc, betaInRHmin + betaInRHmax) > 0) ? (2.f * betaIn / alpaka::math::abs(acc, betaInRHmin + betaInRHmax)) @@ -1549,19 +1535,19 @@ namespace lst { betaOutRHmax *= betaOutMMSF; float min_ptBeta_maxPtBeta = alpaka::math::min( - acc, alpaka::math::abs(acc, pt_beta), lst::kPt_betaMax); //need to confimm the range-out value of 7 GeV + acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax); //need to confimm the range-out value of 7 GeV const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_maxPtBeta * min_ptBeta_maxPtBeta); - const float alphaInAbsReg = alpaka::math::max( - acc, - alpaka::math::abs(acc, alpha_InLo), - alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * lst::k2Rinv1GeVf / 3.0f, lst::kSinAlphaMax))); - const float alphaOutAbsReg = alpaka::math::max( - acc, - alpaka::math::abs(acc, alpha_OutLo), - alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * lst::k2Rinv1GeVf / 3.0f, lst::kSinAlphaMax))); - const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * lst::kDeltaZLum / z_InLo); - const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * lst::kDeltaZLum / z_OutLo); + const float alphaInAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, alpha_InLo), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax))); + const float alphaOutAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, alpha_OutLo), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax))); + const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * kDeltaZLum / z_InLo); + const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * kDeltaZLum / z_OutLo); const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum); const float sinDPhi = alpaka::math::sin(acc, dPhi); @@ -1581,8 +1567,7 @@ namespace lst { const float dBetaROut2 = dBetaROut * dBetaROut; float betaOutCut = - alpaka::math::asin(acc, - alpaka::math::min(acc, drt_tl_axis * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + + alpaka::math::asin(acc, alpaka::math::min(acc, drt_tl_axis * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2); //Cut #6: The real beta cut @@ -1598,13 +1583,13 @@ namespace lst { float dBeta = betaIn - betaOut; return dBeta * dBeta <= dBetaCut2; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoBBEE(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t innerOuterLowerModuleIndex, uint16_t outerInnerLowerModuleIndex, @@ -1615,8 +1600,8 @@ namespace lst { unsigned int secondMDIndex, unsigned int thirdMDIndex, unsigned int fourthMDIndex) { - bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == lst::PS); - bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == lst::PS); + bool isPS_InLo = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == ::lst::PS); + bool isPS_OutLo = (modulesInGPU.moduleType[outerInnerLowerModuleIndex] == ::lst::PS); float rt_InLo = mdsInGPU.anchorRt[firstMDIndex]; float rt_InOut = mdsInGPU.anchorRt[secondMDIndex]; @@ -1627,21 +1612,21 @@ namespace lst { float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; float alpha1GeV_OutLo = - alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)); + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, kSinAlphaMax)); float dzDrtScale = alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly - float zpitch_InLo = (isPS_InLo ? lst::kPixelPSZpitch : lst::kStrip2SZpitch); - float zpitch_OutLo = (isPS_OutLo ? lst::kPixelPSZpitch : lst::kStrip2SZpitch); + float zpitch_InLo = (isPS_InLo ? kPixelPSZpitch : kStrip2SZpitch); + float zpitch_OutLo = (isPS_OutLo ? kPixelPSZpitch : kStrip2SZpitch); float zGeom = zpitch_InLo + zpitch_OutLo; // Cut #0: Preliminary (Only here in endcap case) if (z_InLo * z_OutLo <= 0) return false; - float dLum = alpaka::math::copysign(acc, lst::kDeltaZLum, z_InLo); - bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == lst::PS; - float rtGeom1 = isOutSgInnerMDPS ? lst::kPixelPSZpitch : lst::kStrip2SZpitch; + float dLum = alpaka::math::copysign(acc, kDeltaZLum, z_InLo); + bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == ::lst::PS; + float rtGeom1 = isOutSgInnerMDPS ? kPixelPSZpitch : kStrip2SZpitch; float zGeom1 = alpaka::math::copysign(acc, zGeom, z_InLo); float rtLo = rt_InLo * (1.f + (z_OutLo - z_InLo - zGeom1) / (z_InLo + zGeom1 + dLum) / dzDrtScale) - rtGeom1; //slope correction only on the lower end @@ -1670,12 +1655,12 @@ namespace lst { const float coshEta = dr3SDIn / drtSDIn; //direction estimate const float dzOutInAbs = alpaka::math::abs(acc, z_OutLo - z_InLo); const float multDzDr = dzOutInAbs * coshEta / (coshEta * coshEta - 1.f); - const float zGeom1_another = lst::kPixelPSZpitch; + const float zGeom1_another = kPixelPSZpitch; float kZ = (z_OutLo - z_InLo) / dzSDIn; float drtErr = zGeom1_another * zGeom1_another * drtSDIn * drtSDIn / dzSDIn / dzSDIn * (1.f - 2.f * kZ + 2.f * kZ * kZ); const float thetaMuls2 = (kMulsInGeV * kMulsInGeV) * (0.1f + 0.2f * (rt_OutLo - rt_InLo) / 50.f) * (rIn / rt_InLo); - const float muls2 = thetaMuls2 * 9.f / (lst::ptCut * lst::ptCut) * 16.f; + const float muls2 = thetaMuls2 * 9.f / (ptCut * ptCut) * 16.f; drtErr += muls2 * multDzDr * multDzDr / 3.f * coshEta * coshEta; drtErr = alpaka::math::sqrt(acc, drtErr); @@ -1686,7 +1671,7 @@ namespace lst { const float pvOffset = 0.1f / rt_OutLo; float dPhiCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, muls2 + pvOffset * pvOffset); - float deltaPhiPos = lst::phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); + float deltaPhiPos = phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); //Cut #4: deltaPhiPos can be tighter if (alpaka::math::abs(acc, deltaPhiPos) > dPhiCut) @@ -1697,7 +1682,7 @@ namespace lst { float diffX = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; float diffY = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - float dPhi = lst::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + float dPhi = deltaPhi(acc, midPointX, midPointY, diffX, diffY); // Cut #5: deltaPhiChange if (alpaka::math::abs(acc, dPhi) > dPhiCut) return false; @@ -1707,33 +1692,32 @@ namespace lst { float sdIn_alpha_max = __H2F(segmentsInGPU.dPhiChangeMaxs[innerSegmentIndex]); float sdOut_alpha = sdIn_alpha; - float sdOut_alphaOut = lst::phi_mpi_pi(acc, - lst::phi(acc, - mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], - mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - - mdsInGPU.anchorPhi[fourthMDIndex]); + float sdOut_alphaOut = phi_mpi_pi(acc, + phi(acc, + mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[thirdMDIndex], + mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex]) - + mdsInGPU.anchorPhi[fourthMDIndex]); - float sdOut_alphaOut_min = lst::phi_mpi_pi( + float sdOut_alphaOut_min = phi_mpi_pi( acc, __H2F(segmentsInGPU.dPhiChangeMins[outerSegmentIndex]) - __H2F(segmentsInGPU.dPhiMins[outerSegmentIndex])); - float sdOut_alphaOut_max = lst::phi_mpi_pi( + float sdOut_alphaOut_max = phi_mpi_pi( acc, __H2F(segmentsInGPU.dPhiChangeMaxs[outerSegmentIndex]) - __H2F(segmentsInGPU.dPhiMaxs[outerSegmentIndex])); float tl_axis_x = mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; float tl_axis_y = mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - float betaIn = - sdIn_alpha - lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + float betaIn = sdIn_alpha - phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); float betaInRHmin = betaIn; float betaInRHmax = betaIn; float betaOut = - -sdOut_alphaOut + lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); + -sdOut_alphaOut + phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); float betaOutRHmin = betaOut; float betaOutRHmax = betaOut; - bool isEC_secondLayer = (modulesInGPU.subdets[innerOuterLowerModuleIndex] == lst::Endcap) and - (modulesInGPU.moduleType[innerOuterLowerModuleIndex] == lst::TwoS); + bool isEC_secondLayer = (modulesInGPU.subdets[innerOuterLowerModuleIndex] == ::lst::Endcap) and + (modulesInGPU.moduleType[innerOuterLowerModuleIndex] == ::lst::TwoS); if (isEC_secondLayer) { betaInRHmin = betaIn - sdIn_alpha_min + sdIn_alpha; @@ -1766,8 +1750,7 @@ namespace lst { float dr = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); const float corrF = 1.f; float betaInCut = - alpaka::math::asin( - acc, alpaka::math::min(acc, (-sdIn_dr * corrF + dr) * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + + alpaka::math::asin(acc, alpaka::math::min(acc, (-sdIn_dr * corrF + dr) * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + (0.02f / sdIn_d); //Cut #6: first beta cut @@ -1775,7 +1758,7 @@ namespace lst { return false; float betaAv = 0.5f * (betaIn + betaOut); - float pt_beta = dr * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); + float pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); float lIn = 5; float lOut = 11; @@ -1787,7 +1770,7 @@ namespace lst { (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex])); float sdOut_d = mdsInGPU.anchorRt[fourthMDIndex] - mdsInGPU.anchorRt[thirdMDIndex]; - lst::runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, sdIn_dr, sdOut_dr, dr, lIn); + runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, sdIn_dr, sdOut_dr, dr, lIn); const float betaInMMSF = (alpaka::math::abs(acc, betaInRHmin + betaInRHmax) > 0) ? (2.f * betaIn / alpaka::math::abs(acc, betaInRHmin + betaInRHmax)) @@ -1801,25 +1784,25 @@ namespace lst { betaOutRHmax *= betaOutMMSF; float min_ptBeta_maxPtBeta = alpaka::math::min( - acc, alpaka::math::abs(acc, pt_beta), lst::kPt_betaMax); //need to confirm the range-out value of 7 GeV + acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax); //need to confirm the range-out value of 7 GeV const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_maxPtBeta * min_ptBeta_maxPtBeta); - const float alphaInAbsReg = alpaka::math::max( - acc, - alpaka::math::abs(acc, sdIn_alpha), - alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * lst::k2Rinv1GeVf / 3.0f, lst::kSinAlphaMax))); - const float alphaOutAbsReg = alpaka::math::max( - acc, - alpaka::math::abs(acc, sdOut_alpha), - alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * lst::k2Rinv1GeVf / 3.0f, lst::kSinAlphaMax))); - const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * lst::kDeltaZLum / z_InLo); - const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * lst::kDeltaZLum / z_OutLo); + const float alphaInAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, sdIn_alpha), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax))); + const float alphaOutAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, sdOut_alpha), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax))); + const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * kDeltaZLum / z_InLo); + const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * kDeltaZLum / z_OutLo); const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum); const float sinDPhi = alpaka::math::sin(acc, dPhi); const float dBetaRIn2 = 0; // TODO-RH float dBetaROut = 0; - if (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::TwoS) { + if (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::TwoS) { dBetaROut = (alpaka::math::sqrt(acc, mdsInGPU.anchorHighEdgeX[fourthMDIndex] * mdsInGPU.anchorHighEdgeX[fourthMDIndex] + @@ -1831,9 +1814,8 @@ namespace lst { } const float dBetaROut2 = dBetaROut * dBetaROut; - float betaOutCut = - alpaka::math::asin(acc, alpaka::math::min(acc, dr * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + - (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2); + float betaOutCut = alpaka::math::asin(acc, alpaka::math::min(acc, dr * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2); //Cut #6: The real beta cut if (alpaka::math::abs(acc, betaOut) >= betaOutCut) @@ -1848,13 +1830,13 @@ namespace lst { float dBeta = betaIn - betaOut; //Cut #7: Cut on dBet return dBeta * dBeta <= dBetaCut2; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgoEEEE(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t innerOuterLowerModuleIndex, uint16_t outerInnerLowerModuleIndex, @@ -1874,7 +1856,7 @@ namespace lst { float z_OutLo = mdsInGPU.anchorZ[thirdMDIndex]; float alpha1GeV_OutLo = - alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)); + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / ptCut, kSinAlphaMax)); float dzDrtScale = alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly @@ -1883,13 +1865,13 @@ namespace lst { if ((z_InLo * z_OutLo) <= 0) return false; - float dLum = alpaka::math::copysign(acc, lst::kDeltaZLum, z_InLo); - bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == lst::PS; - bool isInSgInnerMDPS = modulesInGPU.moduleType[innerInnerLowerModuleIndex] == lst::PS; + float dLum = alpaka::math::copysign(acc, kDeltaZLum, z_InLo); + bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerInnerLowerModuleIndex] == ::lst::PS; + bool isInSgInnerMDPS = modulesInGPU.moduleType[innerInnerLowerModuleIndex] == ::lst::PS; - float rtGeom = (isInSgInnerMDPS and isOutSgInnerMDPS) ? 2.f * lst::kPixelPSZpitch - : (isInSgInnerMDPS or isOutSgInnerMDPS) ? lst::kPixelPSZpitch + lst::kStrip2SZpitch - : 2.f * lst::kStrip2SZpitch; + float rtGeom = (isInSgInnerMDPS and isOutSgInnerMDPS) ? 2.f * kPixelPSZpitch + : (isInSgInnerMDPS or isOutSgInnerMDPS) ? kPixelPSZpitch + kStrip2SZpitch + : 2.f * kStrip2SZpitch; float dz = z_OutLo - z_InLo; float rtLo = rt_InLo * (1.f + dz / (z_InLo + dLum) / dzDrtScale) - rtGeom; //slope correction only on the lower end @@ -1903,7 +1885,7 @@ namespace lst { if ((rtOut < rtLo) || (rtOut > rtHi)) return false; - bool isInSgOuterMDPS = modulesInGPU.moduleType[innerOuterLowerModuleIndex] == lst::PS; + bool isInSgOuterMDPS = modulesInGPU.moduleType[innerOuterLowerModuleIndex] == ::lst::PS; const float drtSDIn = rt_InOut - rt_InLo; const float dzSDIn = z_InOut - z_InLo; @@ -1916,12 +1898,12 @@ namespace lst { float kZ = (z_OutLo - z_InLo) / dzSDIn; float thetaMuls2 = (kMulsInGeV * kMulsInGeV) * (0.1f + 0.2f * (rt_OutLo - rt_InLo) / 50.f); - float muls2 = thetaMuls2 * 9.f / (lst::ptCut * lst::ptCut) * 16.f; + float muls2 = thetaMuls2 * 9.f / (ptCut * ptCut) * 16.f; - float drtErr = alpaka::math::sqrt( - acc, - lst::kPixelPSZpitch * lst::kPixelPSZpitch * 2.f / (dzSDIn * dzSDIn) * (dzOutInAbs * dzOutInAbs) + - muls2 * multDzDr * multDzDr / 3.f * coshEta * coshEta); + float drtErr = + alpaka::math::sqrt(acc, + kPixelPSZpitch * kPixelPSZpitch * 2.f / (dzSDIn * dzSDIn) * (dzOutInAbs * dzOutInAbs) + + muls2 * multDzDr * multDzDr / 3.f * coshEta * coshEta); float drtMean = drtSDIn * dzOutInAbs / alpaka::math::abs(acc, dzSDIn); float rtWindow = drtErr + rtGeom; @@ -1940,7 +1922,7 @@ namespace lst { float pvOffset = 0.1f / rtOut; float dPhiCut = alpha1GeV_OutLo + alpaka::math::sqrt(acc, muls2 + pvOffset * pvOffset); - float deltaPhiPos = lst::phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); + float deltaPhiPos = phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[secondMDIndex]); if (alpaka::math::abs(acc, deltaPhiPos) > dPhiCut) return false; @@ -1950,7 +1932,7 @@ namespace lst { float diffX = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; float diffY = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - float dPhi = lst::deltaPhi(acc, midPointX, midPointY, diffX, diffY); + float dPhi = deltaPhi(acc, midPointX, midPointY, diffX, diffY); // Cut #5: deltaPhiChange if (alpaka::math::abs(acc, dPhi) > dPhiCut) @@ -1958,21 +1940,20 @@ namespace lst { float sdIn_alpha = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); float sdOut_alpha = sdIn_alpha; //weird - float sdOut_dPhiPos = lst::phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[thirdMDIndex]); + float sdOut_dPhiPos = phi_mpi_pi(acc, mdsInGPU.anchorPhi[fourthMDIndex] - mdsInGPU.anchorPhi[thirdMDIndex]); float sdOut_dPhiChange = __H2F(segmentsInGPU.dPhiChanges[outerSegmentIndex]); float sdOut_dPhiChange_min = __H2F(segmentsInGPU.dPhiChangeMins[outerSegmentIndex]); float sdOut_dPhiChange_max = __H2F(segmentsInGPU.dPhiChangeMaxs[outerSegmentIndex]); - float sdOut_alphaOutRHmin = lst::phi_mpi_pi(acc, sdOut_dPhiChange_min - sdOut_dPhiPos); - float sdOut_alphaOutRHmax = lst::phi_mpi_pi(acc, sdOut_dPhiChange_max - sdOut_dPhiPos); - float sdOut_alphaOut = lst::phi_mpi_pi(acc, sdOut_dPhiChange - sdOut_dPhiPos); + float sdOut_alphaOutRHmin = phi_mpi_pi(acc, sdOut_dPhiChange_min - sdOut_dPhiPos); + float sdOut_alphaOutRHmax = phi_mpi_pi(acc, sdOut_dPhiChange_max - sdOut_dPhiPos); + float sdOut_alphaOut = phi_mpi_pi(acc, sdOut_dPhiChange - sdOut_dPhiPos); float tl_axis_x = mdsInGPU.anchorX[fourthMDIndex] - mdsInGPU.anchorX[firstMDIndex]; float tl_axis_y = mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - float betaIn = - sdIn_alpha - lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + float betaIn = sdIn_alpha - phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); float sdIn_alphaRHmin = __H2F(segmentsInGPU.dPhiChangeMins[innerSegmentIndex]); float sdIn_alphaRHmax = __H2F(segmentsInGPU.dPhiChangeMaxs[innerSegmentIndex]); @@ -1980,7 +1961,7 @@ namespace lst { float betaInRHmax = betaIn + sdIn_alphaRHmax - sdIn_alpha; float betaOut = - -sdOut_alphaOut + lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); + -sdOut_alphaOut + phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[fourthMDIndex]); float betaOutRHmin = betaOut - sdOut_alphaOutRHmin + sdOut_alphaOut; float betaOutRHmax = betaOut - sdOut_alphaOutRHmax + sdOut_alphaOut; @@ -2007,8 +1988,7 @@ namespace lst { float dr = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); const float corrF = 1.f; float betaInCut = - alpaka::math::asin( - acc, alpaka::math::min(acc, (-sdIn_dr * corrF + dr) * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + + alpaka::math::asin(acc, alpaka::math::min(acc, (-sdIn_dr * corrF + dr) * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + (0.02f / sdIn_d); //Cut #6: first beta cut @@ -2016,7 +1996,7 @@ namespace lst { return false; float betaAv = 0.5f * (betaIn + betaOut); - float pt_beta = dr * lst::k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); + float pt_beta = dr * k2Rinv1GeVf / alpaka::math::sin(acc, betaAv); int lIn = 11; //endcap int lOut = 13; //endcap @@ -2028,7 +2008,7 @@ namespace lst { (mdsInGPU.anchorY[fourthMDIndex] - mdsInGPU.anchorY[thirdMDIndex])); float sdOut_d = mdsInGPU.anchorRt[fourthMDIndex] - mdsInGPU.anchorRt[thirdMDIndex]; - lst::runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, sdIn_dr, sdOut_dr, dr, lIn); + runDeltaBetaIterationsT5(acc, betaIn, betaOut, betaAv, pt_beta, sdIn_dr, sdOut_dr, dr, lIn); const float betaInMMSF = (alpaka::math::abs(acc, betaInRHmin + betaInRHmax) > 0) ? (2.f * betaIn / alpaka::math::abs(acc, betaInRHmin + betaInRHmax)) @@ -2042,27 +2022,26 @@ namespace lst { betaOutRHmax *= betaOutMMSF; float min_ptBeta_maxPtBeta = alpaka::math::min( - acc, alpaka::math::abs(acc, pt_beta), lst::kPt_betaMax); //need to confirm the range-out value of 7 GeV + acc, alpaka::math::abs(acc, pt_beta), kPt_betaMax); //need to confirm the range-out value of 7 GeV const float dBetaMuls2 = thetaMuls2 * 16.f / (min_ptBeta_maxPtBeta * min_ptBeta_maxPtBeta); - const float alphaInAbsReg = alpaka::math::max( - acc, - alpaka::math::abs(acc, sdIn_alpha), - alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * lst::k2Rinv1GeVf / 3.0f, lst::kSinAlphaMax))); - const float alphaOutAbsReg = alpaka::math::max( - acc, - alpaka::math::abs(acc, sdOut_alpha), - alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * lst::k2Rinv1GeVf / 3.0f, lst::kSinAlphaMax))); - const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * lst::kDeltaZLum / z_InLo); - const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * lst::kDeltaZLum / z_OutLo); + const float alphaInAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, sdIn_alpha), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_InLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax))); + const float alphaOutAbsReg = + alpaka::math::max(acc, + alpaka::math::abs(acc, sdOut_alpha), + alpaka::math::asin(acc, alpaka::math::min(acc, rt_OutLo * k2Rinv1GeVf / 3.0f, kSinAlphaMax))); + const float dBetaInLum = lIn < 11 ? 0.0f : alpaka::math::abs(acc, alphaInAbsReg * kDeltaZLum / z_InLo); + const float dBetaOutLum = lOut < 11 ? 0.0f : alpaka::math::abs(acc, alphaOutAbsReg * kDeltaZLum / z_OutLo); const float dBetaLum2 = (dBetaInLum + dBetaOutLum) * (dBetaInLum + dBetaOutLum); const float dBetaRIn2 = 0; // TODO-RH float dBetaROut2 = 0; //TODO-RH - float betaOutCut = - alpaka::math::asin(acc, alpaka::math::min(acc, dr * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + - (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2); + float betaOutCut = alpaka::math::asin(acc, alpaka::math::min(acc, dr * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + + (0.02f / sdOut_d) + alpaka::math::sqrt(acc, dBetaLum2 + dBetaMuls2); //Cut #6: The real beta cut if (alpaka::math::abs(acc, betaOut) >= betaOutCut) @@ -2077,13 +2056,13 @@ namespace lst { float dBeta = betaIn - betaOut; //Cut #7: Cut on dBeta return dBeta * dBeta <= dBetaCut2; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletAlgoSelector(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t innerOuterLowerModuleIndex, uint16_t outerInnerLowerModuleIndex, @@ -2099,8 +2078,8 @@ namespace lst { short outerInnerLowerModuleSubdet = modulesInGPU.subdets[outerInnerLowerModuleIndex]; short outerOuterLowerModuleSubdet = modulesInGPU.subdets[outerOuterLowerModuleIndex]; - if (innerInnerLowerModuleSubdet == lst::Barrel and innerOuterLowerModuleSubdet == lst::Barrel and - outerInnerLowerModuleSubdet == lst::Barrel and outerOuterLowerModuleSubdet == lst::Barrel) { + if (innerInnerLowerModuleSubdet == ::lst::Barrel and innerOuterLowerModuleSubdet == ::lst::Barrel and + outerInnerLowerModuleSubdet == ::lst::Barrel and outerOuterLowerModuleSubdet == ::lst::Barrel) { return runQuintupletDefaultAlgoBBBB(acc, modulesInGPU, mdsInGPU, @@ -2115,8 +2094,8 @@ namespace lst { secondMDIndex, thirdMDIndex, fourthMDIndex); - } else if (innerInnerLowerModuleSubdet == lst::Barrel and innerOuterLowerModuleSubdet == lst::Barrel and - outerInnerLowerModuleSubdet == lst::Endcap and outerOuterLowerModuleSubdet == lst::Endcap) { + } else if (innerInnerLowerModuleSubdet == ::lst::Barrel and innerOuterLowerModuleSubdet == ::lst::Barrel and + outerInnerLowerModuleSubdet == ::lst::Endcap and outerOuterLowerModuleSubdet == ::lst::Endcap) { return runQuintupletDefaultAlgoBBEE(acc, modulesInGPU, mdsInGPU, @@ -2131,8 +2110,8 @@ namespace lst { secondMDIndex, thirdMDIndex, fourthMDIndex); - } else if (innerInnerLowerModuleSubdet == lst::Barrel and innerOuterLowerModuleSubdet == lst::Barrel and - outerInnerLowerModuleSubdet == lst::Barrel and outerOuterLowerModuleSubdet == lst::Endcap) { + } else if (innerInnerLowerModuleSubdet == ::lst::Barrel and innerOuterLowerModuleSubdet == ::lst::Barrel and + outerInnerLowerModuleSubdet == ::lst::Barrel and outerOuterLowerModuleSubdet == ::lst::Endcap) { return runQuintupletDefaultAlgoBBBB(acc, modulesInGPU, mdsInGPU, @@ -2147,8 +2126,8 @@ namespace lst { secondMDIndex, thirdMDIndex, fourthMDIndex); - } else if (innerInnerLowerModuleSubdet == lst::Barrel and innerOuterLowerModuleSubdet == lst::Endcap and - outerInnerLowerModuleSubdet == lst::Endcap and outerOuterLowerModuleSubdet == lst::Endcap) { + } else if (innerInnerLowerModuleSubdet == ::lst::Barrel and innerOuterLowerModuleSubdet == ::lst::Endcap and + outerInnerLowerModuleSubdet == ::lst::Endcap and outerOuterLowerModuleSubdet == ::lst::Endcap) { return runQuintupletDefaultAlgoBBEE(acc, modulesInGPU, mdsInGPU, @@ -2163,8 +2142,8 @@ namespace lst { secondMDIndex, thirdMDIndex, fourthMDIndex); - } else if (innerInnerLowerModuleSubdet == lst::Endcap and innerOuterLowerModuleSubdet == lst::Endcap and - outerInnerLowerModuleSubdet == lst::Endcap and outerOuterLowerModuleSubdet == lst::Endcap) { + } else if (innerInnerLowerModuleSubdet == ::lst::Endcap and innerOuterLowerModuleSubdet == ::lst::Endcap and + outerInnerLowerModuleSubdet == ::lst::Endcap and outerOuterLowerModuleSubdet == ::lst::Endcap) { return runQuintupletDefaultAlgoEEEE(acc, modulesInGPU, mdsInGPU, @@ -2182,14 +2161,14 @@ namespace lst { } return false; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runQuintupletDefaultAlgo(TAcc const& acc, - struct lst::Modules& modulesInGPU, - struct lst::MiniDoublets& mdsInGPU, - struct lst::Segments& segmentsInGPU, - struct lst::Triplets& tripletsInGPU, + Modules& modulesInGPU, + MiniDoublets& mdsInGPU, + Segments& segmentsInGPU, + Triplets& tripletsInGPU, uint16_t lowerModuleIndex1, uint16_t lowerModuleIndex2, uint16_t lowerModuleIndex3, @@ -2279,24 +2258,24 @@ namespace lst { float x3Vec[] = {x3, x3, x3}; float y3Vec[] = {y3, y3, y3}; - if (modulesInGPU.subdets[lowerModuleIndex1] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex1] == lst::TwoS) { + if (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex1] == ::lst::TwoS) { x1Vec[1] = mdsInGPU.anchorLowEdgeX[firstMDIndex]; x1Vec[2] = mdsInGPU.anchorHighEdgeX[firstMDIndex]; y1Vec[1] = mdsInGPU.anchorLowEdgeY[firstMDIndex]; y1Vec[2] = mdsInGPU.anchorHighEdgeY[firstMDIndex]; } - if (modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex2] == lst::TwoS) { + if (modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex2] == ::lst::TwoS) { x2Vec[1] = mdsInGPU.anchorLowEdgeX[secondMDIndex]; x2Vec[2] = mdsInGPU.anchorHighEdgeX[secondMDIndex]; y2Vec[1] = mdsInGPU.anchorLowEdgeY[secondMDIndex]; y2Vec[2] = mdsInGPU.anchorHighEdgeY[secondMDIndex]; } - if (modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex3] == lst::TwoS) { + if (modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex3] == ::lst::TwoS) { x3Vec[1] = mdsInGPU.anchorLowEdgeX[thirdMDIndex]; x3Vec[2] = mdsInGPU.anchorHighEdgeX[thirdMDIndex]; @@ -2311,8 +2290,8 @@ namespace lst { x1Vec[i] = x4; y1Vec[i] = y4; } - if (modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex4] == lst::TwoS) { + if (modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex4] == ::lst::TwoS) { x1Vec[1] = mdsInGPU.anchorLowEdgeX[fourthMDIndex]; x1Vec[2] = mdsInGPU.anchorHighEdgeX[fourthMDIndex]; @@ -2327,8 +2306,8 @@ namespace lst { x2Vec[i] = x5; y2Vec[i] = y5; } - if (modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap and - modulesInGPU.moduleType[lowerModuleIndex5] == lst::TwoS) { + if (modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap and + modulesInGPU.moduleType[lowerModuleIndex5] == ::lst::TwoS) { x2Vec[1] = mdsInGPU.anchorLowEdgeX[fifthMDIndex]; x2Vec[2] = mdsInGPU.anchorHighEdgeX[fifthMDIndex]; @@ -2377,23 +2356,23 @@ namespace lst { //split by category bool matchedRadii; - if (modulesInGPU.subdets[lowerModuleIndex1] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex2] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex3] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex4] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex5] == lst::Barrel) { + if (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Barrel) { matchedRadii = matchRadiiBBBBB(acc, innerRadius, bridgeRadius, outerRadius); - } else if (modulesInGPU.subdets[lowerModuleIndex1] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex2] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex3] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex4] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap) { + } else if (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap) { matchedRadii = matchRadiiBBBBE(acc, innerRadius, bridgeRadius, outerRadius); - } else if (modulesInGPU.subdets[lowerModuleIndex1] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex2] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex3] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap and - modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap) { + } else if (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap and + modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap) { if (modulesInGPU.layers[lowerModuleIndex1] == 1) { matchedRadii = matchRadiiBBBEE12378(acc, innerRadius, bridgeRadius, outerRadius, bridgeRadiusMin2S, bridgeRadiusMax2S); @@ -2406,17 +2385,17 @@ namespace lst { } } - else if (modulesInGPU.subdets[lowerModuleIndex1] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex2] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap and - modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap) { + else if (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap and + modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap) { matchedRadii = matchRadiiBBEEE(acc, innerRadius, bridgeRadius, outerRadius, bridgeRadiusMin2S, bridgeRadiusMax2S); - } else if (modulesInGPU.subdets[lowerModuleIndex1] == lst::Barrel and - modulesInGPU.subdets[lowerModuleIndex2] == lst::Endcap and - modulesInGPU.subdets[lowerModuleIndex3] == lst::Endcap and - modulesInGPU.subdets[lowerModuleIndex4] == lst::Endcap and - modulesInGPU.subdets[lowerModuleIndex5] == lst::Endcap) { + } else if (modulesInGPU.subdets[lowerModuleIndex1] == ::lst::Barrel and + modulesInGPU.subdets[lowerModuleIndex2] == ::lst::Endcap and + modulesInGPU.subdets[lowerModuleIndex3] == ::lst::Endcap and + modulesInGPU.subdets[lowerModuleIndex4] == ::lst::Endcap and + modulesInGPU.subdets[lowerModuleIndex5] == ::lst::Endcap) { matchedRadii = matchRadiiBEEEE(acc, innerRadius, bridgeRadius, @@ -2465,22 +2444,22 @@ namespace lst { #ifdef USE_T5_DNN unsigned int mdIndices[] = {firstMDIndex, secondMDIndex, thirdMDIndex, fourthMDIndex, fifthMDIndex}; - float inference = lst::t5dnn::runInference(acc, - modulesInGPU, - mdsInGPU, - segmentsInGPU, - tripletsInGPU, - xVec, - yVec, - mdIndices, - lowerModuleIndices, - innerTripletIndex, - outerTripletIndex, - innerRadius, - outerRadius, - bridgeRadius); - TightCutFlag = TightCutFlag and (inference > lst::t5dnn::kLSTWp2); // T5-in-TC cut - if (inference <= lst::t5dnn::kLSTWp2) // T5-building cut + float inference = t5dnn::runInference(acc, + modulesInGPU, + mdsInGPU, + segmentsInGPU, + tripletsInGPU, + xVec, + yVec, + mdIndices, + lowerModuleIndices, + innerTripletIndex, + outerTripletIndex, + innerRadius, + outerRadius, + bridgeRadius); + TightCutFlag = TightCutFlag and (inference > t5dnn::kLSTWp2); // T5-in-TC cut + if (inference <= t5dnn::kLSTWp2) // T5-building cut return false; #endif @@ -2533,17 +2512,17 @@ namespace lst { regressionF, regressionRadius); return true; - }; + } - struct createQuintupletsInGPUv2 { + struct CreateQuintupletsInGPUv2 { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::MiniDoublets mdsInGPU, - lst::Segments segmentsInGPU, - lst::Triplets tripletsInGPU, - lst::Quintuplets quintupletsInGPU, - lst::ObjectRanges rangesInGPU, + Modules modulesInGPU, + MiniDoublets mdsInGPU, + Segments segmentsInGPU, + Triplets tripletsInGPU, + Quintuplets quintupletsInGPU, + ObjectRanges rangesInGPU, uint16_t nEligibleT5Modules) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -2602,15 +2581,15 @@ namespace lst { TightCutFlag); if (success) { - int totOccupancyQuintuplets = - alpaka::atomicOp(acc, &quintupletsInGPU.totOccupancyQuintuplets[lowerModule1], 1u); + int totOccupancyQuintuplets = alpaka::atomicAdd( + acc, &quintupletsInGPU.totOccupancyQuintuplets[lowerModule1], 1u, alpaka::hierarchy::Threads{}); if (totOccupancyQuintuplets >= rangesInGPU.quintupletModuleOccupancy[lowerModule1]) { #ifdef WARNINGS printf("Quintuplet excess alert! Module index = %d\n", lowerModule1); #endif } else { - int quintupletModuleIndex = - alpaka::atomicOp(acc, &quintupletsInGPU.nQuintuplets[lowerModule1], 1u); + int quintupletModuleIndex = alpaka::atomicAdd( + acc, &quintupletsInGPU.nQuintuplets[lowerModule1], 1u, alpaka::hierarchy::Threads{}); //this if statement should never get executed! if (rangesInGPU.quintupletModuleIndices[lowerModule1] == -1) { #ifdef WARNINGS @@ -2625,7 +2604,7 @@ namespace lst { float eta = mdsInGPU.anchorEta[segmentsInGPU.mdIndices[2 * tripletsInGPU.segmentIndices[2 * innerTripletIndex + layer2_adjustment]]]; - float pt = (innerRadius + outerRadius) * lst::k2Rinv1GeVf; + float pt = (innerRadius + outerRadius) * k2Rinv1GeVf; float scores = chiSquared + nonAnchorChiSquared; addQuintupletToMemory(tripletsInGPU, quintupletsInGPU, @@ -2664,12 +2643,16 @@ namespace lst { } }; - struct createEligibleModulesListForQuintupletsGPU { + struct CreateEligibleModulesListForQuintupletsGPU { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::Triplets tripletsInGPU, - lst::ObjectRanges rangesInGPU) const { + Modules modulesInGPU, + Triplets tripletsInGPU, + ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -2682,10 +2665,10 @@ namespace lst { } alpaka::syncBlockThreads(acc); - // Initialize variables outside of the for loop. + // Create variables outside of the for loop. int occupancy, category_number, eta_number; - for (int i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (int i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { // Condition for a quintuple to exist for a module // TCs don't exist for layers 5 and 6 barrel, and layers 2,3,4,5 endcap short module_rings = modulesInGPU.rings[i]; @@ -2695,12 +2678,12 @@ namespace lst { if (tripletsInGPU.nTriplets[i] == 0) continue; - if (module_subdets == lst::Barrel and module_layers >= 3) + if (module_subdets == ::lst::Barrel and module_layers >= 3) continue; - if (module_subdets == lst::Endcap and module_layers > 1) + if (module_subdets == ::lst::Endcap and module_layers > 1) continue; - int nEligibleT5Modules = alpaka::atomicOp(acc, &nEligibleT5Modulesx, 1); + int nEligibleT5Modules = alpaka::atomicAdd(acc, &nEligibleT5Modulesx, 1, alpaka::hierarchy::Threads{}); if (module_layers <= 3 && module_subdets == 5) category_number = 0; @@ -2749,7 +2732,7 @@ namespace lst { #endif } - int nTotQ = alpaka::atomicOp(acc, &nTotalQuintupletsx, occupancy); + int nTotQ = alpaka::atomicAdd(acc, &nTotalQuintupletsx, occupancy, alpaka::hierarchy::Threads{}); rangesInGPU.quintupletModuleIndices[i] = nTotQ; rangesInGPU.indicesOfEligibleT5Modules[nEligibleT5Modules] = i; rangesInGPU.quintupletModuleOccupancy[i] = occupancy; @@ -2757,23 +2740,27 @@ namespace lst { // Wait for all threads to finish before reporting final values alpaka::syncBlockThreads(acc); - if (globalThreadIdx[2] == 0) { + if (cms::alpakatools::once_per_block(acc)) { *rangesInGPU.nEligibleT5Modules = static_cast(nEligibleT5Modulesx); *rangesInGPU.device_nTotalQuints = static_cast(nTotalQuintupletsx); } } }; - struct addQuintupletRangesToEventExplicit { + struct AddQuintupletRangesToEventExplicit { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::Quintuplets quintupletsInGPU, - lst::ObjectRanges rangesInGPU) const { + Modules modulesInGPU, + Quintuplets quintupletsInGPU, + ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (quintupletsInGPU.nQuintuplets[i] == 0 or rangesInGPU.quintupletModuleIndices[i] == -1) { rangesInGPU.quintupletRanges[i * 2] = -1; rangesInGPU.quintupletRanges[i * 2 + 1] = -1; @@ -2785,5 +2772,5 @@ namespace lst { } } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/Segment.h b/RecoTracker/LSTCore/src/alpaka/Segment.h index 6e79bacfa4902..b74de58f3c233 100644 --- a/RecoTracker/LSTCore/src/alpaka/Segment.h +++ b/RecoTracker/LSTCore/src/alpaka/Segment.h @@ -11,7 +11,7 @@ #include "Hit.h" #include "ObjectRanges.h" -namespace lst { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { struct Segments { FPX* dPhis; FPX* dPhiMins; @@ -50,40 +50,40 @@ namespace lst { template void setData(TBuff& buf) { - dPhis = alpaka::getPtrNative(buf.dPhis_buf); - dPhiMins = alpaka::getPtrNative(buf.dPhiMins_buf); - dPhiMaxs = alpaka::getPtrNative(buf.dPhiMaxs_buf); - dPhiChanges = alpaka::getPtrNative(buf.dPhiChanges_buf); - dPhiChangeMins = alpaka::getPtrNative(buf.dPhiChangeMins_buf); - dPhiChangeMaxs = alpaka::getPtrNative(buf.dPhiChangeMaxs_buf); - innerLowerModuleIndices = alpaka::getPtrNative(buf.innerLowerModuleIndices_buf); - outerLowerModuleIndices = alpaka::getPtrNative(buf.outerLowerModuleIndices_buf); - seedIdx = alpaka::getPtrNative(buf.seedIdx_buf); - mdIndices = alpaka::getPtrNative(buf.mdIndices_buf); - nMemoryLocations = alpaka::getPtrNative(buf.nMemoryLocations_buf); - innerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(buf.innerMiniDoubletAnchorHitIndices_buf); - outerMiniDoubletAnchorHitIndices = alpaka::getPtrNative(buf.outerMiniDoubletAnchorHitIndices_buf); - charge = alpaka::getPtrNative(buf.charge_buf); - superbin = alpaka::getPtrNative(buf.superbin_buf); - nSegments = alpaka::getPtrNative(buf.nSegments_buf); - totOccupancySegments = alpaka::getPtrNative(buf.totOccupancySegments_buf); - pLSHitsIdxs = alpaka::getPtrNative(buf.pLSHitsIdxs_buf); - pixelType = alpaka::getPtrNative(buf.pixelType_buf); - isQuad = alpaka::getPtrNative(buf.isQuad_buf); - isDup = alpaka::getPtrNative(buf.isDup_buf); - partOfPT5 = alpaka::getPtrNative(buf.partOfPT5_buf); - ptIn = alpaka::getPtrNative(buf.ptIn_buf); - ptErr = alpaka::getPtrNative(buf.ptErr_buf); - px = alpaka::getPtrNative(buf.px_buf); - py = alpaka::getPtrNative(buf.py_buf); - pz = alpaka::getPtrNative(buf.pz_buf); - etaErr = alpaka::getPtrNative(buf.etaErr_buf); - eta = alpaka::getPtrNative(buf.eta_buf); - phi = alpaka::getPtrNative(buf.phi_buf); - score = alpaka::getPtrNative(buf.score_buf); - circleCenterX = alpaka::getPtrNative(buf.circleCenterX_buf); - circleCenterY = alpaka::getPtrNative(buf.circleCenterY_buf); - circleRadius = alpaka::getPtrNative(buf.circleRadius_buf); + dPhis = buf.dPhis_buf.data(); + dPhiMins = buf.dPhiMins_buf.data(); + dPhiMaxs = buf.dPhiMaxs_buf.data(); + dPhiChanges = buf.dPhiChanges_buf.data(); + dPhiChangeMins = buf.dPhiChangeMins_buf.data(); + dPhiChangeMaxs = buf.dPhiChangeMaxs_buf.data(); + innerLowerModuleIndices = buf.innerLowerModuleIndices_buf.data(); + outerLowerModuleIndices = buf.outerLowerModuleIndices_buf.data(); + seedIdx = buf.seedIdx_buf.data(); + mdIndices = buf.mdIndices_buf.data(); + nMemoryLocations = buf.nMemoryLocations_buf.data(); + innerMiniDoubletAnchorHitIndices = buf.innerMiniDoubletAnchorHitIndices_buf.data(); + outerMiniDoubletAnchorHitIndices = buf.outerMiniDoubletAnchorHitIndices_buf.data(); + charge = buf.charge_buf.data(); + superbin = buf.superbin_buf.data(); + nSegments = buf.nSegments_buf.data(); + totOccupancySegments = buf.totOccupancySegments_buf.data(); + pLSHitsIdxs = buf.pLSHitsIdxs_buf.data(); + pixelType = buf.pixelType_buf.data(); + isQuad = buf.isQuad_buf.data(); + isDup = buf.isDup_buf.data(); + partOfPT5 = buf.partOfPT5_buf.data(); + ptIn = buf.ptIn_buf.data(); + ptErr = buf.ptErr_buf.data(); + px = buf.px_buf.data(); + py = buf.py_buf.data(); + pz = buf.pz_buf.data(); + etaErr = buf.etaErr_buf.data(); + eta = buf.eta_buf.data(); + phi = buf.phi_buf.data(); + score = buf.score_buf.data(); + circleCenterX = buf.circleCenterX_buf.data(); + circleCenterY = buf.circleCenterY_buf.data(); + circleRadius = buf.circleRadius_buf.data(); } }; @@ -170,14 +170,13 @@ namespace lst { alpaka::memset(queue, totOccupancySegments_buf, 0u); alpaka::memset(queue, partOfPT5_buf, false); alpaka::memset(queue, pLSHitsIdxs_buf, 0u); - alpaka::wait(queue); } inline Segments const* data() const { return &data_; } inline void setData(SegmentsBuffer& buf) { data_.setData(buf); } }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules_seg(lst::Modules const& modulesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules_seg(Modules const& modulesInGPU, unsigned int moduleIndex) { // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing // This is the same as what was previously considered as"isNormalTiltedModules" @@ -187,19 +186,21 @@ namespace lst { short side = modulesInGPU.sides[moduleIndex]; short rod = modulesInGPU.rods[moduleIndex]; - return (subdet == Barrel) && (((side != Center) && (layer == 3)) || - ((side == NegZ) && (((layer == 2) && (rod > 5)) || ((layer == 1) && (rod > 9)))) || - ((side == PosZ) && (((layer == 2) && (rod < 8)) || ((layer == 1) && (rod < 4))))); - }; + return (subdet == ::lst::Barrel) && + (((side != ::lst::Center) && (layer == 3)) || + ((side == ::lst::NegZ) && (((layer == 2) && (rod > 5)) || ((layer == 1) && (rod > 9)))) || + ((side == ::lst::PosZ) && (((layer == 2) && (rod < 8)) || ((layer == 1) && (rod < 4))))); + } ALPAKA_FN_ACC ALPAKA_FN_INLINE float isTighterTiltedModules_seg(short subdet, short layer, short side, short rod) { // The "tighter" tilted modules are the subset of tilted modules that have smaller spacing // This is the same as what was previously considered as"isNormalTiltedModules" // See Figure 9.1 of https://cds.cern.ch/record/2272264/files/CMS-TDR-014.pdf - return (subdet == Barrel) && (((side != Center) && (layer == 3)) || - ((side == NegZ) && (((layer == 2) && (rod > 5)) || ((layer == 1) && (rod > 9)))) || - ((side == PosZ) && (((layer == 2) && (rod < 8)) || ((layer == 1) && (rod < 4))))); - }; + return (subdet == ::lst::Barrel) && + (((side != ::lst::Center) && (layer == 3)) || + ((side == ::lst::NegZ) && (((layer == 2) && (rod > 5)) || ((layer == 1) && (rod > 9)))) || + ((side == ::lst::PosZ) && (((layer == 2) && (rod < 8)) || ((layer == 1) && (rod < 4))))); + } ALPAKA_FN_ACC ALPAKA_FN_INLINE float moduleGapSize_seg(short layer, short ring, short subdet, short side, short rod) { static constexpr float miniDeltaTilted[3] = {0.26f, 0.26f, 0.26f}; @@ -217,11 +218,11 @@ namespace lst { float moduleSeparation = 0; - if (subdet == Barrel and side == Center) { + if (subdet == ::lst::Barrel and side == ::lst::Center) { moduleSeparation = miniDeltaFlat[iL]; } else if (isTighterTiltedModules_seg(subdet, layer, side, rod)) { moduleSeparation = miniDeltaTilted[iL]; - } else if (subdet == Endcap) { + } else if (subdet == ::lst::Endcap) { moduleSeparation = miniDeltaEndcap[iL][iR]; } else //Loose tilted modules { @@ -229,9 +230,9 @@ namespace lst { } return moduleSeparation; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE float moduleGapSize_seg(lst::Modules const& modulesInGPU, unsigned int moduleIndex) { + ALPAKA_FN_ACC ALPAKA_FN_INLINE float moduleGapSize_seg(Modules const& modulesInGPU, unsigned int moduleIndex) { static constexpr float miniDeltaTilted[3] = {0.26f, 0.26f, 0.26f}; static constexpr float miniDeltaFlat[6] = {0.26f, 0.16f, 0.16f, 0.18f, 0.18f, 0.18f}; static constexpr float miniDeltaLooseTilted[3] = {0.4f, 0.4f, 0.4f}; @@ -249,11 +250,11 @@ namespace lst { float moduleSeparation = 0; - if (subdet == Barrel and side == Center) { + if (subdet == ::lst::Barrel and side == ::lst::Center) { moduleSeparation = miniDeltaFlat[iL]; } else if (isTighterTiltedModules_seg(modulesInGPU, moduleIndex)) { moduleSeparation = miniDeltaTilted[iL]; - } else if (subdet == Endcap) { + } else if (subdet == ::lst::Endcap) { moduleSeparation = miniDeltaEndcap[iL][iR]; } else //Loose tilted modules { @@ -261,13 +262,13 @@ namespace lst { } return moduleSeparation; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE void dAlphaThreshold(TAcc const& acc, float* dAlphaThresholdValues, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, float xIn, float yIn, float zIn, @@ -280,7 +281,7 @@ namespace lst { uint16_t outerLowerModuleIndex, unsigned int innerMDIndex, unsigned int outerMDIndex) { - float sdMuls = (modulesInGPU.subdets[innerLowerModuleIndex] == lst::Barrel) + float sdMuls = (modulesInGPU.subdets[innerLowerModuleIndex] == ::lst::Barrel) ? kMiniMulsPtScaleBarrel[modulesInGPU.layers[innerLowerModuleIndex] - 1] * 3.f / ptCut : kMiniMulsPtScaleEndcap[modulesInGPU.layers[innerLowerModuleIndex] - 1] * 3.f / ptCut; @@ -290,15 +291,15 @@ namespace lst { const float dAlpha_Bfield = alpaka::math::asin(acc, alpaka::math::min(acc, segmentDr * k2Rinv1GeVf / ptCut, kSinAlphaMax)); - bool isInnerTilted = modulesInGPU.subdets[innerLowerModuleIndex] == lst::Barrel and - modulesInGPU.sides[innerLowerModuleIndex] != lst::Center; - bool isOuterTilted = modulesInGPU.subdets[outerLowerModuleIndex] == lst::Barrel and - modulesInGPU.sides[outerLowerModuleIndex] != lst::Center; + bool isInnerTilted = modulesInGPU.subdets[innerLowerModuleIndex] == ::lst::Barrel and + modulesInGPU.sides[innerLowerModuleIndex] != ::lst::Center; + bool isOuterTilted = modulesInGPU.subdets[outerLowerModuleIndex] == ::lst::Barrel and + modulesInGPU.sides[outerLowerModuleIndex] != ::lst::Center; float drdzInner = modulesInGPU.drdzs[innerLowerModuleIndex]; float drdzOuter = modulesInGPU.drdzs[outerLowerModuleIndex]; - float innerModuleGapSize = lst::moduleGapSize_seg(modulesInGPU, innerLowerModuleIndex); - float outerModuleGapSize = lst::moduleGapSize_seg(modulesInGPU, outerLowerModuleIndex); + float innerModuleGapSize = moduleGapSize_seg(modulesInGPU, innerLowerModuleIndex); + float outerModuleGapSize = moduleGapSize_seg(modulesInGPU, outerLowerModuleIndex); const float innerminiTilt2 = isInnerTilted ? ((0.5f * 0.5f) * (kPixelPSZpitch * kPixelPSZpitch) * (drdzInner * drdzInner) / (1.f + drdzInner * drdzInner) / (innerModuleGapSize * innerModuleGapSize)) @@ -314,14 +315,14 @@ namespace lst { float sdLumForInnerMini2; float sdLumForOuterMini2; - if (modulesInGPU.subdets[innerLowerModuleIndex] == lst::Barrel) { + if (modulesInGPU.subdets[innerLowerModuleIndex] == ::lst::Barrel) { sdLumForInnerMini2 = innerminiTilt2 * (dAlpha_Bfield * dAlpha_Bfield); } else { sdLumForInnerMini2 = (mdsInGPU.dphis[innerMDIndex] * mdsInGPU.dphis[innerMDIndex]) * (kDeltaZLum * kDeltaZLum) / (mdsInGPU.dzs[innerMDIndex] * mdsInGPU.dzs[innerMDIndex]); } - if (modulesInGPU.subdets[outerLowerModuleIndex] == lst::Barrel) { + if (modulesInGPU.subdets[outerLowerModuleIndex] == ::lst::Barrel) { sdLumForOuterMini2 = outerminiTilt2 * (dAlpha_Bfield * dAlpha_Bfield); } else { sdLumForOuterMini2 = (mdsInGPU.dphis[outerMDIndex] * mdsInGPU.dphis[outerMDIndex]) * (kDeltaZLum * kDeltaZLum) / @@ -331,23 +332,23 @@ namespace lst { // Unique stuff for the segment dudes alone float dAlpha_res_inner = 0.02f / miniDelta * - (modulesInGPU.subdets[innerLowerModuleIndex] == lst::Barrel ? 1.0f : alpaka::math::abs(acc, zIn) / rtIn); + (modulesInGPU.subdets[innerLowerModuleIndex] == ::lst::Barrel ? 1.0f : alpaka::math::abs(acc, zIn) / rtIn); float dAlpha_res_outer = 0.02f / miniDelta * - (modulesInGPU.subdets[outerLowerModuleIndex] == lst::Barrel ? 1.0f : alpaka::math::abs(acc, zOut) / rtOut); + (modulesInGPU.subdets[outerLowerModuleIndex] == ::lst::Barrel ? 1.0f : alpaka::math::abs(acc, zOut) / rtOut); float dAlpha_res = dAlpha_res_inner + dAlpha_res_outer; - if (modulesInGPU.subdets[innerLowerModuleIndex] == lst::Barrel and - modulesInGPU.sides[innerLowerModuleIndex] == lst::Center) { + if (modulesInGPU.subdets[innerLowerModuleIndex] == ::lst::Barrel and + modulesInGPU.sides[innerLowerModuleIndex] == ::lst::Center) { dAlphaThresholdValues[0] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls); } else { dAlphaThresholdValues[0] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls + sdLumForInnerMini2); } - if (modulesInGPU.subdets[outerLowerModuleIndex] == lst::Barrel and - modulesInGPU.sides[outerLowerModuleIndex] == lst::Center) { + if (modulesInGPU.subdets[outerLowerModuleIndex] == ::lst::Barrel and + modulesInGPU.sides[outerLowerModuleIndex] == ::lst::Center) { dAlphaThresholdValues[1] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls); } else { dAlphaThresholdValues[1] = @@ -356,9 +357,9 @@ namespace lst { //Inner to outer dAlphaThresholdValues[2] = dAlpha_Bfield + alpaka::math::sqrt(acc, dAlpha_res * dAlpha_res + sdMuls * sdMuls); - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(lst::Segments& segmentsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addSegmentToMemory(Segments& segmentsInGPU, unsigned int lowerMDIndex, unsigned int upperMDIndex, uint16_t innerLowerModuleIndex, @@ -389,8 +390,8 @@ namespace lst { template ALPAKA_FN_ACC ALPAKA_FN_INLINE void addPixelSegmentToMemory(TAcc const& acc, - lst::Segments& segmentsInGPU, - lst::MiniDoublets const& mdsInGPU, + Segments& segmentsInGPU, + MiniDoublets const& mdsInGPU, unsigned int innerMDIndex, unsigned int outerMDIndex, uint16_t pixelModuleIndex, @@ -428,7 +429,7 @@ namespace lst { mdsInGPU.anchorY[innerMDIndex] + circleRadius * alpaka::math::cos(acc, circlePhi)}; //check which of the circles can accommodate r3LH better (we won't get perfect agreement) - float bestChiSquared = lst::lst_INF; + float bestChiSquared = lst_INF; float chiSquared; size_t bestIndex; for (size_t i = 0; i < 2; i++) { @@ -448,12 +449,12 @@ namespace lst { segmentsInGPU.circleCenterX[pixelSegmentArrayIndex] = candidateCenterXs[bestIndex]; segmentsInGPU.circleCenterY[pixelSegmentArrayIndex] = candidateCenterYs[bestIndex]; segmentsInGPU.circleRadius[pixelSegmentArrayIndex] = circleRadius; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgoBarrel(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDIndex, @@ -464,7 +465,7 @@ namespace lst { float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax) { - float sdMuls = (modulesInGPU.subdets[innerLowerModuleIndex] == lst::Barrel) + float sdMuls = (modulesInGPU.subdets[innerLowerModuleIndex] == ::lst::Barrel) ? kMiniMulsPtScaleBarrel[modulesInGPU.layers[innerLowerModuleIndex] - 1] * 3.f / ptCut : kMiniMulsPtScaleEndcap[modulesInGPU.layers[innerLowerModuleIndex] - 1] * 3.f / ptCut; @@ -495,12 +496,12 @@ namespace lst { float sdCut = sdSlope + alpaka::math::sqrt(acc, sdMuls * sdMuls + sdPVoff * sdPVoff); - dPhi = lst::phi_mpi_pi(acc, mdsInGPU.anchorPhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); + dPhi = phi_mpi_pi(acc, mdsInGPU.anchorPhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); if (alpaka::math::abs(acc, dPhi) > sdCut) return false; - dPhiChange = lst::phi_mpi_pi(acc, lst::phi(acc, xOut - xIn, yOut - yIn) - mdsInGPU.anchorPhi[innerMDIndex]); + dPhiChange = phi_mpi_pi(acc, phi(acc, xOut - xIn, yOut - yIn) - mdsInGPU.anchorPhi[innerMDIndex]); if (alpaka::math::abs(acc, dPhiChange) > sdCut) return false; @@ -538,12 +539,12 @@ namespace lst { if (alpaka::math::abs(acc, dAlphaOuterMDSegment) >= dAlphaOuterMDSegmentThreshold) return false; return alpaka::math::abs(acc, dAlphaInnerMDOuterMD) < dAlphaInnerMDOuterMDThreshold; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgoEndcap(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDIndex, @@ -566,8 +567,8 @@ namespace lst { zOut = mdsInGPU.anchorZ[outerMDIndex]; rtOut = mdsInGPU.anchorRt[outerMDIndex]; - bool outerLayerEndcapTwoS = (modulesInGPU.subdets[outerLowerModuleIndex] == lst::Endcap) && - (modulesInGPU.moduleType[outerLowerModuleIndex] == lst::TwoS); + bool outerLayerEndcapTwoS = (modulesInGPU.subdets[outerLowerModuleIndex] == ::lst::Endcap) && + (modulesInGPU.moduleType[outerLowerModuleIndex] == ::lst::TwoS); float sdSlope = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax)); float disks2SMinRadius = 60.f; @@ -595,14 +596,12 @@ namespace lst { if ((rtOut < rtLo) || (rtOut > rtHi)) return false; - dPhi = lst::phi_mpi_pi(acc, mdsInGPU.anchorPhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); + dPhi = phi_mpi_pi(acc, mdsInGPU.anchorPhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); float sdCut = sdSlope; if (outerLayerEndcapTwoS) { - float dPhiPos_high = - lst::phi_mpi_pi(acc, mdsInGPU.anchorHighEdgePhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); - float dPhiPos_low = - lst::phi_mpi_pi(acc, mdsInGPU.anchorLowEdgePhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); + float dPhiPos_high = phi_mpi_pi(acc, mdsInGPU.anchorHighEdgePhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); + float dPhiPos_low = phi_mpi_pi(acc, mdsInGPU.anchorLowEdgePhi[outerMDIndex] - mdsInGPU.anchorPhi[innerMDIndex]); dPhiMax = alpaka::math::abs(acc, dPhiPos_high) > alpaka::math::abs(acc, dPhiPos_low) ? dPhiPos_high : dPhiPos_low; dPhiMin = alpaka::math::abs(acc, dPhiPos_high) > alpaka::math::abs(acc, dPhiPos_low) ? dPhiPos_low : dPhiPos_high; @@ -654,12 +653,12 @@ namespace lst { if (alpaka::math::abs(acc, dAlphaOuterMDSegment) >= dAlphaOuterMDSegmentThreshold) return false; return alpaka::math::abs(acc, dAlphaInnerMDOuterMD) < dAlphaInnerMDOuterMDThreshold; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runSegmentDefaultAlgo(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, uint16_t innerLowerModuleIndex, uint16_t outerLowerModuleIndex, unsigned int innerMDIndex, @@ -670,8 +669,8 @@ namespace lst { float& dPhiChange, float& dPhiChangeMin, float& dPhiChangeMax) { - if (modulesInGPU.subdets[innerLowerModuleIndex] == lst::Barrel and - modulesInGPU.subdets[outerLowerModuleIndex] == lst::Barrel) { + if (modulesInGPU.subdets[innerLowerModuleIndex] == ::lst::Barrel and + modulesInGPU.subdets[outerLowerModuleIndex] == ::lst::Barrel) { return runSegmentDefaultAlgoBarrel(acc, modulesInGPU, mdsInGPU, @@ -700,15 +699,15 @@ namespace lst { dPhiChangeMin, dPhiChangeMax); } - }; + } - struct createSegmentsInGPUv2 { + struct CreateSegmentsInGPUv2 { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::MiniDoublets mdsInGPU, - lst::Segments segmentsInGPU, - lst::ObjectRanges rangesInGPU) const { + Modules modulesInGPU, + MiniDoublets mdsInGPU, + Segments segmentsInGPU, + ObjectRanges rangesInGPU) const { auto const globalBlockIdx = alpaka::getIdx(acc); auto const blockThreadIdx = alpaka::getIdx(acc); auto const gridBlockExtent = alpaka::getWorkDiv(acc); @@ -763,15 +762,15 @@ namespace lst { dPhiChange, dPhiChangeMin, dPhiChangeMax)) { - unsigned int totOccupancySegments = alpaka::atomicOp( - acc, &segmentsInGPU.totOccupancySegments[innerLowerModuleIndex], 1u); + unsigned int totOccupancySegments = alpaka::atomicAdd( + acc, &segmentsInGPU.totOccupancySegments[innerLowerModuleIndex], 1u, alpaka::hierarchy::Threads{}); if (static_cast(totOccupancySegments) >= rangesInGPU.segmentModuleOccupancy[innerLowerModuleIndex]) { #ifdef WARNINGS printf("Segment excess alert! Module index = %d\n", innerLowerModuleIndex); #endif } else { - unsigned int segmentModuleIdx = - alpaka::atomicOp(acc, &segmentsInGPU.nSegments[innerLowerModuleIndex], 1u); + unsigned int segmentModuleIdx = alpaka::atomicAdd( + acc, &segmentsInGPU.nSegments[innerLowerModuleIndex], 1u, alpaka::hierarchy::Threads{}); unsigned int segmentIdx = rangesInGPU.segmentModuleIndices[innerLowerModuleIndex] + segmentModuleIdx; addSegmentToMemory(segmentsInGPU, @@ -796,12 +795,16 @@ namespace lst { } }; - struct createSegmentArrayRanges { + struct CreateSegmentArrayRanges { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::ObjectRanges rangesInGPU, - lst::MiniDoublets mdsInGPU) const { + Modules modulesInGPU, + ObjectRanges rangesInGPU, + MiniDoublets mdsInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -812,10 +815,10 @@ namespace lst { } alpaka::syncBlockThreads(acc); - // Initialize variables outside of the for loop. + // Create variables outside of the for loop. int occupancy, category_number, eta_number; - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (modulesInGPU.nConnectedModules[i] == 0) { rangesInGPU.segmentModuleIndices[i] = nTotalSegments; rangesInGPU.segmentModuleOccupancy[i] = 0; @@ -882,30 +885,34 @@ namespace lst { #endif } - int nTotSegs = alpaka::atomicOp(acc, &nTotalSegments, occupancy); + int nTotSegs = alpaka::atomicAdd(acc, &nTotalSegments, occupancy, alpaka::hierarchy::Threads{}); rangesInGPU.segmentModuleIndices[i] = nTotSegs; rangesInGPU.segmentModuleOccupancy[i] = occupancy; } // Wait for all threads to finish before reporting final values alpaka::syncBlockThreads(acc); - if (globalThreadIdx[2] == 0) { + if (cms::alpakatools::once_per_block(acc)) { rangesInGPU.segmentModuleIndices[*modulesInGPU.nLowerModules] = nTotalSegments; *rangesInGPU.device_nTotalSegs = nTotalSegments; } } }; - struct addSegmentRangesToEventExplicit { + struct AddSegmentRangesToEventExplicit { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::Segments segmentsInGPU, - lst::ObjectRanges rangesInGPU) const { + Modules modulesInGPU, + Segments segmentsInGPU, + ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (segmentsInGPU.nSegments[i] == 0) { rangesInGPU.segmentRanges[i * 2] = -1; rangesInGPU.segmentRanges[i * 2 + 1] = -1; @@ -917,14 +924,14 @@ namespace lst { } }; - struct addPixelSegmentToEventKernel { + struct AddPixelSegmentToEventKernel { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::ObjectRanges rangesInGPU, - lst::Hits hitsInGPU, - lst::MiniDoublets mdsInGPU, - lst::Segments segmentsInGPU, + Modules modulesInGPU, + ObjectRanges rangesInGPU, + Hits hitsInGPU, + MiniDoublets mdsInGPU, + Segments segmentsInGPU, unsigned int* hitIndices0, unsigned int* hitIndices1, unsigned int* hitIndices2, @@ -1002,6 +1009,6 @@ namespace lst { } } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h index ede4dd9471e8e..16f36df3257cd 100644 --- a/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h +++ b/RecoTracker/LSTCore/src/alpaka/TrackCandidate.h @@ -12,7 +12,7 @@ #include "Hit.h" #include "ObjectRanges.h" -namespace lst { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { struct TrackCandidates { short* trackCandidateType; // 4-T5 5-pT3 7-pT5 8-pLS unsigned int* directObjectIndices; // Will hold direct indices to each type containers @@ -34,23 +34,23 @@ namespace lst { template void setData(TBuff& buf) { - trackCandidateType = alpaka::getPtrNative(buf.trackCandidateType_buf); - directObjectIndices = alpaka::getPtrNative(buf.directObjectIndices_buf); - objectIndices = alpaka::getPtrNative(buf.objectIndices_buf); - nTrackCandidates = alpaka::getPtrNative(buf.nTrackCandidates_buf); - nTrackCandidatespT3 = alpaka::getPtrNative(buf.nTrackCandidatespT3_buf); - nTrackCandidatespT5 = alpaka::getPtrNative(buf.nTrackCandidatespT5_buf); - nTrackCandidatespLS = alpaka::getPtrNative(buf.nTrackCandidatespLS_buf); - nTrackCandidatesT5 = alpaka::getPtrNative(buf.nTrackCandidatesT5_buf); - - logicalLayers = alpaka::getPtrNative(buf.logicalLayers_buf); - hitIndices = alpaka::getPtrNative(buf.hitIndices_buf); - pixelSeedIndex = alpaka::getPtrNative(buf.pixelSeedIndex_buf); - lowerModuleIndices = alpaka::getPtrNative(buf.lowerModuleIndices_buf); - - centerX = alpaka::getPtrNative(buf.centerX_buf); - centerY = alpaka::getPtrNative(buf.centerY_buf); - radius = alpaka::getPtrNative(buf.radius_buf); + trackCandidateType = buf.trackCandidateType_buf.data(); + directObjectIndices = buf.directObjectIndices_buf.data(); + objectIndices = buf.objectIndices_buf.data(); + nTrackCandidates = buf.nTrackCandidates_buf.data(); + nTrackCandidatespT3 = buf.nTrackCandidatespT3_buf.data(); + nTrackCandidatespT5 = buf.nTrackCandidatespT5_buf.data(); + nTrackCandidatespLS = buf.nTrackCandidatespLS_buf.data(); + nTrackCandidatesT5 = buf.nTrackCandidatesT5_buf.data(); + + logicalLayers = buf.logicalLayers_buf.data(); + hitIndices = buf.hitIndices_buf.data(); + pixelSeedIndex = buf.pixelSeedIndex_buf.data(); + lowerModuleIndices = buf.lowerModuleIndices_buf.data(); + + centerX = buf.centerX_buf.data(); + centerY = buf.centerY_buf.data(); + radius = buf.radius_buf.data(); } }; @@ -102,14 +102,13 @@ namespace lst { alpaka::memset(queue, lowerModuleIndices_buf, 0u); alpaka::memset(queue, hitIndices_buf, 0u); alpaka::memset(queue, pixelSeedIndex_buf, 0); - alpaka::wait(queue); } inline TrackCandidates const* data() const { return &data_; } inline void setData(TrackCandidatesBuffer& buf) { data_.setData(buf); } }; - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addpLSTrackCandidateToMemory(lst::TrackCandidates& trackCandidatesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addpLSTrackCandidateToMemory(TrackCandidates& trackCandidatesInGPU, unsigned int trackletIndex, unsigned int trackCandidateIndex, uint4 hitIndices, @@ -126,9 +125,9 @@ namespace lst { trackCandidatesInGPU.hitIndices[Params_pT5::kHits * trackCandidateIndex + 1] = hitIndices.z; trackCandidatesInGPU.hitIndices[Params_pT5::kHits * trackCandidateIndex + 2] = hitIndices.y; trackCandidatesInGPU.hitIndices[Params_pT5::kHits * trackCandidateIndex + 3] = hitIndices.w; - }; + } - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTrackCandidateToMemory(lst::TrackCandidates& trackCandidatesInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTrackCandidateToMemory(TrackCandidates& trackCandidatesInGPU, short trackCandidateType, unsigned int innerTrackletIndex, unsigned int outerTrackletIndex, @@ -163,13 +162,13 @@ namespace lst { trackCandidatesInGPU.centerX[trackCandidateIndex] = __F2H(centerX); trackCandidatesInGPU.centerY[trackCandidateIndex] = __F2H(centerY); trackCandidatesInGPU.radius[trackCandidateIndex] = __F2H(radius); - }; + } ALPAKA_FN_ACC ALPAKA_FN_INLINE int checkPixelHits(unsigned int ix, unsigned int jx, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, - lst::Hits const& hitsInGPU) { + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, + Hits const& hitsInGPU) { int phits1[Params_pLS::kHits]; int phits2[Params_pLS::kHits]; @@ -203,16 +202,16 @@ namespace lst { npMatched++; } return npMatched; - }; + } - struct crossCleanpT3 { + struct CrossCleanpT3 { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::ObjectRanges rangesInGPU, - lst::PixelTriplets pixelTripletsInGPU, - lst::Segments segmentsInGPU, - lst::PixelQuintuplets pixelQuintupletsInGPU) const { + Modules modulesInGPU, + ObjectRanges rangesInGPU, + PixelTriplets pixelTripletsInGPU, + Segments segmentsInGPU, + PixelQuintuplets pixelQuintupletsInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -236,7 +235,7 @@ namespace lst { float eta2 = segmentsInGPU.eta[pLS_jx - prefix]; float phi2 = segmentsInGPU.phi[pLS_jx - prefix]; float dEta = alpaka::math::abs(acc, (eta1 - eta2)); - float dPhi = lst::calculate_dPhi(phi1, phi2); + float dPhi = calculate_dPhi(phi1, phi2); float dR2 = dEta * dEta + dPhi * dPhi; if (dR2 < 1e-5f) @@ -246,14 +245,14 @@ namespace lst { } }; - struct crossCleanT5 { + struct CrossCleanT5 { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::Quintuplets quintupletsInGPU, - lst::PixelQuintuplets pixelQuintupletsInGPU, - lst::PixelTriplets pixelTripletsInGPU, - lst::ObjectRanges rangesInGPU) const { + Modules modulesInGPU, + Quintuplets quintupletsInGPU, + PixelQuintuplets pixelQuintupletsInGPU, + PixelTriplets pixelTripletsInGPU, + ObjectRanges rangesInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -289,7 +288,7 @@ namespace lst { } float dEta = alpaka::math::abs(acc, eta1 - eta2); - float dPhi = lst::calculate_dPhi(phi1, phi2); + float dPhi = calculate_dPhi(phi1, phi2); float dR2 = dEta * dEta + dPhi * dPhi; if (dR2 < 1e-3f) @@ -301,17 +300,17 @@ namespace lst { } }; - struct crossCleanpLS { + struct CrossCleanpLS { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::ObjectRanges rangesInGPU, - lst::PixelTriplets pixelTripletsInGPU, - lst::TrackCandidates trackCandidatesInGPU, - lst::Segments segmentsInGPU, - lst::MiniDoublets mdsInGPU, - lst::Hits hitsInGPU, - lst::Quintuplets quintupletsInGPU) const { + Modules modulesInGPU, + ObjectRanges rangesInGPU, + PixelTriplets pixelTripletsInGPU, + TrackCandidates trackCandidatesInGPU, + Segments segmentsInGPU, + MiniDoublets mdsInGPU, + Hits hitsInGPU, + Quintuplets quintupletsInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -337,7 +336,7 @@ namespace lst { float eta2 = __H2F(quintupletsInGPU.eta[quintupletIndex]); float phi2 = __H2F(quintupletsInGPU.phi[quintupletIndex]); float dEta = alpaka::math::abs(acc, eta1 - eta2); - float dPhi = lst::calculate_dPhi(phi1, phi2); + float dPhi = calculate_dPhi(phi1, phi2); float dR2 = dEta * dEta + dPhi * dPhi; if (dR2 < 1e-3f) @@ -354,7 +353,7 @@ namespace lst { float eta2 = __H2F(pixelTripletsInGPU.eta_pix[pT3Index]); float phi2 = __H2F(pixelTripletsInGPU.phi_pix[pT3Index]); float dEta = alpaka::math::abs(acc, eta1 - eta2); - float dPhi = lst::calculate_dPhi(phi1, phi2); + float dPhi = calculate_dPhi(phi1, phi2); float dR2 = dEta * dEta + dPhi * dPhi; if (dR2 < 0.000001f) @@ -371,7 +370,7 @@ namespace lst { float eta2 = segmentsInGPU.eta[pLSIndex - prefix]; float phi2 = segmentsInGPU.phi[pLSIndex - prefix]; float dEta = alpaka::math::abs(acc, eta1 - eta2); - float dPhi = lst::calculate_dPhi(phi1, phi2); + float dPhi = calculate_dPhi(phi1, phi2); float dR2 = dEta * dEta + dPhi * dPhi; if (dR2 < 0.000001f) @@ -382,36 +381,40 @@ namespace lst { } }; - struct addpT3asTrackCandidatesInGPU { + struct AddpT3asTrackCandidatesInGPU { template ALPAKA_FN_ACC void operator()(TAcc const& acc, uint16_t nLowerModules, - lst::PixelTriplets pixelTripletsInGPU, - lst::TrackCandidates trackCandidatesInGPU, - lst::Segments segmentsInGPU, - lst::ObjectRanges rangesInGPU) const { + PixelTriplets pixelTripletsInGPU, + TrackCandidates trackCandidatesInGPU, + Segments segmentsInGPU, + ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); unsigned int nPixelTriplets = *pixelTripletsInGPU.nPixelTriplets; unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[nLowerModules]; - for (unsigned int pixelTripletIndex = globalThreadIdx[2]; pixelTripletIndex < nPixelTriplets; - pixelTripletIndex += gridThreadExtent[2]) { + for (unsigned int pixelTripletIndex = globalThreadIdx[0]; pixelTripletIndex < nPixelTriplets; + pixelTripletIndex += gridThreadExtent[0]) { if ((pixelTripletsInGPU.isDup[pixelTripletIndex])) continue; unsigned int trackCandidateIdx = - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + alpaka::atomicAdd(acc, trackCandidatesInGPU.nTrackCandidates, 1u, alpaka::hierarchy::Threads{}); if (trackCandidateIdx >= n_max_pixel_track_candidates) // This is done before any non-pixel TCs are added { #ifdef WARNINGS printf("Track Candidate excess alert! Type = pT3"); #endif - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + alpaka::atomicSub(acc, trackCandidatesInGPU.nTrackCandidates, 1u, alpaka::hierarchy::Threads{}); break; } else { - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidatespT3, 1u); + alpaka::atomicAdd(acc, trackCandidatesInGPU.nTrackCandidatespT3, 1u, alpaka::hierarchy::Threads{}); float radius = 0.5f * (__H2F(pixelTripletsInGPU.pixelRadius[pixelTripletIndex]) + __H2F(pixelTripletsInGPU.tripletRadius[pixelTripletIndex])); @@ -434,13 +437,13 @@ namespace lst { } }; - struct addT5asTrackCandidateInGPU { + struct AddT5asTrackCandidateInGPU { template ALPAKA_FN_ACC void operator()(TAcc const& acc, uint16_t nLowerModules, - lst::Quintuplets quintupletsInGPU, - lst::TrackCandidates trackCandidatesInGPU, - lst::ObjectRanges rangesInGPU) const { + Quintuplets quintupletsInGPU, + TrackCandidates trackCandidatesInGPU, + ObjectRanges rangesInGPU) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -457,7 +460,7 @@ namespace lst { continue; unsigned int trackCandidateIdx = - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + alpaka::atomicAdd(acc, trackCandidatesInGPU.nTrackCandidates, 1u, alpaka::hierarchy::Threads{}); if (trackCandidateIdx - *trackCandidatesInGPU.nTrackCandidatespT5 - *trackCandidatesInGPU.nTrackCandidatespT3 >= n_max_nonpixel_track_candidates) // pT5 and pT3 TCs have been added, but not pLS TCs @@ -465,10 +468,10 @@ namespace lst { #ifdef WARNINGS printf("Track Candidate excess alert! Type = T5"); #endif - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + alpaka::atomicSub(acc, trackCandidatesInGPU.nTrackCandidates, 1u, alpaka::hierarchy::Threads{}); break; } else { - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidatesT5, 1u); + alpaka::atomicAdd(acc, trackCandidatesInGPU.nTrackCandidatesT5, 1u, alpaka::hierarchy::Threads{}); addTrackCandidateToMemory(trackCandidatesInGPU, 4 /*track candidate type T5=4*/, quintupletIndex, @@ -488,12 +491,12 @@ namespace lst { } }; - struct addpLSasTrackCandidateInGPU { + struct AddpLSasTrackCandidateInGPU { template ALPAKA_FN_ACC void operator()(TAcc const& acc, uint16_t nLowerModules, - lst::TrackCandidates trackCandidatesInGPU, - lst::Segments segmentsInGPU, + TrackCandidates trackCandidatesInGPU, + Segments segmentsInGPU, bool tc_pls_triplets) const { auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -505,18 +508,18 @@ namespace lst { continue; unsigned int trackCandidateIdx = - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + alpaka::atomicAdd(acc, trackCandidatesInGPU.nTrackCandidates, 1u, alpaka::hierarchy::Threads{}); if (trackCandidateIdx - *trackCandidatesInGPU.nTrackCandidatesT5 >= n_max_pixel_track_candidates) // T5 TCs have already been added { #ifdef WARNINGS printf("Track Candidate excess alert! Type = pLS"); #endif - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + alpaka::atomicSub(acc, trackCandidatesInGPU.nTrackCandidates, 1u, alpaka::hierarchy::Threads{}); break; } else { - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidatespLS, 1u); + alpaka::atomicAdd(acc, trackCandidatesInGPU.nTrackCandidatespLS, 1u, alpaka::hierarchy::Threads{}); addpLSTrackCandidateToMemory(trackCandidatesInGPU, pixelArrayIndex, trackCandidateIdx, @@ -527,36 +530,40 @@ namespace lst { } }; - struct addpT5asTrackCandidateInGPU { + struct AddpT5asTrackCandidateInGPU { template ALPAKA_FN_ACC void operator()(TAcc const& acc, uint16_t nLowerModules, - lst::PixelQuintuplets pixelQuintupletsInGPU, - lst::TrackCandidates trackCandidatesInGPU, - lst::Segments segmentsInGPU, - lst::ObjectRanges rangesInGPU) const { + PixelQuintuplets pixelQuintupletsInGPU, + TrackCandidates trackCandidatesInGPU, + Segments segmentsInGPU, + ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); int nPixelQuintuplets = *pixelQuintupletsInGPU.nPixelQuintuplets; unsigned int pLS_offset = rangesInGPU.segmentModuleIndices[nLowerModules]; - for (int pixelQuintupletIndex = globalThreadIdx[2]; pixelQuintupletIndex < nPixelQuintuplets; - pixelQuintupletIndex += gridThreadExtent[2]) { + for (int pixelQuintupletIndex = globalThreadIdx[0]; pixelQuintupletIndex < nPixelQuintuplets; + pixelQuintupletIndex += gridThreadExtent[0]) { if (pixelQuintupletsInGPU.isDup[pixelQuintupletIndex]) continue; unsigned int trackCandidateIdx = - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + alpaka::atomicAdd(acc, trackCandidatesInGPU.nTrackCandidates, 1u, alpaka::hierarchy::Threads{}); if (trackCandidateIdx >= n_max_pixel_track_candidates) // No other TCs have been added yet { #ifdef WARNINGS printf("Track Candidate excess alert! Type = pT5"); #endif - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidates, 1u); + alpaka::atomicSub(acc, trackCandidatesInGPU.nTrackCandidates, 1u, alpaka::hierarchy::Threads{}); break; } else { - alpaka::atomicOp(acc, trackCandidatesInGPU.nTrackCandidatespT5, 1u); + alpaka::atomicAdd(acc, trackCandidatesInGPU.nTrackCandidatespT5, 1u, alpaka::hierarchy::Threads{}); float radius = 0.5f * (__H2F(pixelQuintupletsInGPU.pixelRadius[pixelQuintupletIndex]) + __H2F(pixelQuintupletsInGPU.quintupletRadius[pixelQuintupletIndex])); @@ -579,5 +586,5 @@ namespace lst { } } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/src/alpaka/Triplet.h b/RecoTracker/LSTCore/src/alpaka/Triplet.h index f5a216724c1da..c5ac8bda543d8 100644 --- a/RecoTracker/LSTCore/src/alpaka/Triplet.h +++ b/RecoTracker/LSTCore/src/alpaka/Triplet.h @@ -11,7 +11,7 @@ #include "Hit.h" #include "ObjectRanges.h" -namespace lst { +namespace ALPAKA_ACCELERATOR_NAMESPACE::lst { struct Triplets { unsigned int* segmentIndices; uint16_t* lowerModuleIndices; //3 of them @@ -36,24 +36,24 @@ namespace lst { #endif template void setData(TBuff& buf) { - segmentIndices = alpaka::getPtrNative(buf.segmentIndices_buf); - lowerModuleIndices = alpaka::getPtrNative(buf.lowerModuleIndices_buf); - nTriplets = alpaka::getPtrNative(buf.nTriplets_buf); - totOccupancyTriplets = alpaka::getPtrNative(buf.totOccupancyTriplets_buf); - nMemoryLocations = alpaka::getPtrNative(buf.nMemoryLocations_buf); - logicalLayers = alpaka::getPtrNative(buf.logicalLayers_buf); - hitIndices = alpaka::getPtrNative(buf.hitIndices_buf); - betaIn = alpaka::getPtrNative(buf.betaIn_buf); - circleRadius = alpaka::getPtrNative(buf.circleRadius_buf); - circleCenterX = alpaka::getPtrNative(buf.circleCenterX_buf); - circleCenterY = alpaka::getPtrNative(buf.circleCenterY_buf); - partOfPT5 = alpaka::getPtrNative(buf.partOfPT5_buf); - partOfT5 = alpaka::getPtrNative(buf.partOfT5_buf); - partOfPT3 = alpaka::getPtrNative(buf.partOfPT3_buf); + segmentIndices = buf.segmentIndices_buf.data(); + lowerModuleIndices = buf.lowerModuleIndices_buf.data(); + nTriplets = buf.nTriplets_buf.data(); + totOccupancyTriplets = buf.totOccupancyTriplets_buf.data(); + nMemoryLocations = buf.nMemoryLocations_buf.data(); + logicalLayers = buf.logicalLayers_buf.data(); + hitIndices = buf.hitIndices_buf.data(); + betaIn = buf.betaIn_buf.data(); + circleRadius = buf.circleRadius_buf.data(); + circleCenterX = buf.circleCenterX_buf.data(); + circleCenterY = buf.circleCenterY_buf.data(); + partOfPT5 = buf.partOfPT5_buf.data(); + partOfT5 = buf.partOfT5_buf.data(); + partOfPT3 = buf.partOfPT3_buf.data(); #ifdef CUT_VALUE_DEBUG - zOut = alpaka::getPtrNative(buf.zOut_buf); - rtOut = alpaka::getPtrNative(buf.rtOut_buf); - betaInCut = alpaka::getPtrNative(buf.betaInCut_buf); + zOut = buf.zOut_buf.data(); + rtOut = buf.rtOut_buf.data(); + betaInCut = buf.betaInCut_buf.data(); #endif } }; @@ -136,10 +136,10 @@ namespace lst { }; #ifdef CUT_VALUE_DEBUG - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, - lst::Triplets& tripletsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, + Triplets& tripletsInGPU, unsigned int innerSegmentIndex, unsigned int outerSegmentIndex, uint16_t innerInnerLowerModuleIndex, @@ -154,10 +154,10 @@ namespace lst { float circleCenterY, unsigned int tripletIndex) #else - ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, - lst::Triplets& tripletsInGPU, + ALPAKA_FN_ACC ALPAKA_FN_INLINE void addTripletToMemory(Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, + Triplets& tripletsInGPU, unsigned int innerSegmentIndex, unsigned int outerSegmentIndex, uint16_t innerInnerLowerModuleIndex, @@ -202,13 +202,13 @@ namespace lst { tripletsInGPU.rtOut[tripletIndex] = rtOut; tripletsInGPU.betaInCut[tripletIndex] = betaInCut; #endif - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passRZConstraint(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t middleLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -262,13 +262,13 @@ namespace lst { } else { return alpaka::math::abs(acc, residual) < 5; } - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBB(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t middleLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -280,8 +280,8 @@ namespace lst { unsigned int innerSegmentIndex, float& betaIn, float& betaInCut) { - bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == lst::PS); - bool isPSOut = (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::PS); + bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == ::lst::PS); + bool isPSOut = (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::PS); float rtIn = mdsInGPU.anchorRt[firstMDIndex]; float rtMid = mdsInGPU.anchorRt[secondMDIndex]; @@ -291,17 +291,16 @@ namespace lst { float zMid = mdsInGPU.anchorZ[secondMDIndex]; zOut = mdsInGPU.anchorZ[thirdMDIndex]; - float alpha1GeVOut = - alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)); + float alpha1GeVOut = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax)); float rtRatio_OutIn = rtOut / rtIn; // Outer segment beginning rt divided by inner segment beginning rt; float dzDrtScale = alpaka::math::tan(acc, alpha1GeVOut) / alpha1GeVOut; // The track can bend in r-z plane slightly - float zpitchIn = (isPSIn ? lst::kPixelPSZpitch : lst::kStrip2SZpitch); - float zpitchOut = (isPSOut ? lst::kPixelPSZpitch : lst::kStrip2SZpitch); + float zpitchIn = (isPSIn ? kPixelPSZpitch : kStrip2SZpitch); + float zpitchOut = (isPSOut ? kPixelPSZpitch : kStrip2SZpitch); const float zHi = - zIn + (zIn + lst::kDeltaZLum) * (rtRatio_OutIn - 1.f) * (zIn < 0.f ? 1.f : dzDrtScale) + (zpitchIn + zpitchOut); - const float zLo = zIn + (zIn - lst::kDeltaZLum) * (rtRatio_OutIn - 1.f) * (zIn > 0.f ? 1.f : dzDrtScale) - + zIn + (zIn + kDeltaZLum) * (rtRatio_OutIn - 1.f) * (zIn < 0.f ? 1.f : dzDrtScale) + (zpitchIn + zpitchOut); + const float zLo = zIn + (zIn - kDeltaZLum) * (rtRatio_OutIn - 1.f) * (zIn > 0.f ? 1.f : dzDrtScale) - (zpitchIn + zpitchOut); //slope-correction only on outer end //Cut 1 - z compatibility @@ -320,15 +319,14 @@ namespace lst { float dzErr = (zpitchIn + zpitchOut) * (zpitchIn + zpitchOut) * 2.f; float thetaMuls2 = (kMulsInGeV * kMulsInGeV) * (0.1f + 0.2f * (rtOut - rtIn) / 50.f) * (r3In / rtIn); - float muls2 = thetaMuls2 * 9.f / (lst::ptCut * lst::ptCut) * 16.f; + float muls2 = thetaMuls2 * 9.f / (ptCut * ptCut) * 16.f; dzErr += muls2 * drt_OutIn * drt_OutIn / 3.f * coshEta * coshEta; dzErr = alpaka::math::sqrt(acc, dzErr); // Constructing upper and lower bound const float dzMean = dz_InSeg / drt_InSeg * drt_OutIn; - const float zWindow = - dzErr / drt_InSeg * drt_OutIn + - (zpitchIn + zpitchOut); //FIXME for lst::ptCut lower than ~0.8 need to add curv path correction + const float zWindow = dzErr / drt_InSeg * drt_OutIn + + (zpitchIn + zpitchOut); //FIXME for ptCut lower than ~0.8 need to add curv path correction const float zLoPointed = zIn + dzMean * (zIn > 0.f ? 1.f : dzDrtScale) - zWindow; const float zHiPointed = zIn + dzMean * (zIn < 0.f ? 1.f : dzDrtScale) + zWindow; @@ -342,7 +340,7 @@ namespace lst { float alpha_InLo = __H2F(segmentsInGPU.dPhiChanges[innerSegmentIndex]); float tl_axis_x = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; float tl_axis_y = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - betaIn = alpha_InLo - lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + betaIn = alpha_InLo - phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); //beta computation float drt_tl_axis = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); @@ -355,19 +353,18 @@ namespace lst { (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex]) * (mdsInGPU.anchorY[secondMDIndex] - mdsInGPU.anchorY[firstMDIndex])); betaInCut = - alpaka::math::asin( - acc, alpaka::math::min(acc, (-rt_InSeg + drt_tl_axis) * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + + alpaka::math::asin(acc, alpaka::math::min(acc, (-rt_InSeg + drt_tl_axis) * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + (0.02f / drt_InSeg); //Cut #3: first beta cut return alpaka::math::abs(acc, betaIn) < betaInCut; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintBBE(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t middleLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -381,8 +378,8 @@ namespace lst { unsigned int outerSegmentIndex, float& betaIn, float& betaInCut) { - bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == lst::PS); - bool isPSOut = (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::PS); + bool isPSIn = (modulesInGPU.moduleType[innerInnerLowerModuleIndex] == ::lst::PS); + bool isPSOut = (modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::PS); float rtIn = mdsInGPU.anchorRt[firstMDIndex]; float rtMid = mdsInGPU.anchorRt[secondMDIndex]; @@ -392,22 +389,21 @@ namespace lst { float zMid = mdsInGPU.anchorZ[secondMDIndex]; zOut = mdsInGPU.anchorZ[thirdMDIndex]; - float alpha1GeV_OutLo = - alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)); + float alpha1GeV_OutLo = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax)); float dzDrtScale = alpaka::math::tan(acc, alpha1GeV_OutLo) / alpha1GeV_OutLo; // The track can bend in r-z plane slightly - float zpitchIn = (isPSIn ? lst::kPixelPSZpitch : lst::kStrip2SZpitch); - float zpitchOut = (isPSOut ? lst::kPixelPSZpitch : lst::kStrip2SZpitch); + float zpitchIn = (isPSIn ? kPixelPSZpitch : kStrip2SZpitch); + float zpitchOut = (isPSOut ? kPixelPSZpitch : kStrip2SZpitch); float zGeom = zpitchIn + zpitchOut; // Cut #0: Preliminary (Only here in endcap case) if (zIn * zOut <= 0) return false; - float dLum = alpaka::math::copysign(acc, lst::kDeltaZLum, zIn); - bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::PS; - float rtGeom1 = isOutSgInnerMDPS ? lst::kPixelPSZpitch : lst::kStrip2SZpitch; + float dLum = alpaka::math::copysign(acc, kDeltaZLum, zIn); + bool isOutSgInnerMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::PS; + float rtGeom1 = isOutSgInnerMDPS ? kPixelPSZpitch : kStrip2SZpitch; float zGeom1 = alpaka::math::copysign(acc, zGeom, zIn); float rtLo = rtIn * (1.f + (zOut - zIn - zGeom1) / (zIn + zGeom1 + dLum) / dzDrtScale) - rtGeom1; //slope correction only on the lower end @@ -433,12 +429,12 @@ namespace lst { const float coshEta = dr3SDIn / drtSDIn; //direction estimate const float dzOutInAbs = alpaka::math::abs(acc, zOut - zIn); const float multDzDr = dzOutInAbs * coshEta / (coshEta * coshEta - 1.f); - const float zGeom1_another = lst::kPixelPSZpitch; + const float zGeom1_another = kPixelPSZpitch; const float kZ = (zOut - zIn) / dzSDIn; float drtErr = zGeom1_another * zGeom1_another * drtSDIn * drtSDIn / dzSDIn / dzSDIn * (1.f - 2.f * kZ + 2.f * kZ * kZ); const float thetaMuls2 = (kMulsInGeV * kMulsInGeV) * (0.1f + 0.2 * (rtOut - rtIn) / 50.f) * (rIn / rtIn); - const float muls2 = thetaMuls2 * 9.f / (lst::ptCut * lst::ptCut) * 16.f; + const float muls2 = thetaMuls2 * 9.f / (ptCut * ptCut) * 16.f; drtErr += muls2 * multDzDr * multDzDr / 3.f * coshEta * coshEta; drtErr = alpaka::math::sqrt(acc, drtErr); @@ -455,7 +451,7 @@ namespace lst { float tl_axis_x = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; float tl_axis_y = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - betaIn = sdIn_alpha - lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + betaIn = sdIn_alpha - phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); float betaInRHmin = betaIn; float betaInRHmax = betaIn; @@ -476,19 +472,18 @@ namespace lst { float sdIn_d = rt_InOut - rt_InLo; float dr = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); - betaInCut = alpaka::math::asin( - acc, alpaka::math::min(acc, (-sdIn_dr + dr) * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + + betaInCut = alpaka::math::asin(acc, alpaka::math::min(acc, (-sdIn_dr + dr) * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + (0.02f / sdIn_d); //Cut #4: first beta cut return alpaka::math::abs(acc, betaInRHmin) < betaInCut; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraintEEE(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t middleLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -509,8 +504,7 @@ namespace lst { float zMid = mdsInGPU.anchorZ[secondMDIndex]; zOut = mdsInGPU.anchorZ[thirdMDIndex]; - float alpha1GeV_Out = - alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)); + float alpha1GeV_Out = alpaka::math::asin(acc, alpaka::math::min(acc, rtOut * k2Rinv1GeVf / ptCut, kSinAlphaMax)); float dzDrtScale = alpaka::math::tan(acc, alpha1GeV_Out) / alpha1GeV_Out; // The track can bend in r-z plane slightly @@ -519,13 +513,13 @@ namespace lst { if (zIn * zOut <= 0) return false; - float dLum = alpaka::math::copysign(acc, lst::kDeltaZLum, zIn); - bool isOutSgOuterMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::PS; - bool isInSgInnerMDPS = modulesInGPU.moduleType[innerInnerLowerModuleIndex] == lst::PS; + float dLum = alpaka::math::copysign(acc, kDeltaZLum, zIn); + bool isOutSgOuterMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::PS; + bool isInSgInnerMDPS = modulesInGPU.moduleType[innerInnerLowerModuleIndex] == ::lst::PS; - float rtGeom = (isInSgInnerMDPS and isOutSgOuterMDPS) ? 2.f * lst::kPixelPSZpitch - : (isInSgInnerMDPS or isOutSgOuterMDPS) ? lst::kPixelPSZpitch + lst::kStrip2SZpitch - : 2.f * lst::kStrip2SZpitch; + float rtGeom = (isInSgInnerMDPS and isOutSgOuterMDPS) ? 2.f * kPixelPSZpitch + : (isInSgInnerMDPS or isOutSgOuterMDPS) ? kPixelPSZpitch + kStrip2SZpitch + : 2.f * kStrip2SZpitch; float dz = zOut - zIn; const float rtLo = rtIn * (1.f + dz / (zIn + dLum) / dzDrtScale) - rtGeom; //slope correction only on the lower end @@ -535,7 +529,7 @@ namespace lst { if ((rtOut < rtLo) || (rtOut > rtHi)) return false; - bool isInSgOuterMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == lst::PS; + bool isInSgOuterMDPS = modulesInGPU.moduleType[outerOuterLowerModuleIndex] == ::lst::PS; float drtSDIn = rtMid - rtIn; float dzSDIn = zMid - zIn; @@ -549,12 +543,12 @@ namespace lst { float kZ = (zOut - zIn) / dzSDIn; float thetaMuls2 = (kMulsInGeV * kMulsInGeV) * (0.1f + 0.2f * (rtOut - rtIn) / 50.f); - float muls2 = thetaMuls2 * 9.f / (lst::ptCut * lst::ptCut) * 16.f; + float muls2 = thetaMuls2 * 9.f / (ptCut * ptCut) * 16.f; - float drtErr = alpaka::math::sqrt( - acc, - lst::kPixelPSZpitch * lst::kPixelPSZpitch * 2.f / (dzSDIn * dzSDIn) * (dzOutInAbs * dzOutInAbs) + - muls2 * multDzDr * multDzDr / 3.f * coshEta * coshEta); + float drtErr = + alpaka::math::sqrt(acc, + kPixelPSZpitch * kPixelPSZpitch * 2.f / (dzSDIn * dzSDIn) * (dzOutInAbs * dzOutInAbs) + + muls2 * multDzDr * multDzDr / 3.f * coshEta * coshEta); float drtMean = drtSDIn * dzOutInAbs / alpaka::math::abs(acc, dzSDIn); float rtWindow = drtErr + rtGeom; @@ -577,7 +571,7 @@ namespace lst { float tl_axis_x = mdsInGPU.anchorX[thirdMDIndex] - mdsInGPU.anchorX[firstMDIndex]; float tl_axis_y = mdsInGPU.anchorY[thirdMDIndex] - mdsInGPU.anchorY[firstMDIndex]; - betaIn = sdIn_alpha - lst::phi_mpi_pi(acc, lst::phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); + betaIn = sdIn_alpha - phi_mpi_pi(acc, phi(acc, tl_axis_x, tl_axis_y) - mdsInGPU.anchorPhi[firstMDIndex]); float sdIn_alphaRHmin = __H2F(segmentsInGPU.dPhiChangeMins[innerSegmentIndex]); float sdIn_alphaRHmax = __H2F(segmentsInGPU.dPhiChangeMaxs[innerSegmentIndex]); @@ -599,19 +593,18 @@ namespace lst { float sdIn_d = rt_InOut - rt_InLo; float dr = alpaka::math::sqrt(acc, tl_axis_x * tl_axis_x + tl_axis_y * tl_axis_y); - betaInCut = alpaka::math::asin( - acc, alpaka::math::min(acc, (-sdIn_dr + dr) * lst::k2Rinv1GeVf / lst::ptCut, lst::kSinAlphaMax)) + + betaInCut = alpaka::math::asin(acc, alpaka::math::min(acc, (-sdIn_dr + dr) * k2Rinv1GeVf / ptCut, kSinAlphaMax)) + (0.02f / sdIn_d); //Cut #4: first beta cut return alpaka::math::abs(acc, betaInRHmin) < betaInCut; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool passPointingConstraint(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t middleLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -629,8 +622,8 @@ namespace lst { short middleLowerModuleSubdet = modulesInGPU.subdets[middleLowerModuleIndex]; short outerOuterLowerModuleSubdet = modulesInGPU.subdets[outerOuterLowerModuleIndex]; - if (innerInnerLowerModuleSubdet == lst::Barrel and middleLowerModuleSubdet == lst::Barrel and - outerOuterLowerModuleSubdet == lst::Barrel) { + if (innerInnerLowerModuleSubdet == ::lst::Barrel and middleLowerModuleSubdet == ::lst::Barrel and + outerOuterLowerModuleSubdet == ::lst::Barrel) { return passPointingConstraintBBB(acc, modulesInGPU, mdsInGPU, @@ -646,8 +639,8 @@ namespace lst { innerSegmentIndex, betaIn, betaInCut); - } else if (innerInnerLowerModuleSubdet == lst::Barrel and middleLowerModuleSubdet == lst::Barrel and - outerOuterLowerModuleSubdet == lst::Endcap) { + } else if (innerInnerLowerModuleSubdet == ::lst::Barrel and middleLowerModuleSubdet == ::lst::Barrel and + outerOuterLowerModuleSubdet == ::lst::Endcap) { return passPointingConstraintBBE(acc, modulesInGPU, mdsInGPU, @@ -665,8 +658,8 @@ namespace lst { outerSegmentIndex, betaIn, betaInCut); - } else if (innerInnerLowerModuleSubdet == lst::Barrel and middleLowerModuleSubdet == lst::Endcap and - outerOuterLowerModuleSubdet == lst::Endcap) { + } else if (innerInnerLowerModuleSubdet == ::lst::Barrel and middleLowerModuleSubdet == ::lst::Endcap and + outerOuterLowerModuleSubdet == ::lst::Endcap) { return passPointingConstraintBBE(acc, modulesInGPU, mdsInGPU, @@ -687,8 +680,8 @@ namespace lst { } - else if (innerInnerLowerModuleSubdet == lst::Endcap and middleLowerModuleSubdet == lst::Endcap and - outerOuterLowerModuleSubdet == lst::Endcap) { + else if (innerInnerLowerModuleSubdet == ::lst::Endcap and middleLowerModuleSubdet == ::lst::Endcap and + outerOuterLowerModuleSubdet == ::lst::Endcap) { return passPointingConstraintEEE(acc, modulesInGPU, mdsInGPU, @@ -707,7 +700,7 @@ namespace lst { betaInCut); } return false; // failsafe - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE float computeRadiusFromThreeAnchorHits( @@ -740,13 +733,13 @@ namespace lst { radius = alpaka::math::sqrt(acc, g * g + f * f - c); return radius; - }; + } template ALPAKA_FN_ACC ALPAKA_FN_INLINE bool runTripletConstraintsAndAlgo(TAcc const& acc, - lst::Modules const& modulesInGPU, - lst::MiniDoublets const& mdsInGPU, - lst::Segments const& segmentsInGPU, + Modules const& modulesInGPU, + MiniDoublets const& mdsInGPU, + Segments const& segmentsInGPU, uint16_t innerInnerLowerModuleIndex, uint16_t middleLowerModuleIndex, uint16_t outerOuterLowerModuleIndex, @@ -806,16 +799,16 @@ namespace lst { circleRadius = computeRadiusFromThreeAnchorHits(acc, x1, y1, x2, y2, x3, y3, circleCenterX, circleCenterY); return true; - }; + } - struct createTripletsInGPUv2 { + struct CreateTripletsInGPUv2 { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::MiniDoublets mdsInGPU, - lst::Segments segmentsInGPU, - lst::Triplets tripletsInGPU, - lst::ObjectRanges rangesInGPU, + Modules modulesInGPU, + MiniDoublets mdsInGPU, + Segments segmentsInGPU, + Triplets tripletsInGPU, + ObjectRanges rangesInGPU, uint16_t* index_gpu, uint16_t nonZeroModules) const { auto const globalThreadIdx = alpaka::getIdx(acc); @@ -868,16 +861,19 @@ namespace lst { circleCenterY); if (success) { - unsigned int totOccupancyTriplets = alpaka::atomicOp( - acc, &tripletsInGPU.totOccupancyTriplets[innerInnerLowerModuleIndex], 1u); + unsigned int totOccupancyTriplets = + alpaka::atomicAdd(acc, + &tripletsInGPU.totOccupancyTriplets[innerInnerLowerModuleIndex], + 1u, + alpaka::hierarchy::Threads{}); if (static_cast(totOccupancyTriplets) >= rangesInGPU.tripletModuleOccupancy[innerInnerLowerModuleIndex]) { #ifdef WARNINGS printf("Triplet excess alert! Module index = %d\n", innerInnerLowerModuleIndex); #endif } else { - unsigned int tripletModuleIndex = - alpaka::atomicOp(acc, &tripletsInGPU.nTriplets[innerInnerLowerModuleIndex], 1u); + unsigned int tripletModuleIndex = alpaka::atomicAdd( + acc, &tripletsInGPU.nTriplets[innerInnerLowerModuleIndex], 1u, alpaka::hierarchy::Threads{}); unsigned int tripletIndex = rangesInGPU.tripletModuleIndices[innerInnerLowerModuleIndex] + tripletModuleIndex; #ifdef CUT_VALUE_DEBUG @@ -922,12 +918,16 @@ namespace lst { } }; - struct createTripletArrayRanges { + struct CreateTripletArrayRanges { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::ObjectRanges rangesInGPU, - lst::Segments segmentsInGPU) const { + Modules modulesInGPU, + ObjectRanges rangesInGPU, + Segments segmentsInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); @@ -938,10 +938,10 @@ namespace lst { } alpaka::syncBlockThreads(acc); - // Initialize variables outside of the for loop. + // Create variables outside of the for loop. int occupancy, category_number, eta_number; - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (segmentsInGPU.nSegments[i] == 0) { rangesInGPU.tripletModuleIndices[i] = nTotalTriplets; rangesInGPU.tripletModuleOccupancy[i] = 0; @@ -1009,28 +1009,32 @@ namespace lst { } rangesInGPU.tripletModuleOccupancy[i] = occupancy; - unsigned int nTotT = alpaka::atomicOp(acc, &nTotalTriplets, occupancy); + unsigned int nTotT = alpaka::atomicAdd(acc, &nTotalTriplets, occupancy, alpaka::hierarchy::Threads{}); rangesInGPU.tripletModuleIndices[i] = nTotT; } // Wait for all threads to finish before reporting final values alpaka::syncBlockThreads(acc); - if (globalThreadIdx[2] == 0) { + if (cms::alpakatools::once_per_block(acc)) { *rangesInGPU.device_nTotalTrips = nTotalTriplets; } } }; - struct addTripletRangesToEventExplicit { + struct AddTripletRangesToEventExplicit { template ALPAKA_FN_ACC void operator()(TAcc const& acc, - lst::Modules modulesInGPU, - lst::Triplets tripletsInGPU, - lst::ObjectRanges rangesInGPU) const { + Modules modulesInGPU, + Triplets tripletsInGPU, + ObjectRanges rangesInGPU) const { + // implementation is 1D with a single block + static_assert(std::is_same_v, "Should be Acc1D"); + ALPAKA_ASSERT_ACC((alpaka::getWorkDiv(acc)[0] == 1)); + auto const globalThreadIdx = alpaka::getIdx(acc); auto const gridThreadExtent = alpaka::getWorkDiv(acc); - for (uint16_t i = globalThreadIdx[2]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[2]) { + for (uint16_t i = globalThreadIdx[0]; i < *modulesInGPU.nLowerModules; i += gridThreadExtent[0]) { if (tripletsInGPU.nTriplets[i] == 0) { rangesInGPU.tripletRanges[i * 2] = -1; rangesInGPU.tripletRanges[i * 2 + 1] = -1; @@ -1041,5 +1045,5 @@ namespace lst { } } }; -} // namespace lst +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::lst #endif diff --git a/RecoTracker/LSTCore/standalone/bin/lst.cc b/RecoTracker/LSTCore/standalone/bin/lst.cc index e67fe5b62d269..060308e4dabab 100644 --- a/RecoTracker/LSTCore/standalone/bin/lst.cc +++ b/RecoTracker/LSTCore/standalone/bin/lst.cc @@ -2,8 +2,6 @@ #include -using namespace ALPAKA_ACCELERATOR_NAMESPACE; - //___________________________________________________________________________________________________________________________________________________________________________________________ int main(int argc, char **argv) { //******************************************************************************** @@ -255,7 +253,7 @@ int main(int argc, char **argv) { // Printing out the option settings overview std::cout << "=========================================================" << std::endl; - std::cout << " Running for Acc = " << alpaka::getAccName() << std::endl; + std::cout << " Running for Acc = " << alpaka::getAccName() << std::endl; std::cout << " Setting of the analysis job based on provided arguments " << std::endl; std::cout << "---------------------------------------------------------" << std::endl; std::cout << " ana.input_file_list_tstring: " << ana.input_file_list_tstring << std::endl; @@ -298,17 +296,18 @@ int main(int argc, char **argv) { //___________________________________________________________________________________________________________________________________________________________________________________________ void run_lst() { - Device devAcc = alpaka::getDevByIdx(ALPAKA_ACCELERATOR_NAMESPACE::Platform{}, 0u); - std::vector queues; + ALPAKA_ACCELERATOR_NAMESPACE::Device devAcc = alpaka::getDevByIdx(ALPAKA_ACCELERATOR_NAMESPACE::Platform{}, 0u); + std::vector queues; for (int s = 0; s < ana.streams; s++) { - queues.push_back(Queue(devAcc)); + queues.push_back(ALPAKA_ACCELERATOR_NAMESPACE::Queue(devAcc)); } // Load various maps used in the lst reconstruction TStopwatch full_timer; full_timer.Start(); auto hostESData = lst::loadAndFillESHost(); - auto deviceESData = cms::alpakatools::CopyToDevice>::copyAsync(queues[0], *hostESData.get()); + auto deviceESData = + cms::alpakatools::CopyToDevice>::copyAsync(queues[0], *hostESData.get()); float timeForMapLoading = full_timer.RealTime() * 1000; if (ana.do_write_ntuple) { @@ -384,9 +383,10 @@ void run_lst() { full_timer.Reset(); full_timer.Start(); - std::vector *> events; + std::vector events; for (int s = 0; s < ana.streams; s++) { - lst::Event *event = new lst::Event(ana.verbose >= 2, queues[s], &deviceESData); + ALPAKA_ACCELERATOR_NAMESPACE::lst::Event *event = + new ALPAKA_ACCELERATOR_NAMESPACE::lst::Event(ana.verbose >= 2, queues[s], &deviceESData); events.push_back(event); } float timeForEventCreation = full_timer.RealTime() * 1000; @@ -478,7 +478,7 @@ void run_lst() { // Clear this event TStopwatch my_timer; my_timer.Start(); - events.at(omp_get_thread_num())->resetEvent(); + events.at(omp_get_thread_num())->resetEventSync(); float timing_resetEvent = my_timer.RealTime(); timing_information.push_back({timing_input_loading, diff --git a/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc b/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc index 76cfa9760b71a..eb48917952a38 100644 --- a/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc +++ b/RecoTracker/LSTCore/standalone/code/core/AccessHelper.cc @@ -1,6 +1,6 @@ #include "AccessHelper.h" -using namespace ALPAKA_ACCELERATOR_NAMESPACE; +using namespace ALPAKA_ACCELERATOR_NAMESPACE::lst; // =============== // ----* Hit *---- @@ -8,8 +8,8 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE; //____________________________________________________________________________________________ std::tuple, std::vector> convertHitsToHitIdxsAndHitTypes( - lst::Event* event, std::vector hits) { - lst::Hits const* hitsEvt = event->getHits()->data(); + Event* event, std::vector hits) { + Hits const* hitsEvt = event->getHits()->data(); std::vector hitidxs; std::vector hittypes; for (auto& hit : hits) { @@ -27,11 +27,11 @@ std::tuple, std::vector> convertHitsToHi // =============== //____________________________________________________________________________________________ -std::vector getPixelHitsFrompLS(lst::Event* event, unsigned int pLS) { - lst::Segments const* segments = event->getSegments()->data(); - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::ObjectRanges const* rangesEvt = event->getRanges()->data(); - lst::Modules const* modulesEvt = event->getModules()->data(); +std::vector getPixelHitsFrompLS(Event* event, unsigned int pLS) { + Segments const* segments = event->getSegments()->data(); + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + ObjectRanges const* rangesEvt = event->getRanges()->data(); + ::lst::Modules const* modulesEvt = event->getModules()->data(); const unsigned int pLS_offset = rangesEvt->segmentModuleIndices[*(modulesEvt->nLowerModules)]; unsigned int MD_1 = segments->mdIndices[2 * (pLS + pLS_offset)]; unsigned int MD_2 = segments->mdIndices[2 * (pLS + pLS_offset) + 1]; @@ -46,8 +46,8 @@ std::vector getPixelHitsFrompLS(lst::Event* event, unsigned } //____________________________________________________________________________________________ -std::vector getPixelHitIdxsFrompLS(lst::Event* event, unsigned int pLS) { - lst::Hits const* hitsEvt = event->getHits()->data(); +std::vector getPixelHitIdxsFrompLS(Event* event, unsigned int pLS) { + Hits const* hitsEvt = event->getHits()->data(); std::vector hits = getPixelHitsFrompLS(event, pLS); std::vector hitidxs; for (auto& hit : hits) @@ -56,14 +56,14 @@ std::vector getPixelHitIdxsFrompLS(lst::Event* event, unsig } //____________________________________________________________________________________________ -std::vector getPixelHitTypesFrompLS(lst::Event* event, unsigned int pLS) { +std::vector getPixelHitTypesFrompLS(Event* event, unsigned int pLS) { std::vector hits = getPixelHitsFrompLS(event, pLS); std::vector hittypes(hits.size(), 0); return hittypes; } //____________________________________________________________________________________________ -std::tuple, std::vector> getHitIdxsAndHitTypesFrompLS(lst::Event* event, +std::tuple, std::vector> getHitIdxsAndHitTypesFrompLS(Event* event, unsigned pLS) { return convertHitsToHitIdxsAndHitTypes(event, getPixelHitsFrompLS(event, pLS)); } @@ -73,15 +73,15 @@ std::tuple, std::vector> getHitIdxsAndHi // ============== //____________________________________________________________________________________________ -std::vector getHitsFromMD(lst::Event* event, unsigned int MD) { - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); +std::vector getHitsFromMD(Event* event, unsigned int MD) { + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); unsigned int hit_1 = miniDoublets->anchorHitIndices[MD]; unsigned int hit_2 = miniDoublets->outerHitIndices[MD]; return {hit_1, hit_2}; } //____________________________________________________________________________________________ -std::tuple, std::vector> getHitIdxsAndHitTypesFromMD(lst::Event* event, +std::tuple, std::vector> getHitIdxsAndHitTypesFromMD(Event* event, unsigned MD) { return convertHitsToHitIdxsAndHitTypes(event, getHitsFromMD(event, MD)); } @@ -91,15 +91,15 @@ std::tuple, std::vector> getHitIdxsAndHi // ============== //____________________________________________________________________________________________ -std::vector getMDsFromLS(lst::Event* event, unsigned int LS) { - lst::Segments const* segments = event->getSegments()->data(); +std::vector getMDsFromLS(Event* event, unsigned int LS) { + Segments const* segments = event->getSegments()->data(); unsigned int MD_1 = segments->mdIndices[2 * LS]; unsigned int MD_2 = segments->mdIndices[2 * LS + 1]; return {MD_1, MD_2}; } //____________________________________________________________________________________________ -std::vector getHitsFromLS(lst::Event* event, unsigned int LS) { +std::vector getHitsFromLS(Event* event, unsigned int LS) { std::vector MDs = getMDsFromLS(event, LS); std::vector hits_0 = getHitsFromMD(event, MDs[0]); std::vector hits_1 = getHitsFromMD(event, MDs[1]); @@ -107,7 +107,7 @@ std::vector getHitsFromLS(lst::Event* event, unsigned int L } //____________________________________________________________________________________________ -std::tuple, std::vector> getHitIdxsAndHitTypesFromLS(lst::Event* event, +std::tuple, std::vector> getHitIdxsAndHitTypesFromLS(Event* event, unsigned LS) { return convertHitsToHitIdxsAndHitTypes(event, getHitsFromLS(event, LS)); } @@ -117,15 +117,15 @@ std::tuple, std::vector> getHitIdxsAndHi // ============== //____________________________________________________________________________________________ -std::vector getLSsFromT3(lst::Event* event, unsigned int T3) { - lst::Triplets const* triplets = event->getTriplets()->data(); +std::vector getLSsFromT3(Event* event, unsigned int T3) { + Triplets const* triplets = event->getTriplets()->data(); unsigned int LS_1 = triplets->segmentIndices[2 * T3]; unsigned int LS_2 = triplets->segmentIndices[2 * T3 + 1]; return {LS_1, LS_2}; } //____________________________________________________________________________________________ -std::vector getMDsFromT3(lst::Event* event, unsigned int T3) { +std::vector getMDsFromT3(Event* event, unsigned int T3) { std::vector LSs = getLSsFromT3(event, T3); std::vector MDs_0 = getMDsFromLS(event, LSs[0]); std::vector MDs_1 = getMDsFromLS(event, LSs[1]); @@ -133,7 +133,7 @@ std::vector getMDsFromT3(lst::Event* event, unsigned int T3 } //____________________________________________________________________________________________ -std::vector getHitsFromT3(lst::Event* event, unsigned int T3) { +std::vector getHitsFromT3(Event* event, unsigned int T3) { std::vector MDs = getMDsFromT3(event, T3); std::vector hits_0 = getHitsFromMD(event, MDs[0]); std::vector hits_1 = getHitsFromMD(event, MDs[1]); @@ -142,7 +142,7 @@ std::vector getHitsFromT3(lst::Event* event, unsigned int T } //____________________________________________________________________________________________ -std::tuple, std::vector> getHitIdxsAndHitTypesFromT3(lst::Event* event, +std::tuple, std::vector> getHitIdxsAndHitTypesFromT3(Event* event, unsigned T3) { return convertHitsToHitIdxsAndHitTypes(event, getHitsFromT3(event, T3)); } @@ -152,15 +152,15 @@ std::tuple, std::vector> getHitIdxsAndHi // ============== //____________________________________________________________________________________________ -std::vector getT3sFromT5(lst::Event* event, unsigned int T5) { - lst::Quintuplets const* quintuplets = event->getQuintuplets()->data(); +std::vector getT3sFromT5(Event* event, unsigned int T5) { + Quintuplets const* quintuplets = event->getQuintuplets()->data(); unsigned int T3_1 = quintuplets->tripletIndices[2 * T5]; unsigned int T3_2 = quintuplets->tripletIndices[2 * T5 + 1]; return {T3_1, T3_2}; } //____________________________________________________________________________________________ -std::vector getLSsFromT5(lst::Event* event, unsigned int T5) { +std::vector getLSsFromT5(Event* event, unsigned int T5) { std::vector T3s = getT3sFromT5(event, T5); std::vector LSs_0 = getLSsFromT3(event, T3s[0]); std::vector LSs_1 = getLSsFromT3(event, T3s[1]); @@ -168,7 +168,7 @@ std::vector getLSsFromT5(lst::Event* event, unsigned int T5 } //____________________________________________________________________________________________ -std::vector getMDsFromT5(lst::Event* event, unsigned int T5) { +std::vector getMDsFromT5(Event* event, unsigned int T5) { std::vector LSs = getLSsFromT5(event, T5); std::vector MDs_0 = getMDsFromLS(event, LSs[0]); std::vector MDs_1 = getMDsFromLS(event, LSs[1]); @@ -178,7 +178,7 @@ std::vector getMDsFromT5(lst::Event* event, unsigned int T5 } //____________________________________________________________________________________________ -std::vector getHitsFromT5(lst::Event* event, unsigned int T5) { +std::vector getHitsFromT5(Event* event, unsigned int T5) { std::vector MDs = getMDsFromT5(event, T5); std::vector hits_0 = getHitsFromMD(event, MDs[0]); std::vector hits_1 = getHitsFromMD(event, MDs[1]); @@ -189,8 +189,8 @@ std::vector getHitsFromT5(lst::Event* event, unsigned int T } //____________________________________________________________________________________________ -std::vector getHitIdxsFromT5(lst::Event* event, unsigned int T5) { - lst::Hits const* hitsEvt = event->getHits()->data(); +std::vector getHitIdxsFromT5(Event* event, unsigned int T5) { + Hits const* hitsEvt = event->getHits()->data(); std::vector hits = getHitsFromT5(event, T5); std::vector hitidxs; for (auto& hit : hits) @@ -198,23 +198,23 @@ std::vector getHitIdxsFromT5(lst::Event* event, unsigned in return hitidxs; } //____________________________________________________________________________________________ -std::vector getModuleIdxsFromT5(lst::Event* event, unsigned int T5) { +std::vector getModuleIdxsFromT5(Event* event, unsigned int T5) { std::vector hits = getHitsFromT5(event, T5); std::vector module_idxs; - lst::Hits const* hitsEvt = event->getHits()->data(); + Hits const* hitsEvt = event->getHits()->data(); for (auto& hitIdx : hits) { module_idxs.push_back(hitsEvt->moduleIndices[hitIdx]); } return module_idxs; } //____________________________________________________________________________________________ -std::vector getHitTypesFromT5(lst::Event* event, unsigned int T5) { +std::vector getHitTypesFromT5(Event* event, unsigned int T5) { return {4, 4, 4, 4, 4, 4, 4, 4, 4, 4}; ; } //____________________________________________________________________________________________ -std::tuple, std::vector> getHitIdxsAndHitTypesFromT5(lst::Event* event, +std::tuple, std::vector> getHitIdxsAndHitTypesFromT5(Event* event, unsigned T5) { return convertHitsToHitIdxsAndHitTypes(event, getHitsFromT5(event, T5)); } @@ -224,46 +224,46 @@ std::tuple, std::vector> getHitIdxsAndHi // =============== //____________________________________________________________________________________________ -unsigned int getPixelLSFrompT3(lst::Event* event, unsigned int pT3) { - lst::PixelTriplets const* pixelTriplets = event->getPixelTriplets()->data(); - lst::ObjectRanges const* rangesEvt = event->getRanges()->data(); - lst::Modules const* modulesEvt = event->getModules()->data(); +unsigned int getPixelLSFrompT3(Event* event, unsigned int pT3) { + PixelTriplets const* pixelTriplets = event->getPixelTriplets()->data(); + ObjectRanges const* rangesEvt = event->getRanges()->data(); + ::lst::Modules const* modulesEvt = event->getModules()->data(); const unsigned int pLS_offset = rangesEvt->segmentModuleIndices[*(modulesEvt->nLowerModules)]; return pixelTriplets->pixelSegmentIndices[pT3] - pLS_offset; } //____________________________________________________________________________________________ -unsigned int getT3FrompT3(lst::Event* event, unsigned int pT3) { - lst::PixelTriplets const* pixelTriplets = event->getPixelTriplets()->data(); +unsigned int getT3FrompT3(Event* event, unsigned int pT3) { + PixelTriplets const* pixelTriplets = event->getPixelTriplets()->data(); return pixelTriplets->tripletIndices[pT3]; } //____________________________________________________________________________________________ -std::vector getLSsFrompT3(lst::Event* event, unsigned int pT3) { +std::vector getLSsFrompT3(Event* event, unsigned int pT3) { unsigned int T3 = getT3FrompT3(event, pT3); return getLSsFromT3(event, T3); } //____________________________________________________________________________________________ -std::vector getMDsFrompT3(lst::Event* event, unsigned int pT3) { +std::vector getMDsFrompT3(Event* event, unsigned int pT3) { unsigned int T3 = getT3FrompT3(event, pT3); return getMDsFromT3(event, T3); } //____________________________________________________________________________________________ -std::vector getOuterTrackerHitsFrompT3(lst::Event* event, unsigned int pT3) { +std::vector getOuterTrackerHitsFrompT3(Event* event, unsigned int pT3) { unsigned int T3 = getT3FrompT3(event, pT3); return getHitsFromT3(event, T3); } //____________________________________________________________________________________________ -std::vector getPixelHitsFrompT3(lst::Event* event, unsigned int pT3) { +std::vector getPixelHitsFrompT3(Event* event, unsigned int pT3) { unsigned int pLS = getPixelLSFrompT3(event, pT3); return getPixelHitsFrompLS(event, pLS); } //____________________________________________________________________________________________ -std::vector getHitsFrompT3(lst::Event* event, unsigned int pT3) { +std::vector getHitsFrompT3(Event* event, unsigned int pT3) { unsigned int pLS = getPixelLSFrompT3(event, pT3); unsigned int T3 = getT3FrompT3(event, pT3); std::vector pixelHits = getPixelHitsFrompLS(event, pLS); @@ -273,8 +273,8 @@ std::vector getHitsFrompT3(lst::Event* event, unsigned int } //____________________________________________________________________________________________ -std::vector getHitIdxsFrompT3(lst::Event* event, unsigned int pT3) { - lst::Hits const* hitsEvt = event->getHits()->data(); +std::vector getHitIdxsFrompT3(Event* event, unsigned int pT3) { + Hits const* hitsEvt = event->getHits()->data(); std::vector hits = getHitsFrompT3(event, pT3); std::vector hitidxs; for (auto& hit : hits) @@ -282,17 +282,17 @@ std::vector getHitIdxsFrompT3(lst::Event* event, unsigned i return hitidxs; } //____________________________________________________________________________________________ -std::vector getModuleIdxsFrompT3(lst::Event* event, unsigned int pT3) { +std::vector getModuleIdxsFrompT3(Event* event, unsigned int pT3) { std::vector hits = getOuterTrackerHitsFrompT3(event, pT3); std::vector module_idxs; - lst::Hits const* hitsEvt = event->getHits()->data(); + Hits const* hitsEvt = event->getHits()->data(); for (auto& hitIdx : hits) { module_idxs.push_back(hitsEvt->moduleIndices[hitIdx]); } return module_idxs; } //____________________________________________________________________________________________ -std::vector getHitTypesFrompT3(lst::Event* event, unsigned int pT3) { +std::vector getHitTypesFrompT3(Event* event, unsigned int pT3) { unsigned int pLS = getPixelLSFrompT3(event, pT3); std::vector pixelHits = getPixelHitsFrompLS(event, pLS); // pixel Hits list will be either 3 or 4 and depending on it return accordingly @@ -303,7 +303,7 @@ std::vector getHitTypesFrompT3(lst::Event* event, unsigned } //____________________________________________________________________________________________ -std::tuple, std::vector> getHitIdxsAndHitTypesFrompT3(lst::Event* event, +std::tuple, std::vector> getHitIdxsAndHitTypesFrompT3(Event* event, unsigned pT3) { return convertHitsToHitIdxsAndHitTypes(event, getHitsFrompT3(event, pT3)); } @@ -313,52 +313,52 @@ std::tuple, std::vector> getHitIdxsAndHi // =============== //____________________________________________________________________________________________ -unsigned int getPixelLSFrompT5(lst::Event* event, unsigned int pT5) { - lst::PixelQuintuplets const* pixelQuintuplets = event->getPixelQuintuplets()->data(); - lst::ObjectRanges const* rangesEvt = event->getRanges()->data(); - lst::Modules const* modulesEvt = event->getModules()->data(); +unsigned int getPixelLSFrompT5(Event* event, unsigned int pT5) { + PixelQuintuplets const* pixelQuintuplets = event->getPixelQuintuplets()->data(); + ObjectRanges const* rangesEvt = event->getRanges()->data(); + ::lst::Modules const* modulesEvt = event->getModules()->data(); const unsigned int pLS_offset = rangesEvt->segmentModuleIndices[*(modulesEvt->nLowerModules)]; return pixelQuintuplets->pixelIndices[pT5] - pLS_offset; } //____________________________________________________________________________________________ -unsigned int getT5FrompT5(lst::Event* event, unsigned int pT5) { - lst::PixelQuintuplets const* pixelQuintuplets = event->getPixelQuintuplets()->data(); +unsigned int getT5FrompT5(Event* event, unsigned int pT5) { + PixelQuintuplets const* pixelQuintuplets = event->getPixelQuintuplets()->data(); return pixelQuintuplets->T5Indices[pT5]; } //____________________________________________________________________________________________ -std::vector getT3sFrompT5(lst::Event* event, unsigned int pT5) { +std::vector getT3sFrompT5(Event* event, unsigned int pT5) { unsigned int T5 = getT5FrompT5(event, pT5); return getT3sFromT5(event, T5); } //____________________________________________________________________________________________ -std::vector getLSsFrompT5(lst::Event* event, unsigned int pT5) { +std::vector getLSsFrompT5(Event* event, unsigned int pT5) { unsigned int T5 = getT5FrompT5(event, pT5); return getLSsFromT5(event, T5); } //____________________________________________________________________________________________ -std::vector getMDsFrompT5(lst::Event* event, unsigned int pT5) { +std::vector getMDsFrompT5(Event* event, unsigned int pT5) { unsigned int T5 = getT5FrompT5(event, pT5); return getMDsFromT5(event, T5); } //____________________________________________________________________________________________ -std::vector getOuterTrackerHitsFrompT5(lst::Event* event, unsigned int pT5) { +std::vector getOuterTrackerHitsFrompT5(Event* event, unsigned int pT5) { unsigned int T5 = getT5FrompT5(event, pT5); return getHitsFromT5(event, T5); } //____________________________________________________________________________________________ -std::vector getPixelHitsFrompT5(lst::Event* event, unsigned int pT5) { +std::vector getPixelHitsFrompT5(Event* event, unsigned int pT5) { unsigned int pLS = getPixelLSFrompT5(event, pT5); return getPixelHitsFrompLS(event, pLS); } //____________________________________________________________________________________________ -std::vector getHitsFrompT5(lst::Event* event, unsigned int pT5) { +std::vector getHitsFrompT5(Event* event, unsigned int pT5) { unsigned int pLS = getPixelLSFrompT5(event, pT5); unsigned int T5 = getT5FrompT5(event, pT5); std::vector pixelHits = getPixelHitsFrompLS(event, pLS); @@ -368,8 +368,8 @@ std::vector getHitsFrompT5(lst::Event* event, unsigned int } //____________________________________________________________________________________________ -std::vector getHitIdxsFrompT5(lst::Event* event, unsigned int pT5) { - lst::Hits const* hitsEvt = event->getHits()->data(); +std::vector getHitIdxsFrompT5(Event* event, unsigned int pT5) { + Hits const* hitsEvt = event->getHits()->data(); std::vector hits = getHitsFrompT5(event, pT5); std::vector hitidxs; for (auto& hit : hits) @@ -378,10 +378,10 @@ std::vector getHitIdxsFrompT5(lst::Event* event, unsigned i } //____________________________________________________________________________________________ -std::vector getModuleIdxsFrompT5(lst::Event* event, unsigned int pT5) { +std::vector getModuleIdxsFrompT5(Event* event, unsigned int pT5) { std::vector hits = getOuterTrackerHitsFrompT5(event, pT5); std::vector module_idxs; - lst::Hits const* hitsEvt = event->getHits()->data(); + Hits const* hitsEvt = event->getHits()->data(); for (auto& hitIdx : hits) { module_idxs.push_back(hitsEvt->moduleIndices[hitIdx]); } @@ -389,7 +389,7 @@ std::vector getModuleIdxsFrompT5(lst::Event* event, unsigne } //____________________________________________________________________________________________ -std::vector getHitTypesFrompT5(lst::Event* event, unsigned int pT5) { +std::vector getHitTypesFrompT5(Event* event, unsigned int pT5) { unsigned int pLS = getPixelLSFrompT5(event, pT5); std::vector pixelHits = getPixelHitsFrompLS(event, pLS); // pixel Hits list will be either 3 or 4 and depending on it return accordingly @@ -400,7 +400,7 @@ std::vector getHitTypesFrompT5(lst::Event* event, unsigned } //____________________________________________________________________________________________ -std::tuple, std::vector> getHitIdxsAndHitTypesFrompT5(lst::Event* event, +std::tuple, std::vector> getHitIdxsAndHitTypesFrompT5(Event* event, unsigned pT5) { return convertHitsToHitIdxsAndHitTypes(event, getHitsFrompT5(event, pT5)); } @@ -410,9 +410,9 @@ std::tuple, std::vector> getHitIdxsAndHi // ============== //____________________________________________________________________________________________ -std::vector getLSsFromTC(lst::Event* event, unsigned int TC) { +std::vector getLSsFromTC(Event* event, unsigned int TC) { // Get the type of the track candidate - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); short type = trackCandidates->trackCandidateType[TC]; unsigned int objidx = trackCandidates->directObjectIndices[TC]; switch (type) { @@ -432,10 +432,10 @@ std::vector getLSsFromTC(lst::Event* event, unsigned int TC } //____________________________________________________________________________________________ -std::tuple, std::vector> getHitIdxsAndHitTypesFromTC(lst::Event* event, +std::tuple, std::vector> getHitIdxsAndHitTypesFromTC(Event* event, unsigned TC) { // Get the type of the track candidate - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); short type = trackCandidates->trackCandidateType[TC]; unsigned int objidx = trackCandidates->directObjectIndices[TC]; switch (type) { diff --git a/RecoTracker/LSTCore/standalone/code/core/AccessHelper.h b/RecoTracker/LSTCore/standalone/code/core/AccessHelper.h index d0924518eeb4d..6c856b22915d4 100644 --- a/RecoTracker/LSTCore/standalone/code/core/AccessHelper.h +++ b/RecoTracker/LSTCore/standalone/code/core/AccessHelper.h @@ -5,7 +5,7 @@ #include #include "Event.h" -using LSTEvent = lst::Event; +using LSTEvent = ALPAKA_ACCELERATOR_NAMESPACE::lst::Event; enum { kpT5 = 7, kpT3 = 5, kT5 = 4, kpLS = 8 }; diff --git a/RecoTracker/LSTCore/standalone/code/core/AnalysisConfig.h b/RecoTracker/LSTCore/standalone/code/core/AnalysisConfig.h index 8608bc95ed2fa..ce7ce3824849e 100644 --- a/RecoTracker/LSTCore/standalone/code/core/AnalysisConfig.h +++ b/RecoTracker/LSTCore/standalone/code/core/AnalysisConfig.h @@ -100,7 +100,7 @@ class AnalysisConfig { std::map>> moduleSimHits; std::map modulePopulation; - lst::ModuleConnectionMap moduleConnectiongMapLoose; + ::lst::ModuleConnectionMap moduleConnectiongMapLoose; // Boolean to trigger whether to run cut_value_ntupling bool do_cut_value_ntuple; diff --git a/RecoTracker/LSTCore/standalone/code/core/trkCore.cc b/RecoTracker/LSTCore/standalone/code/core/trkCore.cc index d6657c5e512f6..3841affaaf059 100644 --- a/RecoTracker/LSTCore/standalone/code/core/trkCore.cc +++ b/RecoTracker/LSTCore/standalone/code/core/trkCore.cc @@ -1,7 +1,5 @@ #include "trkCore.h" -using namespace ALPAKA_ACCELERATOR_NAMESPACE; - //___________________________________________________________________________________________________________________________________________________________________________________________ bool goodEvent() { if (ana.specific_event_index >= 0) { @@ -22,12 +20,13 @@ bool goodEvent() { } //___________________________________________________________________________________________________________________________________________________________________________________________ -float runMiniDoublet(lst::Event *event, int evt) { +float runMiniDoublet(LSTEvent *event, int evt) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Reco Mini-Doublet start " << evt << std::endl; my_timer.Start(); event->createMiniDoublets(); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float md_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) @@ -74,12 +73,13 @@ float runMiniDoublet(lst::Event *event, int evt) { } //___________________________________________________________________________________________________________________________________________________________________________________________ -float runSegment(lst::Event *event) { +float runSegment(LSTEvent *event) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Reco Segment start" << std::endl; my_timer.Start(); event->createSegmentsWithModuleMap(); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float sg_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) std::cout << "Reco Segment processing time: " << sg_elapsed << " secs" << std::endl; @@ -111,12 +111,13 @@ float runSegment(lst::Event *event) { } //___________________________________________________________________________________________________________________________________________________________________________________________ -float runT3(lst::Event *event) { +float runT3(LSTEvent *event) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Reco T3 start" << std::endl; my_timer.Start(); event->createTriplets(); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float t3_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) std::cout << "Reco T3 processing time: " << t3_elapsed << " secs" << std::endl; @@ -152,12 +153,13 @@ float runT3(lst::Event *event) { } //___________________________________________________________________________________________________________________________________________________________________________________________ -float runpT3(lst::Event *event) { +float runpT3(LSTEvent *event) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Reco Pixel Triplet pT3 start" << std::endl; my_timer.Start(); event->createPixelTriplets(); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float pt3_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) std::cout << "Reco pT3 processing time: " << pt3_elapsed << " secs" << std::endl; @@ -168,12 +170,13 @@ float runpT3(lst::Event *event) { } //___________________________________________________________________________________________________________________________________________________________________________________________ -float runQuintuplet(lst::Event *event) { +float runQuintuplet(LSTEvent *event) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Reco Quintuplet start" << std::endl; my_timer.Start(); event->createQuintuplets(); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float t5_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) std::cout << "Reco Quintuplet processing time: " << t5_elapsed << " secs" << std::endl; @@ -213,12 +216,13 @@ float runQuintuplet(lst::Event *event) { } //___________________________________________________________________________________________________________________________________________________________________________________________ -float runPixelLineSegment(lst::Event *event, bool no_pls_dupclean) { +float runPixelLineSegment(LSTEvent *event, bool no_pls_dupclean) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Reco Pixel Line Segment start" << std::endl; my_timer.Start(); event->pixelLineSegmentCleaning(no_pls_dupclean); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float pls_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) std::cout << "Reco Pixel Line Segment processing time: " << pls_elapsed << " secs" << std::endl; @@ -227,12 +231,13 @@ float runPixelLineSegment(lst::Event *event, bool no_pls_dupclean) { } //___________________________________________________________________________________________________________________________________________________________________________________________ -float runPixelQuintuplet(lst::Event *event) { +float runPixelQuintuplet(LSTEvent *event) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Reco Pixel Quintuplet start" << std::endl; my_timer.Start(); event->createPixelQuintuplets(); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float pt5_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) std::cout << "Reco Pixel Quintuplet processing time: " << pt5_elapsed << " secs" << std::endl; @@ -243,12 +248,13 @@ float runPixelQuintuplet(lst::Event *event) { } //___________________________________________________________________________________________________________________________________________________________________________________________ -float runTrackCandidate(lst::Event *event, bool no_pls_dupclean, bool tc_pls_triplets) { +float runTrackCandidate(LSTEvent *event, bool no_pls_dupclean, bool tc_pls_triplets) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Reco TrackCandidate start" << std::endl; my_timer.Start(); event->createTrackCandidates(no_pls_dupclean, tc_pls_triplets); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float tc_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) std::cout << "Reco TrackCandidate processing time: " << tc_elapsed << " secs" << std::endl; @@ -839,7 +845,7 @@ void addInputsToLineSegmentTrackingPreLoad(std::vector> &out_ } //___________________________________________________________________________________________________________________________________________________________________________________________ -float addInputsToEventPreLoad(lst::Event *event, +float addInputsToEventPreLoad(LSTEvent *event, bool useOMP, std::vector trkX, std::vector trkY, @@ -892,6 +898,7 @@ float addInputsToEventPreLoad(lst::Event *event, superbin_vec, pixelType_vec, isQuad_vec); + event->wait(); // device side event calls are asynchronous: wait to measure time or print float hit_loading_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) @@ -1143,7 +1150,7 @@ void writeMetaData() { // DEPRECATED FUNCTIONS //__________________________________________________________________________________________ -[[deprecated]] float addInputsToLineSegmentTracking(lst::Event &event, bool useOMP) { +[[deprecated]] float addInputsToLineSegmentTracking(LSTEvent &event, bool useOMP) { TStopwatch my_timer; if (ana.verbose >= 2) std::cout << "Loading Inputs (i.e. outer tracker hits, and pixel line segements) to the Line Segment Tracking.... " @@ -1331,6 +1338,7 @@ void writeMetaData() { pixelType_vec, isQuad_vec); + event.wait(); // device side event calls are asynchronous: wait to measure time or print float hit_loading_elapsed = my_timer.RealTime(); if (ana.verbose >= 2) std::cout << "Loading inputs processing time: " << hit_loading_elapsed << " secs" << std::endl; @@ -1338,6 +1346,6 @@ void writeMetaData() { } //__________________________________________________________________________________________ -[[deprecated]] float addInputsToLineSegmentTrackingUsingExplicitMemory(lst::Event &event) { +[[deprecated]] float addInputsToLineSegmentTrackingUsingExplicitMemory(LSTEvent &event) { return addInputsToLineSegmentTracking(event, true); } diff --git a/RecoTracker/LSTCore/standalone/code/core/trkCore.h b/RecoTracker/LSTCore/standalone/code/core/trkCore.h index 0a2fddaba9d5c..66d5c10baf431 100644 --- a/RecoTracker/LSTCore/standalone/code/core/trkCore.h +++ b/RecoTracker/LSTCore/standalone/code/core/trkCore.h @@ -11,7 +11,7 @@ #include #include -using LSTEvent = lst::Event; +using LSTEvent = ALPAKA_ACCELERATOR_NAMESPACE::lst::Event; // --------------------- ======================== --------------------- diff --git a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc index 33eaeefc2d796..7c330a768a175 100644 --- a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc +++ b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.cc @@ -1,6 +1,6 @@ #include "write_lst_ntuple.h" -using namespace ALPAKA_ACCELERATOR_NAMESPACE; +using namespace ALPAKA_ACCELERATOR_NAMESPACE::lst; //________________________________________________________________________________________________________________________________ void createOutputBranches() { @@ -9,7 +9,7 @@ void createOutputBranches() { } //________________________________________________________________________________________________________________________________ -void fillOutputBranches(lst::Event* event) { +void fillOutputBranches(Event* event) { setOutputBranches(event); setOptionalOutputBranches(event); if (ana.gnn_ntuple) @@ -183,7 +183,7 @@ void createGnnNtupleBranches() { } //________________________________________________________________________________________________________________________________ -void setOutputBranches(lst::Event* event) { +void setOutputBranches(Event* event) { // ============ Sim tracks ============= int n_accepted_simtrk = 0; for (unsigned int isimtrk = 0; isimtrk < trk.sim_pt().size(); ++isimtrk) { @@ -226,7 +226,7 @@ void setOutputBranches(lst::Event* event) { std::vector> tc_matched_simIdx; // ============ Track candidates ============= - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); unsigned int nTrackCandidates = *trackCandidates->nTrackCandidates; for (unsigned int idx = 0; idx < nTrackCandidates; idx++) { // Compute reco quantities of track candidate based on final object @@ -278,7 +278,7 @@ void setOutputBranches(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void setOptionalOutputBranches(lst::Event* event) { +void setOptionalOutputBranches(Event* event) { #ifdef CUT_VALUE_DEBUG setPixelQuintupletOutputBranches(event); @@ -289,12 +289,12 @@ void setOptionalOutputBranches(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void setPixelQuintupletOutputBranches(lst::Event* event) { +void setPixelQuintupletOutputBranches(Event* event) { // ============ pT5 ============= - lst::PixelQuintuplets const* pixelQuintuplets = event->getPixelQuintuplets()->data(); - lst::Quintuplets const* quintuplets = event->getQuintuplets()->data(); - lst::Segments const* segments = event->getSegments()->data(); - lst::Modules const* modules = event->getModules()->data(); + PixelQuintuplets const* pixelQuintuplets = event->getPixelQuintuplets()->data(); + Quintuplets const* quintuplets = event->getQuintuplets()->data(); + Segments const* segments = event->getSegments()->data(); + ::lst::Modules const* modules = event->getModules()->data(); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); unsigned int nPixelQuintuplets = @@ -305,7 +305,7 @@ void setPixelQuintupletOutputBranches(lst::Event* event) { for (unsigned int pT5 = 0; pT5 < nPixelQuintuplets; pT5++) { unsigned int T5Index = getT5FrompT5(event, pT5); unsigned int pLSIndex = getPixelLSFrompT5(event, pT5); - float pt = (__H2F(quintuplets->innerRadius[T5Index]) * lst::k2Rinv1GeVf * 2 + segments->ptIn[pLSIndex]) / 2; + float pt = (__H2F(quintuplets->innerRadius[T5Index]) * k2Rinv1GeVf * 2 + segments->ptIn[pLSIndex]) / 2; float eta = segments->eta[pLSIndex]; float phi = segments->phi[pLSIndex]; @@ -365,10 +365,10 @@ void setPixelQuintupletOutputBranches(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void setQuintupletOutputBranches(lst::Event* event) { - lst::Quintuplets const* quintuplets = event->getQuintuplets()->data(); - lst::ObjectRanges const* ranges = event->getRanges()->data(); - lst::Modules const* modules = event->getModules()->data(); +void setQuintupletOutputBranches(Event* event) { + Quintuplets const* quintuplets = event->getQuintuplets()->data(); + ObjectRanges const* ranges = event->getRanges()->data(); + ::lst::Modules const* modules = event->getModules()->data(); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); std::vector sim_t5_matched(n_accepted_simtrk); @@ -378,7 +378,7 @@ void setQuintupletOutputBranches(lst::Event* event) { int nQuintuplets = quintuplets->nQuintuplets[lowerModuleIdx]; for (unsigned int idx = 0; idx < nQuintuplets; idx++) { unsigned int quintupletIndex = ranges->quintupletModuleIndices[lowerModuleIdx] + idx; - float pt = __H2F(quintuplets->innerRadius[quintupletIndex]) * lst::k2Rinv1GeVf * 2; + float pt = __H2F(quintuplets->innerRadius[quintupletIndex]) * k2Rinv1GeVf * 2; float eta = __H2F(quintuplets->eta[quintupletIndex]); float phi = __H2F(quintuplets->phi[quintupletIndex]); @@ -436,10 +436,10 @@ void setQuintupletOutputBranches(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void setPixelTripletOutputBranches(lst::Event* event) { - lst::PixelTriplets const* pixelTriplets = event->getPixelTriplets()->data(); - lst::Modules const* modules = event->getModules()->data(); - lst::Segments const* segments = event->getSegments()->data(); +void setPixelTripletOutputBranches(Event* event) { + PixelTriplets const* pixelTriplets = event->getPixelTriplets()->data(); + ::lst::Modules const* modules = event->getModules()->data(); + Segments const* segments = event->getSegments()->data(); int n_accepted_simtrk = ana.tx->getBranch>("sim_TC_matched").size(); unsigned int nPixelTriplets = *pixelTriplets->nPixelTriplets; @@ -499,14 +499,14 @@ void setPixelTripletOutputBranches(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void setGnnNtupleBranches(lst::Event* event) { +void setGnnNtupleBranches(Event* event) { // Get relevant information - lst::Segments const* segments = event->getSegments()->data(); - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::Hits const* hitsEvt = event->getHits()->data(); - lst::Modules const* modules = event->getModules()->data(); - lst::ObjectRanges const* ranges = event->getRanges()->data(); - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + Segments const* segments = event->getSegments()->data(); + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + Hits const* hitsEvt = event->getHits()->data(); + ::lst::Modules const* modules = event->getModules()->data(); + ObjectRanges const* ranges = event->getRanges()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); std::set mds_used_in_sg; std::map md_index_map; @@ -640,10 +640,10 @@ void setGnnNtupleBranches(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void setGnnNtupleMiniDoublet(lst::Event* event, unsigned int MD) { +void setGnnNtupleMiniDoublet(Event* event, unsigned int MD) { // Get relevant information - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::Hits const* hitsEvt = event->getHits()->data(); + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + Hits const* hitsEvt = event->getHits()->data(); // Get the hit indices unsigned int hit0 = miniDoublets->anchorHitIndices[MD]; @@ -680,7 +680,7 @@ void setGnnNtupleMiniDoublet(lst::Event* event, unsigned int MD) { float dphichange = miniDoublets->dphichanges[MD]; // Computing pt - float pt = hit0_r * lst::k2Rinv1GeVf / sin(dphichange); + float pt = hit0_r * k2Rinv1GeVf / sin(dphichange); // T5 eta and phi are computed using outer and innermost hits lst_math::Hit hitA(trk.ph2_x()[anchitidx], trk.ph2_y()[anchitidx], trk.ph2_z()[anchitidx]); @@ -708,10 +708,9 @@ void setGnnNtupleMiniDoublet(lst::Event* event, unsigned int MD) { } //________________________________________________________________________________________________________________________________ -std::tuple> parseTrackCandidate(lst::Event* event, - unsigned int idx) { +std::tuple> parseTrackCandidate(Event* event, unsigned int idx) { // Get the type of the track candidate - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); short type = trackCandidates->trackCandidateType[idx]; enum { pT5 = 7, pT3 = 5, T5 = 4, pLS = 8 }; @@ -742,12 +741,12 @@ std::tuple> parseTrackCandidate( } //________________________________________________________________________________________________________________________________ -std::tuple, std::vector> parsepT5(lst::Event* event, +std::tuple, std::vector> parsepT5(Event* event, unsigned int idx) { // Get relevant information - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); - lst::Quintuplets const* quintuplets = event->getQuintuplets()->data(); - lst::Segments const* segments = event->getSegments()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + Quintuplets const* quintuplets = event->getQuintuplets()->data(); + Segments const* segments = event->getSegments()->data(); // // pictorial representation of a pT5 @@ -843,7 +842,7 @@ std::tuple, std::vectorptIn[pLS]; const float eta_pLS = segments->eta[pLS]; const float phi_pLS = segments->phi[pLS]; - float pt_T5 = __H2F(quintuplets->innerRadius[T5Index]) * 2 * lst::k2Rinv1GeVf; + float pt_T5 = __H2F(quintuplets->innerRadius[T5Index]) * 2 * k2Rinv1GeVf; const float pt = (pt_T5 + pt_pLS) / 2; // Form the hit idx/type std::vector @@ -854,12 +853,12 @@ std::tuple, std::vector, std::vector> parsepT3(lst::Event* event, +std::tuple, std::vector> parsepT3(Event* event, unsigned int idx) { // Get relevant information - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); - lst::Triplets const* triplets = event->getTriplets()->data(); - lst::Segments const* segments = event->getSegments()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + Triplets const* triplets = event->getTriplets()->data(); + Segments const* segments = event->getSegments()->data(); // // pictorial representation of a pT3 @@ -876,7 +875,7 @@ std::tuple, std::vectorptIn[pLS]; const float eta_pLS = segments->eta[pLS]; const float phi_pLS = segments->phi[pLS]; - float pt_T3 = triplets->circleRadius[T3] * 2 * lst::k2Rinv1GeVf; + float pt_T3 = triplets->circleRadius[T3] * 2 * k2Rinv1GeVf; // average pt const float pt = (pt_pLS + pt_T3) / 2; @@ -889,10 +888,10 @@ std::tuple, std::vector, std::vector> parseT5(lst::Event* event, +std::tuple, std::vector> parseT5(Event* event, unsigned int idx) { - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); - lst::Quintuplets const* quintuplets = event->getQuintuplets()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + Quintuplets const* quintuplets = event->getQuintuplets()->data(); unsigned int T5 = trackCandidates->directObjectIndices[idx]; std::vector hits = getHitsFromT5(event, T5); @@ -908,7 +907,7 @@ std::tuple, std::vectorinnerRadius[T5] * lst::k2Rinv1GeVf * 2; + const float pt = quintuplets->innerRadius[T5] * k2Rinv1GeVf * 2; // T5 eta and phi are computed using outer and innermost hits lst_math::Hit hitA(trk.ph2_x()[Hit_0], trk.ph2_y()[Hit_0], trk.ph2_z()[Hit_0]); @@ -923,10 +922,10 @@ std::tuple, std::vector, std::vector> parsepLS(lst::Event* event, +std::tuple, std::vector> parsepLS(Event* event, unsigned int idx) { - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); - lst::Segments const* segments = event->getSegments()->data(); + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + Segments const* segments = event->getSegments()->data(); // Getting pLS index unsigned int pLS = trackCandidates->directObjectIndices[idx]; @@ -944,9 +943,9 @@ std::tuple, std::vector* event) { - lst::Modules const* modules = event->getModules()->data(); - lst::ObjectRanges const* ranges = event->getRanges()->data(); +void printHitMultiplicities(Event* event) { + ::lst::Modules const* modules = event->getModules()->data(); + ObjectRanges const* ranges = event->getRanges()->data(); int nHits = 0; for (unsigned int idx = 0; idx <= *(modules->nLowerModules); @@ -959,9 +958,9 @@ void printHitMultiplicities(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void printMiniDoubletMultiplicities(lst::Event* event) { - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::Modules const* modules = event->getModules()->data(); +void printMiniDoubletMultiplicities(Event* event) { + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + ::lst::Modules const* modules = event->getModules()->data(); int nMiniDoublets = 0; int totOccupancyMiniDoublets = 0; @@ -978,7 +977,7 @@ void printMiniDoubletMultiplicities(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void printAllObjects(lst::Event* event) { +void printAllObjects(Event* event) { printMDs(event); printLSs(event); printpLSs(event); @@ -986,11 +985,11 @@ void printAllObjects(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void printMDs(lst::Event* event) { - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::Hits const* hitsEvt = event->getHits()->data(); - lst::Modules const* modules = event->getModules()->data(); - lst::ObjectRanges const* ranges = event->getRanges()->data(); +void printMDs(Event* event) { + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + Hits const* hitsEvt = event->getHits()->data(); + ::lst::Modules const* modules = event->getModules()->data(); + ObjectRanges const* ranges = event->getRanges()->data(); // Then obtain the lower module index for (unsigned int idx = 0; idx <= *(modules->nLowerModules); ++idx) { @@ -1008,12 +1007,12 @@ void printMDs(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void printLSs(lst::Event* event) { - lst::Segments const* segments = event->getSegments()->data(); - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::Hits const* hitsEvt = event->getHits()->data(); - lst::Modules const* modules = event->getModules()->data(); - lst::ObjectRanges const* ranges = event->getRanges()->data(); +void printLSs(Event* event) { + Segments const* segments = event->getSegments()->data(); + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + Hits const* hitsEvt = event->getHits()->data(); + ::lst::Modules const* modules = event->getModules()->data(); + ObjectRanges const* ranges = event->getRanges()->data(); int nSegments = 0; for (unsigned int i = 0; i < *(modules->nLowerModules); ++i) { @@ -1040,12 +1039,12 @@ void printLSs(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void printpLSs(lst::Event* event) { - lst::Segments const* segments = event->getSegments()->data(); - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::Hits const* hitsEvt = event->getHits()->data(); - lst::Modules const* modules = event->getModules()->data(); - lst::ObjectRanges const* ranges = event->getRanges()->data(); +void printpLSs(Event* event) { + Segments const* segments = event->getSegments()->data(); + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + Hits const* hitsEvt = event->getHits()->data(); + ::lst::Modules const* modules = event->getModules()->data(); + ObjectRanges const* ranges = event->getRanges()->data(); unsigned int i = *(modules->nLowerModules); unsigned int idx = i; //modules->lowerModuleIndices[i]; @@ -1070,12 +1069,12 @@ void printpLSs(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void printT3s(lst::Event* event) { - lst::Triplets const* triplets = event->getTriplets()->data(); - lst::Segments const* segments = event->getSegments()->data(); - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::Hits const* hitsEvt = event->getHits()->data(); - lst::Modules const* modules = event->getModules()->data(); +void printT3s(Event* event) { + Triplets const* triplets = event->getTriplets()->data(); + Segments const* segments = event->getSegments()->data(); + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + Hits const* hitsEvt = event->getHits()->data(); + ::lst::Modules const* modules = event->getModules()->data(); int nTriplets = 0; for (unsigned int i = 0; i < *(modules->nLowerModules); ++i) { // unsigned int idx = modules->lowerModuleIndices[i]; @@ -1112,13 +1111,13 @@ void printT3s(lst::Event* event) { } //________________________________________________________________________________________________________________________________ -void debugPrintOutlierMultiplicities(lst::Event* event) { - lst::TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); - lst::Triplets const* triplets = event->getTriplets()->data(); - lst::Segments const* segments = event->getSegments()->data(); - lst::MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); - lst::Modules const* modules = event->getModules()->data(); - lst::ObjectRanges const* ranges = event->getRanges()->data(); +void debugPrintOutlierMultiplicities(Event* event) { + TrackCandidates const* trackCandidates = event->getTrackCandidates()->data(); + Triplets const* triplets = event->getTriplets()->data(); + Segments const* segments = event->getSegments()->data(); + MiniDoublets const* miniDoublets = event->getMiniDoublets()->data(); + ::lst::Modules const* modules = event->getModules()->data(); + ObjectRanges const* ranges = event->getRanges()->data(); //int nTrackCandidates = 0; for (unsigned int idx = 0; idx <= *(modules->nLowerModules); ++idx) { if (trackCandidates->nTrackCandidates[idx] > 50000) { diff --git a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.h b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.h index cd20553772b9a..7a25c0d3cbcc6 100644 --- a/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.h +++ b/RecoTracker/LSTCore/standalone/code/core/write_lst_ntuple.h @@ -11,7 +11,7 @@ #include "trkCore.h" #include "AccessHelper.h" -using LSTEvent = lst::Event; +using LSTEvent = ALPAKA_ACCELERATOR_NAMESPACE::lst::Event; // Common void createOutputBranches();