diff --git a/src/cudadev/CUDACore/device_unique_ptr.h b/src/cudadev/CUDACore/device_unique_ptr.h index ab5d6bc25..fb736cfe2 100644 --- a/src/cudadev/CUDACore/device_unique_ptr.h +++ b/src/cudadev/CUDACore/device_unique_ptr.h @@ -49,8 +49,8 @@ namespace cms { template typename device::impl::make_device_unique_selector::non_array make_device_unique(cudaStream_t stream) { - static_assert(std::is_trivially_constructible::value, - "Allocating with non-trivial constructor on the device memory is not supported"); + static_assert(std::is_trivially_copyable::value, + "Allocating with non-trivial copy on the device memory is not supported"); void *mem = allocate_device(sizeof(T), stream); return typename device::impl::make_device_unique_selector::non_array{reinterpret_cast(mem), device::impl::DeviceDeleter{stream}}; @@ -60,8 +60,8 @@ namespace cms { typename device::impl::make_device_unique_selector::unbounded_array make_device_unique(size_t n, cudaStream_t stream) { using element_type = typename std::remove_extent::type; - static_assert(std::is_trivially_constructible::value, - "Allocating with non-trivial constructor on the device memory is not supported"); + static_assert(std::is_trivially_copyable::value, + "Allocating with non-trivial copy on the device memory is not supported"); void *mem = allocate_device(n * sizeof(element_type), stream); return typename device::impl::make_device_unique_selector::unbounded_array{ reinterpret_cast(mem), device::impl::DeviceDeleter{stream}}; @@ -69,28 +69,6 @@ namespace cms { template typename device::impl::make_device_unique_selector::bounded_array make_device_unique(Args &&...) = delete; - - // No check for the trivial constructor, make it clear in the interface - template - typename device::impl::make_device_unique_selector::non_array make_device_unique_uninitialized( - cudaStream_t stream) { - void *mem = allocate_device(sizeof(T), stream); - return typename device::impl::make_device_unique_selector::non_array{reinterpret_cast(mem), - device::impl::DeviceDeleter{stream}}; - } - - template - typename device::impl::make_device_unique_selector::unbounded_array make_device_unique_uninitialized( - size_t n, cudaStream_t stream) { - using element_type = typename std::remove_extent::type; - void *mem = allocate_device(n * sizeof(element_type), stream); - return typename device::impl::make_device_unique_selector::unbounded_array{ - reinterpret_cast(mem), device::impl::DeviceDeleter{stream}}; - } - - template - typename device::impl::make_device_unique_selector::bounded_array make_device_unique_uninitialized(Args &&...) = - delete; } // namespace cuda } // namespace cms diff --git a/src/cudadev/CUDACore/host_unique_ptr.h b/src/cudadev/CUDACore/host_unique_ptr.h index f34798da3..ba5369bbe 100644 --- a/src/cudadev/CUDACore/host_unique_ptr.h +++ b/src/cudadev/CUDACore/host_unique_ptr.h @@ -39,8 +39,8 @@ namespace cms { // Allocate pinned host memory template typename host::impl::make_host_unique_selector::non_array make_host_unique(cudaStream_t stream) { - static_assert(std::is_trivially_constructible::value, - "Allocating with non-trivial constructor on the pinned host memory is not supported"); + static_assert(std::is_trivially_copyable::value, + "Allocating with non-trivial copy on the pinned host memory is not supported"); void *mem = allocate_host(sizeof(T), stream); return typename host::impl::make_host_unique_selector::non_array{reinterpret_cast(mem)}; } @@ -48,8 +48,8 @@ namespace cms { template typename host::impl::make_host_unique_selector::unbounded_array make_host_unique(size_t n, cudaStream_t stream) { using element_type = typename std::remove_extent::type; - static_assert(std::is_trivially_constructible::value, - "Allocating with non-trivial constructor on the pinned host memory is not supported"); + static_assert(std::is_trivially_copyable::value, + "Allocating with non-trivial copy on the pinned host memory is not supported"); void *mem = allocate_host(n * sizeof(element_type), stream); return typename host::impl::make_host_unique_selector::unbounded_array{reinterpret_cast(mem)}; } diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc index a9feabb92..2c71cdabf 100644 --- a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc +++ b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc @@ -3,17 +3,9 @@ #include "CUDACore/host_unique_ptr.h" #include "CUDADataFormats/SiPixelClustersCUDA.h" -SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) - : moduleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)), - clusInModule_d(cms::cuda::make_device_unique(maxModules, stream)), - moduleId_d(cms::cuda::make_device_unique(maxModules, stream)), - clusModuleStart_d(cms::cuda::make_device_unique(maxModules + 1, stream)) { - auto view = cms::cuda::make_host_unique(stream); - view->moduleStart_ = moduleStart_d.get(); - view->clusInModule_ = clusInModule_d.get(); - view->moduleId_ = moduleId_d.get(); - view->clusModuleStart_ = clusModuleStart_d.get(); +SiPixelClustersCUDA::SiPixelClustersCUDA() : data_d(), deviceLayout_(data_d.get(), 0), deviceView_(deviceLayout_) {} - view_d = cms::cuda::make_device_unique(stream); - cms::cuda::copyAsync(view_d, view, stream); -} +SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream) + : data_d(cms::cuda::make_device_unique(DeviceLayout::computeDataSize(maxModules), stream)), + deviceLayout_(data_d.get(), maxModules), + deviceView_(deviceLayout_) {} diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h index e93b742cf..9f7451239 100644 --- a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h +++ b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h @@ -4,12 +4,49 @@ #include "CUDACore/device_unique_ptr.h" #include "CUDACore/host_unique_ptr.h" #include "CUDACore/cudaCompat.h" +#include "DataFormats/SoALayout.h" +#include "DataFormats/SoAView.h" #include class SiPixelClustersCUDA { public: - SiPixelClustersCUDA() = default; + GENERATE_SOA_LAYOUT(DeviceLayoutTemplate, + SOA_COLUMN(uint32_t, moduleStart), // index of the first pixel of each module + SOA_COLUMN(uint32_t, clusInModule), // number of clusters found in each module + SOA_COLUMN(uint32_t, moduleId), // module id of each module + + // originally from rechits + SOA_COLUMN(uint32_t, clusModuleStart)) // index of the first cluster of each module + + // We use all defaults for the template parameters. + using DeviceLayout = DeviceLayoutTemplate<>; + + GENERATE_SOA_VIEW( + DeviceViewTemplate, + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(DeviceLayout, deviceLayout)), + SOA_VIEW_VALUE_LIST(SOA_VIEW_VALUE(deviceLayout, moduleStart), // index of the first pixel of each module + SOA_VIEW_VALUE(deviceLayout, clusInModule), // number of clusters found in each module + SOA_VIEW_VALUE(deviceLayout, moduleId), // module id of each module + + // originally from rechits + SOA_VIEW_VALUE(deviceLayout, clusModuleStart))) // index of the first cluster of each module + + using DeviceView = DeviceViewTemplate<>; + + GENERATE_SOA_CONST_VIEW( + DeviceConstViewTemplate, + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(DeviceView, deviceView)), + SOA_VIEW_VALUE_LIST(SOA_VIEW_VALUE(deviceView, moduleStart), // index of the first pixel of each module + SOA_VIEW_VALUE(deviceView, clusInModule), // number of clusters found in each module + SOA_VIEW_VALUE(deviceView, moduleId), // module id of each module + + // originally from rechits + SOA_VIEW_VALUE(deviceView, clusModuleStart))) // index of the first cluster of each module + + using DeviceConstView = DeviceConstViewTemplate<>; + + explicit SiPixelClustersCUDA(); explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream); ~SiPixelClustersCUDA() = default; @@ -22,40 +59,22 @@ class SiPixelClustersCUDA { uint32_t nClusters() const { return nClusters_h; } - uint32_t *moduleStart() { return moduleStart_d.get(); } - uint32_t *clusInModule() { return clusInModule_d.get(); } - uint32_t *moduleId() { return moduleId_d.get(); } - uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } - - uint32_t const *moduleStart() const { return moduleStart_d.get(); } - uint32_t const *clusInModule() const { return clusInModule_d.get(); } - uint32_t const *moduleId() const { return moduleId_d.get(); } - uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } + uint32_t *moduleStart() { return deviceView_.moduleStart(); } + uint32_t *clusInModule() { return deviceView_.clusInModule(); } + uint32_t *moduleId() { return deviceView_.moduleId(); } + uint32_t *clusModuleStart() { return deviceView_.clusModuleStart(); } - class DeviceConstView { - public: - __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); } - __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); } - __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); } - __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); } + uint32_t const *moduleStart() const { return deviceView_.moduleStart(); } + uint32_t const *clusInModule() const { return deviceView_.clusInModule(); } + uint32_t const *moduleId() const { return deviceView_.moduleId(); } + uint32_t const *clusModuleStart() const { return deviceView_.clusModuleStart(); } - uint32_t const *moduleStart_; - uint32_t const *clusInModule_; - uint32_t const *moduleId_; - uint32_t const *clusModuleStart_; - }; - - DeviceConstView *view() const { return view_d.get(); } + DeviceConstView view() const { return DeviceConstView(deviceView_); } private: - cms::cuda::device::unique_ptr moduleStart_d; // index of the first pixel of each module - cms::cuda::device::unique_ptr clusInModule_d; // number of clusters found in each module - cms::cuda::device::unique_ptr moduleId_d; // module id of each module - - // originally from rechits - cms::cuda::device::unique_ptr clusModuleStart_d; // index of the first cluster of each module - - cms::cuda::device::unique_ptr view_d; // "me" pointer + cms::cuda::device::unique_ptr data_d; // Single SoA storage + DeviceLayout deviceLayout_; + DeviceView deviceView_; uint32_t nClusters_h = 0; }; diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc index fd87fee56..55837fa92 100644 --- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc +++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc @@ -5,44 +5,59 @@ #include "CUDACore/host_unique_ptr.h" SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) - : xx_d(cms::cuda::make_device_unique(maxFedWords, stream)), - yy_d(cms::cuda::make_device_unique(maxFedWords, stream)), - adc_d(cms::cuda::make_device_unique(maxFedWords, stream)), - moduleInd_d(cms::cuda::make_device_unique(maxFedWords, stream)), - clus_d(cms::cuda::make_device_unique(maxFedWords, stream)), - view_d(cms::cuda::make_device_unique(stream)), - pdigi_d(cms::cuda::make_device_unique(maxFedWords, stream)), - rawIdArr_d(cms::cuda::make_device_unique(maxFedWords, stream)) { - auto view = cms::cuda::make_host_unique(stream); - view->xx_ = xx_d.get(); - view->yy_ = yy_d.get(); - view->adc_ = adc_d.get(); - view->moduleInd_ = moduleInd_d.get(); - view->clus_ = clus_d.get(); - - cms::cuda::copyAsync(view_d, view, stream); -} + : data_d(cms::cuda::make_device_unique( + DeviceOnlyLayout::computeDataSize(maxFedWords) + HostDeviceLayout::computeDataSize(maxFedWords), stream)), + deviceOnlyLayout_d(data_d.get(), maxFedWords), + hostDeviceLayout_d(deviceOnlyLayout_d.soaMetadata().nextByte(), maxFedWords), + deviceFullView_(deviceOnlyLayout_d, hostDeviceLayout_d), + devicePixelConstView_(deviceFullView_) {} -cms::cuda::host::unique_ptr SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nDigis(), stream); - cms::cuda::copyAsync(ret, adc_d, nDigis(), stream); - return ret; -} +SiPixelDigisCUDA::SiPixelDigisCUDA() + : data_d(), deviceOnlyLayout_d(), hostDeviceLayout_d(), deviceFullView_(), devicePixelConstView_() {} -cms::cuda::host::unique_ptr SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nDigis(), stream); - cms::cuda::copyAsync(ret, clus_d, nDigis(), stream); - return ret; +SiPixelDigisCUDA::HostStore::HostStore() : data_h(), hostLayout_(nullptr, 0), hostView_(hostLayout_) {} + +SiPixelDigisCUDA::HostStore::HostStore(size_t maxFedWords, cudaStream_t stream) + : data_h(cms::cuda::make_host_unique(SiPixelDigisCUDA::HostDeviceLayout::computeDataSize(maxFedWords), + stream)), + hostLayout_(data_h.get(), maxFedWords), + hostView_(hostLayout_) {} + +void SiPixelDigisCUDA::HostStore::reset() { + hostLayout_ = HostDeviceLayout(); + hostView_ = HostDeviceView(hostLayout_); + data_h.reset(); } -cms::cuda::host::unique_ptr SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nDigis(), stream); - cms::cuda::copyAsync(ret, pdigi_d, nDigis(), stream); +cms::cuda::host::unique_ptr SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const { + auto ret = cms::cuda::make_host_unique(nDigis(), stream); + // TODO: this is downgraded from cms::cuda::copyAsync as we copy data from within a block but not the full block. + cudaCheck(cudaMemcpyAsync( + ret.get(), deviceFullView_.adc(), nDigis() * sizeof(decltype(ret[0])), cudaMemcpyDeviceToHost, stream)); return ret; } -cms::cuda::host::unique_ptr SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nDigis(), stream); - cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), stream); +SiPixelDigisCUDA::HostStore SiPixelDigisCUDA::dataToHostAsync(cudaStream_t stream) const { + // Allocate the needed space only and build the compact data in place in host memory (from the larger device memory). + // Due to the compaction with the 2D copy, we need to know the precise geometry, and hence operate on the store (as opposed + // to the view, which is unaware of the column pitches. + HostStore ret(nDigis(), stream); + auto rhlsm = ret.hostLayout_.soaMetadata(); + auto hdlsm_d = hostDeviceLayout_d.soaMetadata(); + cudaCheck(cudaMemcpyAsync(rhlsm.addressOf_adc(), + hdlsm_d.addressOf_adc(), + nDigis_h * sizeof(*rhlsm.addressOf_adc()), + cudaMemcpyDeviceToHost, + stream)); + // Copy the other columns, realigning the data in shorter arrays. clus is the first but all 3 columns (clus, pdigis, rawIdArr) have + // the same geometry. + cudaCheck(cudaMemcpy2DAsync(rhlsm.addressOf_clus(), + rhlsm.clusPitch(), + hdlsm_d.addressOf_clus(), + hdlsm_d.clusPitch(), + 3 /* rows */, + nDigis() * sizeof(decltype(*ret.hostView_.clus())), + cudaMemcpyDeviceToHost, + stream)); return ret; -} +} \ No newline at end of file diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h index 03ae6639a..734b3631b 100644 --- a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h +++ b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.h @@ -6,10 +6,79 @@ #include "CUDACore/cudaCompat.h" #include "CUDACore/device_unique_ptr.h" #include "CUDACore/host_unique_ptr.h" +#include "DataFormats/SoALayout.h" +#include "DataFormats/SoAView.h" class SiPixelDigisCUDA { public: - SiPixelDigisCUDA() = default; + GENERATE_SOA_LAYOUT( + DeviceOnlyLayoutTemplate, + /* These are consumed by downstream device code */ + SOA_COLUMN(uint16_t, xx), /* local coordinates of each pixel */ + SOA_COLUMN(uint16_t, yy), /* */ + SOA_COLUMN(uint16_t, moduleInd) /* module id of each pixel */ + ) + + using DeviceOnlyLayout = DeviceOnlyLayoutTemplate<>; + + GENERATE_SOA_LAYOUT( + HostDeviceLayoutTemplate, + /* These are also transferred to host (see HostDataView) */ + SOA_COLUMN(uint16_t, adc), /* ADC of each pixel */ + SOA_COLUMN(int32_t, clus), /* cluster id of each pixel */ + /* These are for CPU output; should we (eventually) place them to a */ + /* separate product? */ + SOA_COLUMN(uint32_t, pdigi), /* packed digi (row, col, adc) of each pixel */ + SOA_COLUMN(uint32_t, rawIdArr) /* DetId of each pixel */ + ) + + using HostDeviceLayout = HostDeviceLayoutTemplate<>; + + GENERATE_SOA_VIEW(HostDeviceViewTemplate, + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(HostDeviceLayout, hostDevice)), + SOA_VIEW_VALUE_LIST( + SOA_VIEW_VALUE(hostDevice, adc), /* ADC of each pixel */ + SOA_VIEW_VALUE(hostDevice, clus), /* cluster id of each pixel */ + SOA_VIEW_VALUE(hostDevice, pdigi), /* packed digi (row, col, adc) of each pixel */ + SOA_VIEW_VALUE(hostDevice, + rawIdArr) /* DetId of each pixel */ + )) + + using HostDeviceView = HostDeviceViewTemplate<>; + + GENERATE_SOA_VIEW( + DeviceFullViewTemplate, + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(DeviceOnlyLayout, deviceOnly), + SOA_VIEW_LAYOUT(HostDeviceLayout, hostDevice)), + SOA_VIEW_VALUE_LIST( + SOA_VIEW_VALUE(deviceOnly, xx), /* local coordinates of each pixel */ + SOA_VIEW_VALUE(deviceOnly, yy), /* */ + SOA_VIEW_VALUE(deviceOnly, moduleInd), /* module id of each pixel */ + SOA_VIEW_VALUE(hostDevice, adc), /* ADC of each pixel */ + SOA_VIEW_VALUE(hostDevice, clus), /* cluster id of each pixel */ + SOA_VIEW_VALUE(hostDevice, pdigi), /* packed digi (row, col, adc) of each pixel */ + SOA_VIEW_VALUE(hostDevice, rawIdArr) /* DetId of each pixel */ + )) + + using DeviceFullView = DeviceFullViewTemplate<>; + + /* Device pixel view: this is a second generation view (view from view) */ + GENERATE_SOA_CONST_VIEW( + DevicePixelConstViewTemplate, + /* We get out data from the DeviceFullView */ + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(DeviceFullView, deviceFullView)), + /* These are consumed by downstream device code */ + SOA_VIEW_VALUE_LIST( + SOA_VIEW_VALUE(deviceFullView, xx), /* local coordinates of each pixel */ + SOA_VIEW_VALUE(deviceFullView, yy), /* */ + SOA_VIEW_VALUE(deviceFullView, moduleInd), /* module id of each pixel */ + SOA_VIEW_VALUE(deviceFullView, adc), /* ADC of each pixel */ + SOA_VIEW_VALUE(deviceFullView, clus) /* cluster id of each pixel */ + )) + + using DevicePixelConstView = DevicePixelConstViewTemplate<>; + + explicit SiPixelDigisCUDA(); explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); ~SiPixelDigisCUDA() = default; @@ -26,60 +95,52 @@ class SiPixelDigisCUDA { uint32_t nModules() const { return nModules_h; } uint32_t nDigis() const { return nDigis_h; } - uint16_t *xx() { return xx_d.get(); } - uint16_t *yy() { return yy_d.get(); } - uint16_t *adc() { return adc_d.get(); } - uint16_t *moduleInd() { return moduleInd_d.get(); } - int32_t *clus() { return clus_d.get(); } - uint32_t *pdigi() { return pdigi_d.get(); } - uint32_t *rawIdArr() { return rawIdArr_d.get(); } - - uint16_t const *xx() const { return xx_d.get(); } - uint16_t const *yy() const { return yy_d.get(); } - uint16_t const *adc() const { return adc_d.get(); } - uint16_t const *moduleInd() const { return moduleInd_d.get(); } - int32_t const *clus() const { return clus_d.get(); } - uint32_t const *pdigi() const { return pdigi_d.get(); } - uint32_t const *rawIdArr() const { return rawIdArr_d.get(); } + uint16_t *xx() { return deviceFullView_.xx(); } + uint16_t *yy() { return deviceFullView_.yy(); } + uint16_t *adc() { return deviceFullView_.adc(); } + uint16_t *moduleInd() { return deviceFullView_.moduleInd(); } + int32_t *clus() { return deviceFullView_.clus(); } + uint32_t *pdigi() { return deviceFullView_.pdigi(); } + uint32_t *rawIdArr() { return deviceFullView_.rawIdArr(); } + + uint16_t const *xx() const { return deviceFullView_.xx(); } + uint16_t const *yy() const { return deviceFullView_.yy(); } + uint16_t const *adc() const { return deviceFullView_.adc(); } + uint16_t const *moduleInd() const { return deviceFullView_.moduleInd(); } + int32_t const *clus() const { return deviceFullView_.clus(); } + uint32_t const *pdigi() const { return deviceFullView_.pdigi(); } + uint32_t const *rawIdArr() const { return deviceFullView_.rawIdArr(); } + + class HostStore { + friend SiPixelDigisCUDA; - cms::cuda::host::unique_ptr adcToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr clusToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr pdigiToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr rawIdArrToHostAsync(cudaStream_t stream) const; - - class DeviceConstView { public: - __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_ + i); } - __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_ + i); } - __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_ + i); } - __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_ + i); } - __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_ + i); } - - uint16_t const *xx_; - uint16_t const *yy_; - uint16_t const *adc_; - uint16_t const *moduleInd_; - int32_t const *clus_; + HostStore(); + const SiPixelDigisCUDA::HostDeviceView view() { return hostView_; } + void reset(); + + private: + HostStore(size_t maxFedWords, cudaStream_t stream); + cms::cuda::host::unique_ptr data_h; + HostDeviceLayout hostLayout_; + HostDeviceView hostView_; }; + HostStore dataToHostAsync(cudaStream_t stream) const; - const DeviceConstView *view() const { return view_d.get(); } + // Special copy for validation + cms::cuda::host::unique_ptr adcToHostAsync(cudaStream_t stream) const; + + const DevicePixelConstView &pixelConstView() const { return devicePixelConstView_; } private: // These are consumed by downstream device code - cms::cuda::device::unique_ptr xx_d; // local coordinates of each pixel - cms::cuda::device::unique_ptr yy_d; // - cms::cuda::device::unique_ptr adc_d; // ADC of each pixel - cms::cuda::device::unique_ptr moduleInd_d; // module id of each pixel - cms::cuda::device::unique_ptr clus_d; // cluster id of each pixel - cms::cuda::device::unique_ptr view_d; // "me" pointer - - // These are for CPU output; should we (eventually) place them to a - // separate product? - cms::cuda::device::unique_ptr pdigi_d; // packed digi (row, col, adc) of each pixel - cms::cuda::device::unique_ptr rawIdArr_d; // DetId of each pixel - + cms::cuda::device::unique_ptr data_d; // Single SoA storage + DeviceOnlyLayout deviceOnlyLayout_d; + HostDeviceLayout hostDeviceLayout_d; + DeviceFullView deviceFullView_; + DevicePixelConstView devicePixelConstView_; uint32_t nModules_h = 0; uint32_t nDigis_h = 0; }; -#endif // CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h +#endif // CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h \ No newline at end of file diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc index 5c1aacaf4..a805b3fa0 100644 --- a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc +++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.cc @@ -3,42 +3,12 @@ #include "CUDACore/device_unique_ptr.h" #include "CUDACore/host_unique_ptr.h" #include "CUDADataFormats/TrackingRecHit2DHeterogeneous.h" +#include "CondFormats/SiPixelROCsStatusAndMapping.h" template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); - cms::cuda::copyAsync(ret, m_store32, 4 * nHits(), stream); +TrackingRecHit2DHostSOAStore TrackingRecHit2DCUDA::hitsToHostAsync(cudaStream_t stream) const { + // copy xl, yl, xerr, yerr, xg, yg, zg,rg, charge, clusterSizeX, clusterSizeY. + TrackingRecHit2DHostSOAStore ret(nHits(), stream); + cms::cuda::copyAsync(ret.hits_h, m_hitsSupportLayerStartStore, ret.hitsLayout_.soaMetadata().byteSize(), stream); return ret; -} - -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(gpuClustering::maxNumModules + 1, stream); - cudaCheck(cudaMemcpyAsync( - ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (gpuClustering::maxNumModules + 1), cudaMemcpyDefault, stream)); - return ret; -} - -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::globalCoordToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(4 * nHits(), stream); - cudaCheck(cudaMemcpyAsync( - ret.get(), m_store32.get() + 4 * nHits(), 4 * nHits() * sizeof(float), cudaMemcpyDefault, stream)); - return ret; -} - -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::chargeToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nHits(), stream); - cudaCheck( - cudaMemcpyAsync(ret.get(), m_store32.get() + 8 * nHits(), nHits() * sizeof(int32_t), cudaMemcpyDefault, stream)); - return ret; -} - -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DCUDA::sizeToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(2 * nHits(), stream); - cudaCheck(cudaMemcpyAsync( - ret.get(), m_store16.get() + 2 * nHits(), 2 * nHits() * sizeof(int16_t), cudaMemcpyDefault, stream)); - return ret; -} +} \ No newline at end of file diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h index 7a19299a9..5294328a8 100644 --- a/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h +++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHeterogeneous.h @@ -3,6 +3,7 @@ #include "CUDADataFormats/TrackingRecHit2DSOAView.h" #include "CUDADataFormats/HeterogeneousSoA.h" +#include "CUDADataFormats/TrackingRecHit2DHostSOAStore.h" template class TrackingRecHit2DHeterogeneous { @@ -10,7 +11,7 @@ class TrackingRecHit2DHeterogeneous { template using unique_ptr = typename Traits::template unique_ptr; - using PhiBinner = TrackingRecHit2DSOAView::PhiBinner; + using PhiBinner = TrackingRecHit2DSOAStore::PhiBinner; TrackingRecHit2DHeterogeneous() = default; @@ -26,8 +27,8 @@ class TrackingRecHit2DHeterogeneous { TrackingRecHit2DHeterogeneous(TrackingRecHit2DHeterogeneous&&) = default; TrackingRecHit2DHeterogeneous& operator=(TrackingRecHit2DHeterogeneous&&) = default; - TrackingRecHit2DSOAView* view() { return m_view.get(); } - TrackingRecHit2DSOAView const* view() const { return m_view.get(); } + TrackingRecHit2DSOAStore* store() { return m_store.get(); } + TrackingRecHit2DSOAStore const* store() const { return m_store.get(); } auto nHits() const { return m_nHits; } @@ -37,30 +38,24 @@ class TrackingRecHit2DHeterogeneous { auto phiBinnerStorage() { return m_phiBinnerStorage; } auto iphi() { return m_iphi; } - // only the local coord and detector index - cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; + // Transfer the local and global coordinates, charge and size + TrackingRecHit2DHostSOAStore hitsToHostAsync(cudaStream_t stream) const; - // for validation - cms::cuda::host::unique_ptr globalCoordToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr chargeToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr sizeToHostAsync(cudaStream_t stream) const; + // apparently unused + //cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; private: - static constexpr uint32_t n16 = 4; // number of elements in m_store16 - static constexpr uint32_t n32 = 10; // number of elements in m_store32 static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious - unique_ptr m_store16; //! - unique_ptr m_store32; //! + unique_ptr m_PhiBinnerStore; //! + unique_ptr m_AverageGeometryStore; //! - unique_ptr m_PhiBinnerStore; //! - unique_ptr m_AverageGeometryStore; //! - - unique_ptr m_view; //! + unique_ptr m_store; //! uint32_t m_nHits; + unique_ptr m_hitsSupportLayerStartStore; //! + uint32_t const* m_hitsModuleStart; // needed for legacy, this is on GPU! // needed as kernel params... @@ -79,21 +74,21 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH uint32_t const* hitsModuleStart, cudaStream_t stream) : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) { - auto view = Traits::template make_host_unique(stream); + auto store = Traits::template make_host_unique(stream); - view->m_nHits = nHits; - m_view = Traits::template make_device_unique(stream); - m_AverageGeometryStore = Traits::template make_device_unique(stream); - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_cpeParams = cpeParams; - view->m_hitsModuleStart = hitsModuleStart; + store->m_nHits = nHits; + m_store = Traits::template make_device_unique(stream); + m_AverageGeometryStore = Traits::template make_device_unique(stream); + store->m_averageGeometry = m_AverageGeometryStore.get(); + store->m_cpeParams = cpeParams; + store->m_hitsModuleStart = hitsModuleStart; - // if empy do not bother + // if empty do not bother if (0 == nHits) { if constexpr (std::is_same::value) { - cms::cuda::copyAsync(m_view, view, stream); + cms::cuda::copyAsync(m_store, store, stream); } else { - m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version + m_store.reset(store.release()); // NOLINT: std::move() breaks CUDA version } return; } @@ -103,46 +98,49 @@ TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nH // if ordering is relevant they may have to be stored phi-ordered by layer or so // this will break 1to1 correspondence with cluster and module locality // so unless proven VERY inefficient we keep it ordered as generated - m_store16 = Traits::template make_device_unique(nHits * n16, stream); - m_store32 = - Traits::template make_device_unique(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream); - m_PhiBinnerStore = Traits::template make_device_unique(stream); - - static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); - static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); + //m_store16 = Traits::template make_device_unique(nHits * n16, stream); + //m_store32 = + // Traits::template make_device_unique(nHits * n32 + phase1PixelTopology::numberOfLayers + 1, stream); + // We need to store all SoA rows for TrackingRecHit2DSOAView::HitsView(nHits) + + // (phase1PixelTopology::numberOfLayers + 1) TrackingRecHit2DSOAView::PhiBinner::index_type. + // As mentioned above, alignment is not important, yet we want to have 32 bits + // (TrackingRecHit2DSOAView::PhiBinner::index_type exactly) alignement for the second part. + // In order to simplify code, we align all to the minimum necessary size (sizeof(TrackingRecHit2DSOAStore::PhiBinner::index_type)). + { + // Simplify a bit following computations + const size_t phiBinnerByteSize = + (phase1PixelTopology::numberOfLayers + 1) * sizeof(TrackingRecHit2DSOAStore::PhiBinner::index_type); + // Allocate the buffer + m_hitsSupportLayerStartStore = Traits::template make_device_unique( + TrackingRecHit2DSOAStore::HitsLayout::computeDataSize(m_nHits) + + TrackingRecHit2DSOAStore::SupportObjectsLayout::computeDataSize(m_nHits) + phiBinnerByteSize, + stream); + // Split the buffer in stores and array + store->m_hitsLayout = TrackingRecHit2DSOAStore::HitsLayout(m_hitsSupportLayerStartStore.get(), nHits); + store->m_supportObjectsLayout = + TrackingRecHit2DSOAStore::SupportObjectsLayout(store->m_hitsLayout.soaMetadata().nextByte(), nHits); + m_hitsLayerStart = store->m_hitsLayerStart = + reinterpret_cast(store->m_supportObjectsLayout.soaMetadata().nextByte()); + // Record additional references + store->m_hitsAndSupportView = + TrackingRecHit2DSOAStore::HitsAndSupportView(store->m_hitsLayout, store->m_supportObjectsLayout); + m_phiBinnerStorage = store->m_phiBinnerStorage = store->m_hitsAndSupportView.phiBinnerStorage(); + m_iphi = store->m_hitsAndSupportView.iphi(); + } + m_PhiBinnerStore = Traits::template make_device_unique(stream); - auto get16 = [&](int i) { return m_store16.get() + i * nHits; }; - auto get32 = [&](int i) { return m_store32.get() + i * nHits; }; + static_assert(sizeof(TrackingRecHit2DSOAStore::hindex_type) == sizeof(float)); + static_assert(sizeof(TrackingRecHit2DSOAStore::hindex_type) == + sizeof(TrackingRecHit2DSOAStore::PhiBinner::index_type)); // copy all the pointers - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(9)); - - view->m_xl = get32(0); - view->m_yl = get32(1); - view->m_xerr = get32(2); - view->m_yerr = get32(3); - - view->m_xg = get32(4); - view->m_yg = get32(5); - view->m_zg = get32(6); - view->m_rg = get32(7); - - m_iphi = view->m_iphi = reinterpret_cast(get16(0)); - - view->m_charge = reinterpret_cast(get32(8)); - view->m_xsize = reinterpret_cast(get16(2)); - view->m_ysize = reinterpret_cast(get16(3)); - view->m_detInd = get16(1); - - m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(n32)); + m_phiBinner = store->m_phiBinner = m_PhiBinnerStore.get(); // transfer view if constexpr (std::is_same::value) { - cms::cuda::copyAsync(m_view, view, stream); + cms::cuda::copyAsync(m_store, store, stream); } else { - m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version + m_store.reset(store.release()); // NOLINT: std::move() breaks CUDA version } } @@ -151,4 +149,4 @@ using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous; using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; -#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h +#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h \ No newline at end of file diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHostSOAStore.cc b/src/cudadev/CUDADataFormats/TrackingRecHit2DHostSOAStore.cc new file mode 100644 index 000000000..6e2338f6c --- /dev/null +++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHostSOAStore.cc @@ -0,0 +1,15 @@ +#include "CUDADataFormats/TrackingRecHit2DHostSOAStore.h" + +TrackingRecHit2DHostSOAStore::TrackingRecHit2DHostSOAStore() + : hitsLayout_(hits_h.get(), 0 /* size */, 1 /* byte alignement */) {} + +void TrackingRecHit2DHostSOAStore::reset() { + hits_h.reset(); + hitsLayout_ = TrackingRecHit2DSOAStore::HitsLayout(); +} + +TrackingRecHit2DHostSOAStore::TrackingRecHit2DHostSOAStore(size_t size, cudaStream_t stream) + : hits_h(cms::cuda::make_host_unique(TrackingRecHit2DSOAStore::HitsLayout::computeDataSize(size), + stream)), + hitsLayout_(hits_h.get(), size), + hitsView_(hitsLayout_) {} diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DHostSOAStore.h b/src/cudadev/CUDADataFormats/TrackingRecHit2DHostSOAStore.h new file mode 100644 index 000000000..e587932d4 --- /dev/null +++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DHostSOAStore.h @@ -0,0 +1,30 @@ + +#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAHostStore_h +#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAHostStore_h + +#include "CUDADataFormats/TrackingRecHit2DSOAView.h" +#include "CUDACore/host_unique_ptr.h" + +template +class TrackingRecHit2DHeterogeneous; + +struct TrackingRecHit2DHostSOAStore { + template + friend class TrackingRecHit2DHeterogeneous; + +public: + TrackingRecHit2DHostSOAStore(); + void reset(); + __device__ __forceinline__ const auto operator[](size_t i) const { return hitsView_[i]; } + __device__ __forceinline__ size_t size() { + return /* TODO: move to view when view will embed size */ hitsLayout_.soaMetadata().size(); + } + +private: + TrackingRecHit2DHostSOAStore(size_t size, cudaStream_t stream); + cms::cuda::host::unique_ptr hits_h; + TrackingRecHit2DSOAStore::HitsLayout hitsLayout_; + TrackingRecHit2DSOAStore::HitsView hitsView_; +}; + +#endif // ndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAHostStore_h \ No newline at end of file diff --git a/src/cudadev/CUDADataFormats/TrackingRecHit2DSOAView.h b/src/cudadev/CUDADataFormats/TrackingRecHit2DSOAView.h index 5638fcd59..7ee03c1ea 100644 --- a/src/cudadev/CUDADataFormats/TrackingRecHit2DSOAView.h +++ b/src/cudadev/CUDADataFormats/TrackingRecHit2DSOAView.h @@ -7,12 +7,14 @@ #include "CUDACore/HistoContainer.h" #include "CUDACore/cudaCompat.h" #include "Geometry/phase1PixelTopology.h" +#include "DataFormats/SoALayout.h" +#include "DataFormats/SoAView.h" namespace pixelCPEforGPU { struct ParamsOnGPU; } -class TrackingRecHit2DSOAView { +class TrackingRecHit2DSOAStore { public: using hindex_type = uint32_t; // if above is <=2^32 @@ -25,36 +27,99 @@ class TrackingRecHit2DSOAView { __device__ __forceinline__ uint32_t nHits() const { return m_nHits; } - __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; } - __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); } - __device__ __forceinline__ float& yLocal(int i) { return m_yl[i]; } - __device__ __forceinline__ float yLocal(int i) const { return __ldg(m_yl + i); } - - __device__ __forceinline__ float& xerrLocal(int i) { return m_xerr[i]; } - __device__ __forceinline__ float xerrLocal(int i) const { return __ldg(m_xerr + i); } - __device__ __forceinline__ float& yerrLocal(int i) { return m_yerr[i]; } - __device__ __forceinline__ float yerrLocal(int i) const { return __ldg(m_yerr + i); } - - __device__ __forceinline__ float& xGlobal(int i) { return m_xg[i]; } - __device__ __forceinline__ float xGlobal(int i) const { return __ldg(m_xg + i); } - __device__ __forceinline__ float& yGlobal(int i) { return m_yg[i]; } - __device__ __forceinline__ float yGlobal(int i) const { return __ldg(m_yg + i); } - __device__ __forceinline__ float& zGlobal(int i) { return m_zg[i]; } - __device__ __forceinline__ float zGlobal(int i) const { return __ldg(m_zg + i); } - __device__ __forceinline__ float& rGlobal(int i) { return m_rg[i]; } - __device__ __forceinline__ float rGlobal(int i) const { return __ldg(m_rg + i); } - - __device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; } - __device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); } - - __device__ __forceinline__ int32_t& charge(int i) { return m_charge[i]; } - __device__ __forceinline__ int32_t charge(int i) const { return __ldg(m_charge + i); } - __device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; } - __device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); } - __device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; } - __device__ __forceinline__ int16_t clusterSizeY(int i) const { return __ldg(m_ysize + i); } - __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; } - __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); } + // Our arrays do not require specific alignment as access will not be coalesced in the current implementation + // Sill, we need the 32 bits integers to be aligned, so we simply declare the SoA with the 32 bits fields first + // and the 16 bits behind (as they have a looser alignment requirement. Then the SoA can be create with a byte + // alignment of 1) + GENERATE_SOA_LAYOUT(HitsLayoutTemplate, + // 32 bits section + // local coord + SOA_COLUMN(float, xLocal), + SOA_COLUMN(float, yLocal), + SOA_COLUMN(float, xerrLocal), + SOA_COLUMN(float, yerrLocal), + + // global coord + SOA_COLUMN(float, xGlobal), + SOA_COLUMN(float, yGlobal), + SOA_COLUMN(float, zGlobal), + SOA_COLUMN(float, rGlobal), + // global coordinates continue in the 16 bits section + + // cluster properties + SOA_COLUMN(int32_t, charge), + + // 16 bits section (and cluster properties immediately continued) + SOA_COLUMN(int16_t, clusterSizeX), + SOA_COLUMN(int16_t, clusterSizeY)) + + // The hits layout does not use default alignment but a more relaxed one. + using HitsLayout = HitsLayoutTemplate; + + GENERATE_SOA_VIEW(HitsViewTemplate, + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(HitsLayout, hitsLayout)), + SOA_VIEW_VALUE_LIST(SOA_VIEW_VALUE(hitsLayout, xLocal), + SOA_VIEW_VALUE(hitsLayout, yLocal), + SOA_VIEW_VALUE(hitsLayout, xerrLocal), + SOA_VIEW_VALUE(hitsLayout, yerrLocal), + + SOA_VIEW_VALUE(hitsLayout, xGlobal), + SOA_VIEW_VALUE(hitsLayout, yGlobal), + SOA_VIEW_VALUE(hitsLayout, zGlobal), + SOA_VIEW_VALUE(hitsLayout, rGlobal), + + SOA_VIEW_VALUE(hitsLayout, charge), + SOA_VIEW_VALUE(hitsLayout, clusterSizeX), + SOA_VIEW_VALUE(hitsLayout, clusterSizeY))) + + using HitsView = HitsViewTemplate<>; + + GENERATE_SOA_LAYOUT(SupportObjectsLayoutTemplate, + // This is the end of the data which is transferred to host. The following columns are supporting + // objects, not transmitted + + // Supporting data (32 bits aligned) + SOA_COLUMN(TrackingRecHit2DSOAStore::PhiBinner::index_type, phiBinnerStorage), + + // global coordinates (not transmitted) + SOA_COLUMN(int16_t, iphi), + + // cluster properties (not transmitted) + SOA_COLUMN(uint16_t, detectorIndex)) + + // The support objects layouts also not use default alignment but a more relaxed one. + using SupportObjectsLayout = SupportObjectsLayoutTemplate; + + GENERATE_SOA_VIEW(HitsAndSupportViewTemplate, + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(HitsLayout, hitsLayout), + SOA_VIEW_LAYOUT(SupportObjectsLayout, supportObjectsLayout)), + SOA_VIEW_VALUE_LIST(SOA_VIEW_VALUE(hitsLayout, xLocal), + SOA_VIEW_VALUE(hitsLayout, yLocal), + SOA_VIEW_VALUE(hitsLayout, xerrLocal), + SOA_VIEW_VALUE(hitsLayout, yerrLocal), + + SOA_VIEW_VALUE(hitsLayout, xGlobal), + SOA_VIEW_VALUE(hitsLayout, yGlobal), + SOA_VIEW_VALUE(hitsLayout, zGlobal), + SOA_VIEW_VALUE(hitsLayout, rGlobal), + + SOA_VIEW_VALUE(hitsLayout, charge), + SOA_VIEW_VALUE(hitsLayout, clusterSizeX), + SOA_VIEW_VALUE(hitsLayout, clusterSizeY), + + SOA_VIEW_VALUE(supportObjectsLayout, phiBinnerStorage), + SOA_VIEW_VALUE(supportObjectsLayout, iphi), + SOA_VIEW_VALUE(supportObjectsLayout, detectorIndex))) + + using HitsAndSupportView = HitsAndSupportViewTemplate; + + // Shortcut operator saving the explicit calls to view in usage. + __device__ __forceinline__ HitsAndSupportView::element operator[](size_t index) { + return m_hitsAndSupportView[index]; + } + __device__ __forceinline__ HitsAndSupportView::const_element operator[](size_t index) const { + return m_hitsAndSupportView[index]; + } __device__ __forceinline__ pixelCPEforGPU::ParamsOnGPU const& cpeParams() const { return *m_cpeParams; } @@ -70,21 +135,14 @@ class TrackingRecHit2DSOAView { __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; } private: - // local coord - float *m_xl, *m_yl; - float *m_xerr, *m_yerr; - - // global coord - float *m_xg, *m_yg, *m_zg, *m_rg; - int16_t* m_iphi; - - // cluster properties - int32_t* m_charge; - int16_t* m_xsize; - int16_t* m_ysize; - uint16_t* m_detInd; - - // supporting objects + // hits layout + HitsLayout m_hitsLayout; + // supporting objects layout + SupportObjectsLayout m_supportObjectsLayout; + // Global view simplifying usage + HitsAndSupportView m_hitsAndSupportView; + + // individually defined supporting objects // m_averageGeometry is corrected for beam spot, not sure where to host it otherwise AverageGeometry* m_averageGeometry; // owned by TrackingRecHit2DHeterogeneous pixelCPEforGPU::ParamsOnGPU const* m_cpeParams; // forwarded from setup, NOT owned @@ -98,4 +156,4 @@ class TrackingRecHit2DSOAView { uint32_t m_nHits; }; -#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h +#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h \ No newline at end of file diff --git a/src/cudadev/CondFormats/SiPixelROCsStatusAndMapping.h b/src/cudadev/CondFormats/SiPixelROCsStatusAndMapping.h index f7cd8dedc..af60caf9d 100644 --- a/src/cudadev/CondFormats/SiPixelROCsStatusAndMapping.h +++ b/src/cudadev/CondFormats/SiPixelROCsStatusAndMapping.h @@ -1,6 +1,9 @@ #ifndef CondFormats_SiPixelObjects_interface_SiPixelROCsStatusAndMapping_h #define CondFormats_SiPixelObjects_interface_SiPixelROCsStatusAndMapping_h +#include "DataFormats/SoALayout.h" +#include "DataFormats/SoAView.h" + namespace pixelgpudetails { // Maximum fed for phase1 is 150 but not all of them are filled // Update the number FED based on maximum fed found in the cabling map @@ -22,4 +25,32 @@ struct SiPixelROCsStatusAndMapping { alignas(128) unsigned int size = 0; }; +GENERATE_SOA_LAYOUT(SiPixelROCsStatusAndMappingLayoutTemplate, + SOA_COLUMN(unsigned int, fed), + SOA_COLUMN(unsigned int, link), + SOA_COLUMN(unsigned int, roc), + SOA_COLUMN(unsigned int, rawId), + SOA_COLUMN(unsigned int, rocInDet), + SOA_COLUMN(unsigned int, moduleId), + SOA_COLUMN(unsigned char, badRocs), + SOA_SCALAR(unsigned int, size)) + +using SiPixelROCsStatusAndMappingLayout = SiPixelROCsStatusAndMappingLayoutTemplate<>; + +GENERATE_SOA_CONST_VIEW(SiPixelROCsStatusAndMappingConstViewTemplate, + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(SiPixelROCsStatusAndMappingLayout, mappingLayout)), + SOA_VIEW_VALUE_LIST(SOA_VIEW_VALUE(mappingLayout, fed), + SOA_VIEW_VALUE(mappingLayout, link), + SOA_VIEW_VALUE(mappingLayout, roc), + SOA_VIEW_VALUE(mappingLayout, rawId), + SOA_VIEW_VALUE(mappingLayout, rocInDet), + SOA_VIEW_VALUE(mappingLayout, moduleId), + SOA_VIEW_VALUE(mappingLayout, badRocs), + SOA_VIEW_VALUE(mappingLayout, size))) + +// Slightly more complex than using, but allows forward declarations. +struct SiPixelROCsStatusAndMappingConstView : public SiPixelROCsStatusAndMappingConstViewTemplate<> { + using SiPixelROCsStatusAndMappingConstViewTemplate<>::SiPixelROCsStatusAndMappingConstViewTemplate; +}; + #endif // CondFormats_SiPixelObjects_interface_SiPixelROCsStatusAndMapping_h diff --git a/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.cc b/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.cc index 9201903db..c09ed1852 100644 --- a/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.cc +++ b/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.cc @@ -13,44 +13,38 @@ #include "CUDACore/host_unique_ptr.h" #include "CUDADataFormats/gpuClusteringConstants.h" #include "CondFormats/SiPixelROCsStatusAndMappingWrapper.h" +#include "CUDACore/copyAsync.h" SiPixelROCsStatusAndMappingWrapper::SiPixelROCsStatusAndMappingWrapper(SiPixelROCsStatusAndMapping const& cablingMap, std::vector modToUnp) : modToUnpDefault(modToUnp.size()), hasQuality_(true) { - cudaCheck(cudaMallocHost(&cablingMapHost, sizeof(SiPixelROCsStatusAndMapping))); - std::memcpy(cablingMapHost, &cablingMap, sizeof(SiPixelROCsStatusAndMapping)); - + // TODO: check if cudaStreamDefault is appropriate + cablingMapHost = cms::cuda::make_host_unique(cudaStreamDefault); + std::memcpy(cablingMapHost.get(), &cablingMap, sizeof(SiPixelROCsStatusAndMapping)); std::copy(modToUnp.begin(), modToUnp.end(), modToUnpDefault.begin()); } -SiPixelROCsStatusAndMappingWrapper::~SiPixelROCsStatusAndMappingWrapper() { cudaCheck(cudaFreeHost(cablingMapHost)); } - -const SiPixelROCsStatusAndMapping* SiPixelROCsStatusAndMappingWrapper::getGPUProductAsync( +SiPixelROCsStatusAndMappingConstView SiPixelROCsStatusAndMappingWrapper::getGPUProductAsync( cudaStream_t cudaStream) const { const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) { // allocate - cudaCheck(cudaMalloc(&data.cablingMapDevice, sizeof(SiPixelROCsStatusAndMapping))); - + data.allocate(stream); // transfer - cudaCheck(cudaMemcpyAsync( - data.cablingMapDevice, this->cablingMapHost, sizeof(SiPixelROCsStatusAndMapping), cudaMemcpyDefault, stream)); + cms::cuda::copyAsync(data.cablingMapDevice, this->cablingMapHost, stream); }); - return data.cablingMapDevice; + return data.cablingMapDeviceView; } const unsigned char* SiPixelROCsStatusAndMappingWrapper::getModToUnpAllAsync(cudaStream_t cudaStream) const { const auto& data = modToUnp_.dataForCurrentDeviceAsync(cudaStream, [this](ModulesToUnpack& data, cudaStream_t stream) { - cudaCheck(cudaMalloc((void**)&data.modToUnpDefault, pixelgpudetails::MAX_SIZE_BYTE_BOOL)); - cudaCheck(cudaMemcpyAsync(data.modToUnpDefault, + data.modToUnpDefault = + cms::cuda::make_device_unique(pixelgpudetails::MAX_SIZE_BYTE_BOOL, stream); + cudaCheck(cudaMemcpyAsync(data.modToUnpDefault.get(), this->modToUnpDefault.data(), this->modToUnpDefault.size() * sizeof(unsigned char), cudaMemcpyDefault, stream)); }); - return data.modToUnpDefault; -} - -SiPixelROCsStatusAndMappingWrapper::GPUData::~GPUData() { cudaCheck(cudaFree(cablingMapDevice)); } - -SiPixelROCsStatusAndMappingWrapper::ModulesToUnpack::~ModulesToUnpack() { cudaCheck(cudaFree(modToUnpDefault)); } + return data.modToUnpDefault.get(); +} \ No newline at end of file diff --git a/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.h b/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.h index 8917a35cc..120ce4d29 100644 --- a/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.h +++ b/src/cudadev/CondFormats/SiPixelROCsStatusAndMappingWrapper.h @@ -3,6 +3,7 @@ #include "CUDACore/ESProduct.h" #include "CUDACore/HostAllocator.h" +#include "CUDACore/host_unique_ptr.h" #include "CUDACore/device_unique_ptr.h" #include "CondFormats/SiPixelROCsStatusAndMapping.h" @@ -12,14 +13,14 @@ class SiPixelROCsStatusAndMappingWrapper { public: + /* This is using a layout as the size is needed. TODO: use views when views start embedding size. */ explicit SiPixelROCsStatusAndMappingWrapper(SiPixelROCsStatusAndMapping const &cablingMap, std::vector modToUnp); - ~SiPixelROCsStatusAndMappingWrapper(); bool hasQuality() const { return hasQuality_; } // returns pointer to GPU memory - const SiPixelROCsStatusAndMapping *getGPUProductAsync(cudaStream_t cudaStream) const; + SiPixelROCsStatusAndMappingConstView getGPUProductAsync(cudaStream_t cudaStream) const; // returns pointer to GPU memory const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const; @@ -28,17 +29,32 @@ class SiPixelROCsStatusAndMappingWrapper { std::vector> modToUnpDefault; bool hasQuality_; - SiPixelROCsStatusAndMapping *cablingMapHost = nullptr; // pointer to struct in CPU + cms::cuda::host::unique_ptr cablingMapHost; // host pined memory for cabling map. struct GPUData { - ~GPUData(); - SiPixelROCsStatusAndMapping *cablingMapDevice = nullptr; // pointer to struct in GPU + void allocate(cudaStream_t stream) { + cablingMapDevice = cms::cuda::make_device_unique(stream); + // Populate the view with individual column pointers + auto &cmd = *cablingMapDevice; + cablingMapDeviceView = SiPixelROCsStatusAndMappingConstView( + pixelgpudetails::MAX_SIZE, + cmd.fed, // Those are array pointers (in device, but we won't dereference them here). + cmd.link, + cmd.roc, + cmd.rawId, + cmd.rocInDet, + cmd.moduleId, + cmd.badRocs, + &cmd.size // This is a scalar, we need the address-of operator + ); + } + cms::cuda::device::unique_ptr cablingMapDevice; + SiPixelROCsStatusAndMappingConstView cablingMapDeviceView; // map struct in GPU }; cms::cuda::ESProduct gpuData_; struct ModulesToUnpack { - ~ModulesToUnpack(); - unsigned char *modToUnpDefault = nullptr; // pointer to GPU + cms::cuda::device::unique_ptr modToUnpDefault; // pointer to GPU }; cms::cuda::ESProduct modToUnp_; }; diff --git a/src/cudadev/DataFormats/SoA.md b/src/cudadev/DataFormats/SoA.md new file mode 100644 index 000000000..d136fb9f1 --- /dev/null +++ b/src/cudadev/DataFormats/SoA.md @@ -0,0 +1,174 @@ +# Structure of array (SoA) generation + +The two header files [`SoALayout.h`](SoALayout.h) and [`SoAView.h`](SoAView.h) define preprocessor macros that allow generating SoA +classes. The SoA classes generate multiple, aligned column from a memory buffer. The memory buffer is allocated separately by the +user, and can be located in a memory space different from the local one (for example, a SoA located in a GPU device memory is be +fully pre-defined on the host and the resulting structure is passed to the GPU kernel). + +This columnar storage allows efficient memory access by GPU kernels (coalesced access on cache line aligned data) and possibly +vectorization. + +Additionally, templation of the layout and view classes will allow compile-time variations of accesses and checks: verification of +alignment and corresponding compiler hinting, cache strategy (non-coherent, streaming with immediate invalidation), range checking. + +## Layout + +`SoALayout` is a macro generated templated class that subdivides a provided buffer into a collection of columns, Eigen columns and +scalars. The buffer is expected to be aligned with a selectable alignment defaulting to the CUDA GPU cache line (128 bytes). All +columns and scalars within a `SoALayout` will be individually aligned, leaving padding at the end of each if necessary. Eigen columns +have each component of the vector or matrix properly aligned in individual column (by defining the stride between components). Only +compile-time sized Eigen vectors and matrices are supported. Scalar members are members of layout with one element, irrespective of +the size of the layout. + +Static utility functions automatically compute the byte size of a layout, taking into account all its columns and alignment. + +## View + +`SoAView` is a macro generated templated class allowing access to columns defined in one or multiple `SoALayout`s or `SoAViews`. The +view can be generated in a constant and non-constant flavors. All view flavors provide with the same interface where scalar elements +are accessed with an `operator()`: `soa.scalar()` while columns (Eigen or not) are accessed via a array of structure (AoS) -like +syntax: `soa[index].x()`. The "struct" object returned by `operator[]` can be used as a shortcut: +`auto si = soa[index]; si.z() = si.x() + zi.y();` + +A view can be instanciated by being passed the layout(s) and view(s) it is defined against, or column by column. + +## SoAMetadata subclass + +In order to no clutter the namespace of the generated class, a subclass name `SoAMetadata` is generated. Its instanciated with the +`soaMetadata()` member function and contains various utility functions, like `size()` (number of elements in the SoA), `byteSize()`, +`byteAlignment()`, `data()` (a pointer to the buffer). A `nextByte()` function computes the first byte of a structure right after a +layout, allowing using a single buffer for multiple layouts. + +## Examples + +A layout can be defined as: + +```C++ +#include "DataFormats/SoALayout.h" + +GENERATE_SOA_LAYOUT(SoA1LayoutTemplate, + // predefined static scalars + // size_t size; + // size_t alignment; + + // columns: one value per element + SOA_COLUMN(double, x), + SOA_COLUMN(double, y), + SOA_COLUMN(double, z), + SOA_EIGEN_COLUMN(Eigen::Vector3d, a), + SOA_EIGEN_COLUMN(Eigen::Vector3d, b), + SOA_EIGEN_COLUMN(Eigen::Vector3d, r), + SOA_COLUMN(uint16_t, color), + SOA_COLUMN(int32_t, value), + SOA_COLUMN(double *, py), + SOA_COLUMN(uint32_t, count), + SOA_COLUMN(uint32_t, anotherCount), + + // scalars: one value for the whole structure + SOA_SCALAR(const char *, description), + SOA_SCALAR(uint32_t, someNumber) +); + +// Default template parameters are < +// size_t ALIGNMENT = cms::soa::CacheLineSize::defaultSize, +// cms::soa::AlignmentEnforcement ALIGNMENT_ENFORCEMENT = cms::soa::AlignmentEnforcement::Relaxed +// > +using SoA1Layout = SoA1LayoutTemplate<>; + +using SoA1LayoutAligned = SoA1LayoutTemplate; +``` + +The buffer of the proper size is allocated, and the layout is populated with: + +```C++ +// Allocation of aligned +size_t elements = 100; +using AlignedBuffer = std::unique_ptr; +AlignedBuffer h_buf (reinterpret_cast(aligned_alloc(SoA1LayoutAligned::byteAlignment, SoA1LayoutAligned::computeDataSize(elements))), std::free); +SoA1LayoutAligned soaLayout(h_buf.get(), elements); +``` + +A view will derive its column types from one or multiple layouts. The macro generating the view takes a list of layouts or views it +gets is data from as a first parameter, and the selection of the columns the view will give access to as a second parameter. + +```C++ +// A 1 to 1 view of the layout (except for unsupported types). +GENERATE_SOA_VIEW(SoA1ViewTemplate, + SOA_VIEW_LAYOUT_LIST( + SOA_VIEW_LAYOUT(SoA1Layout, soa1) + ), + SOA_VIEW_VALUE_LIST( + SOA_VIEW_VALUE(soa1, x), + SOA_VIEW_VALUE(soa1, y), + SOA_VIEW_VALUE(soa1, z), + SOA_VIEW_VALUE(soa1, color), + SOA_VIEW_VALUE(soa1, value), + SOA_VIEW_VALUE(soa1, py), + SOA_VIEW_VALUE(soa1, count), + SOA_VIEW_VALUE(soa1, anotherCount), + SOA_VIEW_VALUE(soa1, description), + SOA_VIEW_VALUE(soa1, someNumber) + ) +); + +using SoA1View = SoA1ViewTemplate<>; + +SoA1View soaView(soaLayout); + +for (size_t i=0; i < soaLayout.soaMetadata().size(); ++i) { + auto si = soaView[i]; + si.x() = si.y() = i; + soaView.someNumber() += i; +} +``` +Any mixture of mutable and const views can also be defined automatically with the layout (for the trivially identical views) using one those macros `GENERATE_SOA_LAYOUT_VIEW_AND_CONST_VIEW`, `GENERATE_SOA_LAYOUT_AND_VIEW` and `GENERATE_SOA_LAYOUT_AND_CONST_VIEW`: + +```C++ +GENERATE_SOA_LAYOUT_VIEW_AND_CONST_VIEW(SoA1LayoutTemplate, SoA1ViewTemplate, SoA1ConstViewTemplate, + // columns: one value per element + SOA_COLUMN(double, x), + SOA_COLUMN(double, y), + SOA_COLUMN(double, z), + SOA_COLUMN(double, sum), + SOA_COLUMN(double, prod), + SOA_COLUMN(uint16_t, color), + SOA_COLUMN(int32_t, value), + SOA_COLUMN(double *, py), + SOA_COLUMN(uint32_t, count), + SOA_COLUMN(uint32_t, anotherCount), + + // scalars: one value for the whole structure + SOA_SCALAR(const char *, description), + SOA_SCALAR(uint32_t, someNumber) +) +``` + +## Template parameters + +The template parameters are: +- Byte aligment (defaulting to the nVidia GPU cache line size (128 bytes)) +- Alignment enforcement (`Relaxed` or `Enforced`). When enforced, the alignment will be checked at construction time, and the accesses +are done with compiler hinting (using the widely supported `__builtin_assume_aligned` intrinsic). + +## Using SoA layouts and views with GPUs + +Instanciation of views and layouts is preferably done on the CPU side. The view object is lightweight, with only one pointer per +column (size to be added later). Extra view class can be generated to restrict this number of pointers to the strict minimum in +scenarios where only a subset of columns are used in a given GPU kernel. + +## Current status and further improvements + +### Available features + +- The layout and views support scalars and columns, alignment and alignment enforcement and hinting. +- Automatic `__restrict__` compiler hinting is supported. +- A shortcut alloCreate a mechanism to derive trivial views and const views from a single layout. +- Cache access style, which was explored, was abandoned as this not-yet-used feature interferes with `__restrict__` support (which is +already in used in existing code). It could be made available as a separate tool that can be used directly by the module developer, +orthogonally from SoA. +- Optional (compile time) range checking validates the index of every column access, throwing an exception on the CPU side and forcing +a segmentation fault to halt kernels. When not enabled, it has no impact on performance (code not compiled) +- Eigen columns are also suported, with both const and non-const flavors. + +### Planned additions +- Improve `dump()` function and turn it into a more classic `operator<<()`. diff --git a/src/cudadev/DataFormats/SoACommon.h b/src/cudadev/DataFormats/SoACommon.h new file mode 100644 index 000000000..88afb07d6 --- /dev/null +++ b/src/cudadev/DataFormats/SoACommon.h @@ -0,0 +1,578 @@ +/* + * Definitions of SoA common parameters for SoA class generators + */ + +#ifndef DataStructures_SoACommon_h +#define DataStructures_SoACommon_h + +#include "boost/preprocessor.hpp" +#include +#include +#include + +// CUDA attributes +#ifdef __CUDACC__ +#define SOA_HOST_ONLY __host__ +#define SOA_DEVICE_ONLY __device__ +#define SOA_HOST_DEVICE __host__ __device__ +#define SOA_HOST_DEVICE_INLINE __host__ __device__ __forceinline__ +#else +#define SOA_HOST_ONLY +#define SOA_DEVICE_ONLY +#define SOA_HOST_DEVICE +#define SOA_HOST_DEVICE_INLINE inline +#endif + +// Exception throwing (or willful crash in kernels) +#if defined(__CUDACC__) && defined(__CUDA_ARCH__) +#define SOA_THROW_OUT_OF_RANGE(A) \ + { \ + printf(A "\n"); \ + *((char*)nullptr) = 0; \ + } +#else +#define SOA_THROW_OUT_OF_RANGE(A) \ + { throw std::out_of_range(A); } +#endif + +/* declare "scalars" (one value shared across the whole SoA) and "columns" (one value per element) */ +#define _VALUE_TYPE_SCALAR 0 +#define _VALUE_TYPE_COLUMN 1 +#define _VALUE_TYPE_EIGEN_COLUMN 2 + +namespace cms::soa { + + enum class SoAColumnType { + scalar = _VALUE_TYPE_SCALAR, + column = _VALUE_TYPE_COLUMN, + eigen = _VALUE_TYPE_EIGEN_COLUMN + }; + enum class RestrictQualify : bool { Enabled, Disabled, Default = Disabled }; + + enum class RangeChecking : bool { Enabled, Disabled, Default = Disabled }; + + template + struct add_restrict {}; + + template + struct add_restrict { + typedef T Value; + typedef T* __restrict__ Pointer; + typedef T& __restrict__ Reference; + typedef const T ConstValue; + typedef const T* __restrict__ PointerToConst; + typedef const T& __restrict__ ReferenceToConst; + }; + + template + struct add_restrict { + typedef T Value; + typedef T* Pointer; + typedef T& Reference; + typedef const T ConstValue; + typedef const T* PointerToConst; + typedef const T& ReferenceToConst; + }; + template + struct SoAParametersImpl; + + // Templated parameter sets for scalar columns and Eigen columns + template + struct SoAConstParametersImpl { + static const SoAColumnType columnType = COLUMN_TYPE; + typedef T ValueType; + typedef const ValueType* TupleOrPointerType; + const ValueType* addr_ = nullptr; + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl(const ValueType* addr) : addr_(addr) {} + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl(const SoAConstParametersImpl& o) { addr_ = o.addr_; } + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl(const SoAParametersImpl& o) { + addr_ = o.addr_; + } + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl() {} + static bool checkAlignement(ValueType* addr, size_t byteAlignment) { + return reinterpret_cast(addr) % byteAlignment; + } + }; + + template + struct SoAConstParametersImpl { + static const SoAColumnType columnType = SoAColumnType::eigen; + typedef T ValueType; + typedef typename T::Scalar ScalarType; + typedef std::tuple TupleOrPointerType; + const ScalarType* addr_ = nullptr; + size_t stride_ = 0; + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl(const ScalarType* addr, size_t stride) + : addr_(addr), stride_(stride) {} + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl(const TupleOrPointerType tuple) + : addr_(std::get<0>(tuple)), stride_(std::get<1>(tuple)) {} + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl(const ScalarType* addr) : addr_(addr) {} + // Trick setter + return self-reference allowing commat-free 2-stage construction in macro contexts (in combination with the + // addr-only constructor. + SoAConstParametersImpl& setStride(size_t stride) { + stride_ = stride; + return *this; + } + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl(const SoAConstParametersImpl& o) { + addr_ = o.addr_; + stride_ = o.stride_; + } + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl(const SoAParametersImpl& o) { + addr_ = o.addr_; + stride_ = o.stride_; + } + SOA_HOST_DEVICE_INLINE SoAConstParametersImpl() {} + static bool checkAlignement(const TupleOrPointerType tuple, size_t byteAlignment) { + const auto& [addr, stride] = tuple; + return reinterpret_cast(addr) % byteAlignment; + } + }; + + // Matryoshka template to avoiding commas in macros + template + struct SoAConstParameters_ColumnType { + template + struct DataType : public SoAConstParametersImpl { + using SoAConstParametersImpl::SoAConstParametersImpl; + }; + }; + + // Templated parameter sets for scalar columns and Eigen columns + template + struct SoAParametersImpl { + static const SoAColumnType columnType = COLUMN_TYPE; + typedef T ValueType; + typedef const ValueType* TupleOrPointerType; + typedef SoAConstParametersImpl ConstType; + friend ConstType; + ValueType* addr_ = nullptr; + SOA_HOST_DEVICE_INLINE SoAParametersImpl(ValueType* addr) : addr_(addr) {} + SOA_HOST_DEVICE_INLINE SoAParametersImpl() {} + static bool checkAlignement(ValueType* addr, size_t byteAlignment) { + return reinterpret_cast(addr) % byteAlignment; + } + }; + + template + struct SoAParametersImpl { + static const SoAColumnType columnType = SoAColumnType::eigen; + typedef T ValueType; + typedef SoAConstParametersImpl ConstType; + friend ConstType; + typedef typename T::Scalar ScalarType; + typedef std::tuple TupleOrPointerType; + ScalarType* addr_ = nullptr; + size_t stride_ = 0; + SOA_HOST_DEVICE_INLINE SoAParametersImpl(ScalarType* addr, size_t stride) + : addr_(addr), stride_(stride) {} + SOA_HOST_DEVICE_INLINE SoAParametersImpl(const TupleOrPointerType tuple) + : addr_(std::get<0>(tuple)), stride_(std::get<1>(tuple)) {} + SOA_HOST_DEVICE_INLINE SoAParametersImpl() {} + SOA_HOST_DEVICE_INLINE SoAParametersImpl(ScalarType* addr) : addr_(addr) {} + // Trick setter + return self-reference allowing commat-free 2-stage construction in macro contexts (in combination with the + // addr-only constructor. + SoAParametersImpl& setStride(size_t stride) { + stride_ = stride; + return *this; + } + static bool checkAlignement(const TupleOrPointerType tuple, size_t byteAlignment) { + const auto& [addr, stride] = tuple; + return reinterpret_cast(addr) % byteAlignment; + } + }; + + // Matryoshka template to avoiding commas in macros + template + struct SoAParameters_ColumnType { + template + struct DataType : public SoAParametersImpl { + using SoAParametersImpl::SoAParametersImpl; + }; + }; + + // Helper template managing the value within it column + // The optional compile time alignment parameter enables informing the + // compiler of alignment (enforced by caller). + template + class SoAValue { + // Eigen is implemented in a specialization + static_assert(COLUMN_TYPE != SoAColumnType::eigen); + + public: + typedef add_restrict Restr; + typedef typename Restr::Value Val; + typedef typename Restr::Pointer Ptr; + typedef typename Restr::Reference Ref; + typedef typename Restr::PointerToConst PtrToConst; + typedef typename Restr::ReferenceToConst RefToConst; + SOA_HOST_DEVICE_INLINE SoAValue(size_t i, T* col) : idx_(i), col_(col) {} + SOA_HOST_DEVICE_INLINE SoAValue(size_t i, SoAParametersImpl params) + : idx_(i), col_(params.addr_) {} + /* SOA_HOST_DEVICE_INLINE operator T&() { return col_[idx_]; } */ + SOA_HOST_DEVICE_INLINE Ref operator()() { + // Ptr type will add the restrict qualifyer if needed + Ptr col = alignedCol(); + return col[idx_]; + } + SOA_HOST_DEVICE_INLINE RefToConst operator()() const { + // PtrToConst type will add the restrict qualifyer if needed + PtrToConst col = alignedCol(); + return col[idx_]; + } + SOA_HOST_DEVICE_INLINE Ptr operator&() { return &alignedCol()[idx_]; } + SOA_HOST_DEVICE_INLINE PtrToConst operator&() const { return &alignedCol()[idx_]; } + template + SOA_HOST_DEVICE_INLINE Ref operator=(const T2& v) { + return alignedCol()[idx_] = v; + } + typedef Val valueType; + static constexpr auto valueSize = sizeof(T); + + private: + SOA_HOST_DEVICE_INLINE Ptr alignedCol() const { + if constexpr (ALIGNMENT) { + return reinterpret_cast(__builtin_assume_aligned(col_, ALIGNMENT)); + } + return reinterpret_cast(col_); + } + size_t idx_; + T* col_; + }; + + // Helper template managing the value within it column + // TODO Create a const variant to avoid leaking mutable access. +#ifdef EIGEN_WORLD_VERSION + template + class SoAValue { + public: + typedef C Type; + typedef Eigen::Map> MapType; + typedef Eigen::Map> CMapType; + typedef add_restrict Restr; + typedef typename Restr::Value Val; + typedef typename Restr::Pointer Ptr; + typedef typename Restr::Reference Ref; + typedef typename Restr::PointerToConst PtrToConst; + typedef typename Restr::ReferenceToConst RefToConst; + SOA_HOST_DEVICE_INLINE SoAValue(size_t i, typename C::Scalar* col, size_t stride) + : val_(col + i, C::RowsAtCompileTime, C::ColsAtCompileTime, Eigen::InnerStride(stride)), + crCol_(col), + cVal_(crCol_ + i, C::RowsAtCompileTime, C::ColsAtCompileTime, Eigen::InnerStride(stride)), + stride_(stride) {} + SOA_HOST_DEVICE_INLINE SoAValue(size_t i, SoAParametersImpl params) + : val_(params.addr_ + i, + C::RowsAtCompileTime, + C::ColsAtCompileTime, + Eigen::InnerStride(params.stride_)), + crCol_(params.addr_), + cVal_(crCol_ + i, + C::RowsAtCompileTime, + C::ColsAtCompileTime, + Eigen::InnerStride(params.stride_)), + stride_(params.stride_) {} + SOA_HOST_DEVICE_INLINE MapType& operator()() { return val_; } + SOA_HOST_DEVICE_INLINE const CMapType& operator()() const { return cVal_; } + SOA_HOST_DEVICE_INLINE operator C() { return val_; } + SOA_HOST_DEVICE_INLINE operator const C() const { return cVal_; } + SOA_HOST_DEVICE_INLINE C* operator&() { return &val_; } + SOA_HOST_DEVICE_INLINE const C* operator&() const { return &cVal_; } + template + SOA_HOST_DEVICE_INLINE MapType& operator=(const C2& v) { + return val_ = v; + } + typedef typename C::Scalar ValueType; + static constexpr auto valueSize = sizeof(C::Scalar); + SOA_HOST_DEVICE_INLINE size_t stride() const { return stride_; } + + private: + MapType val_; + const Ptr crCol_; + CMapType cVal_; + size_t stride_; + }; +#else + template + class SoAValue { + // Eigen/Core should be pre-included before the SoA headers to enable support for Eigen columns. + static_assert(!sizeof(C), + "Eigen/Core should be pre-included before the SoA headers to enable support for Eigen columns."); + }; +#endif + // Helper template managing the value within it column + template + class SoAConstValue { + // Eigen is implemented in a specialization + static_assert(COLUMN_TYPE != SoAColumnType::eigen); + + public: + typedef add_restrict Restr; + typedef typename Restr::Value Val; + typedef typename Restr::Pointer Ptr; + typedef typename Restr::Reference Ref; + typedef typename Restr::PointerToConst PtrToConst; + typedef typename Restr::ReferenceToConst RefToConst; + typedef SoAParametersImpl Params; + typedef SoAConstParametersImpl ConstParams; + SOA_HOST_DEVICE_INLINE SoAConstValue(size_t i, const T* col) : idx_(i), col_(col) {} + SOA_HOST_DEVICE_INLINE SoAConstValue(size_t i, SoAParametersImpl params) + : idx_(i), col_(params.addr_) {} + SOA_HOST_DEVICE_INLINE SoAConstValue(size_t i, SoAConstParametersImpl params) + : idx_(i), col_(params.addr_) {} + /* SOA_HOST_DEVICE_INLINE operator T&() { return col_[idx_]; } */ + SOA_HOST_DEVICE_INLINE RefToConst operator()() const { + // Ptr type will add the restrict qualifyer if needed + PtrToConst col = alignedCol(); + return col[idx_]; + } + SOA_HOST_DEVICE_INLINE const T* operator&() const { return &alignedCol()[idx_]; } + typedef T valueType; + static constexpr auto valueSize = sizeof(T); + + private: + SOA_HOST_DEVICE_INLINE PtrToConst alignedCol() const { + if constexpr (ALIGNMENT) { + return reinterpret_cast(__builtin_assume_aligned(col_, ALIGNMENT)); + } + return reinterpret_cast(col_); + } + size_t idx_; + const T* col_; + }; + +#ifdef EIGEN_WORLD_VERSION + // Helper template managing the value within it column + // TODO Create a const variant to avoid leaking mutable access. + template + class SoAConstValue { + public: + typedef C Type; + typedef Eigen::Map> CMapType; + typedef CMapType& RefToConst; + typedef SoAConstParametersImpl ConstParams; + SOA_HOST_DEVICE_INLINE SoAConstValue(size_t i, typename C::Scalar* col, size_t stride) + : crCol_(col), + cVal_(crCol_ + i, C::RowsAtCompileTime, C::ColsAtCompileTime, Eigen::InnerStride(stride)), + stride_(stride) {} + SOA_HOST_DEVICE_INLINE SoAConstValue(size_t i, SoAConstParametersImpl params) + : crCol_(params.addr_), + cVal_(crCol_ + i, + C::RowsAtCompileTime, + C::ColsAtCompileTime, + Eigen::InnerStride(params.stride_)), + stride_(params.stride_) {} + SOA_HOST_DEVICE_INLINE const CMapType& operator()() const { return cVal_; } + SOA_HOST_DEVICE_INLINE operator const C() const { return cVal_; } + SOA_HOST_DEVICE_INLINE const C* operator&() const { return &cVal_; } + typedef typename C::Scalar ValueType; + static constexpr auto valueSize = sizeof(C::Scalar); + SOA_HOST_DEVICE_INLINE size_t stride() const { return stride_; } + + private: + const typename C::Scalar* __restrict__ crCol_; + CMapType cVal_; + size_t stride_; + }; +#else + template + class SoAConstValue { + // Eigen/Core should be pre-included before the SoA headers to enable support for Eigen columns. + static_assert(!sizeof(C), + "Eigen/Core should be pre-included before the SoA headers to enable support for Eigen columns."); + }; +#endif + + // Helper template to avoid commas in macro +#ifdef EIGEN_WORLD_VERSION + template + struct EigenConstMapMaker { + typedef Eigen::Map> Type; + class DataHolder { + public: + DataHolder(const typename C::Scalar* data) : data_(data) {} + EigenConstMapMaker::Type withStride(size_t stride) { + return EigenConstMapMaker::Type( + data_, C::RowsAtCompileTime, C::ColsAtCompileTime, Eigen::InnerStride(stride)); + } + + private: + const typename C::Scalar* const data_; + }; + static DataHolder withData(const typename C::Scalar* data) { return DataHolder(data); } + }; +#else + template + struct EigenConstMapMaker { + // Eigen/Core should be pre-included before the SoA headers to enable support for Eigen columns. + static_assert(!sizeof(C), + "Eigen/Core should be pre-included before the SoA headers to enable support for Eigen columns."); + }; +#endif + // Helper function to compute aligned size + inline size_t alignSize(size_t size, size_t alignment = 128) { + if (size) + return ((size - 1) / alignment + 1) * alignment; + else + return 0; + } + +} // namespace cms::soa + +#define SOA_SCALAR(TYPE, NAME) (_VALUE_TYPE_SCALAR, TYPE, NAME) +#define SOA_COLUMN(TYPE, NAME) (_VALUE_TYPE_COLUMN, TYPE, NAME) +#define SOA_EIGEN_COLUMN(TYPE, NAME) (_VALUE_TYPE_EIGEN_COLUMN, TYPE, NAME) + +/* Iterate on the macro MACRO and return the result as a comma separated list */ +#define _ITERATE_ON_ALL_COMMA(MACRO, DATA, ...) \ + BOOST_PP_TUPLE_ENUM(BOOST_PP_SEQ_TO_TUPLE(_ITERATE_ON_ALL(MACRO, DATA, __VA_ARGS__))) +/* Iterate MACRO on all elements */ +#define _ITERATE_ON_ALL(MACRO, DATA, ...) BOOST_PP_SEQ_FOR_EACH(MACRO, DATA, BOOST_PP_VARIADIC_TO_SEQ(__VA_ARGS__)) + +/* Switch on macros depending on scalar / column type */ +#define _SWITCH_ON_TYPE(VALUE_TYPE, IF_SCALAR, IF_COLUMN, IF_EIGEN_COLUMN) \ + BOOST_PP_IF( \ + BOOST_PP_EQUAL(VALUE_TYPE, _VALUE_TYPE_SCALAR), \ + IF_SCALAR, \ + BOOST_PP_IF( \ + BOOST_PP_EQUAL(VALUE_TYPE, _VALUE_TYPE_COLUMN), \ + IF_COLUMN, \ + BOOST_PP_IF(BOOST_PP_EQUAL(VALUE_TYPE, _VALUE_TYPE_EIGEN_COLUMN), IF_EIGEN_COLUMN, BOOST_PP_EMPTY()))) + +namespace cms::soa { + + /* Column accessors: templates implementing the global accesors (soa::x() and soa::x(index) */ + enum class SoAAccessType : bool { mutableAccess, constAccess }; + + template + struct SoAColumnAccessorsImpl {}; + + // Todo: add alignment support. + // Sfinae based const/non const variants. + // Column + template + struct SoAColumnAccessorsImpl { + //SOA_HOST_DEVICE_INLINE SoAColumnAccessorsImpl(T* baseAddress) : baseAddress_(baseAddress) {} + SOA_HOST_DEVICE_INLINE SoAColumnAccessorsImpl(const SoAParametersImpl& params) + : params_(params) {} + SOA_HOST_DEVICE_INLINE T* operator()() { return params_.addr_; } + typedef T* NoParamReturnType; + SOA_HOST_DEVICE_INLINE T& operator()(size_t index) { return params_.addr_[index]; } + + private: + SoAParametersImpl params_; + }; + + // Const column + template + struct SoAColumnAccessorsImpl { + SOA_HOST_DEVICE_INLINE + SoAColumnAccessorsImpl(const SoAConstParametersImpl& params) + : params_(params) {} + SOA_HOST_DEVICE_INLINE const T* operator()() const { return params_.addr_; } + typedef T* NoParamReturnType; + SOA_HOST_DEVICE_INLINE T operator()(size_t index) const { return params_.addr_[index]; } + + private: + SoAConstParametersImpl params_; + }; + + // Scalar + template + struct SoAColumnAccessorsImpl { + SOA_HOST_DEVICE_INLINE SoAColumnAccessorsImpl(const SoAParametersImpl& params) + : params_(params) {} + SOA_HOST_DEVICE_INLINE T& operator()() { return *params_.addr_; } + typedef T& NoParamReturnType; + SOA_HOST_DEVICE_INLINE void operator()(size_t index) const { + assert(false && "Indexed access impossible for SoA scalars."); + } + + private: + SoAParametersImpl params_; + }; + + // Const scalar + template + struct SoAColumnAccessorsImpl { + SOA_HOST_DEVICE_INLINE + SoAColumnAccessorsImpl(const SoAConstParametersImpl& params) + : params_(params) {} + SOA_HOST_DEVICE_INLINE T operator()() const { return *params_.addr_; } + typedef T NoParamReturnType; + SOA_HOST_DEVICE_INLINE void operator()(size_t index) const { + assert(false && "Indexed access impossible for SoA scalars."); + } + + private: + SoAConstParametersImpl params_; + }; + + template + struct SoAColumnAccessorsImpl { + //SOA_HOST_DEVICE_INLINE SoAColumnAccessorsImpl(T* baseAddress) : baseAddress_(baseAddress) {} + SOA_HOST_DEVICE_INLINE SoAColumnAccessorsImpl(const SoAParametersImpl& params) + : params_(params) {} + SOA_HOST_DEVICE_INLINE typename T::Scalar* operator()() { return params_.addr_; } + typedef typename T::Scalar* NoParamReturnType; + //SOA_HOST_DEVICE_INLINE T& operator()(size_t index) { return params_.addr_[index]; } + + private: + SoAParametersImpl params_; + }; + + // Const column + template + struct SoAColumnAccessorsImpl { + SOA_HOST_DEVICE_INLINE + SoAColumnAccessorsImpl(const SoAConstParametersImpl& params) + : params_(params) {} + SOA_HOST_DEVICE_INLINE const typename T::Scalar* operator()() const { return params_.addr_; } + typedef typename T::Scalar* NoParamReturnType; + //SOA_HOST_DEVICE_INLINE T operator()(size_t index) const { return params_.addr_[index]; } + + private: + SoAConstParametersImpl params_; + }; + + /* A helper template stager avoiding comma in macros */ + template + struct SoAAccessors { + template + struct ColumnType { + template + struct AccessType : public SoAColumnAccessorsImpl { + using SoAColumnAccessorsImpl::SoAColumnAccessorsImpl; + }; + }; + }; + + /* Enum parameters allowing templated control of layout/view behaviors */ + /* Alignement enforcement verifies every column is aligned, and + * hints the compiler that it can expect column pointers to be aligned */ + enum class AlignmentEnforcement : bool { Relaxed, Enforced }; + + struct CacheLineSize { + static constexpr size_t NvidiaGPU = 128; + static constexpr size_t IntelCPU = 64; + static constexpr size_t AMDCPU = 64; + static constexpr size_t ARMCPU = 64; + static constexpr size_t defaultSize = NvidiaGPU; + }; + + // An empty shell class to restrict the scope of tempalted operator<<(ostream, soa). + struct BaseLayout {}; +} // namespace cms::soa + +// Small wrapper for stream insertion of SoA printing +template ::value, SOA>::type> +SOA_HOST_ONLY std::ostream& operator<<(std::ostream& os, const SOA& soa) { + soa.toStream(os); + return os; +} +#endif // ndef DataStructures_SoACommon_h diff --git a/src/cudadev/DataFormats/SoALayout.h b/src/cudadev/DataFormats/SoALayout.h new file mode 100644 index 000000000..036f1cb4f --- /dev/null +++ b/src/cudadev/DataFormats/SoALayout.h @@ -0,0 +1,388 @@ +/* + * Structure-of-Arrays template with "columns" and "scalars", defined through preprocessor macros, + * with compile-time size and alignment, and accessors to the "rows" and "columns". + */ + +#ifndef DataStructures_SoALayout_h +#define DataStructures_SoALayout_h + +#include "SoACommon.h" + +#include +#include + +/* dump SoA fields information; these should expand to, for columns: + * Example: + * GENERATE_SOA_LAYOUT(SoA, + * // predefined static scalars + * // size_t size; + * // size_t alignment; + * + * // columns: one value per element + * SOA_COLUMN(double, x), + * SOA_COLUMN(double, y), + * SOA_COLUMN(double, z), + * SOA_EIGEN_COLUMN(Eigen::Vector3d, a), + * SOA_EIGEN_COLUMN(Eigen::Vector3d, b), + * SOA_EIGEN_COLUMN(Eigen::Vector3d, r), + * SOA_COLUMN(uint16_t, colour), + * SOA_COLUMN(int32_t, value), + * SOA_COLUMN(double *, py), + * SOA_COLUMN(uint32_t, count), + * SOA_COLUMN(uint32_t, anotherCount), + * + * // scalars: one value for the whole structure + * SOA_SCALAR(const char *, description), + * SOA_SCALAR(uint32_t, someNumber) + * ); + * + * dumps as: + * SoA(32, 64): + * sizeof(SoA): 152 + * Column x_ at offset 0 has size 256 and padding 0 + * Column y_ at offset 256 has size 256 and padding 0 + * Column z_ at offset 512 has size 256 and padding 0 + * Eigen value a_ at offset 768 has dimension (3 x 1) and per column size 256 and padding 0 + * Eigen value b_ at offset 1536 has dimension (3 x 1) and per column size 256 and padding 0 + * Eigen value r_ at offset 2304 has dimension (3 x 1) and per column size 256 and padding 0 + * Column colour_ at offset 3072 has size 64 and padding 0 + * Column value_ at offset 3136 has size 128 and padding 0 + * Column py_ at offset 3264 has size 256 and padding 0 + * Column count_ at offset 3520 has size 128 and padding 0 + * Column anotherCount_ at offset 3648 has size 128 and padding 0 + * Scalar description_ at offset 3776 has size 8 and padding 56 + * Scalar someNumber_ at offset 3840 has size 4 and padding 60 + * Final offset = 3904 computeDataSize(...): 3904 + * + */ + +// clang-format off +#define _DECLARE_SOA_STREAM_INFO_IMPL(VALUE_TYPE, CPP_TYPE, NAME) \ + _SWITCH_ON_TYPE( \ + VALUE_TYPE, \ + /* Dump scalar */ \ + os << " Scalar " BOOST_PP_STRINGIZE(NAME) " at offset " << offset << " has size " << sizeof(CPP_TYPE) \ + << " and padding " << ((sizeof(CPP_TYPE) - 1) / byteAlignment + 1) * byteAlignment - sizeof(CPP_TYPE) \ + << std::endl; \ + offset += ((sizeof(CPP_TYPE) - 1) / byteAlignment + 1) * byteAlignment; \ + , /* Dump column */ \ + os << " Column " BOOST_PP_STRINGIZE(NAME) " at offset " << offset << " has size " << sizeof(CPP_TYPE) * nElements_ \ + << " and padding " \ + << (((nElements_ * sizeof(CPP_TYPE) - 1) / byteAlignment) + 1) * byteAlignment - (sizeof(CPP_TYPE) * nElements_) \ + << std::endl; \ + offset += (((nElements_ * sizeof(CPP_TYPE) - 1) / byteAlignment) + 1) * byteAlignment; \ + , /* Dump Eigen column */ \ + os << " Eigen value " BOOST_PP_STRINGIZE(NAME) " at offset " << offset << " has dimension (" \ + << CPP_TYPE::RowsAtCompileTime << " x " << CPP_TYPE::ColsAtCompileTime \ + << ")" \ + << " and per column size " \ + << sizeof(CPP_TYPE::Scalar) * nElements_ \ + << " and padding " \ + << (((nElements_ * sizeof(CPP_TYPE::Scalar) - 1) / byteAlignment) + 1) * byteAlignment - \ + (sizeof(CPP_TYPE::Scalar) * nElements_) \ + << std::endl; \ + offset += (((nElements_ * sizeof(CPP_TYPE::Scalar) - 1) / byteAlignment) + 1) * byteAlignment * \ + CPP_TYPE::RowsAtCompileTime * CPP_TYPE::ColsAtCompileTime;) +// clang-format on + +#define _DECLARE_SOA_STREAM_INFO(R, DATA, TYPE_NAME) BOOST_PP_EXPAND(_DECLARE_SOA_STREAM_INFO_IMPL TYPE_NAME) + +/** + * SoAMetadata member computing column pitch + */ +// clang-format off +#define _DEFINE_METADATA_MEMBERS_IMPL(VALUE_TYPE, CPP_TYPE, NAME) \ + _SWITCH_ON_TYPE(VALUE_TYPE, \ + /* Scalar */ \ + size_t BOOST_PP_CAT(NAME, Pitch()) const { \ + return (((sizeof(CPP_TYPE) - 1) / ParentClass::byteAlignment) + 1) * ParentClass::byteAlignment; \ + } \ + typedef CPP_TYPE BOOST_PP_CAT(TypeOf_, NAME); \ + constexpr static cms::soa::SoAColumnType BOOST_PP_CAT(ColumnTypeOf_, NAME) = cms::soa::SoAColumnType::scalar; \ + SOA_HOST_DEVICE_INLINE \ + CPP_TYPE const* BOOST_PP_CAT(addressOf_, NAME)() const { \ + return parent_.soaMetadata().BOOST_PP_CAT(parametersOf_, NAME)().addr_; \ + } \ + typedef cms::soa::SoAParameters_ColumnType::DataType \ + BOOST_PP_CAT(ParametersTypeOf_, NAME); \ + SOA_HOST_DEVICE_INLINE \ + BOOST_PP_CAT(ParametersTypeOf_, NAME) BOOST_PP_CAT(parametersOf_, NAME)() const { \ + return BOOST_PP_CAT(ParametersTypeOf_, NAME) (parent_.BOOST_PP_CAT(NAME, _)); \ + } \ + SOA_HOST_DEVICE_INLINE \ + CPP_TYPE* BOOST_PP_CAT(addressOf_, NAME)() { \ + return parent_.soaMetadata().BOOST_PP_CAT(parametersOf_, NAME)().addr_; \ + }, \ + /* Column */ \ + typedef cms::soa::SoAParameters_ColumnType::DataType \ + BOOST_PP_CAT(ParametersTypeOf_, NAME); \ + SOA_HOST_DEVICE_INLINE \ + BOOST_PP_CAT(ParametersTypeOf_, NAME) BOOST_PP_CAT(parametersOf_, NAME)() const { \ + return BOOST_PP_CAT(ParametersTypeOf_, NAME) (parent_.BOOST_PP_CAT(NAME, _)); \ + } \ + SOA_HOST_DEVICE_INLINE \ + CPP_TYPE const* BOOST_PP_CAT(addressOf_, NAME)() const { \ + return parent_.soaMetadata().BOOST_PP_CAT(parametersOf_, NAME)().addr_; \ + } \ + SOA_HOST_DEVICE_INLINE \ + CPP_TYPE* BOOST_PP_CAT(addressOf_, NAME)() { \ + return parent_.soaMetadata().BOOST_PP_CAT(parametersOf_, NAME)().addr_; \ + } \ + SOA_HOST_DEVICE_INLINE \ + size_t BOOST_PP_CAT(NAME, Pitch()) const { \ + return (((parent_.nElements_ * sizeof(CPP_TYPE) - 1) / ParentClass::byteAlignment) + 1) * \ + ParentClass::byteAlignment; \ + } \ + typedef CPP_TYPE BOOST_PP_CAT(TypeOf_, NAME); \ + constexpr static cms::soa::SoAColumnType BOOST_PP_CAT(ColumnTypeOf_, NAME) = cms::soa::SoAColumnType::column;, \ + /* Eigen column */ \ + typedef cms::soa::SoAParameters_ColumnType::DataType \ + BOOST_PP_CAT(ParametersTypeOf_, NAME); \ + SOA_HOST_DEVICE_INLINE \ + BOOST_PP_CAT(ParametersTypeOf_, NAME) BOOST_PP_CAT(parametersOf_, NAME)() const { \ + return BOOST_PP_CAT(ParametersTypeOf_, NAME) ( \ + parent_.BOOST_PP_CAT(NAME, _), \ + parent_.BOOST_PP_CAT(NAME, Stride_)); \ + } \ + SOA_HOST_DEVICE_INLINE \ + size_t BOOST_PP_CAT(NAME, Pitch()) const { \ + return (((parent_.nElements_ * sizeof(CPP_TYPE::Scalar) - 1) / ParentClass::byteAlignment) + 1) * \ + ParentClass::byteAlignment * CPP_TYPE::RowsAtCompileTime * CPP_TYPE::ColsAtCompileTime; \ + } typedef CPP_TYPE BOOST_PP_CAT(TypeOf_, NAME); \ + constexpr static cms::soa::SoAColumnType BOOST_PP_CAT(ColumnTypeOf_, NAME) = cms::soa::SoAColumnType::eigen; \ + SOA_HOST_DEVICE_INLINE \ + CPP_TYPE::Scalar const* BOOST_PP_CAT(addressOf_, NAME)() const { \ + return parent_.soaMetadata().BOOST_PP_CAT(parametersOf_, NAME)().addr_; \ + } \ + SOA_HOST_DEVICE_INLINE \ + CPP_TYPE::Scalar* BOOST_PP_CAT(addressOf_, NAME)() { \ + return parent_.soaMetadata().BOOST_PP_CAT(parametersOf_, NAME)().addr_; \ + } \ +) +// clang-format on +#define _DEFINE_METADATA_MEMBERS(R, DATA, TYPE_NAME) _DEFINE_METADATA_MEMBERS_IMPL TYPE_NAME + +/** + * Member assignment for trivial constructor + */ +#define _DECLARE_MEMBER_TRIVIAL_CONSTRUCTION_IMPL(VALUE_TYPE, CPP_TYPE, NAME) \ + _SWITCH_ON_TYPE(VALUE_TYPE, /* Scalar */ \ + (BOOST_PP_CAT(NAME, _)(nullptr)), /* Column */ \ + (BOOST_PP_CAT(NAME, _)(nullptr)), /* Eigen column */ \ + (BOOST_PP_CAT(NAME, _)(nullptr))(BOOST_PP_CAT(NAME, Stride_)(0))) + +#define _DECLARE_MEMBER_TRIVIAL_CONSTRUCTION(R, DATA, TYPE_NAME) \ + BOOST_PP_EXPAND(_DECLARE_MEMBER_TRIVIAL_CONSTRUCTION_IMPL TYPE_NAME) +/** + * Computation of the column or scalar pointer location in the memory layout (at SoA construction time) + */ +#define _ASSIGN_SOA_COLUMN_OR_SCALAR_IMPL(VALUE_TYPE, CPP_TYPE, NAME) \ + _SWITCH_ON_TYPE(VALUE_TYPE, /* Scalar */ \ + BOOST_PP_CAT(NAME, _) = reinterpret_cast(curMem); \ + curMem += (((sizeof(CPP_TYPE) - 1) / byteAlignment) + 1) * byteAlignment; \ + , /* Column */ \ + BOOST_PP_CAT(NAME, _) = reinterpret_cast(curMem); \ + curMem += (((nElements_ * sizeof(CPP_TYPE) - 1) / byteAlignment) + 1) * byteAlignment; \ + , /* Eigen column */ \ + BOOST_PP_CAT(NAME, _) = reinterpret_cast(curMem); \ + curMem += (((nElements_ * sizeof(CPP_TYPE::Scalar) - 1) / byteAlignment) + 1) * byteAlignment * \ + CPP_TYPE::RowsAtCompileTime * CPP_TYPE::ColsAtCompileTime; \ + BOOST_PP_CAT(NAME, Stride_) = (((nElements_ * sizeof(CPP_TYPE::Scalar) - 1) / byteAlignment) + 1) * \ + byteAlignment / sizeof(CPP_TYPE::Scalar);) \ + if constexpr (alignmentEnforcement == AlignmentEnforcement::Enforced) \ + if (reinterpret_cast(BOOST_PP_CAT(NAME, _)) % byteAlignment) \ + throw std::out_of_range("In layout constructor: misaligned column: " #NAME); + +#define _ASSIGN_SOA_COLUMN_OR_SCALAR(R, DATA, TYPE_NAME) _ASSIGN_SOA_COLUMN_OR_SCALAR_IMPL TYPE_NAME + +/** + * Computation of the column or scalar size for SoA size computation + */ +#define _ACCUMULATE_SOA_ELEMENT_IMPL(VALUE_TYPE, CPP_TYPE, NAME) \ + _SWITCH_ON_TYPE(VALUE_TYPE, /* Scalar */ \ + ret += (((sizeof(CPP_TYPE) - 1) / byteAlignment) + 1) * byteAlignment; \ + , /* Column */ \ + ret += (((nElements * sizeof(CPP_TYPE) - 1) / byteAlignment) + 1) * byteAlignment; \ + , /* Eigen column */ \ + ret += (((nElements * sizeof(CPP_TYPE::Scalar) - 1) / byteAlignment) + 1) * byteAlignment * \ + CPP_TYPE::RowsAtCompileTime * CPP_TYPE::ColsAtCompileTime;) + +#define _ACCUMULATE_SOA_ELEMENT(R, DATA, TYPE_NAME) _ACCUMULATE_SOA_ELEMENT_IMPL TYPE_NAME + +/** + * Direct access to column pointer and indexed access + */ +#define _DECLARE_SOA_ACCESSOR_IMPL(VALUE_TYPE, CPP_TYPE, NAME) \ + _SWITCH_ON_TYPE( \ + VALUE_TYPE, /* Scalar */ \ + SOA_HOST_DEVICE_INLINE CPP_TYPE& NAME() { return *BOOST_PP_CAT(NAME, _); }, /* Column */ \ + SOA_HOST_DEVICE_INLINE CPP_TYPE* NAME() { \ + return BOOST_PP_CAT(NAME, _); \ + } SOA_HOST_DEVICE_INLINE CPP_TYPE& NAME(size_t index) { return BOOST_PP_CAT(NAME, _)[index]; }, \ + /* Eigen column */ /* Unsupported for the moment TODO */ \ + BOOST_PP_EMPTY()) + +#define _DECLARE_SOA_ACCESSOR(R, DATA, TYPE_NAME) BOOST_PP_EXPAND(_DECLARE_SOA_ACCESSOR_IMPL TYPE_NAME) + +/** + * Direct access to column pointer (const) and indexed access. + */ +#define _DECLARE_SOA_CONST_ACCESSOR_IMPL(VALUE_TYPE, CPP_TYPE, NAME) \ + _SWITCH_ON_TYPE( \ + VALUE_TYPE, /* Scalar */ \ + SOA_HOST_DEVICE_INLINE CPP_TYPE NAME() const { return *(BOOST_PP_CAT(NAME, _)); }, /* Column */ \ + SOA_HOST_DEVICE_INLINE CPP_TYPE const* NAME() \ + const { return BOOST_PP_CAT(NAME, _); } SOA_HOST_DEVICE_INLINE CPP_TYPE NAME(size_t index) \ + const { return *(BOOST_PP_CAT(NAME, _) + index); }, /* Eigen column */ \ + SOA_HOST_DEVICE_INLINE CPP_TYPE::Scalar const* NAME() \ + const { return BOOST_PP_CAT(NAME, _); } SOA_HOST_DEVICE_INLINE size_t BOOST_PP_CAT( \ + NAME, Stride)() { return BOOST_PP_CAT(NAME, Stride_); }) + +#define _DECLARE_SOA_CONST_ACCESSOR(R, DATA, TYPE_NAME) BOOST_PP_EXPAND(_DECLARE_SOA_CONST_ACCESSOR_IMPL TYPE_NAME) + +/** + * SoA class member declaration (column pointers). + */ +#define _DECLARE_SOA_DATA_MEMBER_IMPL(VALUE_TYPE, CPP_TYPE, NAME) \ + _SWITCH_ON_TYPE(VALUE_TYPE, /* Scalar */ \ + CPP_TYPE* BOOST_PP_CAT(NAME, _) = nullptr; \ + , /* Column */ \ + CPP_TYPE * BOOST_PP_CAT(NAME, _) = nullptr; \ + , /* Eigen column */ \ + CPP_TYPE::Scalar * BOOST_PP_CAT(NAME, _) = nullptr; \ + size_t BOOST_PP_CAT(NAME, Stride_) = 0;) + +#define _DECLARE_SOA_DATA_MEMBER(R, DATA, TYPE_NAME) BOOST_PP_EXPAND(_DECLARE_SOA_DATA_MEMBER_IMPL TYPE_NAME) + +#ifdef DEBUG +#define _DO_RANGECHECK true +#else +#define _DO_RANGECHECK false +#endif + +/* + * A macro defining a SoA layout (collection of scalars and columns of equal lengths) + */ +// clang-format off +#define GENERATE_SOA_LAYOUT(CLASS, ...) \ + template \ + struct CLASS: public cms::soa::BaseLayout { \ + /* these could be moved to an external type trait to free up the symbol names */ \ + using self_type = CLASS; \ + typedef cms::soa::AlignmentEnforcement AlignmentEnforcement; \ + \ + /* For CUDA applications, we align to the 128 bytes of the cache lines. \ + * See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0 this is still valid \ + * up to compute capability 8.X. \ + */ \ + constexpr static size_t defaultAlignment = 128; \ + constexpr static size_t byteAlignment = ALIGNMENT; \ + constexpr static AlignmentEnforcement alignmentEnforcement = ALIGNMENT_ENFORCEMENT; \ + constexpr static size_t conditionalAlignment = \ + alignmentEnforcement == AlignmentEnforcement::Enforced ? byteAlignment : 0; \ + /* Those typedefs avoid having commas in macros (which is problematic) */ \ + template \ + using SoAValueWithConf = cms::soa::SoAValue; \ + \ + template \ + using SoAConstValueWithConf = cms::soa::SoAConstValue; \ + \ + /* dump the SoA internal structure */ \ + SOA_HOST_ONLY \ + void toStream(std::ostream & os) const { \ + os << #CLASS "(" << nElements_ << " elements, byte alignement= " << byteAlignment << ", @"<< mem_ <<"): " << std::endl; \ + os << " sizeof(" #CLASS "): " << sizeof(CLASS) << std::endl; \ + size_t offset = 0; \ + _ITERATE_ON_ALL(_DECLARE_SOA_STREAM_INFO, ~, __VA_ARGS__) \ + os << "Final offset = " << offset << " computeDataSize(...): " << computeDataSize(nElements_) \ + << std::endl; \ + os << std::endl; \ + } \ + \ + /* Helper function used by caller to externally allocate the storage */ \ + static size_t computeDataSize(size_t nElements) { \ + size_t ret = 0; \ + _ITERATE_ON_ALL(_ACCUMULATE_SOA_ELEMENT, ~, __VA_ARGS__) \ + return ret; \ + } \ + \ + /** \ + * Helper/friend class allowing SoA introspection. \ + */ \ + struct SoAMetadata { \ + friend CLASS; \ + SOA_HOST_DEVICE_INLINE size_t size() const { return parent_.nElements_; } \ + SOA_HOST_DEVICE_INLINE size_t byteSize() const { return parent_.byteSize_; } \ + SOA_HOST_DEVICE_INLINE size_t byteAlignment() const { return CLASS::byteAlignment; } \ + SOA_HOST_DEVICE_INLINE std::byte* data() { return parent_.mem_; } \ + SOA_HOST_DEVICE_INLINE const std::byte* data() const { return parent_.mem_; } \ + SOA_HOST_DEVICE_INLINE std::byte* nextByte() const { return parent_.mem_ + parent_.byteSize_; } \ + SOA_HOST_DEVICE_INLINE CLASS cloneToNewAddress(std::byte* addr) const { \ + return CLASS(addr, parent_.nElements_); \ + } \ + _ITERATE_ON_ALL(_DEFINE_METADATA_MEMBERS, ~, __VA_ARGS__) \ + \ + SoAMetadata& operator=(const SoAMetadata&) = delete; \ + SoAMetadata(const SoAMetadata&) = delete; \ + \ + private: \ + SOA_HOST_DEVICE_INLINE SoAMetadata(const CLASS& parent) : parent_(parent) {} \ + const CLASS& parent_; \ + typedef CLASS ParentClass; \ + }; \ + friend SoAMetadata; \ + SOA_HOST_DEVICE_INLINE const SoAMetadata soaMetadata() const { return SoAMetadata(*this); } \ + SOA_HOST_DEVICE_INLINE SoAMetadata soaMetadata() { return SoAMetadata(*this); } \ + \ + /* Trivial constuctor */ \ + CLASS() \ + : mem_(nullptr), \ + nElements_(0), \ + byteSize_(0), \ + _ITERATE_ON_ALL_COMMA(_DECLARE_MEMBER_TRIVIAL_CONSTRUCTION, ~, __VA_ARGS__) {} \ + \ + /* Constructor relying on user provided storage */ \ + SOA_HOST_ONLY CLASS(std::byte* mem, size_t nElements) : mem_(mem), nElements_(nElements), byteSize_(0) { \ + if constexpr (alignmentEnforcement == AlignmentEnforcement::Enforced) \ + if (reinterpret_cast(mem) % byteAlignment) \ + throw std::out_of_range("In " #CLASS "::" #CLASS ": misaligned buffer"); \ + auto curMem = mem_; \ + _ITERATE_ON_ALL(_ASSIGN_SOA_COLUMN_OR_SCALAR, ~, __VA_ARGS__) \ + /* Sanity check: we should have reached the computed size, only on host code */ \ + byteSize_ = computeDataSize(nElements_); \ + if (mem_ + byteSize_ != curMem) \ + throw std::out_of_range("In " #CLASS "::" #CLASS ": unexpected end pointer."); \ + } \ + \ + /* Constructor relying on user provided storage */ \ + SOA_DEVICE_ONLY CLASS(bool devConstructor, std::byte* mem, size_t nElements) : mem_(mem), nElements_(nElements) { \ + auto curMem = mem_; \ + _ITERATE_ON_ALL(_ASSIGN_SOA_COLUMN_OR_SCALAR, ~, __VA_ARGS__) \ + } \ + \ + /* dump the SoA internal structure */ \ + template \ + SOA_HOST_ONLY friend void dump(); \ + \ + private: \ + /* Range checker conditional to the macro _DO_RANGECHECK */ \ + SOA_HOST_DEVICE_INLINE \ + void rangeCheck(size_t index) const { \ + if constexpr (_DO_RANGECHECK) { \ + if (index >= nElements_) { \ + printf("In " #CLASS "::rangeCheck(): index out of range: %zu with nElements: %zu\n", index, nElements_); \ + assert(false); \ + } \ + } \ + } \ + \ + /* data members */ \ + std::byte* mem_; \ + size_t nElements_; \ + size_t byteSize_; \ + _ITERATE_ON_ALL(_DECLARE_SOA_DATA_MEMBER, ~, __VA_ARGS__) \ + }; +// clang-format on + +#endif // ndef DataStructures_SoALayout_h diff --git a/src/cudadev/DataFormats/SoAView.h b/src/cudadev/DataFormats/SoAView.h new file mode 100644 index 000000000..5b691c579 --- /dev/null +++ b/src/cudadev/DataFormats/SoAView.h @@ -0,0 +1,599 @@ +/* + * Structure-of-Arrays templates allowing access to a selection of scalars and columns from one + * or multiple SoA layouts or views. + * This template generator will allow handling subsets of columns from one or multiple SoA views or layouts. + */ + +#ifndef DataStructures_SoAView_h +#define DataStructures_SoAView_h + +#include "SoACommon.h" + +#define SOA_VIEW_LAYOUT(TYPE, NAME) (TYPE, NAME) + +#define SOA_VIEW_LAYOUT_LIST(...) __VA_ARGS__ + +#define SOA_VIEW_VALUE(LAYOUT_NAME, LAYOUT_MEMBER) (LAYOUT_NAME, LAYOUT_MEMBER, LAYOUT_MEMBER) + +#define SOA_VIEW_VALUE_RENAME(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) (LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) + +#define SOA_VIEW_VALUE_LIST(...) __VA_ARGS__ + +/* + * A macro defining a SoA view (collection of columns from multiple layouts or views.) + * + * Usage: + * GENERATE_SOA_VIEW(PixelXYView, + * SOA_VIEW_LAYOUT_LIST( + * SOA_VIEW_LAYOUT(PixelDigis, pixelDigis), + * SOA_VIEW_LAYOUT(PixelRecHitsLayout, pixelsRecHit) + * ), + * SOA_VIEW_VALUE_LIST( + * SOA_VIEW_VALUE_RENAME(pixelDigis, x, digisX), + * SOA_VIEW_VALUE_RENAME(pixelDigis, y, digisY), + * SOA_VIEW_VALUE_RENAME(pixelsRecHit, x, recHitsX), + * SOA_VIEW_VALUE_RENAME(pixelsRecHit, y, recHitsY) + * ) + * ); + * + */ + +namespace cms::soa { + + /* Traits for the different column type scenarios */ + /* Value traits passes the class as is in the case of column type and return + * an empty class with functions returning non-scalar as accessors. */ + template + struct ConstValueTraits : public C { + using C::C; + }; + + template + struct ConstValueTraits { + // Just take to SoAValue type to generate the right constructor. + SOA_HOST_DEVICE_INLINE ConstValueTraits(size_t, const typename C::valueType*) {} + SOA_HOST_DEVICE_INLINE ConstValueTraits(size_t, const typename C::Params&) {} + SOA_HOST_DEVICE_INLINE ConstValueTraits(size_t, const typename C::ConstParams&) {} + // Any attempt to do anything with the "scalar" value a const element will fail. + }; + +} // namespace cms::soa + +#include +/* + * Members definitions macros for viewa + */ + +/** + * Layout types aliasing for referencing by name + */ +#define _DECLARE_VIEW_LAYOUT_TYPE_ALIAS_IMPL(TYPE, NAME) typedef TYPE BOOST_PP_CAT(TypeOf_, NAME); + +#define _DECLARE_VIEW_LAYOUT_TYPE_ALIAS(R, DATA, TYPE_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_LAYOUT_TYPE_ALIAS_IMPL TYPE_NAME) + +/** + * Member types aliasing for referencing by name + */ +#define _DECLARE_VIEW_MEMBER_TYPE_ALIAS_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME, DATA) \ + typedef typename BOOST_PP_CAT(TypeOf_, LAYOUT_NAME)::SoAMetadata::BOOST_PP_CAT(TypeOf_, LAYOUT_MEMBER) \ + BOOST_PP_CAT(TypeOf_, LOCAL_NAME); \ + typedef typename BOOST_PP_CAT(TypeOf_, LAYOUT_NAME)::SoAMetadata::BOOST_PP_CAT(ParametersTypeOf_, LAYOUT_MEMBER) \ + BOOST_PP_CAT(ParametersTypeOf_, LOCAL_NAME); \ + constexpr static cms::soa::SoAColumnType BOOST_PP_CAT(ColumnTypeOf_, LOCAL_NAME) = \ + BOOST_PP_CAT(TypeOf_, LAYOUT_NAME)::SoAMetadata::BOOST_PP_CAT(ColumnTypeOf_, LAYOUT_MEMBER); \ + SOA_HOST_DEVICE_INLINE DATA auto* BOOST_PP_CAT(addressOf_, LOCAL_NAME)() const { \ + return parent_.soaMetadata().BOOST_PP_CAT(parametersOf_, LOCAL_NAME)().addr_; \ + }; \ + SOA_HOST_DEVICE_INLINE \ + DATA BOOST_PP_CAT(ParametersTypeOf_, LOCAL_NAME) BOOST_PP_CAT(parametersOf_, LOCAL_NAME)() const { \ + return parent_.BOOST_PP_CAT(LOCAL_NAME, Parameters_); \ + }; + +#define _DECLARE_VIEW_MEMBER_TYPE_ALIAS(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_MEMBER_TYPE_ALIAS_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA)) + +/** + * Generator of parameters (layouts/views) for constructor by layouts/views. + */ +#define _DECLARE_VIEW_CONSTRUCTION_PARAMETERS_IMPL(LAYOUT_TYPE, LAYOUT_NAME, DATA) (DATA LAYOUT_TYPE & LAYOUT_NAME) + +#define _DECLARE_VIEW_CONSTRUCTION_PARAMETERS(R, DATA, TYPE_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_CONSTRUCTION_PARAMETERS_IMPL BOOST_PP_TUPLE_PUSH_BACK(TYPE_NAME, DATA)) + +/** + * Generator of parameters for constructor by column. + */ +#define _DECLARE_VIEW_CONSTRUCTION_BYCOLUMN_PARAMETERS_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME, DATA) \ + (DATA typename BOOST_PP_CAT(SoAMetadata::ParametersTypeOf_, LOCAL_NAME)::TupleOrPointerType LOCAL_NAME) + +#define _DECLARE_VIEW_CONSTRUCTION_BYCOLUMN_PARAMETERS(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND( \ + _DECLARE_VIEW_CONSTRUCTION_BYCOLUMN_PARAMETERS_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA)) + +/** + * Generator of member initialization from constructor. + * We use a lambda with auto return type to handle multiple possible return types. + */ +#define _DECLARE_VIEW_MEMBER_INITIALIZERS_IMPL(LAYOUT, MEMBER, NAME) \ + (BOOST_PP_CAT(NAME, Parameters_)([&]() -> auto { \ + auto params = LAYOUT.soaMetadata().BOOST_PP_CAT(parametersOf_, MEMBER)(); \ + if constexpr (alignmentEnforcement == AlignmentEnforcement::Enforced) \ + if (reinterpret_cast(params.addr_) % byteAlignment) \ + throw std::out_of_range("In constructor by layout: misaligned column: " #NAME); \ + return params; \ + }())) + +#define _DECLARE_VIEW_MEMBER_INITIALIZERS(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_MEMBER_INITIALIZERS_IMPL LAYOUT_MEMBER_NAME) + +/** + * Generator of size computation for constructor. + * This is the per-layout part of the lambda checking they all have the same size. + */ +#define _UPDATE_SIZE_OF_VIEW_IMPL(LAYOUT_TYPE, LAYOUT_NAME) \ + if (set) { \ + if (ret != LAYOUT_NAME.soaMetadata().size()) \ + throw std::out_of_range("In constructor by layout: different sizes from layouts."); \ + } else { \ + ret = LAYOUT_NAME.soaMetadata().size(); \ + set = true; \ + } + +#define _UPDATE_SIZE_OF_VIEW(R, DATA, TYPE_NAME) BOOST_PP_EXPAND(_UPDATE_SIZE_OF_VIEW_IMPL TYPE_NAME) + +/** + * Generator of member initialization from constructor. + * We use a lambda with auto return type to handle multiple possible return types. + */ +// clang-format off +#define _DECLARE_VIEW_MEMBER_INITIALIZERS_BYCOLUMN_IMPL(LAYOUT, MEMBER, NAME) \ + ( \ + BOOST_PP_CAT(NAME, Parameters_)([&]() -> auto { \ + if constexpr (alignmentEnforcement == AlignmentEnforcement::Enforced) \ + if (SoAMetadata:: BOOST_PP_CAT(ParametersTypeOf_, NAME)::checkAlignment(NAME, byteAlignment)) \ + throw std::out_of_range("In constructor by column: misaligned column: " #NAME); \ + return NAME; \ + }()) \ + ) +// clang-format on + +#define _DECLARE_VIEW_MEMBER_INITIALIZERS_BYCOLUMN(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_MEMBER_INITIALIZERS_BYCOLUMN_IMPL LAYOUT_MEMBER_NAME) + +/** + * Generator of element members initializer. + */ +#define _DECLARE_VIEW_ELEM_MEMBER_INIT_IMPL(LAYOUT, MEMBER, LOCAL_NAME, DATA) (LOCAL_NAME(DATA, LOCAL_NAME)) + +#define _DECLARE_VIEW_ELEM_MEMBER_INIT(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_ELEM_MEMBER_INIT_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA)) + +/** + * Helper macro extracting the data type from metadata of a layout or view + */ +#define _COLUMN_TYPE(LAYOUT_NAME, LAYOUT_MEMBER) \ + typename std::remove_pointer::type + +/** + * Generator of parameters for (non-const) element subclass (expanded comma separated). + */ +#define _DECLARE_VIEW_ELEMENT_VALUE_ARG_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME, DATA) \ + (DATA typename BOOST_PP_CAT(SoAMetadata::ParametersTypeOf_, LOCAL_NAME) LOCAL_NAME) + +#define _DECLARE_VIEW_ELEMENT_VALUE_ARG(R, DATA, LAYOUT_MEMBER_NAME) \ + _DECLARE_VIEW_ELEMENT_VALUE_ARG_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA) + +/** + * Generator of parameters for (const) element subclass (expanded comma separated). + */ +#define _DECLARE_CONST_VIEW_ELEMENT_VALUE_ARG_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME, DATA) \ + (DATA typename BOOST_PP_CAT(SoAMetadata::ParametersTypeOf_, LOCAL_NAME)::ConstType LOCAL_NAME) + +#define _DECLARE_CONST_VIEW_ELEMENT_VALUE_ARG(R, DATA, LAYOUT_MEMBER_NAME) \ + _DECLARE_CONST_VIEW_ELEMENT_VALUE_ARG_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA) + +/** + * Generator of member initialization for constructor of element subclass + */ +#define _DECLARE_VIEW_CONST_ELEM_MEMBER_INIT_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME, DATA) \ + (BOOST_PP_CAT(LOCAL_NAME, _)(DATA, LOCAL_NAME)) + +/* declare AoS-like element value args for contructor; these should expand,for columns only */ +#define _DECLARE_VIEW_CONST_ELEM_MEMBER_INIT(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_CONST_ELEM_MEMBER_INIT_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA)) + +/** + * Declaration of the members accessors of the const element subclass + */ +#define _DECLARE_VIEW_CONST_ELEMENT_ACCESSOR_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) \ + SOA_HOST_DEVICE_INLINE \ + typename SoAConstValueWithConf::RefToConst \ + LOCAL_NAME() const { \ + return BOOST_PP_CAT(LOCAL_NAME, _)(); \ + } + +#define _DECLARE_VIEW_CONST_ELEMENT_ACCESSOR(R, DATA, LAYOUT_MEMBER_NAME) \ + _DECLARE_VIEW_CONST_ELEMENT_ACCESSOR_IMPL LAYOUT_MEMBER_NAME + +/** + * Declaration of the private members of the const element subclass + */ +#define _DECLARE_VIEW_CONST_ELEMENT_VALUE_MEMBER_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) \ + const cms::soa::ConstValueTraits, \ + BOOST_PP_CAT(SoAMetadata::ColumnTypeOf_, LOCAL_NAME)> \ + BOOST_PP_CAT(LOCAL_NAME, _); + +#define _DECLARE_VIEW_CONST_ELEMENT_VALUE_MEMBER(R, DATA, LAYOUT_MEMBER_NAME) \ + _DECLARE_VIEW_CONST_ELEMENT_VALUE_MEMBER_IMPL LAYOUT_MEMBER_NAME + +/** + * Generator of the member-by-member copy operator of the element subclass. + */ +#define _DECLARE_VIEW_ELEMENT_VALUE_COPY_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) \ + if constexpr (SoAMetadata::BOOST_PP_CAT(ColumnTypeOf_, LOCAL_NAME) != cms::soa::SoAColumnType::scalar) \ + LOCAL_NAME() = other.LOCAL_NAME(); + +#define _DECLARE_VIEW_ELEMENT_VALUE_COPY(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_ELEMENT_VALUE_COPY_IMPL LAYOUT_MEMBER_NAME) + +/** + * Declaration of the private members of the const element subclass + */ +#define _DECLARE_VIEW_ELEMENT_VALUE_MEMBER_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) \ + SoAValueWithConf \ + LOCAL_NAME; + +#define _DECLARE_VIEW_ELEMENT_VALUE_MEMBER(R, DATA, LAYOUT_MEMBER_NAME) \ + _DECLARE_VIEW_ELEMENT_VALUE_MEMBER_IMPL LAYOUT_MEMBER_NAME + +/** + * Parameters passed to element subclass constructor in operator[] + */ +#define _DECLARE_VIEW_ELEMENT_CONSTR_CALL_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) \ + (BOOST_PP_CAT(LOCAL_NAME, Parameters_)) + +#define _DECLARE_VIEW_ELEMENT_CONSTR_CALL(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_ELEMENT_CONSTR_CALL_IMPL LAYOUT_MEMBER_NAME) + +/** + * Direct access to column pointer and indexed access + */ +#define _DECLARE_VIEW_SOA_ACCESSOR_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) \ + /* Column or scalar */ \ + SOA_HOST_DEVICE_INLINE \ + typename cms::soa::SoAAccessors:: \ + template ColumnType::template AccessType< \ + cms::soa::SoAAccessType::mutableAccess>::NoParamReturnType \ + LOCAL_NAME() { \ + return typename cms::soa::SoAAccessors:: \ + template ColumnType::template AccessType< \ + cms::soa::SoAAccessType::mutableAccess>(BOOST_PP_CAT(LOCAL_NAME, Parameters_))(); \ + } \ + SOA_HOST_DEVICE_INLINE auto& LOCAL_NAME(size_t index) { \ + return typename cms::soa::SoAAccessors:: \ + template ColumnType::template AccessType< \ + cms::soa::SoAAccessType::mutableAccess>(BOOST_PP_CAT(LOCAL_NAME, Parameters_))(index); \ + } + +#define _DECLARE_VIEW_SOA_ACCESSOR(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_SOA_ACCESSOR_IMPL LAYOUT_MEMBER_NAME) + +/** + * Direct access to column pointer (const) and indexed access. + */ +#define _DECLARE_VIEW_SOA_CONST_ACCESSOR_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME) \ + /* Column or scalar */ \ + SOA_HOST_DEVICE_INLINE auto LOCAL_NAME() const { \ + return typename cms::soa::SoAAccessors:: \ + template ColumnType::template AccessType< \ + cms::soa::SoAAccessType::constAccess>(BOOST_PP_CAT(LOCAL_NAME, Parameters_))(); \ + } \ + SOA_HOST_DEVICE_INLINE auto LOCAL_NAME(size_t index) const { \ + return typename cms::soa::SoAAccessors:: \ + template ColumnType::template AccessType< \ + cms::soa::SoAAccessType::constAccess>(BOOST_PP_CAT(LOCAL_NAME, Parameters_))(index); \ + } + +#define _DECLARE_VIEW_SOA_CONST_ACCESSOR(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_SOA_CONST_ACCESSOR_IMPL LAYOUT_MEMBER_NAME) + +/** + * SoA class member declaration (column pointers and parameters). + */ +#define _DECLARE_VIEW_SOA_MEMBER_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME, DATA) \ + typename BOOST_PP_CAT(SoAMetadata::ParametersTypeOf_, LOCAL_NAME) BOOST_PP_CAT(LOCAL_NAME, Parameters_); + +#define _DECLARE_VIEW_SOA_MEMBER(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_VIEW_SOA_MEMBER_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA)) + +/** + * Const SoA class member declaration (column pointers and parameters). + */ +#define _DECLARE_CONST_VIEW_SOA_MEMBER_IMPL(LAYOUT_NAME, LAYOUT_MEMBER, LOCAL_NAME, DATA) \ + typename BOOST_PP_CAT(SoAMetadata::ParametersTypeOf_, LOCAL_NAME)::ConstType BOOST_PP_CAT(LOCAL_NAME, Parameters_); + +#define _DECLARE_CONST_VIEW_SOA_MEMBER(R, DATA, LAYOUT_MEMBER_NAME) \ + BOOST_PP_EXPAND(_DECLARE_CONST_VIEW_SOA_MEMBER_IMPL BOOST_PP_TUPLE_PUSH_BACK(LAYOUT_MEMBER_NAME, DATA)) + +/* ---- MUTABLE VIEW -------------------------------------------------------------------------------------------------------------------- */ +// clang-format off +#define GENERATE_SOA_VIEW(CLASS, LAYOUTS_LIST, VALUE_LIST) \ + template \ + struct CLASS { \ + /* these could be moved to an external type trait to free up the symbol names */ \ + using self_type = CLASS; \ + typedef cms::soa::AlignmentEnforcement AlignmentEnforcement; \ + \ + /* For CUDA applications, we align to the 128 bytes of the cache lines. \ + * See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0 this is still valid \ + * up to compute capability 8.X. \ + */ \ + constexpr static size_t defaultAlignment = cms::soa::CacheLineSize::defaultSize; \ + constexpr static size_t byteAlignment = ALIGNMENT; \ + constexpr static AlignmentEnforcement alignmentEnforcement = ALIGNMENT_ENFORCEMENT; \ + constexpr static size_t conditionalAlignment = \ + alignmentEnforcement == AlignmentEnforcement::Enforced ? byteAlignment : 0; \ + constexpr static cms::soa::RestrictQualify restrictQualify = RESTRICT_QUALIFY; \ + constexpr static cms::soa::RangeChecking rangeChecking = RANGE_CHECKING; \ + /* Those typedefs avoid having commas in macros (which is problematic) */ \ + template \ + using SoAValueWithConf = cms::soa::SoAValue; \ + \ + template \ + using SoAConstValueWithConf = cms::soa::SoAConstValue; \ + \ + /** \ + * Helper/friend class allowing SoA introspection. \ + */ \ + struct SoAMetadata { \ + friend CLASS; \ + SOA_HOST_DEVICE_INLINE size_t size() const { return parent_.nElements_; } \ + /* Alias layout or view types to name-derived identifyer to allow simpler definitions */ \ + _ITERATE_ON_ALL(_DECLARE_VIEW_LAYOUT_TYPE_ALIAS, ~, LAYOUTS_LIST) \ + \ + /* Alias member types to name-derived identifyer to allow simpler definitions */ \ + _ITERATE_ON_ALL(_DECLARE_VIEW_MEMBER_TYPE_ALIAS, BOOST_PP_EMPTY(), VALUE_LIST) \ + \ + /* Forbid copying to avoid const correctness evasion */ \ + SoAMetadata& operator=(const SoAMetadata&) = delete; \ + SoAMetadata(const SoAMetadata&) = delete; \ + \ + private: \ + SOA_HOST_DEVICE_INLINE SoAMetadata(const CLASS& parent) : parent_(parent) {} \ + const CLASS& parent_; \ + }; \ + friend SoAMetadata; \ + SOA_HOST_DEVICE_INLINE const SoAMetadata soaMetadata() const { return SoAMetadata(*this); } \ + SOA_HOST_DEVICE_INLINE SoAMetadata soaMetadata() { return SoAMetadata(*this); } \ + \ + /* Trivial constuctor */ \ + CLASS() {} \ + \ + /* Constructor relying on user provided layouts or views */ \ + SOA_HOST_ONLY CLASS(_ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_CONSTRUCTION_PARAMETERS, BOOST_PP_EMPTY(), LAYOUTS_LIST)) \ + : nElements_([&]() -> size_t { \ + bool set = false; \ + size_t ret = 0; \ + _ITERATE_ON_ALL(_UPDATE_SIZE_OF_VIEW, BOOST_PP_EMPTY(), LAYOUTS_LIST) \ + return ret; \ + }()), \ + _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_MEMBER_INITIALIZERS, ~, VALUE_LIST) {} \ + \ + /* Constructor relying on individually provided column addresses */ \ + SOA_HOST_ONLY CLASS(size_t nElements, \ + _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_CONSTRUCTION_BYCOLUMN_PARAMETERS, \ + BOOST_PP_EMPTY(), \ + VALUE_LIST)) \ + : nElements_(nElements), _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_MEMBER_INITIALIZERS_BYCOLUMN, ~, VALUE_LIST) {} \ + \ + struct const_element { \ + SOA_HOST_DEVICE_INLINE \ + const_element(size_t index, /* Declare parameters */ \ + _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_ELEMENT_VALUE_ARG, const, VALUE_LIST)) \ + : _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_CONST_ELEM_MEMBER_INIT, index, VALUE_LIST) {} \ + _ITERATE_ON_ALL(_DECLARE_VIEW_CONST_ELEMENT_ACCESSOR, ~, VALUE_LIST) \ + \ + private: \ + _ITERATE_ON_ALL(_DECLARE_VIEW_CONST_ELEMENT_VALUE_MEMBER, ~, VALUE_LIST) \ + }; \ + \ + struct element { \ + SOA_HOST_DEVICE_INLINE \ + element(size_t index, /* Declare parameters */ \ + _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_ELEMENT_VALUE_ARG, BOOST_PP_EMPTY(), VALUE_LIST)) \ + : _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_ELEM_MEMBER_INIT, index, VALUE_LIST) {} \ + SOA_HOST_DEVICE_INLINE \ + element& operator=(const element& other) { \ + _ITERATE_ON_ALL(_DECLARE_VIEW_ELEMENT_VALUE_COPY, ~, VALUE_LIST) \ + return *this; \ + } \ + _ITERATE_ON_ALL(_DECLARE_VIEW_ELEMENT_VALUE_MEMBER, ~, VALUE_LIST) \ + }; \ + \ + /* AoS-like accessor (non-const) */ \ + SOA_HOST_DEVICE_INLINE \ + element operator[](size_t index) { \ + if constexpr (rangeChecking == cms::soa::RangeChecking::Enabled) { \ + if (index >= nElements_) \ + SOA_THROW_OUT_OF_RANGE("Out of range index in " #CLASS "::operator[]") \ + } \ + return element(index, _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_ELEMENT_CONSTR_CALL, ~, VALUE_LIST)); \ + } \ + \ + /* AoS-like accessor (const) */ \ + SOA_HOST_DEVICE_INLINE \ + const_element operator[](size_t index) const { \ + if constexpr (rangeChecking == cms::soa::RangeChecking::Enabled) { \ + if (index >= nElements_) \ + SOA_THROW_OUT_OF_RANGE("Out of range index in " #CLASS "::operator[]") \ + } \ + return const_element(index, _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_ELEMENT_CONSTR_CALL, ~, VALUE_LIST)); \ + } \ + \ + /* accessors */ \ + _ITERATE_ON_ALL(_DECLARE_VIEW_SOA_ACCESSOR, ~, VALUE_LIST) \ + _ITERATE_ON_ALL(_DECLARE_VIEW_SOA_CONST_ACCESSOR, ~, VALUE_LIST) \ + \ + /* dump the SoA internal structure */ \ + template \ + SOA_HOST_ONLY friend void dump(); \ + \ + private: \ + size_t nElements_ = 0; \ + _ITERATE_ON_ALL(_DECLARE_VIEW_SOA_MEMBER, BOOST_PP_EMPTY(), VALUE_LIST) \ + }; +// clang-format on + +/* ---- CONST VIEW --------------------------------------------------------------------------------------------------------------------- */ +// clang-format off +#define GENERATE_SOA_CONST_VIEW(CLASS, LAYOUTS_LIST, VALUE_LIST) \ + template \ + struct CLASS { \ + /* these could be moved to an external type trait to free up the symbol names */ \ + using self_type = CLASS; \ + typedef cms::soa::AlignmentEnforcement AlignmentEnforcement; \ + \ + /* For CUDA applications, we align to the 128 bytes of the cache lines. \ + * See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-memory-3-0 this is still valid \ + * up to compute capability 8.X. \ + */ \ + constexpr static size_t defaultAlignment = cms::soa::CacheLineSize::defaultSize; \ + constexpr static size_t byteAlignment = ALIGNMENT; \ + constexpr static AlignmentEnforcement alignmentEnforcement = ALIGNMENT_ENFORCEMENT; \ + constexpr static size_t conditionalAlignment = \ + alignmentEnforcement == AlignmentEnforcement::Enforced ? byteAlignment : 0; \ + constexpr static cms::soa::RestrictQualify restrictQualify = RESTRICT_QUALIFY; \ + constexpr static cms::soa::RangeChecking rangeChecking = RANGE_CHECKING; \ + /* Those typedefs avoid having commas in macros (which is problematic) */ \ + template \ + using SoAValueWithConf = cms::soa::SoAValue; \ + \ + template \ + using SoAConstValueWithConf = cms::soa::SoAConstValue; \ + /** \ + * Helper/friend class allowing SoA introspection. \ + */ \ + struct SoAMetadata { \ + friend CLASS; \ + SOA_HOST_DEVICE_INLINE size_t size() const { return parent_.nElements_; } \ + /* Alias layout/view types to name-derived identifyer to allow simpler definitions */ \ + _ITERATE_ON_ALL(_DECLARE_VIEW_LAYOUT_TYPE_ALIAS, ~, LAYOUTS_LIST) \ + \ + /* Alias member types to name-derived identifyer to allow simpler definitions */ \ + _ITERATE_ON_ALL(_DECLARE_VIEW_MEMBER_TYPE_ALIAS, const, VALUE_LIST) \ + \ + SoAMetadata& operator=(const SoAMetadata&) = delete; \ + SoAMetadata(const SoAMetadata&) = delete; \ + \ + private: \ + SOA_HOST_DEVICE_INLINE SoAMetadata(const CLASS& parent) : parent_(parent) {} \ + const CLASS& parent_; \ + }; \ + friend SoAMetadata; \ + SOA_HOST_DEVICE_INLINE const SoAMetadata soaMetadata() const { return SoAMetadata(*this); } \ + \ + /* Trivial constuctor */ \ + CLASS() {} \ + \ + /* Constructor relying on user provided layouts or views */ \ + SOA_HOST_ONLY CLASS(_ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_CONSTRUCTION_PARAMETERS, const, LAYOUTS_LIST)) \ + : nElements_([&]() -> size_t { \ + bool set = false; \ + size_t ret = 0; \ + _ITERATE_ON_ALL(_UPDATE_SIZE_OF_VIEW, BOOST_PP_EMPTY(), LAYOUTS_LIST) \ + return ret; \ + }()), \ + _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_MEMBER_INITIALIZERS, ~, VALUE_LIST) {} \ + \ + /* Constructor relying on individually provided column addresses */ \ + SOA_HOST_ONLY CLASS(size_t nElements, \ + _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_CONSTRUCTION_BYCOLUMN_PARAMETERS, const, VALUE_LIST)) \ + : nElements_(nElements), _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_MEMBER_INITIALIZERS_BYCOLUMN, ~, VALUE_LIST) {} \ + \ + struct const_element { \ + SOA_HOST_DEVICE_INLINE \ + const_element(size_t index, /* Declare parameters */ \ + _ITERATE_ON_ALL_COMMA(_DECLARE_CONST_VIEW_ELEMENT_VALUE_ARG, const, VALUE_LIST)) \ + : _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_CONST_ELEM_MEMBER_INIT, index, VALUE_LIST) {} \ + _ITERATE_ON_ALL(_DECLARE_VIEW_CONST_ELEMENT_ACCESSOR, ~, VALUE_LIST) \ + \ + private: \ + _ITERATE_ON_ALL(_DECLARE_VIEW_CONST_ELEMENT_VALUE_MEMBER, ~, VALUE_LIST) \ + }; \ + \ + /* AoS-like accessor (const) */ \ + SOA_HOST_DEVICE_INLINE \ + const_element operator[](size_t index) const { \ + if constexpr (rangeChecking == cms::soa::RangeChecking::Enabled) { \ + if (index >= nElements_) \ + SOA_THROW_OUT_OF_RANGE("Out of range index in " #CLASS "::operator[]") \ + } \ + return const_element(index, _ITERATE_ON_ALL_COMMA(_DECLARE_VIEW_ELEMENT_CONSTR_CALL, ~, VALUE_LIST)); \ + } \ + \ + /* accessors */ \ + _ITERATE_ON_ALL(_DECLARE_VIEW_SOA_CONST_ACCESSOR, ~, VALUE_LIST) \ + \ + /* dump the SoA internal structure */ \ + template \ + SOA_HOST_ONLY friend void dump(); \ + \ + private: \ + size_t nElements_ = 0; \ + _ITERATE_ON_ALL(_DECLARE_CONST_VIEW_SOA_MEMBER, const, VALUE_LIST) \ +}; +// clang-format on + +/** + * Helper macro turning layout field declaration into view field declaration. + */ +#define _VIEW_FIELD_FROM_LAYOUT_IMPL(VALUE_TYPE, CPP_TYPE, NAME, DATA) (DATA, NAME, NAME) + +#define _VIEW_FIELD_FROM_LAYOUT(R, DATA, VALUE_TYPE_NAME) \ + BOOST_PP_EXPAND((_VIEW_FIELD_FROM_LAYOUT_IMPL BOOST_PP_TUPLE_PUSH_BACK(VALUE_TYPE_NAME, DATA))) + +/** + * A macro defining both layout and view(s) in one go. + */ + +#define GENERATE_SOA_LAYOUT_VIEW_AND_CONST_VIEW(LAYOUT_NAME, VIEW_NAME, CONST_VIEW_NAME, ...) \ + GENERATE_SOA_LAYOUT(LAYOUT_NAME, __VA_ARGS__) \ + using BOOST_PP_CAT(LAYOUT_NAME, _default) = LAYOUT_NAME<>; \ + GENERATE_SOA_VIEW(VIEW_NAME, \ + SOA_VIEW_LAYOUT_LIST((BOOST_PP_CAT(LAYOUT_NAME, _default), BOOST_PP_CAT(instance_, LAYOUT_NAME))), \ + SOA_VIEW_VALUE_LIST(_ITERATE_ON_ALL_COMMA( \ + _VIEW_FIELD_FROM_LAYOUT, BOOST_PP_CAT(instance_, LAYOUT_NAME), __VA_ARGS__))) \ + GENERATE_SOA_CONST_VIEW( \ + CONST_VIEW_NAME, \ + SOA_VIEW_LAYOUT_LIST((BOOST_PP_CAT(LAYOUT_NAME, _default), BOOST_PP_CAT(instance_, LAYOUT_NAME))), \ + SOA_VIEW_VALUE_LIST( \ + _ITERATE_ON_ALL_COMMA(_VIEW_FIELD_FROM_LAYOUT, BOOST_PP_CAT(instance_, LAYOUT_NAME), __VA_ARGS__))) + +#define GENERATE_SOA_LAYOUT_AND_VIEW(LAYOUT_NAME, VIEW_NAME, ...) \ + GENERATE_SOA_LAYOUT(LAYOUT_NAME, __VA_ARGS__); \ + using BOOST_PP_CAT(LAYOUT_NAME, _default) = LAYOUT_NAME<>; \ + GENERATE_SOA_VIEW(VIEW_NAME, \ + SOA_VIEW_LAYOUT_LIST((BOOST_PP_CAT(LAYOUT_NAME, _default), BOOST_PP_CAT(instance_, LAYOUT_NAME))), \ + SOA_VIEW_VALUE_LIST(_ITERATE_ON_ALL_COMMA( \ + _VIEW_FIELD_FROM_LAYOUT, BOOST_PP_CAT(instance_, LAYOUT_NAME), __VA_ARGS__))) + +#define GENERATE_SOA_LAYOUT_AND_CONST_VIEW(LAYOUT_NAME, CONST_VIEW_NAME, ...) \ + GENERATE_SOA_LAYOUT(LAYOUT_NAME, __VA_ARGS__) \ + using BOOST_PP_CAT(LAYOUT_NAME, _default) = LAYOUT_NAME<>; \ + GENERATE_SOA_CONST_VIEW( \ + CONST_VIEW_NAME, \ + SOA_VIEW_LAYOUT_LIST((BOOST_PP_CAT(LAYOUT_NAME, _default), BOOST_PP_CAT(instance_, LAYOUT_NAME))), \ + SOA_VIEW_VALUE_LIST( \ + _ITERATE_ON_ALL_COMMA(_VIEW_FIELD_FROM_LAYOUT, BOOST_PP_CAT(instance_, LAYOUT_NAME), __VA_ARGS__))) + +#endif // ndef DataStructures_SoAView_h diff --git a/src/cudadev/bin/Source.h b/src/cudadev/bin/Source.h index 69d54d336..c29685c07 100644 --- a/src/cudadev/bin/Source.h +++ b/src/cudadev/bin/Source.h @@ -17,11 +17,8 @@ namespace edm { class Source { public: - explicit Source(int maxEvents, - int runForMinutes, - ProductRegistry& reg, - std::filesystem::path const& datadir, - bool validation); + explicit Source( + int maxEvents, int runForMinutes, ProductRegistry& reg, std::filesystem::path const& datadir, bool validation); void startProcessing(); diff --git a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.h b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.h index 67831af89..b7b5bfcc2 100644 --- a/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.h +++ b/src/cudadev/plugin-PixelTriplets/BrokenLineFitOnGPU.h @@ -16,7 +16,7 @@ #include "BrokenLine.h" #include "HelixFitOnGPU.h" -using HitsOnGPU = TrackingRecHit2DSOAView; +using HitsOnGPU = TrackingRecHit2DSOAStore; using Tuples = pixelTrack::HitContainer; using OutputSoA = pixelTrack::TrackSoA; @@ -79,8 +79,8 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, auto hit = hitId[i]; float ge[6]; hhp->cpeParams() - .detParams(hhp->detectorIndex(hit)) - .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); + .detParams((*hhp)[hit].detectorIndex()) + .frame.toGlobal((*hhp)[hit].xerrLocal(), 0, (*hhp)[hit].yerrLocal(), ge); #ifdef BL_DUMP_HITS if (dump) { printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n", @@ -102,7 +102,7 @@ __global__ void kernel_BLFastFit(Tuples const *__restrict__ foundNtuplets, ge[5]); } #endif - hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits.col(i) << (*hhp)[hit].xGlobal(), (*hhp)[hit].yGlobal(), (*hhp)[hit].zGlobal(); hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; } brokenline::fastFit(hits, fast_fit); @@ -181,4 +181,4 @@ __global__ void kernel_BLFit(caConstants::TupleMultiplicity const *__restrict__ line.cov(1, 1)); #endif } -} +} \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc index f2805d018..ed7a4776c 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cc @@ -63,7 +63,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr device_nCells_, device_theCellNeighbors_.get(), device_theCellTracks_.get(), - hh.view(), + hh.store(), device_isOuterHitOfCell_.get(), nActualPairs, params_.idealConditions_, @@ -94,7 +94,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_connect(device_hitTuple_apc_, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), + hh.store(), device_theCells_.get(), device_nCells_, device_theCellNeighbors_.get(), @@ -108,10 +108,10 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * if (nhits > 1 && params_.earlyFishbone_) { gpuPixelDoublets::fishbone( - hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); + hh.store(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); } - kernel_find_ntuplets(hh.view(), + kernel_find_ntuplets(hh.store(), device_theCells_.get(), device_nCells_, device_theCellTracks_.get(), @@ -120,7 +120,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * quality_d, params_.minHitsPerNtuplet_); if (params_.doStats_) - kernel_mark_used(hh.view(), device_theCells_.get(), device_nCells_); + kernel_mark_used(hh.store(), device_theCells_.get(), device_nCells_); cms::cuda::finalizeBulk(device_hitTuple_apc_, tuples_d); @@ -133,7 +133,7 @@ void CAHitNtupletGeneratorKernelsCPU::launchKernels(HitsOnCPU const &hh, TkSoA * if (nhits > 1 && params_.lateFishbone_) { gpuPixelDoublets::fishbone( - hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); + hh.store(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); } if (params_.doStats_) { @@ -178,7 +178,7 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA // remove duplicates (tracks that share a hit) if (params_.doSharedHitCut_) { kernel_sharedHitCleaner( - hh.view(), tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, device_hitToTuple_.get()); + hh.store(), tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, device_hitToTuple_.get()); } if (params_.doStats_) { @@ -190,6 +190,6 @@ void CAHitNtupletGeneratorKernelsCPU::classifyTuples(HitsOnCPU const &hh, TkSoA #ifdef DUMP_GPU_TK_TUPLES static std::atomic iev(0); ++iev; - kernel_print_found_ntuplets(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); + kernel_print_found_ntuplets(hh.store(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); #endif -} +} \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu index edc1eb49b..423caf558 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.cu @@ -50,7 +50,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * kernel_connect<<>>( device_hitTuple_apc_, device_hitToTuple_apc_, // needed only to be reset, ready for next kernel - hh.view(), + hh.store(), device_theCells_.get(), device_nCells_, device_theCellNeighbors_.get(), @@ -71,13 +71,13 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); gpuPixelDoublets::fishbone<<>>( - hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); + hh.store(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false); cudaCheck(cudaGetLastError()); } blockSize = 64; numberOfBlocks = (3 * params_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize; - kernel_find_ntuplets<<>>(hh.view(), + kernel_find_ntuplets<<>>(hh.store(), device_theCells_.get(), device_nCells_, device_theCellTracks_.get(), @@ -88,7 +88,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * cudaCheck(cudaGetLastError()); if (params_.doStats_) - kernel_mark_used<<>>(hh.view(), device_theCells_.get(), device_nCells_); + kernel_mark_used<<>>(hh.store(), device_theCells_.get(), device_nCells_); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG @@ -123,7 +123,7 @@ void CAHitNtupletGeneratorKernelsGPU::launchKernels(HitsOnCPU const &hh, TkSoA * dim3 blks(1, numberOfBlocks, 1); dim3 thrs(stride, blockSize, 1); gpuPixelDoublets::fishbone<<>>( - hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); + hh.store(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true); cudaCheck(cudaGetLastError()); } @@ -205,7 +205,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr device_nCells_, device_theCellNeighbors_.get(), device_theCellTracks_.get(), - hh.view(), + hh.store(), device_isOuterHitOfCell_.get(), nActualPairs, params_.idealConditions_, @@ -275,7 +275,7 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA // remove duplicates (tracks that share a hit) numberOfBlocks = (hitToTupleView_.offSize + blockSize - 1) / blockSize; kernel_sharedHitCleaner<<>>( - hh.view(), tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, device_hitToTuple_.get()); + hh.store(), tuples_d, tracks_d, quality_d, params_.minHitsForSharingCut_, device_hitToTuple_.get()); cudaCheck(cudaGetLastError()); } @@ -314,11 +314,11 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA static std::atomic iev(0); ++iev; kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>( - hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); + hh.store(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100, iev); #endif } template <> void CAHitNtupletGeneratorKernelsGPU::printCounters(Counters const *counters) { kernel_printCounters<<<1, 1>>>(counters); -} +} \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h index dd87597a4..6f5cdccb0 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernels.h @@ -25,8 +25,8 @@ namespace cAHitNtupletGenerator { unsigned long long nZeroTrackCells; }; - using HitsView = TrackingRecHit2DSOAView; - using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsView = TrackingRecHit2DSOAStore; + using HitsOnGPU = TrackingRecHit2DSOAStore; using HitToTuple = caConstants::HitToTuple; using TupleMultiplicity = caConstants::TupleMultiplicity; @@ -157,8 +157,8 @@ class CAHitNtupletGeneratorKernels { template using unique_ptr = typename Traits::template unique_ptr; - using HitsView = TrackingRecHit2DSOAView; - using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsView = TrackingRecHit2DSOAStore; + using HitsOnGPU = TrackingRecHit2DSOAStore; using HitsOnCPU = TrackingRecHit2DHeterogeneous; using HitToTuple = caConstants::HitToTuple; @@ -232,4 +232,4 @@ class CAHitNtupletGeneratorKernels { using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels; -#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h index f14f5d8ea..032cf3a73 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorKernelsImpl.h @@ -20,7 +20,7 @@ #include "gpuFishbone.h" #include "gpuPixelDoublets.h" -using HitsOnGPU = TrackingRecHit2DSOAView; +using HitsOnGPU = TrackingRecHit2DSOAStore; using HitsOnCPU = TrackingRecHit2DCUDA; using HitToTuple = caConstants::HitToTuple; @@ -449,7 +449,7 @@ __global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples, } __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples, - TrackingRecHit2DSOAView const *__restrict__ hhp, + TrackingRecHit2DSOAStore const *__restrict__ hhp, HitContainer *__restrict__ hitDetIndices) { int first = blockDim.x * blockIdx.x + threadIdx.x; // copy offsets @@ -461,7 +461,7 @@ __global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples auto nhits = hh.nHits(); for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) { assert(tuples->content[idx] < nhits); - hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]); + hitDetIndices->content[idx] = hh[tuples->content[idx]].detectorIndex(); } } @@ -478,7 +478,7 @@ __global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernelsGPU::Hi } } -__global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp, +__global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAStore const *__restrict__ hhp, HitContainer const *__restrict__ ptuples, TkSoA const *__restrict__ ptracks, Quality *__restrict__ quality, @@ -540,7 +540,7 @@ __global__ void kernel_sharedHitCleaner(TrackingRecHit2DSOAView const *__restric } // loop over hits } -__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp, +__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAStore const *__restrict__ hhp, HitContainer const *__restrict__ ptuples, TkSoA const *__restrict__ ptracks, Quality const *__restrict__ quality, @@ -604,4 +604,4 @@ __global__ void kernel_printCounters(cAHitNtupletGenerator::Counters const *coun c.nKilledCells / double(c.nEvents), c.nEmptyCells / double(c.nCells), c.nZeroTrackCells / double(c.nCells)); -} +} \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc index 714748cc1..490184d2b 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.cc @@ -124,14 +124,14 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH kernels.buildDoublets(hits_d, stream); kernels.launchKernels(hits_d, soa, stream); - kernels.fillHitDetIndices(hits_d.view(), soa, stream); // in principle needed only if Hits not "available" + kernels.fillHitDetIndices(hits_d.store(), soa, stream); // in principle needed only if Hits not "available" HelixFitOnGPU fitter(bfield, m_params.fit5as4_); fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { - fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + fitter.launchRiemannKernels(hits_d.store(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); } else { - fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); + fitter.launchBrokenLineKernels(hits_d.store(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets, stream); } kernels.classifyTuples(hits_d, soa, stream); @@ -156,7 +156,7 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC kernels.buildDoublets(hits_d, nullptr); kernels.launchKernels(hits_d, soa, nullptr); - kernels.fillHitDetIndices(hits_d.view(), soa, nullptr); // in principle needed only if Hits not "available" + kernels.fillHitDetIndices(hits_d.store(), soa, nullptr); // in principle needed only if Hits not "available" if (0 == hits_d.nHits()) return tracks; @@ -166,9 +166,9 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa); if (m_params.useRiemannFit_) { - fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); + fitter.launchRiemannKernelsOnCPU(hits_d.store(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); } else { - fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); + fitter.launchBrokenLineKernelsOnCPU(hits_d.store(), hits_d.nHits(), caConstants::maxNumberOfQuadruplets); } kernels.classifyTuples(hits_d, soa, nullptr); @@ -178,4 +178,4 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DC #endif return tracks; -} +} \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h index f42bb301b..978b828a5 100644 --- a/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h +++ b/src/cudadev/plugin-PixelTriplets/CAHitNtupletGeneratorOnGPU.h @@ -19,9 +19,9 @@ namespace edm { class CAHitNtupletGeneratorOnGPU { public: - using HitsOnGPU = TrackingRecHit2DSOAView; + using HitsOnGPU = TrackingRecHit2DSOAStore; using HitsOnCPU = TrackingRecHit2DCUDA; - using hindex_type = TrackingRecHit2DSOAView::hindex_type; + using hindex_type = TrackingRecHit2DSOAStore::hindex_type; using Quality = pixelTrack::Quality; using OutputSoA = pixelTrack::TrackSoA; @@ -53,4 +53,4 @@ class CAHitNtupletGeneratorOnGPU { Counters* m_counters = nullptr; }; -#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h +#endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/GPUCACell.h b/src/cudadev/plugin-PixelTriplets/GPUCACell.h index 58ef54ae8..84eab621e 100644 --- a/src/cudadev/plugin-PixelTriplets/GPUCACell.h +++ b/src/cudadev/plugin-PixelTriplets/GPUCACell.h @@ -29,7 +29,7 @@ class GPUCACell { using CellNeighborsVector = caConstants::CellNeighborsVector; using CellTracksVector = caConstants::CellTracksVector; - using Hits = TrackingRecHit2DSOAView; + using Hits = TrackingRecHit2DSOAStore; using hindex_type = Hits::hindex_type; using TmpTuple = cms::cuda::VecArray; @@ -54,8 +54,8 @@ class GPUCACell { theUsed_ = 0; // optimization that depends on access pattern - theInnerZ = hh.zGlobal(innerHitId); - theInnerR = hh.rGlobal(innerHitId); + theInnerZ = hh[innerHitId].zGlobal(); + theInnerR = hh[innerHitId].rGlobal(); // link to default empty theOuterNeighbors = &cellNeighbors[0]; @@ -109,22 +109,22 @@ class GPUCACell { __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; } __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; } __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; } - __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); } - __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); } - __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); } - __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); } + __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh[theInnerHitId].xGlobal(); } + __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh[theOuterHitId].xGlobal(); } + __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh[theInnerHitId].yGlobal(); } + __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh[theOuterHitId].yGlobal(); } __device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; } // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; } - __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); } + __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh[theOuterHitId].zGlobal(); } __device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; } // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; } - __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); } + __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh[theOuterHitId].rGlobal(); } - __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); } - __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); } + __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh[theInnerHitId].iphi(); } + __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh[theOuterHitId].iphi(); } - __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); } - __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); } + __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh[theInnerHitId].detectorIndex(); } + __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh[theOuterHitId].detectorIndex(); } constexpr unsigned int inner_hit_id() const { return theInnerHitId; } constexpr unsigned int outer_hit_id() const { return theOuterHitId; } diff --git a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h index fee0f8dae..92f881f24 100644 --- a/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h +++ b/src/cudadev/plugin-PixelTriplets/HelixFitOnGPU.h @@ -33,7 +33,7 @@ namespace riemannFit { class HelixFitOnGPU { public: - using HitsView = TrackingRecHit2DSOAView; + using HitsView = TrackingRecHit2DSOAStore; using Tuples = pixelTrack::HitContainer; using OutputSoA = pixelTrack::TrackSoA; @@ -65,4 +65,4 @@ class HelixFitOnGPU { const bool fit5as4_; }; -#endif // RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h +#endif // RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.h b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.h index 12c9856fa..b82cf795f 100644 --- a/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.h +++ b/src/cudadev/plugin-PixelTriplets/RiemannFitOnGPU.h @@ -14,7 +14,7 @@ #include "RiemannFit.h" #include "HelixFitOnGPU.h" -using HitsOnGPU = TrackingRecHit2DSOAView; +using HitsOnGPU = TrackingRecHit2DSOAStore; using Tuples = pixelTrack::HitContainer; using OutputSoA = pixelTrack::TrackSoA; @@ -66,11 +66,11 @@ __global__ void kernel_FastFit(Tuples const *__restrict__ foundNtuplets, // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]); float ge[6]; hhp->cpeParams() - .detParams(hhp->detectorIndex(hit)) - .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge); + .detParams((*hhp)[hit].detectorIndex()) + .frame.toGlobal((*hhp)[hit].xerrLocal(), 0, (*hhp)[hit].yerrLocal(), ge); // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]); - hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit); + hits.col(i) << (*hhp)[hit].xGlobal(), (*hhp)[hit].yGlobal(), (*hhp)[hit].zGlobal(); hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]; } riemannFit::fastFit(hits, fast_fit); @@ -184,4 +184,4 @@ __global__ void kernel_LineFit(caConstants::TupleMultiplicity const *__restrict_ line_fit.cov(1, 1)); #endif } -} +} \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/gpuPixelDoublets.h b/src/cudadev/plugin-PixelTriplets/gpuPixelDoublets.h index cbb374698..4cbbf51c2 100644 --- a/src/cudadev/plugin-PixelTriplets/gpuPixelDoublets.h +++ b/src/cudadev/plugin-PixelTriplets/gpuPixelDoublets.h @@ -99,7 +99,7 @@ namespace gpuPixelDoublets { uint32_t* nCells, CellNeighborsVector* cellNeighbors, CellTracksVector* cellTracks, - TrackingRecHit2DSOAView const* __restrict__ hhp, + TrackingRecHit2DSOAStore const* __restrict__ hhp, GPUCACell::OuterHitOfCell* isOuterHitOfCell, int nActualPairs, bool ideal_cond, @@ -129,4 +129,4 @@ namespace gpuPixelDoublets { } // namespace gpuPixelDoublets -#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h +#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoublets_h \ No newline at end of file diff --git a/src/cudadev/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h b/src/cudadev/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h index 0dcc65203..308ce1f98 100644 --- a/src/cudadev/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h +++ b/src/cudadev/plugin-PixelTriplets/gpuPixelDoubletsAlgos.h @@ -28,7 +28,7 @@ namespace gpuPixelDoublets { uint32_t* nCells, CellNeighborsVector* cellNeighbors, CellTracksVector* cellTracks, - TrackingRecHit2DSOAView const& __restrict__ hh, + TrackingRecHit2DSOAStore const& __restrict__ hh, GPUCACell::OuterHitOfCell* isOuterHitOfCell, int16_t const* __restrict__ phicuts, float const* __restrict__ minz, @@ -50,7 +50,7 @@ namespace gpuPixelDoublets { bool isOuterLadder = ideal_cond; - using PhiBinner = TrackingRecHit2DSOAView::PhiBinner; + using PhiBinner = TrackingRecHit2DSOAStore::PhiBinner; auto const& __restrict__ phiBinner = hh.phiBinner(); uint32_t const* __restrict__ offsets = hh.hitsLayerStart(); @@ -104,7 +104,7 @@ namespace gpuPixelDoublets { assert(i < offsets[inner + 1]); // found hit corresponding to our cuda thread, now do the job - auto mi = hh.detectorIndex(i); + auto mi = hh[i].detectorIndex(); if (mi > gpuClustering::maxNumModules) continue; // invalid @@ -114,7 +114,7 @@ namespace gpuPixelDoublets { if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue; */ - auto mez = hh.zGlobal(i); + auto mez = hh[i].zGlobal(); if (mez < minz[pairLayerId] || mez > maxz[pairLayerId]) continue; @@ -127,7 +127,7 @@ namespace gpuPixelDoublets { isOuterLadder = ideal_cond ? true : 0 == (mi / 8) % 2; // only for B1/B2/B3 B4 is opposite, FPIX:noclue... // in any case we always test mes>0 ... - mes = inner > 0 || isOuterLadder ? hh.clusterSizeY(i) : -1; + mes = inner > 0 || isOuterLadder ? hh[i].clusterSizeY() : -1; if (inner == 0 && outer > 3) // B1 and F1 if (mes > 0 && mes < minYsizeB1) @@ -136,8 +136,8 @@ namespace gpuPixelDoublets { if (mes > 0 && mes < minYsizeB2) continue; } - auto mep = hh.iphi(i); - auto mer = hh.rGlobal(i); + auto mep = hh[i].iphi(); + auto mer = hh[i].rGlobal(); // all cuts: true if fails constexpr float z0cut = 12.f; // cm @@ -148,26 +148,26 @@ namespace gpuPixelDoublets { auto ptcut = [&](int j, int16_t idphi) { auto r2t4 = minRadius2T4; auto ri = mer; - auto ro = hh.rGlobal(j); + auto ro = hh[j].rGlobal(); auto dphi = short2phi(idphi); return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri); }; auto z0cutoff = [&](int j) { - auto zo = hh.zGlobal(j); - auto ro = hh.rGlobal(j); + auto zo = hh[j].zGlobal(); + auto ro = hh[j].rGlobal(); auto dr = ro - mer; return dr > maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr; }; auto zsizeCut = [&](int j) { auto onlyBarrel = outer < 4; - auto so = hh.clusterSizeY(j); + auto so = hh[j].clusterSizeY(); auto dy = inner == 0 ? maxDYsize12 : maxDYsize; // in the barrel cut on difference in size // in the endcap on the prediction on the first layer (actually in the barrel only: happen to be safe for endcap as well) // FIXME move pred cut to z0cutoff to optmize loading of and computaiton ... - auto zo = hh.zGlobal(j); - auto ro = hh.rGlobal(j); + auto zo = hh[j].zGlobal(); + auto ro = hh[j].rGlobal(); return onlyBarrel ? mes > 0 && so > 0 && std::abs(so - mes) > dy : (inner < 4) && mes > 0 && std::abs(mes - int(std::abs((mez - zo) / (mer - ro)) * dzdrFact + 0.5f)) > maxDYPred; @@ -199,14 +199,14 @@ namespace gpuPixelDoublets { auto oi = __ldg(p); assert(oi >= offsets[outer]); assert(oi < offsets[outer + 1]); - auto mo = hh.detectorIndex(oi); + auto mo = hh[oi].detectorIndex(); if (mo > gpuClustering::maxNumModules) continue; // invalid if (doZ0Cut && z0cutoff(oi)) continue; - auto mop = hh.iphi(oi); + auto mop = hh[oi].iphi(); uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop))); if (idphi > iphicut) continue; @@ -240,4 +240,4 @@ namespace gpuPixelDoublets { } // namespace gpuPixelDoublets -#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h +#endif // RecoPixelVertexing_PixelTriplets_plugins_gpuPixelDoubletsAlgos_h \ No newline at end of file diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index a5229b295..083709768 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -82,7 +82,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, ") differs the one from SiPixelROCsStatusAndMappingWrapper. Please fix your configuration."); } // get the GPU product already here so that the async transfer can begin - const auto* gpuMap = hgpuMap.getGPUProductAsync(ctx.stream()); + auto gpuMap = hgpuMap.getGPUProductAsync(ctx.stream()); const unsigned char* gpuModulesToUnpack = hgpuMap.getModToUnpAllAsync(ctx.stream()); auto const& hgains = iSetup.get(); diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu index aaa72c5e0..331874b4f 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.cu @@ -54,18 +54,12 @@ namespace pixelgpudetails { //////////////////// - __device__ bool isBarrel(uint32_t rawId) { - return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask)); + __device__ uint32_t cablingIndex(uint8_t fed, uint32_t link, uint32_t roc) { + return fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc; } - __device__ pixelgpudetails::DetIdGPU getRawId(const SiPixelROCsStatusAndMapping *cablingMap, - uint8_t fed, - uint32_t link, - uint32_t roc) { - uint32_t index = fed * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc; - pixelgpudetails::DetIdGPU detId = { - cablingMap->rawId[index], cablingMap->rocInDet[index], cablingMap->moduleId[index]}; - return detId; + __device__ bool isBarrel(uint32_t rawId) { + return (PixelSubdetector::PixelBarrel == ((rawId >> DetId::kSubdetOffset) & DetId::kSubdetMask)); } //reference http://cmsdoxygen.web.cern.ch/cmsdoxygen/CMSSW_9_2_0/doc/html/dd/d31/FrameConversion_8cc_source.html @@ -187,7 +181,7 @@ namespace pixelgpudetails { __device__ uint8_t checkROC(uint32_t errorWord, uint8_t fedId, uint32_t link, - const SiPixelROCsStatusAndMapping *cablingMap, + SiPixelROCsStatusAndMappingConstView &cablingMap, bool debug = false) { uint8_t errorType = (errorWord >> sipixelconstants::ROC_shift) & sipixelconstants::ERROR_mask; if (errorType < 25) @@ -197,9 +191,9 @@ namespace pixelgpudetails { switch (errorType) { case (25): { errorFound = true; - uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + 1; - if (index > 1 && index <= cablingMap->size) { - if (!(link == cablingMap->link[index] && 1 == cablingMap->roc[index])) + auto index = cablingIndex(fedId, link, 1); + if (index > 1 && index <= cablingMap.size()) { + if (!(link == cablingMap[index].link() && 1 == cablingMap[index].roc())) errorFound = false; } if (debug and errorFound) @@ -267,7 +261,7 @@ namespace pixelgpudetails { __device__ uint32_t getErrRawID(uint8_t fedId, uint32_t errWord, uint32_t errorType, - const SiPixelROCsStatusAndMapping *cablingMap, + SiPixelROCsStatusAndMappingConstView &cablingMap, bool debug = false) { uint32_t rID = 0xffffffff; @@ -279,7 +273,7 @@ namespace pixelgpudetails { case 40: { uint32_t roc = 1; uint32_t link = sipixelconstants::getLink(errWord); - uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId; + uint32_t rID_temp = cablingMap[cablingIndex(fedId, link, roc)].rawId(); if (rID_temp != gpuClustering::invalidModuleId) rID = rID_temp; break; @@ -312,7 +306,7 @@ namespace pixelgpudetails { uint32_t roc = 1; uint32_t link = chanNmbr; - uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId; + uint32_t rID_temp = cablingMap[cablingIndex(fedId, link, roc)].rawId(); if (rID_temp != gpuClustering::invalidModuleId) rID = rID_temp; break; @@ -321,7 +315,7 @@ namespace pixelgpudetails { case 38: { uint32_t roc = sipixelconstants::getROC(errWord); uint32_t link = sipixelconstants::getLink(errWord); - uint32_t rID_temp = getRawId(cablingMap, fedId, link, roc).rawId; + uint32_t rID_temp = cablingMap[cablingIndex(fedId, link, roc)].rawId(); if (rID_temp != gpuClustering::invalidModuleId) rID = rID_temp; break; @@ -334,7 +328,7 @@ namespace pixelgpudetails { } // Kernel to perform Raw to Digi conversion - __global__ void RawToDigi_kernel(const SiPixelROCsStatusAndMapping *cablingMap, + __global__ void RawToDigi_kernel(SiPixelROCsStatusAndMappingConstView cablingMap, const unsigned char *modToUnp, const uint32_t wordCounter, const uint32_t *word, @@ -374,7 +368,8 @@ namespace pixelgpudetails { uint32_t link = sipixelconstants::getLink(ww); // Extract link uint32_t roc = sipixelconstants::getROC(ww); // Extract Roc in link - pixelgpudetails::DetIdGPU detId = getRawId(cablingMap, fedId, link, roc); + auto index = cablingIndex(fedId, link, roc); + auto detId = cablingMap[index]; uint8_t errorType = checkROC(ww, fedId, link, cablingMap, debug); skipROC = (roc < pixelgpudetails::maxROCIndex) ? false : (errorType != 0); @@ -384,13 +379,12 @@ namespace pixelgpudetails { continue; } - uint32_t rawId = detId.rawId; - uint32_t rocIdInDetUnit = detId.rocInDet; + auto rawId = detId.rawId(); + auto rocIdInDetUnit = detId.rocInDet(); bool barrel = isBarrel(rawId); - uint32_t index = fedId * MAX_LINK * MAX_ROC + (link - 1) * MAX_ROC + roc; if (useQualityInfo) { - skipROC = cablingMap->badRocs[index]; + skipROC = cablingMap[index].badRocs(); if (skipROC) continue; } @@ -450,7 +444,7 @@ namespace pixelgpudetails { yy[gIndex] = globalPix.col; // origin shifting by 1 0-415 adc[gIndex] = sipixelconstants::getADC(ww); pdigi[gIndex] = pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]); - moduleId[gIndex] = detId.moduleId; + moduleId[gIndex] = detId.moduleId(); rawIdArr[gIndex] = rawId; } // end of loop (gIndex < end) @@ -499,7 +493,7 @@ namespace pixelgpudetails { // Interface to outside void SiPixelRawToClusterGPUKernel::makeClustersAsync(bool isRun2, const SiPixelClusterThresholds clusterThresholds, - const SiPixelROCsStatusAndMapping *cablingMap, + SiPixelROCsStatusAndMappingConstView &cablingMap, const unsigned char *modToUnp, const SiPixelGainForHLTonGPU *gains, const WordFedAppender &wordFed, diff --git a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h index 04e8b99b9..c3ff57103 100644 --- a/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h +++ b/src/cudadev/plugin-SiPixelClusterizer/SiPixelRawToClusterGPUKernel.h @@ -16,7 +16,7 @@ // local include(s) #include "SiPixelClusterThresholds.h" -struct SiPixelROCsStatusAndMapping; +struct SiPixelROCsStatusAndMappingConstView; class SiPixelGainForHLTonGPU; namespace pixelgpudetails { @@ -44,12 +44,6 @@ namespace pixelgpudetails { const uint32_t MAX_WORD = 2000; - struct DetIdGPU { - uint32_t rawId; - uint32_t rocInDet; - uint32_t moduleId; - }; - struct Pixel { uint32_t row; uint32_t col; @@ -141,7 +135,7 @@ namespace pixelgpudetails { void makeClustersAsync(bool isRun2, const SiPixelClusterThresholds clusterThresholds, - const SiPixelROCsStatusAndMapping* cablingMap, + SiPixelROCsStatusAndMappingConstView& cablingMap, const unsigned char* modToUnp, const SiPixelGainForHLTonGPU* gains, const WordFedAppender& wordFed, diff --git a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc index 448f4b797..bf701d94e 100644 --- a/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc +++ b/src/cudadev/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc @@ -22,10 +22,7 @@ class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork { edm::EDGetTokenT> digiGetToken_; edm::EDPutTokenT digiPutToken_; - cms::cuda::host::unique_ptr pdigi_; - cms::cuda::host::unique_ptr rawIdArr_; - cms::cuda::host::unique_ptr adc_; - cms::cuda::host::unique_ptr clus_; + SiPixelDigisCUDA::HostStore digis_; size_t nDigis_; }; @@ -43,10 +40,7 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); nDigis_ = gpuDigis.nDigis(); - pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream()); - rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream()); - adc_ = gpuDigis.adcToHostAsync(ctx.stream()); - clus_ = gpuDigis.clusToHostAsync(ctx.stream()); + digis_ = gpuDigis.dataToHostAsync(ctx.stream()); } void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { @@ -60,12 +54,10 @@ void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& // host memory to be allocated without a CUDA stream // - What if a CPU algorithm would produce the same SoA? We can't // use cudaMallocHost without a GPU... - iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get()); + auto dv = digis_.view(); + iEvent.emplace(digiPutToken_, nDigis_, dv.pdigi(), dv.rawIdArr(), dv.adc(), dv.clus()); - pdigi_.reset(); - rawIdArr_.reset(); - adc_.reset(); - clus_.reset(); + digis_.reset(); } // define as framework plugin diff --git a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu index ba62da1b5..d14d57a5e 100644 --- a/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu +++ b/src/cudadev/plugin-SiPixelRecHits/PixelRecHitGPUKernel.cu @@ -50,7 +50,7 @@ namespace pixelgpudetails { // protect from empty events if (blocks) { gpuPixelRecHits::getHits<<>>( - cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view()); + cpeParams, bs_d.data(), digis_d.pixelConstView(), digis_d.nDigis(), clusters_d.view(), hits_d.store()); cudaCheck(cudaGetLastError()); #ifdef GPU_DEBUG cudaCheck(cudaDeviceSynchronize()); @@ -74,4 +74,4 @@ namespace pixelgpudetails { return hits_d; } -} // namespace pixelgpudetails +} // namespace pixelgpudetails \ No newline at end of file diff --git a/src/cudadev/plugin-SiPixelRecHits/gpuPixelRecHits.h b/src/cudadev/plugin-SiPixelRecHits/gpuPixelRecHits.h index adddc8b83..9d4bf37f2 100644 --- a/src/cudadev/plugin-SiPixelRecHits/gpuPixelRecHits.h +++ b/src/cudadev/plugin-SiPixelRecHits/gpuPixelRecHits.h @@ -16,10 +16,10 @@ namespace gpuPixelRecHits { __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const* __restrict__ cpeParams, BeamSpotPOD const* __restrict__ bs, - SiPixelDigisCUDA::DeviceConstView const* __restrict__ pdigis, + SiPixelDigisCUDA::DevicePixelConstView digis, int numElements, - SiPixelClustersCUDA::DeviceConstView const* __restrict__ pclusters, - TrackingRecHit2DSOAView* phits) { + SiPixelClustersCUDA::DeviceConstView clusters, + TrackingRecHit2DSOAStore* phits) { // FIXME // the compiler seems NOT to optimize loads from views (even in a simple test case) // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature @@ -30,14 +30,11 @@ namespace gpuPixelRecHits { auto& hits = *phits; - auto const digis = *pdigis; // the copy is intentional! - auto const& clusters = *pclusters; - // copy average geometry corrected by beamspot . FIXME (move it somewhere else???) if (0 == blockIdx.x) { auto& agc = hits.averageGeometry(); auto const& ag = cpeParams->averageGeometry(); - for (int il = threadIdx.x, nl = TrackingRecHit2DSOAView::AverageGeometry::numberOfLaddersInBarrel; il < nl; + for (int il = threadIdx.x, nl = TrackingRecHit2DSOAStore::AverageGeometry::numberOfLaddersInBarrel; il < nl; il += blockDim.x) { agc.ladderZ[il] = ag.ladderZ[il] - bs->z; agc.ladderX[il] = ag.ladderX[il] - bs->x; @@ -62,8 +59,8 @@ namespace gpuPixelRecHits { // as usual one block per module __shared__ ClusParams clusParams; - auto me = clusters.moduleId(blockIdx.x); - int nclus = clusters.clusInModule(me); + auto me = clusters[blockIdx.x].moduleId(); + int nclus = clusters[me].clusInModule(); if (0 == nclus) return; @@ -108,21 +105,21 @@ namespace gpuPixelRecHits { __syncthreads(); // one thread per "digi" - auto first = clusters.moduleStart(1 + blockIdx.x) + threadIdx.x; + auto first = clusters[1 + blockIdx.x].moduleStart() + threadIdx.x; for (int i = first; i < numElements; i += blockDim.x) { - auto id = digis.moduleInd(i); + auto id = digis[i].moduleInd(); if (id == invalidModuleId) continue; // not valid if (id != me) break; // end of module - auto cl = digis.clus(i); + auto cl = digis[i].clus(); if (cl < startClus || cl >= lastClus) continue; cl -= startClus; assert(cl >= 0); assert(cl < MaxHitsInIter); - auto x = digis.xx(i); - auto y = digis.yy(i); + auto x = digis[i].xx(); + auto y = digis[i].yy(); atomicMin(&clusParams.minRow[cl], x); atomicMax(&clusParams.maxRow[cl], x); atomicMin(&clusParams.minCol[cl], y); @@ -135,20 +132,20 @@ namespace gpuPixelRecHits { //auto pixmx = cpeParams->detParams(me).pixmx; auto pixmx = std::numeric_limits::max(); for (int i = first; i < numElements; i += blockDim.x) { - auto id = digis.moduleInd(i); + auto id = digis[i].moduleInd(); if (id == invalidModuleId) continue; // not valid if (id != me) break; // end of module - auto cl = digis.clus(i); + auto cl = digis[i].clus(); if (cl < startClus || cl >= lastClus) continue; cl -= startClus; assert(cl >= 0); assert(cl < MaxHitsInIter); - auto x = digis.xx(i); - auto y = digis.yy(i); - auto ch = std::min(digis.adc(i), pixmx); + auto x = digis[i].xx(); + auto y = digis[i].yy(); + auto ch = std::min(digis[i].adc(), pixmx); atomicAdd(&clusParams.charge[cl], ch); if (clusParams.minRow[cl] == x) atomicAdd(&clusParams.q_f_X[cl], ch); @@ -164,29 +161,29 @@ namespace gpuPixelRecHits { // next one cluster per thread... - first = clusters.clusModuleStart(me) + startClus; + first = clusters[me].clusModuleStart() + startClus; for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) { auto h = first + ic; // output index in global memory assert(h < hits.nHits()); - assert(h < clusters.clusModuleStart(me + 1)); + assert(h < clusters[me + 1].clusModuleStart()); pixelCPEforGPU::position(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); pixelCPEforGPU::errorFromDB(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic); // store it - hits.charge(h) = clusParams.charge[ic]; - hits.detectorIndex(h) = me; + hits[h].charge = clusParams.charge[ic]; + hits[h].detectorIndex = me; float xl, yl; - hits.xLocal(h) = xl = clusParams.xpos[ic]; - hits.yLocal(h) = yl = clusParams.ypos[ic]; + hits[h].xLocal = xl = clusParams.xpos[ic]; + hits[h].yLocal = yl = clusParams.ypos[ic]; - hits.clusterSizeX(h) = clusParams.xsize[ic]; - hits.clusterSizeY(h) = clusParams.ysize[ic]; + hits[h].clusterSizeX = clusParams.xsize[ic]; + hits[h].clusterSizeY = clusParams.ysize[ic]; - hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic]; - hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic]; + hits[h].xerrLocal = clusParams.xerr[ic] * clusParams.xerr[ic]; + hits[h].yerrLocal = clusParams.yerr[ic] * clusParams.yerr[ic]; // keep it local for computations float xg, yg, zg; @@ -197,12 +194,12 @@ namespace gpuPixelRecHits { yg -= bs->y; zg -= bs->z; - hits.xGlobal(h) = xg; - hits.yGlobal(h) = yg; - hits.zGlobal(h) = zg; + hits[h].xGlobal = xg; + hits[h].yGlobal = yg; + hits[h].zGlobal = zg; - hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg); - hits.iphi(h) = unsafe_atan2s<7>(yg, xg); + hits[h].rGlobal = std::sqrt(xg * xg + yg * yg); + hits[h].iphi = unsafe_atan2s<7>(yg, xg); } __syncthreads(); } // end loop on batches @@ -210,4 +207,4 @@ namespace gpuPixelRecHits { } // namespace gpuPixelRecHits -#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h +#endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelRecHits_h \ No newline at end of file diff --git a/src/cudadev/plugin-Validation/HistoValidator.cc b/src/cudadev/plugin-Validation/HistoValidator.cc index 8a888666b..2ab0c0122 100644 --- a/src/cudadev/plugin-Validation/HistoValidator.cc +++ b/src/cudadev/plugin-Validation/HistoValidator.cc @@ -38,10 +38,7 @@ class HistoValidator : public edm::EDProducerExternalWork { uint32_t nHits; cms::cuda::host::unique_ptr h_adc; cms::cuda::host::unique_ptr h_clusInModule; - cms::cuda::host::unique_ptr h_localCoord; - cms::cuda::host::unique_ptr h_globalCoord; - cms::cuda::host::unique_ptr h_charge; - cms::cuda::host::unique_ptr h_size; + TrackingRecHit2DHostSOAStore h_hits; static std::map histos; }; @@ -107,10 +104,7 @@ void HistoValidator::acquire(const edm::Event& iEvent, h_clusInModule.get(), clusters.clusInModule(), sizeof(uint32_t) * nModules, cudaMemcpyDefault, ctx.stream())); nHits = hits.nHits(); - h_localCoord = hits.localCoordToHostAsync(ctx.stream()); - h_globalCoord = hits.globalCoordToHostAsync(ctx.stream()); - h_charge = hits.chargeToHostAsync(ctx.stream()); - h_size = hits.sizeToHostAsync(ctx.stream()); + h_hits = hits.hitsToHostAsync(ctx.stream()); } void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { @@ -129,23 +123,19 @@ void HistoValidator::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) histos["hit_n"].fill(nHits); for (uint32_t i = 0; i < nHits; ++i) { - histos["hit_lx"].fill(h_localCoord[i]); - histos["hit_ly"].fill(h_localCoord[i + nHits]); - histos["hit_lex"].fill(h_localCoord[i + 2 * nHits]); - histos["hit_ley"].fill(h_localCoord[i + 3 * nHits]); - histos["hit_gx"].fill(h_globalCoord[i]); - histos["hit_gy"].fill(h_globalCoord[i + nHits]); - histos["hit_gz"].fill(h_globalCoord[i + 2 * nHits]); - histos["hit_gr"].fill(h_globalCoord[i + 3 * nHits]); - histos["hit_charge"].fill(h_charge[i]); - histos["hit_sizex"].fill(h_size[i]); - histos["hit_sizey"].fill(h_size[i + nHits]); + histos["hit_lx"].fill(h_hits[i].xLocal()); + histos["hit_ly"].fill(h_hits[i].yLocal()); + histos["hit_lex"].fill(h_hits[i].xerrLocal()); + histos["hit_ley"].fill(h_hits[i].yerrLocal()); + histos["hit_gx"].fill(h_hits[i].xGlobal()); + histos["hit_gy"].fill(h_hits[i].yGlobal()); + histos["hit_gz"].fill(h_hits[i].zGlobal()); + histos["hit_gr"].fill(h_hits[i].rGlobal()); + histos["hit_charge"].fill(h_hits[i].charge()); + histos["hit_sizex"].fill(h_hits[i].clusterSizeX()); + histos["hit_sizey"].fill(h_hits[i].clusterSizeY()); } - h_localCoord.reset(); - h_globalCoord.reset(); - h_charge.reset(); - h_size.reset(); - + h_hits.reset(); { auto const& tracks = iEvent.get(trackToken_); @@ -189,4 +179,4 @@ void HistoValidator::endJob() { } } -DEFINE_FWK_MODULE(HistoValidator); +DEFINE_FWK_MODULE(HistoValidator); \ No newline at end of file diff --git a/src/cudadev/test/SoALayoutAndView_t.cu b/src/cudadev/test/SoALayoutAndView_t.cu new file mode 100644 index 000000000..edc848cfa --- /dev/null +++ b/src/cudadev/test/SoALayoutAndView_t.cu @@ -0,0 +1,274 @@ +#include +#include "DataFormats/SoALayout.h" +#include "DataFormats/SoAView.h" +#include "CUDACore/cudaCheck.h" +#include +#include +#include + +// Test SoA stores and view. +// Use cases +// Multiple stores in a buffer +// Scalars, Columns of scalars and of Eigen vectors +// View to each of them, from one and multiple stores. + +GENERATE_SOA_LAYOUT_AND_VIEW(SoAHostDeviceLayoutTemplate, + SoAHostDeviceViewTemplate, + // predefined static scalars + // size_t size; + // size_t alignment; + + // columns: one value per element + SOA_COLUMN(double, x), + SOA_COLUMN(double, y), + SOA_COLUMN(double, z), + SOA_EIGEN_COLUMN(Eigen::Vector3d, a), + SOA_EIGEN_COLUMN(Eigen::Vector3d, b), + SOA_EIGEN_COLUMN(Eigen::Vector3d, r), + // scalars: one value for the whole structure + SOA_SCALAR(const char*, description), + SOA_SCALAR(uint32_t, someNumber)) + +using SoAHostDeviceLayout = SoAHostDeviceLayoutTemplate<>; +using SoAHostDeviceView = + SoAHostDeviceViewTemplate; + +GENERATE_SOA_LAYOUT_AND_VIEW(SoADeviceOnlyLayoutTemplate, + SoADeviceOnlyViewTemplate, + SOA_COLUMN(uint16_t, color), + SOA_COLUMN(double, value), + SOA_COLUMN(double*, py), + SOA_COLUMN(uint32_t, count), + SOA_COLUMN(uint32_t, anotherCount)) + +using SoADeviceOnlyLayout = SoADeviceOnlyLayoutTemplate<>; +using SoADeviceOnlyView = + SoADeviceOnlyViewTemplate; + +// A 1 to 1 view of the store (except for unsupported types). +GENERATE_SOA_VIEW(SoAFullDeviceViewTemplate, + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(SoAHostDeviceLayout, soaHD), + SOA_VIEW_LAYOUT(SoADeviceOnlyLayout, soaDO)), + SOA_VIEW_LAYOUT_LIST(SOA_VIEW_VALUE(soaHD, x), + SOA_VIEW_VALUE(soaHD, y), + SOA_VIEW_VALUE(soaHD, z), + SOA_VIEW_VALUE(soaDO, color), + SOA_VIEW_VALUE(soaDO, value), + SOA_VIEW_VALUE(soaDO, py), + SOA_VIEW_VALUE(soaDO, count), + SOA_VIEW_VALUE(soaDO, anotherCount), + SOA_VIEW_VALUE(soaHD, description), + SOA_VIEW_VALUE(soaHD, someNumber))) + +using SoAFullDeviceView = + SoAFullDeviceViewTemplate; + +// Eigen cross product kernel (on store) +__global__ void crossProduct(SoAHostDeviceView soa, const unsigned int numElements) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i>=numElements) return; + auto si = soa[i]; + si.r() = si.a().cross(si.b()); +} + +// Device-only producer kernel +__global__ void producerKernel(SoAFullDeviceView soa, const unsigned int numElements) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i>=numElements) return; + auto si = soa[i]; + si.color() &= 0x55 << i % (sizeof(si.color()) - sizeof(char)); + si.value() = sqrt(si.x() * si.x() + si.y() * si.y() + si.z() * si.z()); +} + +// Device-only consumer with result in host-device area +__global__ void consumerKernel(SoAFullDeviceView soa, const unsigned int numElements) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i>=numElements) return; + auto si = soa[i]; + si.x() = si.color() * si.value(); +} + +// Get a view like the default, except for range checking +using RangeCheckingHostDeviceView = SoAHostDeviceViewTemplate; + +// We expect to just run one thread. +__global__ void rangeCheckKernel(RangeCheckingHostDeviceView soa) { +#if defined(__CUDACC__) && defined(__CUDA_ARCH__) + printf("About to fail range check in CUDA thread: %d\n", threadIdx.x); +#endif + [[maybe_unused]] auto si = soa[soa.soaMetadata().size()]; + printf("We should not have reached here\n"); +} + +int main(void) { + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + // Non-aligned number of elements to check alignment features. + constexpr unsigned int numElements = 65537; + + // Allocate buffer and store on host + size_t hostDeviceSize = SoAHostDeviceLayout::computeDataSize(numElements); + std::byte * h_buf = nullptr; + cudaCheck(cudaMallocHost(&h_buf, hostDeviceSize)); + SoAHostDeviceLayout h_soahdLayout(h_buf, numElements); + SoAHostDeviceView h_soahd(h_soahdLayout); + + // Alocate buffer, stores and views on the device (single, shared buffer). + size_t deviceOnlySize = SoADeviceOnlyLayout::computeDataSize(numElements); + std::byte * d_buf = nullptr; + cudaCheck(cudaMallocHost(&d_buf, hostDeviceSize + deviceOnlySize)); + SoAHostDeviceLayout d_soahdLayout(d_buf, numElements); + SoADeviceOnlyLayout d_soadoLayout(d_soahdLayout.soaMetadata().nextByte(), numElements); + SoAHostDeviceView d_soahdView(d_soahdLayout); + SoAFullDeviceView d_soaFullView(d_soahdLayout, d_soadoLayout); + + // Assert column alignments + assert(0 == reinterpret_cast(h_soahd.soaMetadata().addressOf_x()) % decltype(h_soahd)::byteAlignment); + assert(0 == reinterpret_cast(h_soahd.soaMetadata().addressOf_y()) % decltype(h_soahd)::byteAlignment); + assert(0 == reinterpret_cast(h_soahd.soaMetadata().addressOf_z()) % decltype(h_soahd)::byteAlignment); + assert(0 == reinterpret_cast(h_soahd.soaMetadata().addressOf_a()) % decltype(h_soahd)::byteAlignment); + assert(0 == reinterpret_cast(h_soahd.soaMetadata().addressOf_b()) % decltype(h_soahd)::byteAlignment); + assert(0 == reinterpret_cast(h_soahd.soaMetadata().addressOf_r()) % decltype(h_soahd)::byteAlignment); + assert(0 == + reinterpret_cast(h_soahd.soaMetadata().addressOf_description()) % decltype(h_soahd)::byteAlignment); + assert(0 == + reinterpret_cast(h_soahd.soaMetadata().addressOf_someNumber()) % decltype(h_soahd)::byteAlignment); + + assert(0 == reinterpret_cast(d_soahdLayout.soaMetadata().addressOf_x()) % + decltype(d_soahdLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soahdLayout.soaMetadata().addressOf_y()) % + decltype(d_soahdLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soahdLayout.soaMetadata().addressOf_z()) % + decltype(d_soahdLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soahdLayout.soaMetadata().addressOf_a()) % + decltype(d_soahdLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soahdLayout.soaMetadata().addressOf_b()) % + decltype(d_soahdLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soahdLayout.soaMetadata().addressOf_r()) % + decltype(d_soahdLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soahdLayout.soaMetadata().addressOf_description()) % + decltype(d_soahdLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soahdLayout.soaMetadata().addressOf_someNumber()) % + decltype(d_soahdLayout)::byteAlignment); + + assert(0 == reinterpret_cast(d_soadoLayout.soaMetadata().addressOf_color()) % + decltype(d_soadoLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soadoLayout.soaMetadata().addressOf_value()) % + decltype(d_soadoLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soadoLayout.soaMetadata().addressOf_py()) % + decltype(d_soadoLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soadoLayout.soaMetadata().addressOf_count()) % + decltype(d_soadoLayout)::byteAlignment); + assert(0 == reinterpret_cast(d_soadoLayout.soaMetadata().addressOf_anotherCount()) % + decltype(d_soadoLayout)::byteAlignment); + + // Views should get the same alignment as the stores they refer to + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_x()) % + decltype(d_soaFullView)::byteAlignment); + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_y()) % + decltype(d_soaFullView)::byteAlignment); + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_z()) % + decltype(d_soaFullView)::byteAlignment); + // Limitation of views: we have to get scalar member addresses via metadata. + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_description()) % + decltype(d_soaFullView)::byteAlignment); + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_someNumber()) % + decltype(d_soaFullView)::byteAlignment); + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_color()) % + decltype(d_soaFullView)::byteAlignment); + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_value()) % + decltype(d_soaFullView)::byteAlignment); + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_py()) % + decltype(d_soaFullView)::byteAlignment); + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_count()) % + decltype(d_soaFullView)::byteAlignment); + assert(0 == reinterpret_cast(d_soaFullView.soaMetadata().addressOf_anotherCount()) % + decltype(d_soaFullView)::byteAlignment); + + // Initialize and fill the host buffer + std::memset(h_soahdLayout.soaMetadata().data(), 0, hostDeviceSize); + for (size_t i = 0; i < numElements; ++i) { + auto si = h_soahd[i]; + si.x() = si.a()(0) = si.b()(2) = 1.0 * i + 1.0; + si.y() = si.a()(1) = si.b()(1) = 2.0 * i; + si.z() = si.a()(2) = si.b()(0) = 3.0 * i - 1.0; + } + auto& sn = h_soahd.someNumber(); + sn = numElements + 2; + + // Push to device + cudaCheck(cudaMemcpyAsync(d_buf, h_buf, hostDeviceSize, cudaMemcpyDefault, stream)); + + // Process on device + crossProduct<<<(numElements + 255) / 256, 256, 0, stream>>>(d_soahdView, numElements); + + // Paint the device only with 0xFF initially + cudaCheck(cudaMemsetAsync(d_soadoLayout.soaMetadata().data(), 0xFF, d_soadoLayout.soaMetadata().byteSize(), stream)); + + // Produce to the device only area + producerKernel<<<(numElements + 255) / 256, 256, 0, stream>>>(d_soaFullView, numElements); + + // Consume the device only area and generate a result on the host-device area + consumerKernel<<<(numElements + 255) / 256, 256, 0, stream>>>(d_soaFullView, numElements); + + // Get result back + cudaCheck(cudaMemcpyAsync(h_buf, d_buf, hostDeviceSize, cudaMemcpyDefault, stream)); + + // Wait and validate. + cudaCheck(cudaStreamSynchronize(stream)); + for (size_t i = 0; i < numElements; ++i) { + auto si = h_soahd[i]; + assert(si.r() == si.a().cross(si.b())); + double initialX = 1.0 * i + 1.0; + double initialY = 2.0 * i; + double initialZ = 3.0 * i - 1.0; + uint16_t expectedColor = 0x55 << i % (sizeof(uint16_t) - sizeof(char)); + double expectedX = expectedColor * sqrt(initialX * initialX + initialY * initialY + initialZ * initialZ); + if (abs(si.x() - expectedX) / expectedX >= 2 * std::numeric_limits::epsilon()) { + std::cout << "X failed: for i=" << i << std::endl + << "initialX=" << initialX << " initialY=" << initialY << " initialZ=" << initialZ << std::endl + << "expectedX=" << expectedX << std::endl + << "resultX=" << si.x() << " resultY=" << si.y() << " resultZ=" << si.z() << std::endl + << "relativeDiff=" << abs(si.x() - expectedX) / expectedX + << " epsilon=" << std::numeric_limits::epsilon() << std::endl; + assert(false); + } + } + + // Validation of range checking + try { + // Get a view like the default, except for range checking + SoAHostDeviceViewTemplate + soa1viewRangeChecking(h_soahdLayout); + // This should throw an exception + [[maybe_unused]] auto si = soa1viewRangeChecking[soa1viewRangeChecking.soaMetadata().size()]; + assert(false); + } catch (const std::out_of_range&) { + } + + // Validation of range checking in a kernel + // Get a view like the default, except for range checking + RangeCheckingHostDeviceView soa1viewRangeChecking(d_soahdLayout); + // This should throw an exception in the kernel + try { + rangeCheckKernel<<<1,1,0,stream>>>(soa1viewRangeChecking); + } catch (const std::out_of_range&) { + std::cout << "Exception received in enqueue." << std::endl; + } + + // Wait and validate (that we failed). + try { + cudaCheck(cudaStreamSynchronize(stream)); + } catch (const std::runtime_error&) { + std::cout << "Exception received in wait." << std::endl; + } + + std::cout << "OK" << std::endl; +} diff --git a/src/cudadev/test/TrackingRecHit2DCUDA_t.cu b/src/cudadev/test/TrackingRecHit2DCUDA_t.cu index 5f3a26391..efd76fd70 100644 --- a/src/cudadev/test/TrackingRecHit2DCUDA_t.cu +++ b/src/cudadev/test/TrackingRecHit2DCUDA_t.cu @@ -4,9 +4,9 @@ namespace testTrackingRecHit2D { - __global__ void fill(TrackingRecHit2DSOAView* phits) { + __global__ void fill(TrackingRecHit2DSOAStore* phits) { assert(phits); - auto& hits = *phits; + [[maybe_unused]] auto& hits = *phits; assert(hits.nHits() == 200); int i = threadIdx.x; @@ -14,9 +14,9 @@ namespace testTrackingRecHit2D { return; } - __global__ void verify(TrackingRecHit2DSOAView const* phits) { + __global__ void verify(TrackingRecHit2DSOAStore const* phits) { assert(phits); - auto const& hits = *phits; + [[maybe_unused]] auto const& hits = *phits; assert(hits.nHits() == 200); int i = threadIdx.x; @@ -24,7 +24,7 @@ namespace testTrackingRecHit2D { return; } - void runKernels(TrackingRecHit2DSOAView* hits) { + void runKernels(TrackingRecHit2DSOAStore* hits) { assert(hits); fill<<<1, 1024>>>(hits); verify<<<1, 1024>>>(hits); @@ -34,7 +34,7 @@ namespace testTrackingRecHit2D { namespace testTrackingRecHit2D { - void runKernels(TrackingRecHit2DSOAView* hits); + void runKernels(TrackingRecHit2DSOAStore* hits); } @@ -47,10 +47,10 @@ int main() { auto nHits = 200; TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream); - testTrackingRecHit2D::runKernels(tkhit.view()); + testTrackingRecHit2D::runKernels(tkhit.store()); } cudaCheck(cudaStreamDestroy(stream)); return 0; -} +} \ No newline at end of file