cms-patatrack · ericcano · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021 · Oct 12, 2021
diff --git a/src/cudadev/CUDACore/device_unique_ptr.h b/src/cudadev/CUDACore/device_unique_ptr.h
@@ -49,8 +49,8 @@ namespace cms {
 
  template <typename T>
  typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
- static_assert(std::is_trivially_constructible<T>::value,
- "Allocating with non-trivial constructor on the device memory is not supported");
+ static_assert(std::is_trivially_copyable<T>::value,
+ "Allocating with non-trivial copy on the device memory is not supported");
  void *mem = allocate_device(sizeof(T), stream);
  return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
  device::impl::DeviceDeleter{stream}};
@@ -60,37 +60,15 @@ namespace cms {
  typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
  cudaStream_t stream) {
  using element_type = typename std::remove_extent<T>::type;
- static_assert(std::is_trivially_constructible<element_type>::value,
- "Allocating with non-trivial constructor on the device memory is not supported");
+ static_assert(std::is_trivially_copyable<element_type>::value,
+ "Allocating with non-trivial copy on the device memory is not supported");
  void *mem = allocate_device(n * sizeof(element_type), stream);
  return typename device::impl::make_device_unique_selector<T>::unbounded_array{
  reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{stream}};
  }
 
  template <typename T, typename... Args>
  typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
-
- // No check for the trivial constructor, make it clear in the interface
- template <typename T>
- typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
- cudaStream_t stream) {
- void *mem = allocate_device(sizeof(T), stream);
- return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
- device::impl::DeviceDeleter{stream}};
- }
-
- template <typename T>
- typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
- size_t n, cudaStream_t stream) {
- using element_type = typename std::remove_extent<T>::type;
- void *mem = allocate_device(n * sizeof(element_type), stream);
- return typename device::impl::make_device_unique_selector<T>::unbounded_array{
- reinterpret_cast<element_type *>(mem), device::impl::DeviceDeleter{stream}};
- }
-
- template <typename T, typename... Args>
- typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique_uninitialized(Args &&...) =
- delete;
  } // namespace cuda
 } // namespace cms
 

diff --git a/src/cudadev/CUDACore/host_unique_ptr.h b/src/cudadev/CUDACore/host_unique_ptr.h
@@ -39,17 +39,17 @@ namespace cms {
  // Allocate pinned host memory
  template <typename T>
  typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
- static_assert(std::is_trivially_constructible<T>::value,
- "Allocating with non-trivial constructor on the pinned host memory is not supported");
+ static_assert(std::is_trivially_copyable<T>::value,
+ "Allocating with non-trivial copy on the pinned host memory is not supported");
  void *mem = allocate_host(sizeof(T), stream);
  return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
  }
 
  template <typename T>
  typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
  using element_type = typename std::remove_extent<T>::type;
- static_assert(std::is_trivially_constructible<element_type>::value,
- "Allocating with non-trivial constructor on the pinned host memory is not supported");
+ static_assert(std::is_trivially_copyable<element_type>::value,
+ "Allocating with non-trivial copy on the pinned host memory is not supported");
  void *mem = allocate_host(n * sizeof(element_type), stream);
  return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
  }

diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.cc
@@ -3,17 +3,10 @@
 #include "CUDACore/host_unique_ptr.h"
 #include "CUDADataFormats/SiPixelClustersCUDA.h"
 
-SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
- : moduleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)),
- clusInModule_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
- moduleId_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
- clusModuleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)) {
- auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
- view->moduleStart_ = moduleStart_d.get();
- view->clusInModule_ = clusInModule_d.get();
- view->moduleId_ = moduleId_d.get();
- view->clusModuleStart_ = clusModuleStart_d.get();
+SiPixelClustersCUDA::SiPixelClustersCUDA(): data_d(), deviceLayout_(data_d.get(), 0), deviceView_(deviceLayout_) {}
 
- view_d = cms::cuda::make_device_unique<DeviceConstView>(stream);
- cms::cuda::copyAsync(view_d, view, stream);
-}
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
+ : data_d(cms::cuda::make_device_unique<std::byte[]>(DeviceLayout::computeDataSize(maxModules), stream)),
+ deviceLayout_(data_d.get(), maxModules),
+ deviceView_(deviceLayout_)
+{}
diff --git a/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h b/src/cudadev/CUDADataFormats/SiPixelClustersCUDA.h
@@ -4,12 +4,54 @@
 #include "CUDACore/device_unique_ptr.h"
 #include "CUDACore/host_unique_ptr.h"
 #include "CUDACore/cudaCompat.h"
+#include "DataFormats/SoALayout.h"
+#include "DataFormats/SoAView.h"
 
 #include <cuda_runtime.h>
 
 class SiPixelClustersCUDA {
 public:
- SiPixelClustersCUDA() = default;
+ GENERATE_SOA_LAYOUT(DeviceLayoutTemplate,
+ SOA_COLUMN(uint32_t, moduleStart), // index of the first pixel of each module
+ SOA_COLUMN(uint32_t, clusInModule), // number of clusters found in each module
+ SOA_COLUMN(uint32_t, moduleId), // module id of each module
+
+ // originally from rechits
+ SOA_COLUMN(uint32_t, clusModuleStart) // index of the first cluster of each module
+ )
+
+ // We use all defaults for the template parameters.
+ using DeviceLayout = DeviceLayoutTemplate<>;
+
+ GENERATE_SOA_VIEW(DeviceViewTemplate,
+ SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(DeviceLayout, deviceLayout)),
+ SOA_VIEW_VALUE_LIST(
+ SOA_VIEW_VALUE(deviceLayout, moduleStart), // index of the first pixel of each module
+ SOA_VIEW_VALUE(deviceLayout, clusInModule), // number of clusters found in each module
+ SOA_VIEW_VALUE(deviceLayout, moduleId), // module id of each module
+
+ // originally from rechits
+ SOA_VIEW_VALUE(deviceLayout, clusModuleStart) // index of the first cluster of each module
+ )
+ )
+
+ using DeviceView = DeviceViewTemplate<>;
+
+ GENERATE_SOA_CONST_VIEW(DeviceConstViewTemplate,
+ SOA_VIEW_LAYOUT_LIST(SOA_VIEW_LAYOUT(DeviceView, deviceView)),
+ SOA_VIEW_VALUE_LIST(
+ SOA_VIEW_VALUE(deviceView, moduleStart), // index of the first pixel of each module
+ SOA_VIEW_VALUE(deviceView, clusInModule), // number of clusters found in each module
+ SOA_VIEW_VALUE(deviceView, moduleId), // module id of each module
+
+ // originally from rechits
+ SOA_VIEW_VALUE(deviceView, clusModuleStart) // index of the first cluster of each module
+ )
+ )
+
+ using DeviceConstView = DeviceConstViewTemplate<>;
+
+ explicit SiPixelClustersCUDA();
  explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream);
  ~SiPixelClustersCUDA() = default;
 
@@ -22,41 +64,23 @@ class SiPixelClustersCUDA {
 
  uint32_t nClusters() const { return nClusters_h; }
 
- uint32_t *moduleStart() { return moduleStart_d.get(); }
- uint32_t *clusInModule() { return clusInModule_d.get(); }
- uint32_t *moduleId() { return moduleId_d.get(); }
- uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
+ uint32_t *moduleStart() { return deviceView_.moduleStart(); }
+ uint32_t *clusInModule() { return deviceView_.clusInModule(); }
+ uint32_t *moduleId() { return deviceView_.moduleId(); }
+ uint32_t *clusModuleStart() { return deviceView_.clusModuleStart(); }
 
- uint32_t const *moduleStart() const { return moduleStart_d.get(); }
- uint32_t const *clusInModule() const { return clusInModule_d.get(); }
- uint32_t const *moduleId() const { return moduleId_d.get(); }
- uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
+ uint32_t const *moduleStart() const { return deviceView_.moduleStart(); }
+ uint32_t const *clusInModule() const { return deviceView_.clusInModule(); }
+ uint32_t const *moduleId() const { return deviceView_.moduleId(); }
+ uint32_t const *clusModuleStart() const { return deviceView_.clusModuleStart(); }
 
- class DeviceConstView {
- public:
- __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); }
- __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); }
- __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); }
- __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); }
-
- uint32_t const *moduleStart_;
- uint32_t const *clusInModule_;
- uint32_t const *moduleId_;
- uint32_t const *clusModuleStart_;
- };
-
- DeviceConstView *view() const { return view_d.get(); }
+ DeviceConstView view() const { return DeviceConstView(deviceView_); }
 
 private:
- cms::cuda::device::unique_ptr<uint32_t[]> moduleStart_d; // index of the first pixel of each module
- cms::cuda::device::unique_ptr<uint32_t[]> clusInModule_d; // number of clusters found in each module
- cms::cuda::device::unique_ptr<uint32_t[]> moduleId_d; // module id of each module
-
- // originally from rechits
- cms::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d; // index of the first cluster of each module
-
- cms::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
-
+ cms::cuda::device::unique_ptr<std::byte[]> data_d; // Single SoA storage
+ DeviceLayout deviceLayout_;
+ DeviceView deviceView_;
+
  uint32_t nClusters_h = 0;
 };
 

diff --git a/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc b/src/cudadev/CUDADataFormats/SiPixelDigisCUDA.cc
@@ -5,44 +5,57 @@
 #include "CUDACore/host_unique_ptr.h"
 
 SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream)
- : xx_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
- yy_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
- adc_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
- moduleInd_d(cms::cuda::make_device_unique<uint16_t[]>(maxFedWords, stream)),
- clus_d(cms::cuda::make_device_unique<int32_t[]>(maxFedWords, stream)),
- view_d(cms::cuda::make_device_unique<DeviceConstView>(stream)),
- pdigi_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream)),
- rawIdArr_d(cms::cuda::make_device_unique<uint32_t[]>(maxFedWords, stream)) {
- auto view = cms::cuda::make_host_unique<DeviceConstView>(stream);
- view->xx_ = xx_d.get();
- view->yy_ = yy_d.get();
- view->adc_ = adc_d.get();
- view->moduleInd_ = moduleInd_d.get();
- view->clus_ = clus_d.get();
-
- cms::cuda::copyAsync(view_d, view, stream);
-}
+ : data_d(cms::cuda::make_device_unique<std::byte[]>(
+ DeviceOnlyLayout::computeDataSize(maxFedWords) + 
+ HostDeviceLayout::computeDataSize(maxFedWords),
+ stream)),
+ deviceOnlyLayout_d(data_d.get(), maxFedWords),
+ hostDeviceLayout_d(deviceOnlyLayout_d.soaMetadata().nextByte(), maxFedWords),
+ deviceFullView_(deviceOnlyLayout_d, hostDeviceLayout_d),
+ devicePixelConstView_(deviceFullView_)
+{}
 
-cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
- auto ret = cms::cuda::make_host_unique<uint16_t[]>(nDigis(), stream);
- cms::cuda::copyAsync(ret, adc_d, nDigis(), stream);
- return ret;
-}
+SiPixelDigisCUDA::SiPixelDigisCUDA()
+ : data_d(),deviceOnlyLayout_d(), hostDeviceLayout_d(), deviceFullView_(), devicePixelConstView_()
+{}
 
-cms::cuda::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const {
- auto ret = cms::cuda::make_host_unique<int32_t[]>(nDigis(), stream);
- cms::cuda::copyAsync(ret, clus_d, nDigis(), stream);
- return ret;
+SiPixelDigisCUDA::HostStore::HostStore()
+ : data_h(), hostLayout_(nullptr, 0), hostView_(hostLayout_)
+{}
+
+SiPixelDigisCUDA::HostStore::HostStore(size_t maxFedWords, cudaStream_t stream)
+ : data_h(cms::cuda::make_host_unique<std::byte[]>(SiPixelDigisCUDA::HostDeviceLayout::computeDataSize(maxFedWords), stream)),
+ hostLayout_(data_h.get(), maxFedWords),
+ hostView_(hostLayout_)
+{}
+
+void SiPixelDigisCUDA::HostStore::reset() {
+ hostLayout_ = HostDeviceLayout();
+ hostView_ = HostDeviceView(hostLayout_);
+ data_h.reset();
 }
 
-cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const {
- auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
- cms::cuda::copyAsync(ret, pdigi_d, nDigis(), stream);
+cms::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const {
+ auto ret = cms::cuda::make_host_unique<uint16_t[]>(nDigis(), stream);
+ // TODO: this is downgraded from cms::cuda::copyAsync as we copy data from within a block but not the full block.
+ cudaCheck(cudaMemcpyAsync(ret.get(), deviceFullView_.adc(), nDigis() * sizeof(decltype(ret[0])), cudaMemcpyDeviceToHost, stream));
  return ret;
 }
 
-cms::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const {
- auto ret = cms::cuda::make_host_unique<uint32_t[]>(nDigis(), stream);
- cms::cuda::copyAsync(ret, rawIdArr_d, nDigis(), stream);
+SiPixelDigisCUDA::HostStore SiPixelDigisCUDA::dataToHostAsync(cudaStream_t stream) const {
+ // Allocate the needed space only and build the compact data in place in host memory (from the larger device memory).
+ // Due to the compaction with the 2D copy, we need to know the precise geometry, and hence operate on the store (as opposed
+ // to the view, which is unaware of the column pitches.
+ HostStore ret(nDigis(), stream);
+ auto rhlsm = ret.hostLayout_.soaMetadata();
+ auto hdlsm_d = hostDeviceLayout_d.soaMetadata();
+ cudaCheck(cudaMemcpyAsync(rhlsm.addressOf_adc(), hdlsm_d.addressOf_adc(), nDigis_h * sizeof(*rhlsm.addressOf_adc()),
+ cudaMemcpyDeviceToHost, stream));
+ // Copy the other columns, realigning the data in shorter arrays. clus is the first but all 3 columns (clus, pdigis, rawIdArr) have
+ // the same geometry.
+ cudaCheck(cudaMemcpy2DAsync(rhlsm.addressOf_clus(), rhlsm.clusPitch(),
+ hdlsm_d.addressOf_clus(), hdlsm_d.clusPitch(),
+ 3 /* rows */,
+ nDigis() * sizeof(decltype (*ret.hostView_.clus())), cudaMemcpyDeviceToHost, stream));
  return ret;
-}
+}