Adapt the EDProducerExternalWork modules to the new interface

cms-patatrack · Dec 7, 2020 · 1ddc75c · 1ddc75c
1 parent dfeeaab
commit 1ddc75c
Show file tree

Hide file tree

Showing 5 changed files with 149 additions and 140 deletions.
diff --git a/src/cuda/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc b/src/cuda/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
@@ -9,21 +9,22 @@
 #include "Framework/EDProducer.h"
 #include "CUDACore/ScopedContext.h"
 
-class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork {
+using PixelTrackSoAFromCUDA_AsyncState = cms::cuda::host::unique_ptr<pixelTrack::TrackSoA>;
+
+class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork<PixelTrackSoAFromCUDA_AsyncState> {
 public:
  explicit PixelTrackSoAFromCUDA(edm::ProductRegistry& reg);
  ~PixelTrackSoAFromCUDA() override = default;
 
 private:
  void acquire(edm::Event const& iEvent,
  edm::EventSetup const& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
- void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ AsyncState& state) const override;
+ void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, AsyncState& state) override;
 
  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
-
- cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg)
@@ -32,17 +33,18 @@ PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg)
 
 void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
  edm::EventSetup const& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ AsyncState& state) const {
  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
  auto const& inputData = ctx.get(inputDataWrapped);
 
- m_soa = inputData.toHostAsync(ctx.stream());
+ state = inputData.toHostAsync(ctx.stream());
 }
 
-void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, AsyncState& state) {
  /*
- auto const & tsoa = *m_soa;
+ auto const & tsoa = *state;
  auto maxTracks = tsoa.stride();
  std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
 
@@ -57,9 +59,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
  */
 
  // DO NOT make a copy (actually TWO....)
- iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(m_soa)));
+ iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(state)));
 
- assert(!m_soa);
+ assert(!state);
 }
 
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/src/cuda/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc b/src/cuda/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
@@ -10,21 +10,22 @@
 #include "Framework/RunningAverage.h"
 #include "CUDACore/ScopedContext.h"
 
-class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork {
+using PixelVertexSoAFromCUDA_AsyncState = cms::cuda::host::unique_ptr<ZVertexSoA>;
+
+class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork<PixelVertexSoAFromCUDA_AsyncState> {
 public:
  explicit PixelVertexSoAFromCUDA(edm::ProductRegistry& reg);
  ~PixelVertexSoAFromCUDA() override = default;
 
 private:
  void acquire(edm::Event const& iEvent,
  edm::EventSetup const& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
- void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ AsyncState& state) const override;
+ void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, AsyncState& state) override;
 
  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
  edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
-
- cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
 };
 
 PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg)
@@ -33,17 +34,18 @@ PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg)
 
 void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
  edm::EventSetup const& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ AsyncState& state) const {
  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
  auto const& inputData = ctx.get(inputDataWrapped);
 
- m_soa = inputData.toHostAsync(ctx.stream());
+ state = inputData.toHostAsync(ctx.stream());
 }
 
-void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, AsyncState& state) {
  // No copies....
- iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+ iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(state)));
 }
 
 DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
@@ -22,27 +22,28 @@
 #include <string>
 #include <vector>
 
-class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
+struct SiPixelRawToClusterCUDA_AsyncStateImpl {
+ cms::cuda::ContextState ctx;
+ pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo;
+};
+
+class SiPixelRawToClusterCUDA
+ : public edm::EDProducerExternalWork<std::unique_ptr<SiPixelRawToClusterCUDA_AsyncStateImpl>> {
 public:
  explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg);
  ~SiPixelRawToClusterCUDA() override = default;
 
 private:
  void acquire(const edm::Event& iEvent,
  const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
- void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
-
- cms::cuda::ContextState ctxState_;
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ AsyncState& state) const override;
+ void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, AsyncState& state) override;
 
- edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
- edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
+ const edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
+ const edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
  edm::EDPutTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
- edm::EDPutTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterPutToken_;
-
- pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
- std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
- PixelFormatterErrors errors_;
+ const edm::EDPutTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterPutToken_;
 
  const bool isRun2_;
  const bool includeErrors_;
@@ -59,14 +60,14 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
  if (includeErrors_) {
  digiErrorPutToken_ = reg.produces<cms::cuda::Product<SiPixelDigiErrorsCUDA>>();
  }
-
- wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
 }
 
 void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
  const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ AsyncState& state) const {
+ state = std::make_unique<SiPixelRawToClusterCUDA_AsyncStateImpl>();
+ cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), state->ctx};
 
  auto const& hgpuMap = iSetup.get<SiPixelFedCablingMapGPUWrapper>();
  if (hgpuMap.hasQuality() != useQuality_) {
@@ -85,7 +86,8 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
 
  const auto& buffers = iEvent.get(rawGetToken_);
 
- errors_.clear();
+ PixelFormatterErrors errors;
+ pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender wordFedAppender;
 
  // GPU specific: Data extraction for RawToDigi GPU
  unsigned int wordCounterGPU = 0;
@@ -115,7 +117,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
 
  // check CRC bit
  const uint64_t* trailer = reinterpret_cast<const uint64_t*>(rawData.data()) + (nWords - 1);
- if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) {
+ if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors)) {
  continue;
  }
 
@@ -125,7 +127,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
  bool moreHeaders = true;
  while (moreHeaders) {
  header++;
- bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_);
+ bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors);
  moreHeaders = headerStatus;
  }
 
@@ -134,41 +136,41 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
  trailer++;
  while (moreTrailers) {
  trailer--;
- bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_);
+ bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors);
  moreTrailers = trailerStatus;
  }
 
  const uint32_t* bw = (const uint32_t*)(header + 1);
  const uint32_t* ew = (const uint32_t*)(trailer);
 
  assert(0 == (ew - bw) % 2);
- wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew - bw));
+ wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew - bw));
  wordCounterGPU += (ew - bw);
 
  } // end of for loop
 
- gpuAlgo_.makeClustersAsync(isRun2_,
- gpuMap,
- gpuModulesToUnpack,
- gpuGains,
- *wordFedAppender_,
- std::move(errors_),
- wordCounterGPU,
- fedCounter,
- useQuality_,
- includeErrors_,
- false, // debug
- ctx.stream());
+ state->gpuAlgo.makeClustersAsync(isRun2_,
+  gpuMap,
+  gpuModulesToUnpack,
+  gpuGains,
+  wordFedAppender,
+  std::move(errors),
+  wordCounterGPU,
+  fedCounter,
+  useQuality_,
+  includeErrors_,
+  false, // debug
+  ctx.stream());
 }
 
-void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
- cms::cuda::ScopedContextProduce ctx{ctxState_};
+void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, AsyncState& state) {
+ cms::cuda::ScopedContextProduce ctx{state->ctx};
 
- auto tmp = gpuAlgo_.getResults();
+ auto tmp = state->gpuAlgo.getResults();
  ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
  ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
  if (includeErrors_) {
- ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors());
+ ctx.emplace(iEvent, digiErrorPutToken_, state->gpuAlgo.getErrors());
  }
 }
 

diff --git a/src/cuda/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc b/src/cuda/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
@@ -8,26 +8,28 @@
 #include "CUDACore/ScopedContext.h"
 #include "CUDACore/host_unique_ptr.h"
 
-class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork {
+struct SiPixelDigisSoAFromCUDA_AsyncState {
+ cms::cuda::host::unique_ptr<uint32_t[]> pdigi;
+ cms::cuda::host::unique_ptr<uint32_t[]> rawIdArr;
+ cms::cuda::host::unique_ptr<uint16_t[]> adc;
+ cms::cuda::host::unique_ptr<int32_t[]> clus;
+ size_t nDigis;
+};
+
+class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork<SiPixelDigisSoAFromCUDA_AsyncState> {
 public:
  explicit SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg);
  ~SiPixelDigisSoAFromCUDA() override = default;
 
 private:
  void acquire(const edm::Event& iEvent,
  const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
- void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
-
- edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
- edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ AsyncState& state) const override;
+ void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, AsyncState& state) override;
 
- cms::cuda::host::unique_ptr<uint32_t[]> pdigi_;
- cms::cuda::host::unique_ptr<uint32_t[]> rawIdArr_;
- cms::cuda::host::unique_ptr<uint16_t[]> adc_;
- cms::cuda::host::unique_ptr<int32_t[]> clus_;
-
- size_t nDigis_;
+ const edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
+ const edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
 };
 
 SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg)
@@ -36,20 +38,18 @@ SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg)
 
 void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
  const edm::EventSetup& iSetup,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+ edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+ AsyncState& state) const {
  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
  const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
-
- nDigis_ = gpuDigis.nDigis();
- pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());
- rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream());
- adc_ = gpuDigis.adcToHostAsync(ctx.stream());
- clus_ = gpuDigis.clusToHostAsync(ctx.stream());
+ state.pdigi = gpuDigis.pdigiToHostAsync(ctx.stream()), state.rawIdArr = gpuDigis.rawIdArrToHostAsync(ctx.stream()),
+ state.adc = gpuDigis.adcToHostAsync(ctx.stream()), state.clus = gpuDigis.clusToHostAsync(ctx.stream()),
+ state.nDigis = gpuDigis.nDigis();
 }
 
-void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, AsyncState& state) {
  // The following line copies the data from the pinned host memory to
  // regular host memory. In principle that feels unnecessary (why not
  // just use the pinned host memory?). There are a few arguments for
@@ -60,12 +60,8 @@ void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup&
  // host memory to be allocated without a CUDA stream
  // - What if a CPU algorithm would produce the same SoA? We can't
  // use cudaMallocHost without a GPU...
- iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get());
-
- pdigi_.reset();
- rawIdArr_.reset();
- adc_.reset();
- clus_.reset();
+ iEvent.emplace(
+ digiPutToken_, state.nDigis, state.pdigi.get(), state.rawIdArr.get(), state.adc.get(), state.clus.get());
 }
 
 // define as framework plugin