Skip to content

Commit

Permalink
Adapt the EDProducerExternalWork modules to the new interface
Browse files Browse the repository at this point in the history
  • Loading branch information
fwyzard committed Dec 7, 2020
1 parent dfeeaab commit 1ddc75c
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 140 deletions.
24 changes: 13 additions & 11 deletions src/cuda/plugin-PixelTrackFitting/PixelTrackSoAFromCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,22 @@
#include "Framework/EDProducer.h"
#include "CUDACore/ScopedContext.h"

class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork {
using PixelTrackSoAFromCUDA_AsyncState = cms::cuda::host::unique_ptr<pixelTrack::TrackSoA>;

class PixelTrackSoAFromCUDA : public edm::EDProducerExternalWork<PixelTrackSoAFromCUDA_AsyncState> {
public:
explicit PixelTrackSoAFromCUDA(edm::ProductRegistry& reg);
~PixelTrackSoAFromCUDA() override = default;

private:
void acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
edm::WaitingTaskWithArenaHolder waitingTaskHolder,
AsyncState& state) const override;
void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, AsyncState& state) override;

edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;

cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
};

PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg)
Expand All @@ -32,17 +33,18 @@ PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(edm::ProductRegistry& reg)

void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
edm::WaitingTaskWithArenaHolder waitingTaskHolder,
AsyncState& state) const {
cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
auto const& inputData = ctx.get(inputDataWrapped);

m_soa = inputData.toHostAsync(ctx.stream());
state = inputData.toHostAsync(ctx.stream());
}

void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, AsyncState& state) {
/*
auto const & tsoa = *m_soa;
auto const & tsoa = *state;
auto maxTracks = tsoa.stride();
std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
Expand All @@ -57,9 +59,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
*/

// DO NOT make a copy (actually TWO....)
iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(m_soa)));
iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(state)));

assert(!m_soa);
assert(!state);
}

DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
20 changes: 11 additions & 9 deletions src/cuda/plugin-PixelVertexFinding/PixelVertexSoAFromCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,22 @@
#include "Framework/RunningAverage.h"
#include "CUDACore/ScopedContext.h"

class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork {
using PixelVertexSoAFromCUDA_AsyncState = cms::cuda::host::unique_ptr<ZVertexSoA>;

class PixelVertexSoAFromCUDA : public edm::EDProducerExternalWork<PixelVertexSoAFromCUDA_AsyncState> {
public:
explicit PixelVertexSoAFromCUDA(edm::ProductRegistry& reg);
~PixelVertexSoAFromCUDA() override = default;

private:
void acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
edm::WaitingTaskWithArenaHolder waitingTaskHolder,
AsyncState& state) const override;
void produce(edm::Event& iEvent, edm::EventSetup const& iSetup, AsyncState& state) override;

edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;

cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
};

PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg)
Expand All @@ -33,17 +34,18 @@ PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(edm::ProductRegistry& reg)

void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
edm::WaitingTaskWithArenaHolder waitingTaskHolder,
AsyncState& state) const {
auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
auto const& inputData = ctx.get(inputDataWrapped);

m_soa = inputData.toHostAsync(ctx.stream());
state = inputData.toHostAsync(ctx.stream());
}

void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup, AsyncState& state) {
// No copies....
iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(state)));
}

DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
76 changes: 39 additions & 37 deletions src/cuda/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,27 +22,28 @@
#include <string>
#include <vector>

class SiPixelRawToClusterCUDA : public edm::EDProducerExternalWork {
struct SiPixelRawToClusterCUDA_AsyncStateImpl {
cms::cuda::ContextState ctx;
pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo;
};

class SiPixelRawToClusterCUDA
: public edm::EDProducerExternalWork<std::unique_ptr<SiPixelRawToClusterCUDA_AsyncStateImpl>> {
public:
explicit SiPixelRawToClusterCUDA(edm::ProductRegistry& reg);
~SiPixelRawToClusterCUDA() override = default;

private:
void acquire(const edm::Event& iEvent,
const edm::EventSetup& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;

cms::cuda::ContextState ctxState_;
edm::WaitingTaskWithArenaHolder waitingTaskHolder,
AsyncState& state) const override;
void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, AsyncState& state) override;

edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
const edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
const edm::EDPutTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiPutToken_;
edm::EDPutTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
edm::EDPutTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterPutToken_;

pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
PixelFormatterErrors errors_;
const edm::EDPutTokenT<cms::cuda::Product<SiPixelClustersCUDA>> clusterPutToken_;

const bool isRun2_;
const bool includeErrors_;
Expand All @@ -59,14 +60,14 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(edm::ProductRegistry& reg)
if (includeErrors_) {
digiErrorPutToken_ = reg.produces<cms::cuda::Product<SiPixelDigiErrorsCUDA>>();
}

wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
}

void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
const edm::EventSetup& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
edm::WaitingTaskWithArenaHolder waitingTaskHolder,
AsyncState& state) const {
state = std::make_unique<SiPixelRawToClusterCUDA_AsyncStateImpl>();
cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), state->ctx};

auto const& hgpuMap = iSetup.get<SiPixelFedCablingMapGPUWrapper>();
if (hgpuMap.hasQuality() != useQuality_) {
Expand All @@ -85,7 +86,8 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,

const auto& buffers = iEvent.get(rawGetToken_);

errors_.clear();
PixelFormatterErrors errors;
pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender wordFedAppender;

// GPU specific: Data extraction for RawToDigi GPU
unsigned int wordCounterGPU = 0;
Expand Down Expand Up @@ -115,7 +117,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,

// check CRC bit
const uint64_t* trailer = reinterpret_cast<const uint64_t*>(rawData.data()) + (nWords - 1);
if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) {
if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors)) {
continue;
}

Expand All @@ -125,7 +127,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
bool moreHeaders = true;
while (moreHeaders) {
header++;
bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_);
bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors);
moreHeaders = headerStatus;
}

Expand All @@ -134,41 +136,41 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent,
trailer++;
while (moreTrailers) {
trailer--;
bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_);
bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors);
moreTrailers = trailerStatus;
}

const uint32_t* bw = (const uint32_t*)(header + 1);
const uint32_t* ew = (const uint32_t*)(trailer);

assert(0 == (ew - bw) % 2);
wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew - bw));
wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew - bw));
wordCounterGPU += (ew - bw);

} // end of for loop

gpuAlgo_.makeClustersAsync(isRun2_,
gpuMap,
gpuModulesToUnpack,
gpuGains,
*wordFedAppender_,
std::move(errors_),
wordCounterGPU,
fedCounter,
useQuality_,
includeErrors_,
false, // debug
ctx.stream());
state->gpuAlgo.makeClustersAsync(isRun2_,
gpuMap,
gpuModulesToUnpack,
gpuGains,
wordFedAppender,
std::move(errors),
wordCounterGPU,
fedCounter,
useQuality_,
includeErrors_,
false, // debug
ctx.stream());
}

void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
cms::cuda::ScopedContextProduce ctx{ctxState_};
void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, AsyncState& state) {
cms::cuda::ScopedContextProduce ctx{state->ctx};

auto tmp = gpuAlgo_.getResults();
auto tmp = state->gpuAlgo.getResults();
ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
if (includeErrors_) {
ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors());
ctx.emplace(iEvent, digiErrorPutToken_, state->gpuAlgo.getErrors());
}
}

Expand Down
48 changes: 22 additions & 26 deletions src/cuda/plugin-SiPixelRawToDigi/SiPixelDigisSoAFromCUDA.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,28 @@
#include "CUDACore/ScopedContext.h"
#include "CUDACore/host_unique_ptr.h"

class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork {
struct SiPixelDigisSoAFromCUDA_AsyncState {
cms::cuda::host::unique_ptr<uint32_t[]> pdigi;
cms::cuda::host::unique_ptr<uint32_t[]> rawIdArr;
cms::cuda::host::unique_ptr<uint16_t[]> adc;
cms::cuda::host::unique_ptr<int32_t[]> clus;
size_t nDigis;
};

class SiPixelDigisSoAFromCUDA : public edm::EDProducerExternalWork<SiPixelDigisSoAFromCUDA_AsyncState> {
public:
explicit SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg);
~SiPixelDigisSoAFromCUDA() override = default;

private:
void acquire(const edm::Event& iEvent,
const edm::EventSetup& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;

edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
edm::WaitingTaskWithArenaHolder waitingTaskHolder,
AsyncState& state) const override;
void produce(edm::Event& iEvent, const edm::EventSetup& iSetup, AsyncState& state) override;

cms::cuda::host::unique_ptr<uint32_t[]> pdigi_;
cms::cuda::host::unique_ptr<uint32_t[]> rawIdArr_;
cms::cuda::host::unique_ptr<uint16_t[]> adc_;
cms::cuda::host::unique_ptr<int32_t[]> clus_;

size_t nDigis_;
const edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
const edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
};

SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg)
Expand All @@ -36,20 +38,18 @@ SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(edm::ProductRegistry& reg)

void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
const edm::EventSetup& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
edm::WaitingTaskWithArenaHolder waitingTaskHolder,
AsyncState& state) const {
// Do the transfer in a CUDA stream parallel to the computation CUDA stream
cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};

const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);

nDigis_ = gpuDigis.nDigis();
pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());
rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream());
adc_ = gpuDigis.adcToHostAsync(ctx.stream());
clus_ = gpuDigis.clusToHostAsync(ctx.stream());
state.pdigi = gpuDigis.pdigiToHostAsync(ctx.stream()), state.rawIdArr = gpuDigis.rawIdArrToHostAsync(ctx.stream()),
state.adc = gpuDigis.adcToHostAsync(ctx.stream()), state.clus = gpuDigis.clusToHostAsync(ctx.stream()),
state.nDigis = gpuDigis.nDigis();
}

void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup, AsyncState& state) {
// The following line copies the data from the pinned host memory to
// regular host memory. In principle that feels unnecessary (why not
// just use the pinned host memory?). There are a few arguments for
Expand All @@ -60,12 +60,8 @@ void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup&
// host memory to be allocated without a CUDA stream
// - What if a CPU algorithm would produce the same SoA? We can't
// use cudaMallocHost without a GPU...
iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get());

pdigi_.reset();
rawIdArr_.reset();
adc_.reset();
clus_.reset();
iEvent.emplace(
digiPutToken_, state.nDigis, state.pdigi.get(), state.rawIdArr.get(), state.adc.get(), state.clus.get());
}

// define as framework plugin
Expand Down
Loading

0 comments on commit 1ddc75c

Please sign in to comment.