From ee0dead92c27e3591e303176fd451032ea1fea1c Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Wed, 23 May 2018 14:18:55 +0200 Subject: [PATCH] Updates to HeterogeneousEDProducer (#50) - remove acquireCPU() as unnecessary - allow omitting device->CPU transfer function - add per-device-type configurability to heterogeneous modules - fix module label in fillDescriptions - move README.md inside HeterogeneousCore/Producer package --- .../CUDACore/interface/GPUCuda.h | 6 +++ HeterogeneousCore/CUDACore/src/GPUCuda.cc | 18 +++++-- HeterogeneousCore/{ => Producer}/README.md | 21 +++++--- .../interface/HeterogeneousEDProducer.h | 35 ++++++++++-- .../Producer/interface/HeterogeneousEvent.h | 2 +- .../Producer/src/HeterogeneousEDProducer.cc | 47 ++++++++++------ .../test/TestHeterogeneousEDProducerGPU.cc | 53 +++++++++---------- .../TestHeterogeneousEDProducerGPUMock.cc | 48 +++++++---------- .../Producer/test/testGPUMock_cfg.py | 3 ++ .../Producer/test/testGPU_cfg.py | 16 +++--- .../Product/interface/HeterogeneousProduct.h | 15 ++++++ 11 files changed, 168 insertions(+), 96 deletions(-) rename HeterogeneousCore/{ => Producer}/README.md (87%) diff --git a/HeterogeneousCore/CUDACore/interface/GPUCuda.h b/HeterogeneousCore/CUDACore/interface/GPUCuda.h index 1413348b3b446..6e02ef83b312c 100644 --- a/HeterogeneousCore/CUDACore/interface/GPUCuda.h +++ b/HeterogeneousCore/CUDACore/interface/GPUCuda.h @@ -3,6 +3,7 @@ #include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h" #include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "HeterogeneousCore/Producer/interface/DeviceWrapper.h" #include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h" @@ -16,12 +17,15 @@ namespace heterogeneous { public: using CallbackType = std::function; + explicit GPUCuda(const edm::ParameterSet& iConfig); virtual ~GPUCuda() noexcept(false); void call_beginStreamGPUCuda(edm::StreamID id); bool call_acquireGPUCuda(DeviceBitSet inputLocation, edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder); void call_produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup); + static void fillPSetDescription(edm::ParameterSetDescription& desc); + private: virtual void beginStreamGPUCuda(edm::StreamID id, cuda::stream_t<>& cudaStream) {}; virtual void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) = 0; @@ -29,6 +33,8 @@ namespace heterogeneous { std::unique_ptr> cudaStream_; int deviceId_ = -1; // device assigned to this edm::Stream + bool enabled_; + const bool forced_; }; DEFINE_DEVICE_WRAPPER(GPUCuda, HeterogeneousDevice::kGPUCuda); } diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc index 983ae4039c28a..8686c8640a8b8 100644 --- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc +++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc @@ -9,11 +9,21 @@ #include namespace heterogeneous { + GPUCuda::GPUCuda(const edm::ParameterSet& iConfig): + enabled_(iConfig.getUntrackedParameter("GPUCuda")), + forced_(iConfig.getUntrackedParameter("force") == "GPUCuda") + {} + GPUCuda::~GPUCuda() noexcept(false) {} + void GPUCuda::fillPSetDescription(edm::ParameterSetDescription& desc) { + desc.addUntracked("GPUCuda", true); + } + void GPUCuda::call_beginStreamGPUCuda(edm::StreamID id) { edm::Service cudaService; - if(!cudaService->enabled()) { + enabled_ = (enabled_ && cudaService->enabled()); + if(!enabled_) { return; } @@ -41,11 +51,13 @@ namespace heterogeneous { } bool GPUCuda::call_acquireGPUCuda(DeviceBitSet inputLocation, edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - edm::Service cudaService; - if(!cudaService->enabled()) { + if(!enabled_) { return false; } + // TODO: currently 'forced_ == true' is already assumed. When the + // scheduling logic evolves, add explicit treatment of forced_. + cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_); try { diff --git a/HeterogeneousCore/README.md b/HeterogeneousCore/Producer/README.md similarity index 87% rename from HeterogeneousCore/README.md rename to HeterogeneousCore/Producer/README.md index d3bdd413789d3..b66be0f445705 100644 --- a/HeterogeneousCore/README.md +++ b/HeterogeneousCore/Producer/README.md @@ -11,12 +11,12 @@ More details can be found from the sub-package specific README files (when they ## Sub-packages -* [`CUDACore`](CUDACore) CUDA-specific core components +* [`CUDACore`](../CUDACore) CUDA-specific core components - *TODO:* Do we actually need this separate from `CUDAServices`? Which one to keep? -* [`CUDAServices`](CUDAServices) Various edm::Services related to CUDA -* [`CUDAUtilities`](CUDAUtilities) Various utilities for CUDA kernel code -* [`Producer`](Producer) Core of the mini-framework for code organization: a base EDProducer class with algorithm scheduling to devices -* [`Product`](Product) Core of the mini-framework for data products +* [`CUDAServices`](../CUDAServices) Various edm::Services related to CUDA +* [`CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code +* [`Producer`](#heterogeneousedproducer) Core of the mini-framework for code organization: a base EDProducer class with algorithm scheduling to devices +* [`Product`](../Product) Core of the mini-framework for data products ## Design goals @@ -90,11 +90,14 @@ particular order). - Well-performing allocators are typically highly non-trivial to construct * Conditions data on GPU - Currently each module takes care of formatting, transferring, and updating the conditions data to GPU + - This is probably good-enough for the current prototyping phase, but what about longer term? + * How to deal with multiple devices, multiple edm::Streams, and multiple lumi sections in flight? + * Do we need to make EventSetup aware of the devices? How much do the details depend on device type? * Add possibility to initiate the GPU->CPU transfer before the CPU product is needed - This would enable overlapping the GPU->CPU transfer while CPU is busy with other work, so the CPU product requestor would not have to wait -* Add configurability - - E.g. for preferred device order, force specific algorithms to specific device +* Improve configurability + - E.g. for preferred device order? * Add fault tolerance - E.g. in a case of a GPU running out of memory continue with CPU - Should be configurable @@ -114,3 +117,7 @@ particular order). * Explore the implementation of these features into the core CMSSW framework - E.g. HeterogeneousProduct would likely go to edm::Wrapper * Explore how to make core framework/TBB scheduling aware of heterogenous devices + +# HeterogeneousEDProducer + +To be written. \ No newline at end of file diff --git a/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h b/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h index a60b47b0f26cd..4f06571e8a70d 100644 --- a/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h +++ b/HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h @@ -4,6 +4,8 @@ #include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h" #include "FWCore/Framework/interface/stream/EDProducer.h" #include "FWCore/Framework/interface/Frameworkfwd.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/Utilities/interface/Exception.h" #include "DataFormats/Common/interface/Handle.h" @@ -15,27 +17,30 @@ namespace heterogeneous { class CPU { public: + explicit CPU(const edm::ParameterSet& iConfig) {} virtual ~CPU() noexcept(false); + static void fillPSetDescription(edm::ParameterSetDescription desc) {} + void call_beginStreamCPU(edm::StreamID id) { beginStreamCPU(id); } bool call_acquireCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder); - void call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - produceCPU(iEvent, iSetup); - } + void call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup); private: virtual void beginStreamCPU(edm::StreamID id) {}; - virtual void acquireCPU(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0; virtual void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0; }; DEFINE_DEVICE_WRAPPER(CPU, HeterogeneousDevice::kCPU); class GPUMock { public: + explicit GPUMock(const edm::ParameterSet& iConfig); virtual ~GPUMock() noexcept(false); + static void fillPSetDescription(edm::ParameterSetDescription& desc); + void call_beginStreamGPUMock(edm::StreamID id) { beginStreamGPUMock(id); } @@ -48,6 +53,9 @@ namespace heterogeneous { virtual void beginStreamGPUMock(edm::StreamID id) {}; virtual void acquireGPUMock(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, std::function callback) = 0; virtual void produceGPUMock(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0; + + const bool enabled_; + const bool forced_; }; DEFINE_DEVICE_WRAPPER(GPUMock, HeterogeneousDevice::kGPUMock); } @@ -132,6 +140,15 @@ namespace heterogeneous { template class HeterogeneousDevices: public Devices... { public: + explicit HeterogeneousDevices(const edm::ParameterSet& iConfig): Devices(iConfig)... {} + + static void fillPSetDescription(edm::ParameterSetDescription& desc) { + // The usual trick to expand the parameter pack for function call + using expander = int[]; + (void)expander {0, ((void)Devices::fillPSetDescription(desc), 1)... }; + desc.addUntracked("force", ""); + } + void call_beginStream(edm::StreamID id) { CallBeginStream::call(*this, id); } @@ -152,7 +169,9 @@ namespace heterogeneous { template class HeterogeneousEDProducer: public Devices, public edm::stream::EDProducer { public: - HeterogeneousEDProducer() {} + explicit HeterogeneousEDProducer(const edm::ParameterSet& iConfig): + Devices(iConfig.getUntrackedParameter("heterogeneousEnabled_")) + {} ~HeterogeneousEDProducer() = default; protected: @@ -161,6 +180,12 @@ class HeterogeneousEDProducer: public Devices, public edm::stream::EDProducer("heterogeneousEnabled_", nested); + } + private: void beginStream(edm::StreamID id) { Devices::call_beginStream(id); diff --git a/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h b/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h index 98595dbd6baa6..e06e0a8ffa86c 100644 --- a/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h +++ b/HeterogeneousCore/Producer/interface/HeterogeneousEvent.h @@ -48,7 +48,7 @@ namespace edm { CASE(HeterogeneousDevice::kGPUMock); CASE(HeterogeneousDevice::kGPUCuda); default: - throw cms::Exception("LogicError") << "edm::HeterogeneousEvent::getByToken(): no case statement for device " << static_cast(location().deviceType()); + throw cms::Exception("LogicError") << "edm::HeterogeneousEvent::getByToken(): no case statement for device " << static_cast(location().deviceType()) << ". If you are calling getByToken() from produceX() where X != CPU, please move the call to acquireX()."; } #undef CASE } diff --git a/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc b/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc index c7f7f152bd75d..74bce0c8e89a9 100644 --- a/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc +++ b/HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc @@ -11,30 +11,47 @@ namespace heterogeneous { CPU::~CPU() noexcept(false) {} bool CPU::call_acquireCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - std::exception_ptr exc; - try { - iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kCPU)); - acquireCPU(iEvent, iSetup); - iEvent.locationSetter()(HeterogeneousDeviceId(HeterogeneousDevice::kCPU)); - } catch(...) { - exc = std::current_exception(); - } - waitingTaskHolder.doneWaiting(exc); + // There is no need for acquire in CPU, everything can be done in produceCPU(). + iEvent.locationSetter()(HeterogeneousDeviceId(HeterogeneousDevice::kCPU)); + waitingTaskHolder.doneWaiting(nullptr); return true; } + void CPU::call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { + // For CPU we set the heterogeneous input location for produce, because there is no acquire + // For other devices this probably doesn't make sense, because the device code is supposed to be launched from acquire. + iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kCPU, 0)); + produceCPU(iEvent, iSetup); + } + + GPUMock::GPUMock(const edm::ParameterSet& iConfig): + enabled_(iConfig.getUntrackedParameter("GPUMock")), + forced_(iConfig.getUntrackedParameter("force") == "GPUMock") + {} + GPUMock::~GPUMock() noexcept(false) {} + void GPUMock::fillPSetDescription(edm::ParameterSetDescription& desc) { + desc.addUntracked("GPUMock", true); + } + bool GPUMock::call_acquireGPUMock(DeviceBitSet inputLocation, edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - // Decide randomly whether to run on GPU or CPU to simulate scheduler decisions - std::random_device r; - std::mt19937 gen(r()); - auto dist1 = std::uniform_int_distribution<>(0, 3); // simulate GPU (in)availability - if(dist1(gen) == 0) { - edm::LogPrint("HeterogeneousEDProducer") << "Mock GPU is not available (by chance)"; + if(!enabled_) { + edm::LogPrint("HeterogeneousEDProducer") << "Mock GPU is not available for this module (disabled in configuration)"; return false; } + if(!forced_) { + // Decide randomly whether to run on GPU or CPU to simulate scheduler decisions + std::random_device r; + std::mt19937 gen(r()); + auto dist1 = std::uniform_int_distribution<>(0, 3); // simulate GPU (in)availability + if(dist1(gen) == 0) { + edm::LogPrint("HeterogeneousEDProducer") << "Mock GPU is not available (by chance)"; + return false; + } + } + try { iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0)); acquireGPUMock(iEvent, iSetup, diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc index 83afa1e8ca373..b92fa76614e85 100644 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc +++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc @@ -43,7 +43,6 @@ class TestHeterogeneousEDProducerGPU: public HeterogeneousEDProducer& cudaStream) override; - void acquireCPU(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override; void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override; void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override; @@ -55,12 +54,10 @@ class TestHeterogeneousEDProducerGPU: public HeterogeneousEDProducer gpuAlgo_; TestHeterogeneousEDProducerGPUTask::ResultType gpuOutput_; - - // output - unsigned int output_; }; TestHeterogeneousEDProducerGPU::TestHeterogeneousEDProducerGPU(edm::ParameterSet const& iConfig): + HeterogeneousEDProducer(iConfig), label_(iConfig.getParameter("@module_label")) { auto srcTag = iConfig.getParameter("src"); @@ -74,7 +71,8 @@ TestHeterogeneousEDProducerGPU::TestHeterogeneousEDProducerGPU(edm::ParameterSet void TestHeterogeneousEDProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("src", edm::InputTag()); - descriptions.add("testHeterogeneousEDProducerGPU2", desc); + HeterogeneousEDProducer::fillPSetDescription(desc); + descriptions.add("testHeterogeneousEDProducerGPU", desc); } void TestHeterogeneousEDProducerGPU::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) { @@ -87,28 +85,6 @@ void TestHeterogeneousEDProducerGPU::beginStreamGPUCuda(edm::StreamID streamId, edm::LogPrint("TestHeterogeneousEDProducerGPU") << " " << label_ << " TestHeterogeneousEDProducerGPU::beginStreamGPUCuda end stream " << streamId << " device " << cs->getCurrentDevice(); } -void TestHeterogeneousEDProducerGPU::acquireCPU(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - edm::LogPrint("TestHeterogeneousEDProducerGPU") << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireCPU begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); - - unsigned int input = 0; - if(!srcToken_.isUninitialized()) { - edm::Handle hin; - iEvent.getByToken(srcToken_, hin); - input = *hin; - } - - std::random_device r; - std::mt19937 gen(r()); - auto dist = std::uniform_real_distribution<>(1.0, 3.0); - auto dur = dist(gen); - edm::LogPrint("TestHeterogeneousEDProducerGPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << iEvent.streamID() << " will take " << dur << " seconds"; - std::this_thread::sleep_for(std::chrono::seconds(1)*dur); - - output_ = input + iEvent.streamID()*100 + iEvent.id().event(); - - edm::LogPrint("TestHeterogeneousEDProducerGPU") << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID(); -} - void TestHeterogeneousEDProducerGPU::acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) { edm::Service cs; edm::LogPrint("TestHeterogeneousEDProducerGPU") << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireGPUCuda begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " device " << cs->getCurrentDevice(); @@ -131,9 +107,25 @@ void TestHeterogeneousEDProducerGPU::acquireGPUCuda(const edm::HeterogeneousEven void TestHeterogeneousEDProducerGPU::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { edm::LogPrint("TestHeterogeneousEDProducerGPU") << label_ << " TestHeterogeneousEDProducerGPU::produceCPU begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); - iEvent.put(std::make_unique(output_)); + unsigned int input = 0; + if(!srcToken_.isUninitialized()) { + edm::Handle hin; + iEvent.getByToken(srcToken_, hin); + input = *hin; + } - edm::LogPrint("TestHeterogeneousEDProducerGPU") << label_ << " TestHeterogeneousEDProducerGPU::produceCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << output_; + std::random_device r; + std::mt19937 gen(r()); + auto dist = std::uniform_real_distribution<>(1.0, 3.0); + auto dur = dist(gen); + edm::LogPrint("TestHeterogeneousEDProducerGPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << iEvent.streamID() << " will take " << dur << " seconds"; + std::this_thread::sleep_for(std::chrono::seconds(1)*dur); + + const unsigned int output = input + iEvent.streamID()*100 + iEvent.id().event(); + + iEvent.put(std::make_unique(output)); + + edm::LogPrint("TestHeterogeneousEDProducerGPU") << label_ << " TestHeterogeneousEDProducerGPU::produceCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << output; } void TestHeterogeneousEDProducerGPU::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) { @@ -155,6 +147,9 @@ void TestHeterogeneousEDProducerGPU::produceGPUCuda(edm::HeterogeneousEvent& iEv dst = TestHeterogeneousEDProducerGPUTask::getResult(src, cudaStream); }); + // If, for any reason, you want to disable the automatic GPU->CPU transfer, pass heterogeneous::DisableTransfer{} insteads of the function, i.e. + //iEvent.put(std::make_unique(gpuOutput_.first.get(), gpuOutput_.second.get()), heterogeneous::DisableTransfer{}); + edm::LogPrint("TestHeterogeneousEDProducerGPU") << label_ << " TestHeterogeneousEDProducerGPU::produceGPUCuda end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " device " << cs->getCurrentDevice(); } diff --git a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc index 697810ed36dab..716b05ecc165c 100644 --- a/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc +++ b/HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPUMock.cc @@ -43,10 +43,6 @@ class TestHeterogeneousEDProducerGPUMock: public HeterogeneousEDProducer callback); void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override; @@ -54,6 +50,7 @@ class TestHeterogeneousEDProducerGPUMock: public HeterogeneousEDProducer("@module_label")) { auto srcTag = iConfig.getParameter("src"); @@ -68,31 +65,10 @@ TestHeterogeneousEDProducerGPUMock::TestHeterogeneousEDProducerGPUMock(edm::Para void TestHeterogeneousEDProducerGPUMock::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; desc.add("src", edm::InputTag()); + HeterogeneousEDProducer::fillPSetDescription(desc); descriptions.add("testHeterogeneousEDProducerGPUMock", desc); } -void TestHeterogeneousEDProducerGPUMock::acquireCPU(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") << label_ << " TestHeterogeneousEDProducerGPUMock::acquireCPU event " << iEvent.id().event() << " stream " << iEvent.streamID(); - - unsigned int input = 0; - if(!srcToken_.isUninitialized()) { - edm::Handle hin; - iEvent.getByToken(srcToken_, hin); - input = *hin; - } - - std::random_device r; - std::mt19937 gen(r()); - auto dist = std::uniform_real_distribution<>(1.0, 3.0); - auto dur = dist(gen); - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << iEvent.streamID() << " will take " << dur << " seconds"; - std::this_thread::sleep_for(std::chrono::seconds(1)*dur); - - output_ = input+ iEvent.streamID()*100 + iEvent.id().event(); - - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") << " " << label_ << " TestHeterogeneousEDProducerGPUMock::acquireCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID(); -} - void TestHeterogeneousEDProducerGPUMock::acquireGPUMock(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, std::function callback) { edm::LogPrint("TestHeterogeneousEDProducerGPUMock") << " " << label_ << " TestHeterogeneousEDProducerGPUMock::acquireGPUMock begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); @@ -129,10 +105,26 @@ void TestHeterogeneousEDProducerGPUMock::acquireGPUMock(const edm::Heterogeneous void TestHeterogeneousEDProducerGPUMock::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { edm::LogPrint("TestHeterogeneousEDProducerGPUMock") << label_ << " TestHeterogeneousEDProducerGPUMock::produceCPU begin event " << iEvent.id().event() << " stream " << iEvent.streamID(); - iEvent.put(std::make_unique(output_)); + unsigned int input = 0; + if(!srcToken_.isUninitialized()) { + edm::Handle hin; + iEvent.getByToken(srcToken_, hin); + input = *hin; + } + + std::random_device r; + std::mt19937 gen(r()); + auto dist = std::uniform_real_distribution<>(1.0, 3.0); + auto dur = dist(gen); + edm::LogPrint("TestHeterogeneousEDProducerGPUMock") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << iEvent.streamID() << " will take " << dur << " seconds"; + std::this_thread::sleep_for(std::chrono::seconds(1)*dur); + + const unsigned int output = input+ iEvent.streamID()*100 + iEvent.id().event(); + + iEvent.put(std::make_unique(output)); iEvent.put(std::make_unique(1)); - edm::LogPrint("TestHeterogeneousEDProducerGPUMock") << label_ << " TestHeterogeneousEDProducerGPUMock::produceCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << output_; + edm::LogPrint("TestHeterogeneousEDProducerGPUMock") << label_ << " TestHeterogeneousEDProducerGPUMock::produceCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << output; } void TestHeterogeneousEDProducerGPUMock::produceGPUMock(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) { diff --git a/HeterogeneousCore/Producer/test/testGPUMock_cfg.py b/HeterogeneousCore/Producer/test/testGPUMock_cfg.py index c1fe7f60db33a..f6d4125f792df 100644 --- a/HeterogeneousCore/Producer/test/testGPUMock_cfg.py +++ b/HeterogeneousCore/Producer/test/testGPUMock_cfg.py @@ -28,3 +28,6 @@ ) process.p = cms.Path(process.prod1+process.prod2)#+process.eca) #process.p.associate(process.t) + +# Example of forcing module to run a specific device for one module via configuration +#process.prod1.heterogeneousEnabled_ = cms.untracked.PSet(force = cms.untracked.string("GPUMock")) diff --git a/HeterogeneousCore/Producer/test/testGPU_cfg.py b/HeterogeneousCore/Producer/test/testGPU_cfg.py index 9b916a685b88c..73042fc799d05 100644 --- a/HeterogeneousCore/Producer/test/testGPU_cfg.py +++ b/HeterogeneousCore/Producer/test/testGPU_cfg.py @@ -12,17 +12,14 @@ numberOfStreams = cms.untracked.uint32(0) ) +from HeterogeneousCore.Producer.testHeterogeneousEDProducerGPU_cfi import testHeterogeneousEDProducerGPU as prod #process.Tracer = cms.Service("Tracer") process.CUDAService = cms.Service("CUDAService") -process.prod1 = cms.EDProducer('TestHeterogeneousEDProducerGPU') -process.prod2 = cms.EDProducer('TestHeterogeneousEDProducerGPU', - src = cms.InputTag("prod1"), -) -process.prod3 = cms.EDProducer('TestHeterogeneousEDProducerGPU', - src = cms.InputTag("prod1"), -) -process.prod4 = cms.EDProducer('TestHeterogeneousEDProducerGPU') +process.prod1 = prod.clone() +process.prod2 = prod.clone(src = "prod1") +process.prod3 = prod.clone(src = "prod1") +process.prod4 = prod.clone() process.ana = cms.EDAnalyzer("TestHeterogeneousEDProducerAnalyzer", src = cms.VInputTag("prod2", "prod3", "prod4") ) @@ -30,3 +27,6 @@ process.t = cms.Task(process.prod1, process.prod2, process.prod3, process.prod4) process.p = cms.Path(process.ana) process.p.associate(process.t) + +# Example of disabling CUDA device type for one module via configuration +#process.prod4.heterogeneousEnabled_.GPUCuda = False diff --git a/HeterogeneousCore/Product/interface/HeterogeneousProduct.h b/HeterogeneousCore/Product/interface/HeterogeneousProduct.h index c5317ba2acbc3..9e0b5a2b0588c 100644 --- a/HeterogeneousCore/Product/interface/HeterogeneousProduct.h +++ b/HeterogeneousCore/Product/interface/HeterogeneousProduct.h @@ -55,6 +55,9 @@ namespace heterogeneous { DEFINE_DEVICE_PRODUCT(GPUCuda); #undef DEFINE_DEVICE_PRODUCT + // Tag class to allow disabling automatic device->CPU transfers + struct DisableTransfer {}; + /** * Below are various helpers * @@ -262,6 +265,18 @@ class HeterogeneousProductImpl: public HeterogeneousProductBase { location_[index].set(location.deviceId()); } + /** + * Generic constructor for device data, but without the transfer function(!). + */ + template + HeterogeneousProductImpl(heterogeneous::HeterogeneousDeviceTag, D&& data, HeterogeneousDeviceId location, heterogeneous::DisableTransfer) { + // TODO: try to avoid code duplication between the other device data + constexpr const auto index = static_cast(Device); + assert(location.deviceType() == Device); + std::get(products_) = std::move(data); + location_[index].set(location.deviceId()); + } + template const auto& getProduct() const { constexpr const auto index = static_cast(device);