Skip to content

Commit

Permalink
Updates to HeterogeneousEDProducer (cms-sw#50)
Browse files Browse the repository at this point in the history
  - remove acquireCPU() as unnecessary
  - allow omitting device->CPU transfer function
  - add per-device-type configurability to heterogeneous modules
  - fix module label in fillDescriptions
  - move README.md inside HeterogeneousCore/Producer package
  • Loading branch information
makortel authored and fwyzard committed May 23, 2018
1 parent 6c80eaf commit ee0dead
Show file tree
Hide file tree
Showing 11 changed files with 168 additions and 96 deletions.
6 changes: 6 additions & 0 deletions HeterogeneousCore/CUDACore/interface/GPUCuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
#include "FWCore/Framework/interface/Frameworkfwd.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"

#include "HeterogeneousCore/Producer/interface/DeviceWrapper.h"
#include "HeterogeneousCore/Producer/interface/HeterogeneousEvent.h"
Expand All @@ -16,19 +17,24 @@ namespace heterogeneous {
public:
using CallbackType = std::function<void(cuda::device::id_t, cuda::stream::id_t, cuda::status_t)>;

explicit GPUCuda(const edm::ParameterSet& iConfig);
virtual ~GPUCuda() noexcept(false);

void call_beginStreamGPUCuda(edm::StreamID id);
bool call_acquireGPUCuda(DeviceBitSet inputLocation, edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder);
void call_produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup);

static void fillPSetDescription(edm::ParameterSetDescription& desc);

private:
virtual void beginStreamGPUCuda(edm::StreamID id, cuda::stream_t<>& cudaStream) {};
virtual void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) = 0;
virtual void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) = 0;

std::unique_ptr<cuda::stream_t<>> cudaStream_;
int deviceId_ = -1; // device assigned to this edm::Stream
bool enabled_;
const bool forced_;
};
DEFINE_DEVICE_WRAPPER(GPUCuda, HeterogeneousDevice::kGPUCuda);
}
Expand Down
18 changes: 15 additions & 3 deletions HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,21 @@
#include <exception>

namespace heterogeneous {
GPUCuda::GPUCuda(const edm::ParameterSet& iConfig):
enabled_(iConfig.getUntrackedParameter<bool>("GPUCuda")),
forced_(iConfig.getUntrackedParameter<std::string>("force") == "GPUCuda")
{}

GPUCuda::~GPUCuda() noexcept(false) {}

void GPUCuda::fillPSetDescription(edm::ParameterSetDescription& desc) {
desc.addUntracked<bool>("GPUCuda", true);
}

void GPUCuda::call_beginStreamGPUCuda(edm::StreamID id) {
edm::Service<CUDAService> cudaService;
if(!cudaService->enabled()) {
enabled_ = (enabled_ && cudaService->enabled());
if(!enabled_) {
return;
}

Expand Down Expand Up @@ -41,11 +51,13 @@ namespace heterogeneous {
}

bool GPUCuda::call_acquireGPUCuda(DeviceBitSet inputLocation, edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
edm::Service<CUDAService> cudaService;
if(!cudaService->enabled()) {
if(!enabled_) {
return false;
}

// TODO: currently 'forced_ == true' is already assumed. When the
// scheduling logic evolves, add explicit treatment of forced_.

cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ More details can be found from the sub-package specific README files (when they

## Sub-packages

* [`CUDACore`](CUDACore) CUDA-specific core components
* [`CUDACore`](../CUDACore) CUDA-specific core components
- *TODO:* Do we actually need this separate from `CUDAServices`? Which one to keep?
* [`CUDAServices`](CUDAServices) Various edm::Services related to CUDA
* [`CUDAUtilities`](CUDAUtilities) Various utilities for CUDA kernel code
* [`Producer`](Producer) Core of the mini-framework for code organization: a base EDProducer class with algorithm scheduling to devices
* [`Product`](Product) Core of the mini-framework for data products
* [`CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
* [`CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
* [`Producer`](#heterogeneousedproducer) Core of the mini-framework for code organization: a base EDProducer class with algorithm scheduling to devices
* [`Product`](../Product) Core of the mini-framework for data products

## Design goals

Expand Down Expand Up @@ -90,11 +90,14 @@ particular order).
- Well-performing allocators are typically highly non-trivial to construct
* Conditions data on GPU
- Currently each module takes care of formatting, transferring, and updating the conditions data to GPU
- This is probably good-enough for the current prototyping phase, but what about longer term?
* How to deal with multiple devices, multiple edm::Streams, and multiple lumi sections in flight?
* Do we need to make EventSetup aware of the devices? How much do the details depend on device type?
* Add possibility to initiate the GPU->CPU transfer before the CPU product is needed
- This would enable overlapping the GPU->CPU transfer while CPU is busy
with other work, so the CPU product requestor would not have to wait
* Add configurability
- E.g. for preferred device order, force specific algorithms to specific device
* Improve configurability
- E.g. for preferred device order?
* Add fault tolerance
- E.g. in a case of a GPU running out of memory continue with CPU
- Should be configurable
Expand All @@ -114,3 +117,7 @@ particular order).
* Explore the implementation of these features into the core CMSSW framework
- E.g. HeterogeneousProduct would likely go to edm::Wrapper
* Explore how to make core framework/TBB scheduling aware of heterogenous devices

# HeterogeneousEDProducer

To be written.
35 changes: 30 additions & 5 deletions HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
#include "FWCore/Framework/interface/stream/EDProducer.h"
#include "FWCore/Framework/interface/Frameworkfwd.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/Utilities/interface/Exception.h"

#include "DataFormats/Common/interface/Handle.h"
Expand All @@ -15,27 +17,30 @@
namespace heterogeneous {
class CPU {
public:
explicit CPU(const edm::ParameterSet& iConfig) {}
virtual ~CPU() noexcept(false);

static void fillPSetDescription(edm::ParameterSetDescription desc) {}

void call_beginStreamCPU(edm::StreamID id) {
beginStreamCPU(id);
}
bool call_acquireCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder);
void call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
produceCPU(iEvent, iSetup);
}
void call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup);

private:
virtual void beginStreamCPU(edm::StreamID id) {};
virtual void acquireCPU(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0;
virtual void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0;
};
DEFINE_DEVICE_WRAPPER(CPU, HeterogeneousDevice::kCPU);

class GPUMock {
public:
explicit GPUMock(const edm::ParameterSet& iConfig);
virtual ~GPUMock() noexcept(false);

static void fillPSetDescription(edm::ParameterSetDescription& desc);

void call_beginStreamGPUMock(edm::StreamID id) {
beginStreamGPUMock(id);
}
Expand All @@ -48,6 +53,9 @@ namespace heterogeneous {
virtual void beginStreamGPUMock(edm::StreamID id) {};
virtual void acquireGPUMock(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, std::function<void()> callback) = 0;
virtual void produceGPUMock(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) = 0;

const bool enabled_;
const bool forced_;
};
DEFINE_DEVICE_WRAPPER(GPUMock, HeterogeneousDevice::kGPUMock);
}
Expand Down Expand Up @@ -132,6 +140,15 @@ namespace heterogeneous {
template <typename ...Devices>
class HeterogeneousDevices: public Devices... {
public:
explicit HeterogeneousDevices(const edm::ParameterSet& iConfig): Devices(iConfig)... {}

static void fillPSetDescription(edm::ParameterSetDescription& desc) {
// The usual trick to expand the parameter pack for function call
using expander = int[];
(void)expander {0, ((void)Devices::fillPSetDescription(desc), 1)... };
desc.addUntracked<std::string>("force", "");
}

void call_beginStream(edm::StreamID id) {
CallBeginStream<HeterogeneousDevices, Devices...>::call(*this, id);
}
Expand All @@ -152,7 +169,9 @@ namespace heterogeneous {
template <typename Devices, typename ...Capabilities>
class HeterogeneousEDProducer: public Devices, public edm::stream::EDProducer<edm::ExternalWork, Capabilities...> {
public:
HeterogeneousEDProducer() {}
explicit HeterogeneousEDProducer(const edm::ParameterSet& iConfig):
Devices(iConfig.getUntrackedParameter<edm::ParameterSet>("heterogeneousEnabled_"))
{}
~HeterogeneousEDProducer() = default;

protected:
Expand All @@ -161,6 +180,12 @@ class HeterogeneousEDProducer: public Devices, public edm::stream::EDProducer<ed
return tokens_.back();
}

static void fillPSetDescription(edm::ParameterSetDescription& desc) {
edm::ParameterSetDescription nested;
Devices::fillPSetDescription(nested);
desc.addUntracked<edm::ParameterSetDescription>("heterogeneousEnabled_", nested);
}

private:
void beginStream(edm::StreamID id) {
Devices::call_beginStream(id);
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/Producer/interface/HeterogeneousEvent.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ namespace edm {
CASE(HeterogeneousDevice::kGPUMock);
CASE(HeterogeneousDevice::kGPUCuda);
default:
throw cms::Exception("LogicError") << "edm::HeterogeneousEvent::getByToken(): no case statement for device " << static_cast<unsigned int>(location().deviceType());
throw cms::Exception("LogicError") << "edm::HeterogeneousEvent::getByToken(): no case statement for device " << static_cast<unsigned int>(location().deviceType()) << ". If you are calling getByToken() from produceX() where X != CPU, please move the call to acquireX().";
}
#undef CASE
}
Expand Down
47 changes: 32 additions & 15 deletions HeterogeneousCore/Producer/src/HeterogeneousEDProducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,47 @@ namespace heterogeneous {
CPU::~CPU() noexcept(false) {}

bool CPU::call_acquireCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
std::exception_ptr exc;
try {
iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kCPU));
acquireCPU(iEvent, iSetup);
iEvent.locationSetter()(HeterogeneousDeviceId(HeterogeneousDevice::kCPU));
} catch(...) {
exc = std::current_exception();
}
waitingTaskHolder.doneWaiting(exc);
// There is no need for acquire in CPU, everything can be done in produceCPU().
iEvent.locationSetter()(HeterogeneousDeviceId(HeterogeneousDevice::kCPU));
waitingTaskHolder.doneWaiting(nullptr);
return true;
}

void CPU::call_produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
// For CPU we set the heterogeneous input location for produce, because there is no acquire
// For other devices this probably doesn't make sense, because the device code is supposed to be launched from acquire.
iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kCPU, 0));
produceCPU(iEvent, iSetup);
}

GPUMock::GPUMock(const edm::ParameterSet& iConfig):
enabled_(iConfig.getUntrackedParameter<bool>("GPUMock")),
forced_(iConfig.getUntrackedParameter<std::string>("force") == "GPUMock")
{}

GPUMock::~GPUMock() noexcept(false) {}

void GPUMock::fillPSetDescription(edm::ParameterSetDescription& desc) {
desc.addUntracked<bool>("GPUMock", true);
}

bool GPUMock::call_acquireGPUMock(DeviceBitSet inputLocation, edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
// Decide randomly whether to run on GPU or CPU to simulate scheduler decisions
std::random_device r;
std::mt19937 gen(r());
auto dist1 = std::uniform_int_distribution<>(0, 3); // simulate GPU (in)availability
if(dist1(gen) == 0) {
edm::LogPrint("HeterogeneousEDProducer") << "Mock GPU is not available (by chance)";
if(!enabled_) {
edm::LogPrint("HeterogeneousEDProducer") << "Mock GPU is not available for this module (disabled in configuration)";
return false;
}

if(!forced_) {
// Decide randomly whether to run on GPU or CPU to simulate scheduler decisions
std::random_device r;
std::mt19937 gen(r());
auto dist1 = std::uniform_int_distribution<>(0, 3); // simulate GPU (in)availability
if(dist1(gen) == 0) {
edm::LogPrint("HeterogeneousEDProducer") << "Mock GPU is not available (by chance)";
return false;
}
}

try {
iEvent.setInputLocation(HeterogeneousDeviceId(HeterogeneousDevice::kGPUMock, 0));
acquireGPUMock(iEvent, iSetup,
Expand Down
53 changes: 24 additions & 29 deletions HeterogeneousCore/Producer/test/TestHeterogeneousEDProducerGPU.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ class TestHeterogeneousEDProducerGPU: public HeterogeneousEDProducer<heterogeneo

void beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) override;

void acquireCPU(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override;
void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;

void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override;
Expand All @@ -55,12 +54,10 @@ class TestHeterogeneousEDProducerGPU: public HeterogeneousEDProducer<heterogeneo
// GPU stuff
std::unique_ptr<TestHeterogeneousEDProducerGPUTask> gpuAlgo_;
TestHeterogeneousEDProducerGPUTask::ResultType gpuOutput_;

// output
unsigned int output_;
};

TestHeterogeneousEDProducerGPU::TestHeterogeneousEDProducerGPU(edm::ParameterSet const& iConfig):
HeterogeneousEDProducer(iConfig),
label_(iConfig.getParameter<std::string>("@module_label"))
{
auto srcTag = iConfig.getParameter<edm::InputTag>("src");
Expand All @@ -74,7 +71,8 @@ TestHeterogeneousEDProducerGPU::TestHeterogeneousEDProducerGPU(edm::ParameterSet
void TestHeterogeneousEDProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
edm::ParameterSetDescription desc;
desc.add<edm::InputTag>("src", edm::InputTag());
descriptions.add("testHeterogeneousEDProducerGPU2", desc);
HeterogeneousEDProducer::fillPSetDescription(desc);
descriptions.add("testHeterogeneousEDProducerGPU", desc);
}

void TestHeterogeneousEDProducerGPU::beginStreamGPUCuda(edm::StreamID streamId, cuda::stream_t<>& cudaStream) {
Expand All @@ -87,28 +85,6 @@ void TestHeterogeneousEDProducerGPU::beginStreamGPUCuda(edm::StreamID streamId,
edm::LogPrint("TestHeterogeneousEDProducerGPU") << " " << label_ << " TestHeterogeneousEDProducerGPU::beginStreamGPUCuda end stream " << streamId << " device " << cs->getCurrentDevice();
}

void TestHeterogeneousEDProducerGPU::acquireCPU(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
edm::LogPrint("TestHeterogeneousEDProducerGPU") << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireCPU begin event " << iEvent.id().event() << " stream " << iEvent.streamID();

unsigned int input = 0;
if(!srcToken_.isUninitialized()) {
edm::Handle<unsigned int> hin;
iEvent.getByToken<OutputType>(srcToken_, hin);
input = *hin;
}

std::random_device r;
std::mt19937 gen(r());
auto dist = std::uniform_real_distribution<>(1.0, 3.0);
auto dur = dist(gen);
edm::LogPrint("TestHeterogeneousEDProducerGPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << iEvent.streamID() << " will take " << dur << " seconds";
std::this_thread::sleep_for(std::chrono::seconds(1)*dur);

output_ = input + iEvent.streamID()*100 + iEvent.id().event();

edm::LogPrint("TestHeterogeneousEDProducerGPU") << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID();
}

void TestHeterogeneousEDProducerGPU::acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
edm::Service<CUDAService> cs;
edm::LogPrint("TestHeterogeneousEDProducerGPU") << " " << label_ << " TestHeterogeneousEDProducerGPU::acquireGPUCuda begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " device " << cs->getCurrentDevice();
Expand All @@ -131,9 +107,25 @@ void TestHeterogeneousEDProducerGPU::acquireGPUCuda(const edm::HeterogeneousEven
void TestHeterogeneousEDProducerGPU::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
edm::LogPrint("TestHeterogeneousEDProducerGPU") << label_ << " TestHeterogeneousEDProducerGPU::produceCPU begin event " << iEvent.id().event() << " stream " << iEvent.streamID();

iEvent.put<OutputType>(std::make_unique<unsigned int>(output_));
unsigned int input = 0;
if(!srcToken_.isUninitialized()) {
edm::Handle<unsigned int> hin;
iEvent.getByToken<OutputType>(srcToken_, hin);
input = *hin;
}

edm::LogPrint("TestHeterogeneousEDProducerGPU") << label_ << " TestHeterogeneousEDProducerGPU::produceCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << output_;
std::random_device r;
std::mt19937 gen(r());
auto dist = std::uniform_real_distribution<>(1.0, 3.0);
auto dur = dist(gen);
edm::LogPrint("TestHeterogeneousEDProducerGPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << iEvent.streamID() << " will take " << dur << " seconds";
std::this_thread::sleep_for(std::chrono::seconds(1)*dur);

const unsigned int output = input + iEvent.streamID()*100 + iEvent.id().event();

iEvent.put<OutputType>(std::make_unique<unsigned int>(output));

edm::LogPrint("TestHeterogeneousEDProducerGPU") << label_ << " TestHeterogeneousEDProducerGPU::produceCPU end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << output;
}

void TestHeterogeneousEDProducerGPU::produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) {
Expand All @@ -155,6 +147,9 @@ void TestHeterogeneousEDProducerGPU::produceGPUCuda(edm::HeterogeneousEvent& iEv
dst = TestHeterogeneousEDProducerGPUTask::getResult(src, cudaStream);
});

// If, for any reason, you want to disable the automatic GPU->CPU transfer, pass heterogeneous::DisableTransfer{} insteads of the function, i.e.
//iEvent.put<OutputType>(std::make_unique<TestHeterogeneousEDProducerGPUTask::ResultTypeRaw>(gpuOutput_.first.get(), gpuOutput_.second.get()), heterogeneous::DisableTransfer{});

edm::LogPrint("TestHeterogeneousEDProducerGPU") << label_ << " TestHeterogeneousEDProducerGPU::produceGPUCuda end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " device " << cs->getCurrentDevice();
}

Expand Down
Loading

0 comments on commit ee0dead

Please sign in to comment.