Skip to content

Commit

Permalink
f cleanup gpudfg
Browse files Browse the repository at this point in the history
  • Loading branch information
antoniupop committed Sep 15, 2023
1 parent bacd4ea commit 28d9e3b
Showing 1 changed file with 24 additions and 34 deletions.
58 changes: 24 additions & 34 deletions compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
#include "keyswitch.h"
#include "linear_algebra.h"

#define GPU_COMPUTE_FACTOR 24

using MemRef2 = concretelang::clientlib::MemRefDescriptor<2>;
using RuntimeContext = mlir::concretelang::RuntimeContext;

Expand All @@ -38,21 +36,16 @@ namespace gpu_dfg {
namespace {

// When not using all accelerators on the machine, we distribute work
// by assigning the default accelerator for each SDFG to next
// by assigning the default accelerator for each SDFG to next_device
// round-robin.
static std::atomic<size_t> next_device = {0};

// Resources available (or set as requested by user through
// environment variables) on the machine. Defaults to using all
// available.
static size_t num_devices = 0; // Set SDFG_NUM_GPUS to configure
static size_t num_cores = 1; // Set OMP_NUM_THREADS to configure (as
// this is linked to loop parallelism)

// By default we distribute batched ops across all available GPUs
// (or value of environment variable SDFG_NUM_GPUS whichever is
// lower). Set SDFG_DISTRIBUTE_BATCH_OPS=OFF to inhibit this.
static bool dont_distribute_batched_ops = false;
// available resources.
static size_t num_devices = 0; // Set SDFG_NUM_GPUS to configure
static size_t num_cores = 1; // Set SDFG_NUM_THREADS to configure
static size_t device_compute_factor = 10; // Set SDFG_DEVICE_TO_CORE_RATIO

// Get the byte size of a rank 2 MemRef
static inline size_t memref_get_data_size(MemRef2 &m) {
Expand Down Expand Up @@ -279,8 +272,8 @@ struct Dependence {
return;
}
size_t chunk_size =
num_samples / (num_chunks + num_gpu_chunks * GPU_COMPUTE_FACTOR);
size_t gpu_chunk_size = chunk_size * GPU_COMPUTE_FACTOR;
num_samples / (num_chunks + num_gpu_chunks * device_compute_factor);
size_t gpu_chunk_size = chunk_size * device_compute_factor;
chunk_size = (num_samples - gpu_chunk_size * num_gpu_chunks) / num_chunks;
size_t chunk_remainder =
(num_samples - gpu_chunk_size * num_gpu_chunks) % num_chunks;
Expand Down Expand Up @@ -525,8 +518,7 @@ struct Stream {
std::list<Process *> queue;
extract_producing_graph(queue);

// TODO : replace with on-cpu execution, see if can be parallelised
// Do this for subgraphs that don't use BSes
// Determine if this subgrah bootstraps
bool is_batched_subgraph = false;
size_t subgraph_bootstraps = 0;
for (auto p : queue) {
Expand All @@ -535,7 +527,8 @@ struct Stream {
(p->fun == memref_bootstrap_lwe_u64_process) ? 1 : 0;
}
// If this subgraph is not batched, then use this DFG's allocated
// GPU to offload to.
// GPU to offload to. If this does not bootstrap, just execute on
// the host.
if (!is_batched_subgraph) {
for (auto p : queue) {
schedule_kernel(
Expand Down Expand Up @@ -608,13 +601,13 @@ struct Stream {
int32_t num_devices_to_use = 0;
// If the subgraph does not have sufficient computational
// intensity (which we approximate by whether it bootstraps), then
// we assume (FIXME- confirm with profiling) that it is not
// we assume (TODO: confirm with profiling) that it is not
// beneficial to offload to GPU.
if (subgraph_bootstraps) {
// Determine maximum GPU granulariry
size_t gpu_free_mem;
size_t gpu_total_mem;
// FIXME: this could be improved
// TODO: this could be improved
// Force deallocation with a synchronization point
for (size_t g = 0; g < num_devices; ++g)
cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(g));
Expand All @@ -623,20 +616,21 @@ struct Stream {
// TODO - for now assume each device on the system has roughly same
// available memory.
size_t available_mem = gpu_free_mem;
// Further assume (FIXME) that kernel execution requires twice as much
// Further assume (TODO) that kernel execution requires twice as much
// memory per sample
size_t max_samples_per_chunk =
(available_mem - const_mem_per_sample) / (mem_per_sample * 2);

if (num_samples < num_cores + GPU_COMPUTE_FACTOR * num_devices) {
if (num_samples < num_cores + device_compute_factor * num_devices) {
num_devices_to_use = 0;
num_chunks = std::min(num_cores, num_samples);
} else {
num_devices_to_use = num_devices;
size_t compute_resources = num_cores + num_devices * GPU_COMPUTE_FACTOR;
size_t compute_resources =
num_cores + num_devices * device_compute_factor;
size_t gpu_chunk_size =
std::ceil((double)num_samples / compute_resources) *
GPU_COMPUTE_FACTOR;
device_compute_factor;
size_t scale_factor =
std::ceil((double)gpu_chunk_size / max_samples_per_chunk);
num_chunks = num_cores * scale_factor;
Expand Down Expand Up @@ -673,7 +667,7 @@ struct Stream {
o->dep->chunks.resize(num_chunks + num_gpu_chunks, nullptr);
}

// Execute
// Execute graph
std::list<std::thread> workers;
std::list<std::thread> gpu_schedulers;
int32_t dev = 0;
Expand Down Expand Up @@ -721,10 +715,7 @@ struct Stream {
gs.join();
gpu_schedulers.clear();

for (size_t g = 0; g < num_devices; ++g)
cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(g));

// Build output
// Build output out of the separate chunks processed
for (auto o : outputs) {
assert(o->batched_stream && o->ct_stream &&
"Only operations with ciphertext output supported.");
Expand Down Expand Up @@ -1037,7 +1028,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
};

// If this is a mapped TLU
// FIXME: for now we do not provide more advanced ways of selecting
// TODO: for now we do not provide more advanced ways of selecting
bool mapped = (p->input_streams[1]->dep->host_data.sizes[0] > 1);
std::vector<size_t> lut_indexes;
if (mapped) {
Expand Down Expand Up @@ -1548,13 +1539,12 @@ void *stream_emulator_init() {
else
num_devices = requested_gpus;
}
env = getenv("SDFG_DISTRIBUTE_BATCH_OPS");
if (env != nullptr && (!strncmp(env, "off", 3) || !strncmp(env, "OFF", 3) ||
!strncmp(env, "0", 1))) {
dont_distribute_batched_ops = true;
}
assert(num_devices > 0 && "No GPUs available on system.");

env = getenv("SDFG_DEVICE_TO_CORE_RATIO");
if (env != nullptr)
device_compute_factor = strtoul(env, NULL, 10);

hwloc_topology_t topology;
hwloc_topology_init(&topology);
hwloc_topology_set_all_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_NONE);
Expand Down

0 comments on commit 28d9e3b

Please sign in to comment.