f cleanup gpudfg

zama-ai · Sep 15, 2023 · 28d9e3b · 28d9e3b
1 parent bacd4ea
commit 28d9e3b
Showing 1 changed file with 24 additions and 34 deletions.
diff --git a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
@@ -27,8 +27,6 @@
 #include "keyswitch.h"
 #include "linear_algebra.h"
 
-#define GPU_COMPUTE_FACTOR 24
-
 using MemRef2 = concretelang::clientlib::MemRefDescriptor<2>;
 using RuntimeContext = mlir::concretelang::RuntimeContext;
 
@@ -38,21 +36,16 @@ namespace gpu_dfg {
 namespace {
 
 // When not using all accelerators on the machine, we distribute work
-// by assigning the default accelerator for each SDFG to next
+// by assigning the default accelerator for each SDFG to next_device
 // round-robin.
 static std::atomic<size_t> next_device = {0};
 
 // Resources available (or set as requested by user through
 // environment variables) on the machine. Defaults to using all
-// available.
-static size_t num_devices = 0; // Set SDFG_NUM_GPUS to configure
-static size_t num_cores = 1;   // Set OMP_NUM_THREADS to configure (as
-                               // this is linked to loop parallelism)
-
-// By default we distribute batched ops across all available GPUs
-// (or value of environment variable SDFG_NUM_GPUS whichever is
-// lower). Set SDFG_DISTRIBUTE_BATCH_OPS=OFF to inhibit this.
-static bool dont_distribute_batched_ops = false;
+// available resources.
+static size_t num_devices = 0;            // Set SDFG_NUM_GPUS to configure
+static size_t num_cores = 1;              // Set SDFG_NUM_THREADS to configure
+static size_t device_compute_factor = 10; // Set SDFG_DEVICE_TO_CORE_RATIO
 
 // Get the byte size of a rank 2 MemRef
 static inline size_t memref_get_data_size(MemRef2 &m) {
@@ -279,8 +272,8 @@ struct Dependence {
       return;
     }
     size_t chunk_size =
-        num_samples / (num_chunks + num_gpu_chunks * GPU_COMPUTE_FACTOR);
-    size_t gpu_chunk_size = chunk_size * GPU_COMPUTE_FACTOR;
+        num_samples / (num_chunks + num_gpu_chunks * device_compute_factor);
+    size_t gpu_chunk_size = chunk_size * device_compute_factor;
     chunk_size = (num_samples - gpu_chunk_size * num_gpu_chunks) / num_chunks;
     size_t chunk_remainder =
         (num_samples - gpu_chunk_size * num_gpu_chunks) % num_chunks;
@@ -525,8 +518,7 @@ struct Stream {
     std::list<Process *> queue;
     extract_producing_graph(queue);
 
-    // TODO : replace with on-cpu execution, see if can be parallelised
-    // Do this for subgraphs that don't use BSes
+    // Determine if this subgrah bootstraps
     bool is_batched_subgraph = false;
     size_t subgraph_bootstraps = 0;
     for (auto p : queue) {
@@ -535,7 +527,8 @@ struct Stream {
           (p->fun == memref_bootstrap_lwe_u64_process) ? 1 : 0;
     }
     // If this subgraph is not batched, then use this DFG's allocated
-    // GPU to offload to.
+    // GPU to offload to.  If this does not bootstrap, just execute on
+    // the host.
     if (!is_batched_subgraph) {
       for (auto p : queue) {
         schedule_kernel(
@@ -608,13 +601,13 @@ struct Stream {
     int32_t num_devices_to_use = 0;
     // If the subgraph does not have sufficient computational
     // intensity (which we approximate by whether it bootstraps), then
-    // we assume (FIXME- confirm with profiling) that it is not
+    // we assume (TODO: confirm with profiling) that it is not
     // beneficial to offload to GPU.
     if (subgraph_bootstraps) {
       // Determine maximum GPU granulariry
       size_t gpu_free_mem;
       size_t gpu_total_mem;
-      // FIXME: this could be improved
+      // TODO: this could be improved
       // Force deallocation with a synchronization point
       for (size_t g = 0; g < num_devices; ++g)
         cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(g));
@@ -623,20 +616,21 @@ struct Stream {
       // TODO - for now assume each device on the system has roughly same
       // available memory.
       size_t available_mem = gpu_free_mem;
-      // Further assume (FIXME) that kernel execution requires twice as much
+      // Further assume (TODO) that kernel execution requires twice as much
       // memory per sample
       size_t max_samples_per_chunk =
           (available_mem - const_mem_per_sample) / (mem_per_sample * 2);
 
-      if (num_samples < num_cores + GPU_COMPUTE_FACTOR * num_devices) {
+      if (num_samples < num_cores + device_compute_factor * num_devices) {
         num_devices_to_use = 0;
         num_chunks = std::min(num_cores, num_samples);
       } else {
         num_devices_to_use = num_devices;
-        size_t compute_resources = num_cores + num_devices * GPU_COMPUTE_FACTOR;
+        size_t compute_resources =
+            num_cores + num_devices * device_compute_factor;
         size_t gpu_chunk_size =
             std::ceil((double)num_samples / compute_resources) *
-            GPU_COMPUTE_FACTOR;
+            device_compute_factor;
         size_t scale_factor =
             std::ceil((double)gpu_chunk_size / max_samples_per_chunk);
         num_chunks = num_cores * scale_factor;
@@ -673,7 +667,7 @@ struct Stream {
       o->dep->chunks.resize(num_chunks + num_gpu_chunks, nullptr);
     }
 
-    // Execute
+    // Execute graph
     std::list<std::thread> workers;
     std::list<std::thread> gpu_schedulers;
     int32_t dev = 0;
@@ -721,10 +715,7 @@ struct Stream {
       gs.join();
     gpu_schedulers.clear();
 
-    for (size_t g = 0; g < num_devices; ++g)
-      cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(g));
-
-    // Build output
+    // Build output out of the separate chunks processed
     for (auto o : outputs) {
       assert(o->batched_stream && o->ct_stream &&
              "Only operations with ciphertext output supported.");
@@ -1037,7 +1028,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
   };
 
   // If this is a mapped TLU
-  // FIXME: for now we do not provide more advanced ways of selecting
+  // TODO: for now we do not provide more advanced ways of selecting
   bool mapped = (p->input_streams[1]->dep->host_data.sizes[0] > 1);
   std::vector<size_t> lut_indexes;
   if (mapped) {
@@ -1548,13 +1539,12 @@ void *stream_emulator_init() {
     else
       num_devices = requested_gpus;
   }
-  env = getenv("SDFG_DISTRIBUTE_BATCH_OPS");
-  if (env != nullptr && (!strncmp(env, "off", 3) || !strncmp(env, "OFF", 3) ||
-                         !strncmp(env, "0", 1))) {
-    dont_distribute_batched_ops = true;
-  }
   assert(num_devices > 0 && "No GPUs available on system.");
 
+  env = getenv("SDFG_DEVICE_TO_CORE_RATIO");
+  if (env != nullptr)
+    device_compute_factor = strtoul(env, NULL, 10);
+
   hwloc_topology_t topology;
   hwloc_topology_init(&topology);
   hwloc_topology_set_all_types_filter(topology, HWLOC_TYPE_FILTER_KEEP_NONE);