fall back to cudaMallocManaged for optimizer states if we're out of memory

ngc92 · ngc92 · commit 0d52d2a3d797 · 2024-08-16T01:42:30.000+03:00
diff --git a/llmc/cuda_common.h b/llmc/cuda_common.h
@@ -49,13 +49,13 @@ constexpr std::bool_constant<true> False;
 // Error checking
 
 // CUDA error checking
-inline void cudaCheck(cudaError_t error, const char *file, int line) {
+inline void cudaCheck_(cudaError_t error, const char *file, int line) {
   if (error != cudaSuccess) {
     printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
     exit(EXIT_FAILURE);
   }
 };
-#define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__))
+#define cudaCheck(err) (cudaCheck_(err, __FILE__, __LINE__))
 
 // like cudaFree, but checks for errors _and_ resets the pointer.
 template<class T>
diff --git a/llmc/cuda_utils.cuh b/llmc/cuda_utils.cuh
@@ -205,6 +205,29 @@ void global_sum_deterministic(float* result, const Float* values, int count, cud
     cudaCheck(cudaGetLastError());
 }
 
+// ----------------------------------------------------------------------------
+// memory management
+
+// allocate memory, preferrably on the
+void cudaMallocConditionallyManaged(void** out, size_t bytes, const char *file, int line) {
+    size_t free, total;
+    cudaCheck(cudaMemGetInfo(&free, &total));
+    // check if we have enough space to pin the memory to device (with 1% slack)
+    if(100 * free < 99 * bytes) {
+        cudaCheck_(cudaMalloc((void**)out, bytes), file, line);
+    } else {
+        // if not, fallback to a managed allocation. It will be slower, but at least
+        // it won't crash.
+        fprintf(stderr, "[WARN] Not enough space to allocate %zu bytes on device.\n"
+                        "      Falling back to managed allocation.\n      Speed may be negatively affected.",
+                        bytes);
+        cudaCheck_(cudaMallocManaged((void**)out, bytes), file, line);
+    }
+}
+
+#define cudaMallocConditionallyManaged(out, bytes)\
+(cudaMallocConditionallyManaged((void**)out, bytes, __FILE__, __LINE__))
+
 // ----------------------------------------------------------------------------
 // Random Number Generation used in Stochastic Rounding
 
diff --git a/train_gpt2.cu b/train_gpt2.cu
@@ -393,13 +393,13 @@ void gpt2_allocate_state(GPT2 *model, int B, int T) {
     printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
     assert(model->m_memory == nullptr);
     assert(model->v_memory == nullptr);
-    cudaCheck(cudaMalloc((void**)&model->m_memory, shard_num_parameters * sizeof(float)));
-    cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float)));
+    cudaMallocConditionallyManaged((void**)&model->m_memory, shard_num_parameters * sizeof(float));
+    cudaMallocConditionallyManaged((void**)&model->v_memory, shard_num_parameters * sizeof(float));
 
     if (model->use_master_weights == 1) {
         assert(model->master_weights == nullptr);
         printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
-        cudaCheck(cudaMalloc((void**) &model->master_weights, shard_num_parameters * sizeof(float)));
+        cudaMallocConditionallyManaged((void**) &model->master_weights, shard_num_parameters * sizeof(float));
     }
 
     size_t free, total;