Improve GPU VRAM cleanup during rebalancing

bigscience-workshop · Dec 12, 2022 · b251516 · b251516
1 parent 1d31dd4
commit b251516
Showing 1 changed file with 22 additions and 6 deletions.
diff --git a/src/petals/server/server.py b/src/petals/server/server.py
@@ -250,10 +250,20 @@ def run(self):
     def _clean_memory_and_fds(self):
         del self.module_container
         gc.collect()  # In particular, this closes unused file descriptors
-
         cur_proc = psutil.Process()
-        num_fds = [proc.num_fds() for proc in [cur_proc] + psutil.Process().children(recursive=True)]
-        logger.info(f"Cleanup complete, {sum(num_fds)} open file descriptors left")
+        num_fds = [proc.num_fds() for proc in [cur_proc] + cur_proc.children(recursive=True)]
+        logger.info(f"Cleaning up, left {sum(num_fds)} open file descriptors")
+
+        if self.device.type == "cuda":
+            torch.cuda.empty_cache()
+
+            allocated_vram = torch.cuda.memory_allocated(self.device)
+            reserved_vram = torch.cuda.memory_reserved(self.device)
+            gib = 1024**3
+            logger.info(
+                f"Cleaning up, left {allocated_vram / gib:.1f} GiB allocated memory, "
+                f"{reserved_vram / gib:.1f} GiB reserved memory"
+            )
 
     def _choose_blocks(self) -> List[int]:
         if self.strict_block_indices is not None:
@@ -471,9 +481,8 @@ def ready(self) -> mp.synchronize.Event:
         return self.runtime.ready  # mp.Event that is true if self is ready to process batches
 
     def is_healthy(self) -> bool:
-        return (
-            all(handler.is_alive() for handler in self.conn_handlers) and
-            all(pool.is_alive() for pool in self.runtime.pools)
+        return all(handler.is_alive() for handler in self.conn_handlers) and all(
+            pool.is_alive() for pool in self.runtime.pools
         )
 
     def shutdown(self):
@@ -512,6 +521,13 @@ def shutdown(self):
         logger.debug(f"Shutting down runtime")
         self.runtime.shutdown()
 
+        logger.debug("Cleaning up memory")
+        # Necessary since links to `backend.module` may be left somewhere, so it is not garbage-collected properly
+        dummy = torch.tensor([])
+        for backend in self.module_backends.values():
+            for p in backend.module.parameters():
+                p.data = dummy
+
         logger.info("Module container shut down successfully")