release gpu vram after layer.fwd (ModelCloud#616)

Co-authored-by: LRL-ModelCloud <lrl@modelcloud.ai>
CSY-ModelCloud · Nov 19, 2024 · 416e47f · 416e47f
1 parent ee4ede5
commit 416e47f
Showing 1 changed file with 4 additions and 0 deletions.
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -541,6 +541,8 @@ def tmp(_, inp, out):
                         additional_layer_inputs[k] = nested_move_to(v, cur_layer_device)
                     with torch.no_grad():
                         layer(*layer_input, **additional_layer_inputs)
+
+                    torch.cuda.empty_cache()
                 for h in handles:
                     h.remove()
 
@@ -615,6 +617,8 @@ def tmp(_, inp, out):
                     )
                     layer_outputs.append([layer_output])
 
+                torch.cuda.empty_cache()
+
             layers[i] = move_to(layer, CPU if force_layer_back_to_cpu else cur_layer_device)
             del layer
             del gptq