diff --git a/tutorial.ipynb b/tutorial.ipynb
index 8d9c3f8b7a1..b1650627528 100644
--- a/tutorial.ipynb
+++ b/tutorial.ipynb
@@ -1172,11 +1172,11 @@
       },
       "source": [
         "# Profile\n",
-        "from utils.torch_utils import profile \n",
+        "from utils.torch_utils import profile\n",
         "\n",
         "m1 = lambda x: x * torch.sigmoid(x)\n",
         "m2 = torch.nn.SiLU()\n",
-        "profile(x=torch.randn(16, 3, 640, 640), ops=[m1, m2], n=100)"
+        "results = profile(input=torch.randn(16, 3, 640, 640), ops=[m1, m2], n=100)"
       ],
       "execution_count": null,
       "outputs": []
diff --git a/utils/torch_utils.py b/utils/torch_utils.py
index 55a5fd7875b..4956cf95d1c 100644
--- a/utils/torch_utils.py
+++ b/utils/torch_utils.py
@@ -98,42 +98,56 @@ def time_sync():
     return time.time()
 
 
-def profile(x, ops, n=100, device=None):
-    # profile a pytorch module or list of modules. Example usage:
-    #     x = torch.randn(16, 3, 640, 640)  # input
+def profile(input, ops, n=10, device=None):
+    # YOLOv5 speed/memory/FLOPs profiler
+    #
+    # Usage:
+    #     input = torch.randn(16, 3, 640, 640)
     #     m1 = lambda x: x * torch.sigmoid(x)
     #     m2 = nn.SiLU()
-    #     profile(x, [m1, m2], n=100)  # profile speed over 100 iterations
+    #     profile(input, [m1, m2], n=100)  # profile over 100 iterations
 
+    results = []
     device = device or select_device()
-    x = x.to(device)
-    x.requires_grad = True
-    print(f"{'Params':>12s}{'GFLOPs':>12s}{'forward (ms)':>16s}{'backward (ms)':>16s}{'input':>24s}{'output':>24s}")
-    for m in ops if isinstance(ops, list) else [ops]:
-        m = m.to(device) if hasattr(m, 'to') else m  # device
-        m = m.half() if hasattr(m, 'half') and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m  # type
-        dtf, dtb, t = 0., 0., [0., 0., 0.]  # dt forward, backward
-        try:
-            flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2  # GFLOPs
-        except:
-            flops = 0
-
-        for _ in range(n):
-            t[0] = time_sync()
-            y = m(x)
-            t[1] = time_sync()
+    print(f"{'Params':>12s}{'GFLOPs':>12s}{'GPU_mem (GB)':>14s}{'forward (ms)':>14s}{'backward (ms)':>14s}"
+          f"{'input':>24s}{'output':>24s}")
+
+    for x in input if isinstance(input, list) else [input]:
+        x = x.to(device)
+        x.requires_grad = True
+        for m in ops if isinstance(ops, list) else [ops]:
+            m = m.to(device) if hasattr(m, 'to') else m  # device
+            m = m.half() if hasattr(m, 'half') and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m
+            tf, tb, t = 0., 0., [0., 0., 0.]  # dt forward, backward
             try:
-                _ = y.sum().backward()
-                t[2] = time_sync()
-            except:  # no backward method
-                t[2] = float('nan')
-            dtf += (t[1] - t[0]) * 1000 / n  # ms per op forward
-            dtb += (t[2] - t[1]) * 1000 / n  # ms per op backward
-
-        s_in = tuple(x.shape) if isinstance(x, torch.Tensor) else 'list'
-        s_out = tuple(y.shape) if isinstance(y, torch.Tensor) else 'list'
-        p = sum(list(x.numel() for x in m.parameters())) if isinstance(m, nn.Module) else 0  # parameters
-        print(f'{p:12}{flops:12.4g}{dtf:16.4g}{dtb:16.4g}{str(s_in):>24s}{str(s_out):>24s}')
+                flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2  # GFLOPs
+            except:
+                flops = 0
+
+            try:
+                for _ in range(n):
+                    t[0] = time_sync()
+                    y = m(x)
+                    t[1] = time_sync()
+                    try:
+                        _ = (sum([yi.sum() for yi in y]) if isinstance(y, list) else y).sum().backward()
+                        t[2] = time_sync()
+                    except Exception as e:  # no backward method
+                        print(e)
+                        t[2] = float('nan')
+                    tf += (t[1] - t[0]) * 1000 / n  # ms per op forward
+                    tb += (t[2] - t[1]) * 1000 / n  # ms per op backward
+                mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0  # (GB)
+                s_in = tuple(x.shape) if isinstance(x, torch.Tensor) else 'list'
+                s_out = tuple(y.shape) if isinstance(y, torch.Tensor) else 'list'
+                p = sum(list(x.numel() for x in m.parameters())) if isinstance(m, nn.Module) else 0  # parameters
+                print(f'{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}')
+                results.append([p, flops, mem, tf, tb, s_in, s_out])
+            except Exception as e:
+                print(e)
+                results.append(None)
+            torch.cuda.empty_cache()
+    return results
 
 
 def is_parallel(model):