Skip to content

Commit

Permalink
Update profile() for CUDA Memory allocation (PaddlePaddle#4239)
Browse files Browse the repository at this point in the history
* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Update profile()

* Cleanup
  • Loading branch information
glenn-jocher authored Jul 30, 2021
1 parent bceb57b commit d8f1883
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 33 deletions.
4 changes: 2 additions & 2 deletions tutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1172,11 +1172,11 @@
},
"source": [
"# Profile\n",
"from utils.torch_utils import profile \n",
"from utils.torch_utils import profile\n",
"\n",
"m1 = lambda x: x * torch.sigmoid(x)\n",
"m2 = torch.nn.SiLU()\n",
"profile(x=torch.randn(16, 3, 640, 640), ops=[m1, m2], n=100)"
"results = profile(input=torch.randn(16, 3, 640, 640), ops=[m1, m2], n=100)"
],
"execution_count": null,
"outputs": []
Expand Down
76 changes: 45 additions & 31 deletions utils/torch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,42 +98,56 @@ def time_sync():
return time.time()


def profile(x, ops, n=100, device=None):
# profile a pytorch module or list of modules. Example usage:
# x = torch.randn(16, 3, 640, 640) # input
def profile(input, ops, n=10, device=None):
# YOLOv5 speed/memory/FLOPs profiler
#
# Usage:
# input = torch.randn(16, 3, 640, 640)
# m1 = lambda x: x * torch.sigmoid(x)
# m2 = nn.SiLU()
# profile(x, [m1, m2], n=100) # profile speed over 100 iterations
# profile(input, [m1, m2], n=100) # profile over 100 iterations

results = []
device = device or select_device()
x = x.to(device)
x.requires_grad = True
print(f"{'Params':>12s}{'GFLOPs':>12s}{'forward (ms)':>16s}{'backward (ms)':>16s}{'input':>24s}{'output':>24s}")
for m in ops if isinstance(ops, list) else [ops]:
m = m.to(device) if hasattr(m, 'to') else m # device
m = m.half() if hasattr(m, 'half') and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m # type
dtf, dtb, t = 0., 0., [0., 0., 0.] # dt forward, backward
try:
flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 # GFLOPs
except:
flops = 0

for _ in range(n):
t[0] = time_sync()
y = m(x)
t[1] = time_sync()
print(f"{'Params':>12s}{'GFLOPs':>12s}{'GPU_mem (GB)':>14s}{'forward (ms)':>14s}{'backward (ms)':>14s}"
f"{'input':>24s}{'output':>24s}")

for x in input if isinstance(input, list) else [input]:
x = x.to(device)
x.requires_grad = True
for m in ops if isinstance(ops, list) else [ops]:
m = m.to(device) if hasattr(m, 'to') else m # device
m = m.half() if hasattr(m, 'half') and isinstance(x, torch.Tensor) and x.dtype is torch.float16 else m
tf, tb, t = 0., 0., [0., 0., 0.] # dt forward, backward
try:
_ = y.sum().backward()
t[2] = time_sync()
except: # no backward method
t[2] = float('nan')
dtf += (t[1] - t[0]) * 1000 / n # ms per op forward
dtb += (t[2] - t[1]) * 1000 / n # ms per op backward

s_in = tuple(x.shape) if isinstance(x, torch.Tensor) else 'list'
s_out = tuple(y.shape) if isinstance(y, torch.Tensor) else 'list'
p = sum(list(x.numel() for x in m.parameters())) if isinstance(m, nn.Module) else 0 # parameters
print(f'{p:12}{flops:12.4g}{dtf:16.4g}{dtb:16.4g}{str(s_in):>24s}{str(s_out):>24s}')
flops = thop.profile(m, inputs=(x,), verbose=False)[0] / 1E9 * 2 # GFLOPs
except:
flops = 0

try:
for _ in range(n):
t[0] = time_sync()
y = m(x)
t[1] = time_sync()
try:
_ = (sum([yi.sum() for yi in y]) if isinstance(y, list) else y).sum().backward()
t[2] = time_sync()
except Exception as e: # no backward method
print(e)
t[2] = float('nan')
tf += (t[1] - t[0]) * 1000 / n # ms per op forward
tb += (t[2] - t[1]) * 1000 / n # ms per op backward
mem = torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0 # (GB)
s_in = tuple(x.shape) if isinstance(x, torch.Tensor) else 'list'
s_out = tuple(y.shape) if isinstance(y, torch.Tensor) else 'list'
p = sum(list(x.numel() for x in m.parameters())) if isinstance(m, nn.Module) else 0 # parameters
print(f'{p:12}{flops:12.4g}{mem:>14.3f}{tf:14.4g}{tb:14.4g}{str(s_in):>24s}{str(s_out):>24s}')
results.append([p, flops, mem, tf, tb, s_in, s_out])
except Exception as e:
print(e)
results.append(None)
torch.cuda.empty_cache()
return results


def is_parallel(model):
Expand Down

0 comments on commit d8f1883

Please sign in to comment.