Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 134 additions & 0 deletions tests/benchmarks/flatten_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#!/usr/bin/env python
# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
#
# usage:
# ./flatten_bench.py -t
# ./flatten_bench.py -c
# kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof

import argparse

import gc

import torch
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from deepspeed.ops.op_builder import UtilsBuilder

from apex_C import flatten as flatten_apex

util_ops = UtilsBuilder().load()
flatten = util_ops.flatten
unflatten = util_ops.unflatten

torch.manual_seed(0)
# emulate a small typical model weights
x = [
torch.rand((512,
512)).cuda(),
torch.rand((512,
1024)).cuda(),
torch.rand((512,
30000)).cuda()
]
t = x * 30

# warm up and check that the same output is produced
flat_py = _flatten_dense_tensors(t)
flat_cpp = flatten(t)
flat_apex = flatten_apex(t)
#numel = flat_cpp.numel()
assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"

TIMES = 1000


# the programs being tested
def py():
for i in range(TIMES):
flat = _flatten_dense_tensors(t)


def cpp():
for i in range(TIMES):
flat = flatten(t)


def apex():
for i in range(TIMES):
flat = flatten_apex(t)


#### cProfile ####

import cProfile


def cprofileme():
print("--------------- cProfile -----------------")
print("py")
cProfile.run("py()", sort=-1)
gc.collect()
torch.cuda.empty_cache()
print("cpp")
cProfile.run("cpp()", sort=-1)
gc.collect()
torch.cuda.empty_cache()
print("apex")
cProfile.run("apex()", sort=-1)
gc.collect()
torch.cuda.empty_cache()


#### timeit ####

import timeit


def timeme():
print("--------------- timeit -----------------")
print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()
print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()
print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()


#### line_profiler ####
# this one requires a special way to be called
# pip install line_profiler
# kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof


def line_profileme():
print("--------------- line_profier -----------------")
print("py")
profile(py)()
gc.collect()
torch.cuda.empty_cache()
print("cpp")
profile(cpp)()
gc.collect()
torch.cuda.empty_cache()
print("apex")
profile(apex)()
gc.collect()
torch.cuda.empty_cache()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-l", action='store_true')
parser.add_argument("-c", action='store_true')
parser.add_argument("-t", action='store_true')
args = parser.parse_args()
if args.l:
line_profileme()
elif args.c:
cprofileme()
elif args.t:
timeme()
143 changes: 143 additions & 0 deletions tests/benchmarks/unflatten_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/usr/bin/env python

# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l)
#
# usage:
# ./unflatten_bench.py -t
# ./unflatten_bench.py -c
# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof

import argparse
import gc
import torch
from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
from deepspeed.ops.op_builder import UtilsBuilder

from apex_C import flatten as flatten_apex
from apex_C import unflatten as unflatten_apex

util_ops = UtilsBuilder().load()
flatten = util_ops.flatten
unflatten = util_ops.unflatten

torch.manual_seed(0)
# emulate a small typical model weights
x = [
torch.rand((512,
512)).cuda(),
torch.rand((512,
1024)).cuda(),
torch.rand((512,
30000)).cuda()
]
unflat_t = x * 30

# warm up and check that the same output is produced
flat_py = _flatten_dense_tensors(unflat_t)
flat_cpp = flatten(unflat_t)
flat_apex = flatten_apex(unflat_t)
#numel = flat_cpp.numel()
assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor"
assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor"

flat_t = flat_py
unflat_py = _unflatten_dense_tensors(flat_py, unflat_t)
for i in range(len(unflat_t)):
assert torch.eq(unflat_t[i], unflat_py[i]).all()
unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t)
for i in range(len(unflat_t)):
assert torch.eq(unflat_t[i], unflat_cpp[i]).all()
unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t)
for i in range(len(unflat_t)):
assert torch.eq(unflat_t[i], unflat_apex[i]).all()


# the programs being tested
def py():
for i in range(1000):
unflat = _unflatten_dense_tensors(flat_t, unflat_t)


def cpp():
for i in range(1000):
unflat = unflatten(flat_t, unflat_t)


def apex():
for i in range(1000):
unflat = unflatten_apex(flat_t, unflat_t)


#### cProfile ####

import cProfile


def cprofileme():
print("--------------- cProfile -----------------")
print("py")
cProfile.run("py()", sort=-1)
gc.collect()
torch.cuda.empty_cache()
print("cpp")
cProfile.run("cpp()", sort=-1)
gc.collect()
torch.cuda.empty_cache()
print("apex")
cProfile.run("apex()", sort=-1)
gc.collect()
torch.cuda.empty_cache()


#### timeit ####

import timeit


def timeme():
print("--------------- timeit -----------------")
print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()
print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()
print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}')
gc.collect()
torch.cuda.empty_cache()


#### line_profiler ####
# this one requires a special way to be called
# pip install line_profiler
# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof


def line_profileme():
print("--------------- line_profier -----------------")
print("py")
profile(py)()
gc.collect()
torch.cuda.empty_cache()
print("cpp")
profile(cpp)()
gc.collect()
torch.cuda.empty_cache()
print("apex")
profile(apex)()
gc.collect()
torch.cuda.empty_cache()


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-l", action='store_true')
parser.add_argument("-c", action='store_true')
parser.add_argument("-t", action='store_true')
args = parser.parse_args()
if args.l:
line_profileme()
elif args.c:
cprofileme()
elif args.t:
timeme()