diff --git a/tests/benchmarks/flatten_bench.py b/tests/benchmarks/flatten_bench.py new file mode 100755 index 000000000000..b3ed3c601492 --- /dev/null +++ b/tests/benchmarks/flatten_bench.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l) +# +# usage: +# ./flatten_bench.py -t +# ./flatten_bench.py -c +# kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof + +import argparse + +import gc + +import torch +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from deepspeed.ops.op_builder import UtilsBuilder + +from apex_C import flatten as flatten_apex + +util_ops = UtilsBuilder().load() +flatten = util_ops.flatten +unflatten = util_ops.unflatten + +torch.manual_seed(0) +# emulate a small typical model weights +x = [ + torch.rand((512, + 512)).cuda(), + torch.rand((512, + 1024)).cuda(), + torch.rand((512, + 30000)).cuda() +] +t = x * 30 + +# warm up and check that the same output is produced +flat_py = _flatten_dense_tensors(t) +flat_cpp = flatten(t) +flat_apex = flatten_apex(t) +#numel = flat_cpp.numel() +assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor" +assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor" + +TIMES = 1000 + + +# the programs being tested +def py(): + for i in range(TIMES): + flat = _flatten_dense_tensors(t) + + +def cpp(): + for i in range(TIMES): + flat = flatten(t) + + +def apex(): + for i in range(TIMES): + flat = flatten_apex(t) + + +#### cProfile #### + +import cProfile + + +def cprofileme(): + print("--------------- cProfile -----------------") + print("py") + cProfile.run("py()", sort=-1) + gc.collect() + torch.cuda.empty_cache() + print("cpp") + cProfile.run("cpp()", sort=-1) + gc.collect() + torch.cuda.empty_cache() + print("apex") + cProfile.run("apex()", sort=-1) + gc.collect() + torch.cuda.empty_cache() + + +#### timeit #### + +import timeit + + +def timeme(): + print("--------------- timeit -----------------") + print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}') + gc.collect() + torch.cuda.empty_cache() + print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}') + gc.collect() + torch.cuda.empty_cache() + print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}') + gc.collect() + torch.cuda.empty_cache() + + +#### line_profiler #### +# this one requires a special way to be called +# pip install line_profiler +# kernprof -l flatten_bench.py -l; python -m line_profiler flatten_bench.py.lprof + + +def line_profileme(): + print("--------------- line_profier -----------------") + print("py") + profile(py)() + gc.collect() + torch.cuda.empty_cache() + print("cpp") + profile(cpp)() + gc.collect() + torch.cuda.empty_cache() + print("apex") + profile(apex)() + gc.collect() + torch.cuda.empty_cache() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-l", action='store_true') + parser.add_argument("-c", action='store_true') + parser.add_argument("-t", action='store_true') + args = parser.parse_args() + if args.l: + line_profileme() + elif args.c: + cprofileme() + elif args.t: + timeme() diff --git a/tests/benchmarks/unflatten_bench.py b/tests/benchmarks/unflatten_bench.py new file mode 100755 index 000000000000..85baf751ad9c --- /dev/null +++ b/tests/benchmarks/unflatten_bench.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python + +# run the benchmark under timeit (-t), cProfile (-c), line_profiler (-l) +# +# usage: +# ./unflatten_bench.py -t +# ./unflatten_bench.py -c +# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof + +import argparse +import gc +import torch +from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors +from deepspeed.ops.op_builder import UtilsBuilder + +from apex_C import flatten as flatten_apex +from apex_C import unflatten as unflatten_apex + +util_ops = UtilsBuilder().load() +flatten = util_ops.flatten +unflatten = util_ops.unflatten + +torch.manual_seed(0) +# emulate a small typical model weights +x = [ + torch.rand((512, + 512)).cuda(), + torch.rand((512, + 1024)).cuda(), + torch.rand((512, + 30000)).cuda() +] +unflat_t = x * 30 + +# warm up and check that the same output is produced +flat_py = _flatten_dense_tensors(unflat_t) +flat_cpp = flatten(unflat_t) +flat_apex = flatten_apex(unflat_t) +#numel = flat_cpp.numel() +assert torch.eq(flat_py, flat_cpp).all(), "both produce the same tensor" +assert torch.eq(flat_py, flat_apex).all(), "both produce the same tensor" + +flat_t = flat_py +unflat_py = _unflatten_dense_tensors(flat_py, unflat_t) +for i in range(len(unflat_t)): + assert torch.eq(unflat_t[i], unflat_py[i]).all() +unflat_cpp = _unflatten_dense_tensors(flat_cpp, unflat_t) +for i in range(len(unflat_t)): + assert torch.eq(unflat_t[i], unflat_cpp[i]).all() +unflat_apex = _unflatten_dense_tensors(flat_apex, unflat_t) +for i in range(len(unflat_t)): + assert torch.eq(unflat_t[i], unflat_apex[i]).all() + + +# the programs being tested +def py(): + for i in range(1000): + unflat = _unflatten_dense_tensors(flat_t, unflat_t) + + +def cpp(): + for i in range(1000): + unflat = unflatten(flat_t, unflat_t) + + +def apex(): + for i in range(1000): + unflat = unflatten_apex(flat_t, unflat_t) + + +#### cProfile #### + +import cProfile + + +def cprofileme(): + print("--------------- cProfile -----------------") + print("py") + cProfile.run("py()", sort=-1) + gc.collect() + torch.cuda.empty_cache() + print("cpp") + cProfile.run("cpp()", sort=-1) + gc.collect() + torch.cuda.empty_cache() + print("apex") + cProfile.run("apex()", sort=-1) + gc.collect() + torch.cuda.empty_cache() + + +#### timeit #### + +import timeit + + +def timeme(): + print("--------------- timeit -----------------") + print(f'py ={timeit.Timer("py()", globals=globals()).timeit(number=1)}') + gc.collect() + torch.cuda.empty_cache() + print(f'cpp ={timeit.Timer("cpp()", globals=globals()).timeit(number=1)}') + gc.collect() + torch.cuda.empty_cache() + print(f'apex={timeit.Timer("apex()", globals=globals()).timeit(number=1)}') + gc.collect() + torch.cuda.empty_cache() + + +#### line_profiler #### +# this one requires a special way to be called +# pip install line_profiler +# kernprof -l unflatten_bench.py -l; python -m line_profiler unflatten_bench.py.lprof + + +def line_profileme(): + print("--------------- line_profier -----------------") + print("py") + profile(py)() + gc.collect() + torch.cuda.empty_cache() + print("cpp") + profile(cpp)() + gc.collect() + torch.cuda.empty_cache() + print("apex") + profile(apex)() + gc.collect() + torch.cuda.empty_cache() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-l", action='store_true') + parser.add_argument("-c", action='store_true') + parser.add_argument("-t", action='store_true') + args = parser.parse_args() + if args.l: + line_profileme() + elif args.c: + cprofileme() + elif args.t: + timeme()