Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test/bench: add bcast benchmark #7157

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/mpi/bench/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/*.c
/p2p_bw
/p2p_latency
/bcast
3 changes: 2 additions & 1 deletion test/mpi/bench/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ LDADD += -lm
## correctly
noinst_PROGRAMS = \
p2p_latency \
p2p_bw
p2p_bw \
bcast

.def.c:
mydef_page $<
20 changes: 20 additions & 0 deletions test/mpi/bench/bcast.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
include: macros/bench_frame.def
include: macros/bench_coll.def
include: macros/mtest.def

page: bcast, bench_frame
data: buf, size, MPI_CHAR

tf_barrier = bench_barrier(comm)
$for int size = 1; size < MAX_BUFSIZE; size *= 2
&call coll_warmup, 0
MPI_Bcast($(data), 0, comm)
MPI_Barrier(comm)
&call run_stat, NUM_REPEAT, tf_latency
&call measure, iter
MPI_Bcast($(data), 0, comm)
MPI_Barrier(comm)
tf_latency = (tf_dur / iter) - tf_barrier
$if grank == 0
$call report_latency, size, 1

32 changes: 32 additions & 0 deletions test/mpi/bench/macros/bench_coll.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
subcode: coll_warmup(root)
$if grank == 0
&call warm_up, iter, tf_dur
MPI_Bcast(&iter, 1, MPI_INT, $(root), comm)
&call measure, iter
BLOCK
tn_zero = 0
MPI_Bcast(&tn_zero, 1, MPI_INT, $(root), comm)
$else
$while 1
MPI_Bcast(&iter, 1, MPI_INT, $(root), comm)
$if iter == 0
break
$for 0:iter
BLOCK

MPI_Bcast(&iter, 1, MPI_INT, $(root), comm)

fncode: bench_barrier(comm)
$local int iter
&call coll_warmup, 0
MPI_Barrier(comm)

&call run_stat, NUM_REPEAT, tf_latency
&call measure, iter
MPI_Barrier(comm)
tf_latency = (tf_dur / iter)

$if grank == 0
printf("Barrier latency %.3f +/- %.3f us\n", sum1 * 1e6, sum2 * 1e6)
return sum1

11 changes: 10 additions & 1 deletion test/mpi/bench/macros/bench_frame.def
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ subcode: bench_frame
$(else)
MPI_Init(NULL, NULL);

$my grank, gsize: int
$global grank, gsize: int
MPI_Comm_rank(MPI_COMM_WORLD, &grank);
MPI_Comm_size(MPI_COMM_WORLD, &gsize);
$(if:MIN_PROCS)
Expand Down Expand Up @@ -54,21 +54,28 @@ subcode: bench_frame
macros:
use_double: 1

macros:
MAX_BUFSIZE: 5000000 # 5 MB

#----------------------------------------
subcode: _autoload
$register_prefix(comm) MPI_Comm
$define MAX_BUFSIZE 5000000
$define NUM_REPEAT 20

subcode: foreach_size
$for int size = 0; size < $(MAX_MSG); size = (size==0)?1:size*2
$(set:MSG_SIZE=size)
BLOCK

# measure tf_dur over iter
subcode: measure(iter)
tf_start = MPI_Wtime()
$for 0:$(iter)
BLOCK
tf_dur = MPI_Wtime() - tf_start

# repeat N times and calc avg in sum1 and std in sum2
subcode: run_stat(N, var)
$my double sum1=0, double sum2=0
$for 0:$(N)
Expand All @@ -79,7 +86,9 @@ subcode: run_stat(N, var)
sum2 /= $(N)
sum2 = sqrt(sum2 - sum1 * sum1)

# repeat until dur stabilize and iter adjusted to last minimum of 1ms
subcode: warm_up(iter, dur)
# minimum iteration to fill the duration to 1 ms
$(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
$(iter) = 2
$my double last_dur = 1.0
Expand Down
4 changes: 1 addition & 3 deletions test/mpi/bench/macros/bench_p2p.def
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

macros:
MIN_PROCS: 2
MAX_BUFSIZE: 5000000 # 5 MB
MEM_TYPES: sendrecv

subcode: _autoload
$register_name(src) int
Expand All @@ -25,8 +25,6 @@ subcode: _autoload
$register_name(size) int
$define TAG 0
$define SYNC_TAG 100
$define MAX_BUFSIZE 5000000
$define NUM_REPEAT 20

subcode: report_header
$call header_latency
Expand Down
23 changes: 15 additions & 8 deletions test/mpi/bench/macros/mtest.def
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@ macros:

subcode: mtest_malloc(size)
MTestArgList *head = MTestArgListCreate(argc, argv)
int send_rank = 0, recv_rank = 1;
$(for:a in send,recv)
$if grank == $(a)_rank
$my mtest_mem_type_e $(a)_memtype, int $(a)_device
$(a)_memtype = MTestArgListGetMemType(head, "$(a)mem")
$(a)_device = MTestArgListGetInt_with_default(head, "$(a)dev", 0)
MTestMalloc($(size), $(a)_memtype, NULL, &buf, $(a)_device)
MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name($(a)_memtype), $(a)_device, $(size))
$(if:MEM_TYPES=sendrecv)
int send_rank = 0, recv_rank = 1;
$(for:a in send,recv)
$if grank == $(a)_rank
$call alloc_mem_dev, $(a)mem, $(a)dev
$(else)
# all procs allocating the same memory types
$call alloc_mem_dev, memtype, device
MTestArgListDestroy(head)

subcode: alloc_mem_dev(memtype, memdev) # memtype and memdev are parameter names
$my mtest_mem_type_e memtype, int device
memtype = MTestArgListGetMemType(head, "$(memtype)")
device = MTestArgListGetInt_with_default(head, "$(memdev)", grank)
MTestMalloc($(size), memtype, NULL, &buf, device)
MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name(memtype), device, $(size))
1 change: 1 addition & 0 deletions test/mpi/bench/testlist
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
p2p_latency 2 resultTest=TestBench
p2p_bw 2 resultTest=TestBench
bcast 16 resultTest=TestBench