From 3f4988377b07d03f23fcece8f7346b3845a39ec2 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sat, 1 Apr 2023 10:18:56 -0500 Subject: [PATCH 1/6] modules: add mydef_boot MyDef provides general templating facilities. --- .gitmodules | 3 +++ modules/mydef_boot | 1 + 2 files changed, 4 insertions(+) create mode 160000 modules/mydef_boot diff --git a/.gitmodules b/.gitmodules index 5fd73e70238..3353b602b74 100644 --- a/.gitmodules +++ b/.gitmodules @@ -14,3 +14,6 @@ [submodule "modules/yaksa"] path = modules/yaksa url = https://github.com/pmodels/yaksa +[submodule "modules/mydef_boot"] + path = modules/mydef_boot + url = https://github.com/pmodels/mydef_boot diff --git a/modules/mydef_boot b/modules/mydef_boot new file mode 160000 index 00000000000..ea2d6852486 --- /dev/null +++ b/modules/mydef_boot @@ -0,0 +1 @@ +Subproject commit ea2d6852486755eb12e255f760e2eb62f5446329 From 30f2bbd4389e16e81775aae30b08e779de35ac2e Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Sun, 17 Dec 2023 19:05:38 -0600 Subject: [PATCH 2/6] test/mpi: add p2p benchmarks in test/mpi/bench Add point-to-point benchmark code in MyDef. The tests have automatic warm-ups and adjusts number of iterations for measurement accuracy. It produces latency measurements with standard deviations and equivalent bandwidths. MYDEF_BOOT=[topsrc_dir]/modules/mydef_boot export PATH=$MYDEF_BOOT/bin:$PATH export PERL5LIB=$MYDEF_BOOT/lib/perl5 export MYDEFLIB=$MYDEF_BOOT/lib/MyDef To run: mydef_page p2p_latency.def # -> p2p_latency.c mpicc p2p_latency.c && mpi_run -n 2 ./a.out Alternatively use mydef_run (uses settings from config): mydef_run p2p_latency.def Next commit will add "make testing". --- test/mpi/bench/config | 3 + test/mpi/bench/macros/bench_frame.def | 93 +++++++++++++++++++++++++++ test/mpi/bench/macros/bench_p2p.def | 79 +++++++++++++++++++++++ test/mpi/bench/p2p_bw.def | 26 ++++++++ test/mpi/bench/p2p_latency.def | 18 ++++++ 5 files changed, 219 insertions(+) create mode 100644 test/mpi/bench/config create mode 100644 test/mpi/bench/macros/bench_frame.def create mode 100644 test/mpi/bench/macros/bench_p2p.def create mode 100644 test/mpi/bench/p2p_bw.def create mode 100644 test/mpi/bench/p2p_latency.def diff --git a/test/mpi/bench/config b/test/mpi/bench/config new file mode 100644 index 00000000000..418c69e8a98 --- /dev/null +++ b/test/mpi/bench/config @@ -0,0 +1,3 @@ +module: c +CC: mpicc +run: mpirun -n 2 diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def new file mode 100644 index 00000000000..7fb78dbc596 --- /dev/null +++ b/test/mpi/bench/macros/bench_frame.def @@ -0,0 +1,93 @@ +/* + * bench_frame : boilerplate for mpi program + * measure(iter) : measures `tf_dur` for $(iter) iterations + * run_stat(N, var) : run N measurements and obtain (avg, std) in sum1, sum2 + * warm_up(iter, dur): repeat until measurements (iter, dur) stabilize + * report_latency(msgsize, MULTIPLICITY) : print a line of latency result + */ + +subcode: bench_frame + $include stdio + $include stdlib + $include mpi + + $function main + MPI_Init(NULL, NULL); + + $my grank, gsize: int + MPI_Comm_rank(MPI_COMM_WORLD, &grank); + MPI_Comm_size(MPI_COMM_WORLD, &gsize); + $(if:MIN_PROCS) + $if gsize < $(MIN_PROCS) + printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n"); + return 1 + + MPI_Comm comm = MPI_COMM_WORLD; + char *buf = malloc(MAX_BUFSIZE) + $if !buf + printf("! Failed to allocate buffer (size=%d)\n", MAX_BUFSIZE) + return 1 + + $if grank == 0 + printf("TEST $(_pagename):\n") + $call @report_header + $call main + $if grank == 0 + printf("\n") + + MPI_Finalize(); + +macros: + use_double: 1 + +#---------------------------------------- +subcode: _autoload + $register_prefix(comm) MPI_Comm + +subcode: foreach_size + $for int size = 0; size < $(MAX_MSG); size = (size==0)?1:size*2 + $(set:MSG_SIZE=size) + BLOCK + +subcode: measure(iter) + tf_start = MPI_Wtime() + $for 0:$(iter) + BLOCK + tf_dur = MPI_Wtime() - tf_start + +subcode: run_stat(N, var) + $my double sum1=0, double sum2=0 + $for 0:$(N) + BLOCK + sum1 += $(var) + sum2 += $(var) * $(var) + sum1 /= $(N) + sum2 /= $(N) + sum2 = sqrt(sum2 - sum1 * sum1) + +subcode: warm_up(iter, dur) + $(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur))) + $(iter) = 2 + $my double last_dur = 1.0 + $my int num_best = 0 + $while num_best < 10 + BLOCK + $if $(iter) < $(MIN_ITER) + $(iter) = $(MIN_ITER) + num_best = 0 + continue + # check that t_dur is no longer monotonically decreasing + $if $(dur) > last_dur + num_best++ + last_dur = $(dur) + +subcode: header_latency + printf("%12s %10s(us) %6s(us) %12s(MB/s)\n", "msgsize", "latency", "sigma", "bandwidth") + +subcode: report_latency(MSGSIZE, MULTIPLICITY) + $my tf_latency, tf_sigma, tf_bw + tf_latency = sum1 / ($(MULTIPLICITY)) * 1e6 + tf_sigma = sum2 / ($(MULTIPLICITY)) * 1e6 + tf_bw = $(MSGSIZE) / tf_latency + printf("%12d %10.3f %6.3f %12.3f\n", $(MSGSIZE), tf_latency, tf_sigma, tf_bw) + diff --git a/test/mpi/bench/macros/bench_p2p.def b/test/mpi/bench/macros/bench_p2p.def new file mode 100644 index 00000000000..742a73acfc3 --- /dev/null +++ b/test/mpi/bench/macros/bench_p2p.def @@ -0,0 +1,79 @@ +/* + * Defines following functions: + * bench_p2p + * bench_send, bench_warmup + * bench_recv + * + * For each measurement - + * First sender tells receiver the `iter` parameter. `iter = 0` means to quit. + * For each iteration runs `send_side` and `recv_side` assuming the measurement on sender side represents a latency measurement. + * + * Caller page defines - + * subcode: sender_side, recv_side + * macro: + * MULTIPLICITY: divisor for each measurement + */ + +macros: + MIN_PROCS: 2 + MAX_BUFSIZE: 5000000 # 5 MB + +subcode: _autoload + $register_name(src) int + $register_name(dst) int + $register_name(buf) void * + $register_name(size) int + $define TAG 0 + $define SYNC_TAG 100 + $define MAX_BUFSIZE 5000000 + $define NUM_REPEAT 20 + +subcode: report_header + $call header_latency + +fncode: bench_p2p(comm, src, dst, buf, size) + int rank; + MPI_Comm_rank(comm, &rank) + + $(if:!MULTIPLICITY) + $(set:MULTIPLICITY=1) + + $if rank == src + iter = bench_warmup(comm, dst, buf, size) + &call run_stat, NUM_REPEAT, tf_latency + tf_latency = bench_send(iter, comm, dst, buf, size) + tf_latency /= iter + $call report_latency, size, $(MULTIPLICITY) + $call send_stop + $elif rank == dst + bench_recv(comm, src, buf, size) + + subcode: send_stop + iter = 0; + MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm) + +#---------------------------------------- +fncode: bench_send(int iter, comm, dst, buf, size) + # synchronize with receiver + MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm); + + &call measure, iter + $call @send_side + + return tf_dur + +fncode: bench_recv(comm, src, buf, size) + $while 1 + int iter; + # synchronize with sender */ + MPI_Recv(&iter, 1, MPI_INT, src, SYNC_TAG, comm, MPI_STATUS_IGNORE); + $if iter == 0 + # time to quit + break + $for i=0:iter + $call @recv_side + +fncode: bench_warmup(comm, dst, buf, size): int + &call warm_up, iter, tf_dur + tf_dur = bench_send(iter, comm, dst, buf, size) + return iter diff --git a/test/mpi/bench/p2p_bw.def b/test/mpi/bench/p2p_bw.def new file mode 100644 index 00000000000..b0c55e7fc94 --- /dev/null +++ b/test/mpi/bench/p2p_bw.def @@ -0,0 +1,26 @@ +include: macros/bench_frame.def +include: macros/bench_p2p.def + +subcode: _autoload + $define WINDOW_SIZE 64 + +page: p2p_bw, bench_frame + MULTIPLICITY: WINDOW_SIZE + data: buf, size, MPI_CHAR + + $for int size = 1; size < MAX_BUFSIZE; size *= 2 + bench_p2p(comm, 0, 1, buf, size) + + subcode: send_side + $my MPI_Request reqs[WINDOW_SIZE] + $for j=0:WINDOW_SIZE + MPI_Isend($(data), dst, TAG, comm, &reqs[j]) + MPI_Waitall(WINDOW_SIZE, reqs, MPI_STATUSES_IGNORE) + MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, dst, TAG, comm, MPI_STATUS_IGNORE) + + subcode: recv_side + $my MPI_Request reqs[WINDOW_SIZE] + $for j=0:WINDOW_SIZE + MPI_Irecv($(data), src, TAG, comm, &reqs[j]) + MPI_Waitall(WINDOW_SIZE, reqs, MPI_STATUSES_IGNORE) + MPI_Send(NULL, 0, MPI_DATATYPE_NULL, src, TAG, comm) diff --git a/test/mpi/bench/p2p_latency.def b/test/mpi/bench/p2p_latency.def new file mode 100644 index 00000000000..3fa94d547eb --- /dev/null +++ b/test/mpi/bench/p2p_latency.def @@ -0,0 +1,18 @@ +include: macros/bench_frame.def +include: macros/bench_p2p.def + +page: p2p_latency, bench_frame + MULTIPLICITY: 2 + data: buf, size, MPI_CHAR + + bench_p2p(comm, 0, 1, buf, 0) + $for int size = 1; size < MAX_BUFSIZE; size *= 2 + bench_p2p(comm, 0, 1, buf, size) + + subcode: send_side + MPI_Send($(data), dst, TAG, comm); + MPI_Recv($(data), dst, TAG, comm, MPI_STATUS_IGNORE); + + subcode: recv_side + MPI_Recv($(data), src, TAG, comm, MPI_STATUS_IGNORE); + MPI_Send($(data), src, TAG, comm); From 6633f0a0015330b7bbf67fe959ab29c6d02e7803 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 1 Oct 2024 16:57:23 -0500 Subject: [PATCH 3/6] autogen: convert mydef code in autogen We could add rules to directly work with mydef code in Makefile, but convert the code in autogen removes the mydef dependency. Also fix a spelling error. --- autogen.sh | 15 +++++++++++++-- test/mpi/bench/.gitignore | 3 +++ test/mpi/bench/autogen.sh | 3 +++ 3 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 test/mpi/bench/.gitignore create mode 100755 test/mpi/bench/autogen.sh diff --git a/autogen.sh b/autogen.sh index 80494232ce4..5178d70ec1e 100755 --- a/autogen.sh +++ b/autogen.sh @@ -65,6 +65,7 @@ do_hydra=yes do_romio=yes do_pmi=yes do_doc=no +do_mydef=yes yaksa_depth= @@ -536,6 +537,14 @@ fn_json_gen() { echo "done" } +fn_mydef() { + MYDEF_BOOT=$PWD/modules/mydef_boot + export PATH=$MYDEF_BOOT/bin:$PATH + export PERL5LIB=$MYDEF_BOOT/lib/perl5 + export MYDEFLIB=$MYDEF_BOOT/lib/MyDef + (cd test/mpi/bench && ./autogen.sh) +} + # internal _patch_libtool() { _file=$1 @@ -731,9 +740,9 @@ EOF echo ">= $ver" else echo "bad autoconf installation" - echo "--- autoreconf diagnositcs ---" + echo "--- autoreconf diagnostics ---" $(cat autoreconf.err) - echo "--- autoreconf diagnositcs ---" + echo "--- autoreconf diagnostics ---" cat < Date: Tue, 1 Oct 2024 10:54:54 -0500 Subject: [PATCH 4/6] test/runtests: add TestBench result check This check does not capture output (thus test results will show in console log) and only checks for exit code - zero means success and nonzero means failure. We'll use this check for benchmark tests. --- test/mpi/runtests | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/mpi/runtests b/test/mpi/runtests index 237ba5e22f4..c7a98295fe3 100755 --- a/test/mpi/runtests +++ b/test/mpi/runtests @@ -934,6 +934,8 @@ sub get_resultTest { return \&TestStatusNoErrors; } elsif ($resultTest eq "TestErrFatal") { return \&TestErrFatal; + } elsif ($resultTest eq "TestBench") { + return \&TestBench; } else { die "resultTest $resultTest not defined!\n"; } @@ -1112,6 +1114,22 @@ sub TestErrFatal { return ($found_error, $inline); } +# Only check exit code: 0 means success, non-zero means failure +sub TestBench { + my ($MPIOUT, $programname) = @_; + my ($found_error, $inline); + + while (<$MPIOUT>) { + print STDOUT $_; + } + my $rc = close($MPIOUT); + if (!$rc) { + expect_status_zero($programname, $?); + $found_error = 1; + } + return ($found_error, $inline); +} + # ---------------------------------------------------------------------------- # Output routines: # OpenOutputs - Open report files and print initial lines From f2add2bed1a812ce02ee9e292abe59a095004b11 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 1 Oct 2024 17:24:41 -0500 Subject: [PATCH 5/6] test/bench: add Makefile and testlist "make testing" in test/mpi/bench should work. --- test/mpi/bench/Makefile.am | 17 +++++++++++++++++ test/mpi/bench/testlist | 2 ++ test/mpi/configure.ac | 1 + 3 files changed, 20 insertions(+) create mode 100644 test/mpi/bench/Makefile.am create mode 100644 test/mpi/bench/testlist diff --git a/test/mpi/bench/Makefile.am b/test/mpi/bench/Makefile.am new file mode 100644 index 00000000000..2341c2c9a78 --- /dev/null +++ b/test/mpi/bench/Makefile.am @@ -0,0 +1,17 @@ +## +## Copyright (C) by Argonne National Laboratory +## See COPYRIGHT in top-level directory +## + +include $(top_srcdir)/Makefile_single.mtest +LDADD += -lm + +## for all programs that are just built from the single corresponding source +## file, we don't need per-target _SOURCES rules, automake will infer them +## correctly +noinst_PROGRAMS = \ + p2p_latency \ + p2p_bw + +.def.c: + mydef_page $< diff --git a/test/mpi/bench/testlist b/test/mpi/bench/testlist new file mode 100644 index 00000000000..af73dca792a --- /dev/null +++ b/test/mpi/bench/testlist @@ -0,0 +1,2 @@ +p2p_latency 2 resultTest=TestBench +p2p_bw 2 resultTest=TestBench diff --git a/test/mpi/configure.ac b/test/mpi/configure.ac index 8bf7b30937f..9a7ed7b50dc 100644 --- a/test/mpi/configure.ac +++ b/test/mpi/configure.ac @@ -1904,5 +1904,6 @@ AC_OUTPUT(maint/testmerge \ impls/mpich/ulfm/Makefile \ impls/mpich/info/Makefile \ impls/mpich/info/testlist \ + bench/Makefile \ ) From 99c6adff2aa093b78f97fe73449390320a40dcf1 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Tue, 1 Oct 2024 18:17:18 -0500 Subject: [PATCH 6/6] test/bench: add support for device memory Add device memory support using mtest_common utilities. This will add the dependency to utility libraries, which the makefile already imports. However, this will remove the simpliicity of building single source with mpicc or mydef_run. If one doesn't need test device memory, one can simply comment off "$include macros/mtest.def" to restore the simplicity. --- test/mpi/bench/macros/bench_frame.def | 22 ++++++++++++++++++---- test/mpi/bench/macros/mtest.def | 14 ++++++++++++++ test/mpi/bench/p2p_bw.def | 1 + test/mpi/bench/p2p_latency.def | 1 + 4 files changed, 34 insertions(+), 4 deletions(-) create mode 100644 test/mpi/bench/macros/mtest.def diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def index 7fb78dbc596..47d19c468e8 100644 --- a/test/mpi/bench/macros/bench_frame.def +++ b/test/mpi/bench/macros/bench_frame.def @@ -9,10 +9,16 @@ subcode: bench_frame $include stdio $include stdlib - $include mpi + $(if:HAS_MTEST) + $include mpitest.h + $(else) + $include mpi $function main - MPI_Init(NULL, NULL); + $(if:HAS_MTEST) + MTest_Init(NULL, NULL); + $(else) + MPI_Init(NULL, NULL); $my grank, gsize: int MPI_Comm_rank(MPI_COMM_WORLD, &grank); @@ -23,7 +29,12 @@ subcode: bench_frame return 1 MPI_Comm comm = MPI_COMM_WORLD; - char *buf = malloc(MAX_BUFSIZE) + + $my void *buf + $(if:HAS_MTEST) + $call mtest_malloc, MAX_BUFSIZE + $(else) + buf = malloc(MAX_BUFSIZE) $if !buf printf("! Failed to allocate buffer (size=%d)\n", MAX_BUFSIZE) return 1 @@ -35,7 +46,10 @@ subcode: bench_frame $if grank == 0 printf("\n") - MPI_Finalize(); + $(if:HAS_MTEST) + MTest_Finalize(0); + $(else) + MPI_Finalize(); macros: use_double: 1 diff --git a/test/mpi/bench/macros/mtest.def b/test/mpi/bench/macros/mtest.def new file mode 100644 index 00000000000..185a35bf77e --- /dev/null +++ b/test/mpi/bench/macros/mtest.def @@ -0,0 +1,14 @@ +macros: + HAS_MTEST: 1 + +subcode: mtest_malloc(size) + MTestArgList *head = MTestArgListCreate(argc, argv) + int send_rank = 0, recv_rank = 1; + $(for:a in send,recv) + $if grank == $(a)_rank + $my mtest_mem_type_e $(a)_memtype, int $(a)_device + $(a)_memtype = MTestArgListGetMemType(head, "$(a)mem") + $(a)_device = MTestArgListGetInt_with_default(head, "$(a)dev", 0) + MTestMalloc($(size), $(a)_memtype, NULL, &buf, $(a)_device) + MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name($(a)_memtype), $(a)_device, $(size)) + MTestArgListDestroy(head) diff --git a/test/mpi/bench/p2p_bw.def b/test/mpi/bench/p2p_bw.def index b0c55e7fc94..ebd5e94ab55 100644 --- a/test/mpi/bench/p2p_bw.def +++ b/test/mpi/bench/p2p_bw.def @@ -1,5 +1,6 @@ include: macros/bench_frame.def include: macros/bench_p2p.def +include: macros/mtest.def subcode: _autoload $define WINDOW_SIZE 64 diff --git a/test/mpi/bench/p2p_latency.def b/test/mpi/bench/p2p_latency.def index 3fa94d547eb..4d18cce0efd 100644 --- a/test/mpi/bench/p2p_latency.def +++ b/test/mpi/bench/p2p_latency.def @@ -1,5 +1,6 @@ include: macros/bench_frame.def include: macros/bench_p2p.def +include: macros/mtest.def page: p2p_latency, bench_frame MULTIPLICITY: 2