From 3f4988377b07d03f23fcece8f7346b3845a39ec2 Mon Sep 17 00:00:00 2001
From: Hui Zhou <hzhou321@anl.gov>
Date: Sat, 1 Apr 2023 10:18:56 -0500
Subject: [PATCH 1/6] modules: add mydef_boot

MyDef provides general templating facilities.
---
 .gitmodules        | 3 +++
 modules/mydef_boot | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 modules/mydef_boot

diff --git a/.gitmodules b/.gitmodules
index 5fd73e70238..3353b602b74 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -14,3 +14,6 @@
 [submodule "modules/yaksa"]
 	path = modules/yaksa
 	url = https://github.com/pmodels/yaksa
+[submodule "modules/mydef_boot"]
+	path = modules/mydef_boot
+	url = https://github.com/pmodels/mydef_boot
diff --git a/modules/mydef_boot b/modules/mydef_boot
new file mode 160000
index 00000000000..ea2d6852486
--- /dev/null
+++ b/modules/mydef_boot
@@ -0,0 +1 @@
+Subproject commit ea2d6852486755eb12e255f760e2eb62f5446329

From 30f2bbd4389e16e81775aae30b08e779de35ac2e Mon Sep 17 00:00:00 2001
From: Hui Zhou <hzhou321@anl.gov>
Date: Sun, 17 Dec 2023 19:05:38 -0600
Subject: [PATCH 2/6] test/mpi: add p2p benchmarks in test/mpi/bench

Add point-to-point benchmark code in MyDef. The tests have automatic
warm-ups and adjusts number of iterations for measurement accuracy.
It produces latency measurements with standard deviations and equivalent
bandwidths.

MYDEF_BOOT=[topsrc_dir]/modules/mydef_boot
export PATH=$MYDEF_BOOT/bin:$PATH
export PERL5LIB=$MYDEF_BOOT/lib/perl5
export MYDEFLIB=$MYDEF_BOOT/lib/MyDef

To run:
    mydef_page p2p_latency.def  # -> p2p_latency.c
    mpicc p2p_latency.c && mpi_run -n 2 ./a.out

Alternatively use mydef_run (uses settings from config):
    mydef_run p2p_latency.def

Next commit will add "make testing".
---
 test/mpi/bench/config                 |  3 +
 test/mpi/bench/macros/bench_frame.def | 93 +++++++++++++++++++++++++++
 test/mpi/bench/macros/bench_p2p.def   | 79 +++++++++++++++++++++++
 test/mpi/bench/p2p_bw.def             | 26 ++++++++
 test/mpi/bench/p2p_latency.def        | 18 ++++++
 5 files changed, 219 insertions(+)
 create mode 100644 test/mpi/bench/config
 create mode 100644 test/mpi/bench/macros/bench_frame.def
 create mode 100644 test/mpi/bench/macros/bench_p2p.def
 create mode 100644 test/mpi/bench/p2p_bw.def
 create mode 100644 test/mpi/bench/p2p_latency.def

diff --git a/test/mpi/bench/config b/test/mpi/bench/config
new file mode 100644
index 00000000000..418c69e8a98
--- /dev/null
+++ b/test/mpi/bench/config
@@ -0,0 +1,3 @@
+module: c
+CC: mpicc
+run: mpirun -n 2
diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def
new file mode 100644
index 00000000000..7fb78dbc596
--- /dev/null
+++ b/test/mpi/bench/macros/bench_frame.def
@@ -0,0 +1,93 @@
+/*
+ * bench_frame       : boilerplate for mpi program
+ * measure(iter)     : measures `tf_dur` for $(iter) iterations
+ * run_stat(N, var)  : run N measurements and obtain (avg, std) in sum1, sum2
+ * warm_up(iter, dur): repeat until measurements (iter, dur) stabilize
+ * report_latency(msgsize, MULTIPLICITY) : print a line of latency result
+ */
+
+subcode: bench_frame
+    $include stdio
+    $include stdlib
+    $include mpi
+
+    $function main
+        MPI_Init(NULL, NULL);
+
+        $my grank, gsize: int
+        MPI_Comm_rank(MPI_COMM_WORLD, &grank);
+        MPI_Comm_size(MPI_COMM_WORLD, &gsize);
+        $(if:MIN_PROCS)
+            $if gsize < $(MIN_PROCS)
+                printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n");
+                return 1
+
+        MPI_Comm comm = MPI_COMM_WORLD;
+        char *buf = malloc(MAX_BUFSIZE)
+        $if !buf
+            printf("! Failed to allocate buffer (size=%d)\n", MAX_BUFSIZE)
+            return 1
+
+        $if grank == 0
+            printf("TEST $(_pagename):\n")
+            $call @report_header
+        $call main
+        $if grank == 0
+            printf("\n")
+
+        MPI_Finalize();
+
+macros:
+    use_double: 1
+
+#----------------------------------------
+subcode: _autoload
+    $register_prefix(comm) MPI_Comm
+
+subcode: foreach_size
+    $for int size = 0; size < $(MAX_MSG); size = (size==0)?1:size*2
+        $(set:MSG_SIZE=size)
+        BLOCK
+
+subcode: measure(iter)
+    tf_start = MPI_Wtime()
+    $for 0:$(iter)
+        BLOCK
+    tf_dur = MPI_Wtime() - tf_start
+
+subcode: run_stat(N, var)
+    $my double sum1=0, double sum2=0
+    $for 0:$(N)
+        BLOCK
+        sum1 += $(var)
+        sum2 += $(var) * $(var)
+    sum1 /= $(N)
+    sum2 /= $(N)
+    sum2 = sqrt(sum2 - sum1 * sum1)
+
+subcode: warm_up(iter, dur)
+    $(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
+    $(iter) = 2
+    $my double last_dur = 1.0
+    $my int num_best = 0
+    $while num_best < 10
+        BLOCK
+        $if $(iter) < $(MIN_ITER)
+            $(iter) = $(MIN_ITER)
+            num_best = 0
+            continue
+        # check that t_dur is no longer monotonically decreasing
+        $if $(dur) > last_dur
+            num_best++
+        last_dur = $(dur)
+
+subcode: header_latency
+    printf("%12s %10s(us) %6s(us) %12s(MB/s)\n", "msgsize", "latency", "sigma", "bandwidth")
+
+subcode: report_latency(MSGSIZE, MULTIPLICITY)
+    $my tf_latency, tf_sigma, tf_bw
+    tf_latency = sum1 / ($(MULTIPLICITY)) * 1e6
+    tf_sigma = sum2 / ($(MULTIPLICITY)) * 1e6
+    tf_bw = $(MSGSIZE) / tf_latency
+    printf("%12d %10.3f     %6.3f     %12.3f\n", $(MSGSIZE), tf_latency, tf_sigma, tf_bw)
+
diff --git a/test/mpi/bench/macros/bench_p2p.def b/test/mpi/bench/macros/bench_p2p.def
new file mode 100644
index 00000000000..742a73acfc3
--- /dev/null
+++ b/test/mpi/bench/macros/bench_p2p.def
@@ -0,0 +1,79 @@
+/*
+ * Defines following functions:
+ *   bench_p2p
+ *       bench_send, bench_warmup
+ *       bench_recv
+ *
+ * For each measurement -
+ *    First sender tells receiver the `iter` parameter. `iter = 0` means to quit.
+ *    For each iteration runs `send_side` and `recv_side` assuming the measurement on sender side represents a latency measurement.
+ * 
+ * Caller page defines -
+ *     subcode: sender_side, recv_side
+ *     macro:
+ *         MULTIPLICITY: divisor for each measurement
+ */
+
+macros:
+    MIN_PROCS: 2
+    MAX_BUFSIZE: 5000000  # 5 MB
+
+subcode: _autoload
+    $register_name(src) int
+    $register_name(dst) int
+    $register_name(buf) void *
+    $register_name(size) int
+    $define TAG 0
+    $define SYNC_TAG 100
+    $define MAX_BUFSIZE 5000000
+    $define NUM_REPEAT 20
+
+subcode: report_header
+        $call header_latency
+
+fncode: bench_p2p(comm, src, dst, buf, size)
+    int rank;
+    MPI_Comm_rank(comm, &rank)
+
+    $(if:!MULTIPLICITY)
+        $(set:MULTIPLICITY=1)
+
+    $if rank == src
+        iter = bench_warmup(comm, dst, buf, size)
+        &call run_stat, NUM_REPEAT, tf_latency
+            tf_latency = bench_send(iter, comm, dst, buf, size)
+            tf_latency /= iter
+        $call report_latency, size, $(MULTIPLICITY)
+        $call send_stop
+    $elif rank == dst
+        bench_recv(comm, src, buf, size)
+
+    subcode: send_stop
+        iter = 0;
+        MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm)
+
+#---------------------------------------- 
+fncode: bench_send(int iter, comm, dst, buf, size)
+    # synchronize with receiver
+    MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm);
+
+    &call measure, iter
+        $call @send_side
+
+    return tf_dur
+
+fncode: bench_recv(comm, src, buf, size)
+    $while 1
+        int iter;
+        # synchronize with sender */
+        MPI_Recv(&iter, 1, MPI_INT, src, SYNC_TAG, comm, MPI_STATUS_IGNORE);
+        $if iter == 0
+            # time to quit
+            break
+        $for i=0:iter
+            $call @recv_side
+
+fncode: bench_warmup(comm, dst, buf, size): int
+    &call warm_up, iter, tf_dur
+        tf_dur = bench_send(iter, comm, dst, buf, size)
+    return iter
diff --git a/test/mpi/bench/p2p_bw.def b/test/mpi/bench/p2p_bw.def
new file mode 100644
index 00000000000..b0c55e7fc94
--- /dev/null
+++ b/test/mpi/bench/p2p_bw.def
@@ -0,0 +1,26 @@
+include: macros/bench_frame.def
+include: macros/bench_p2p.def
+
+subcode: _autoload
+    $define WINDOW_SIZE 64
+
+page: p2p_bw, bench_frame
+    MULTIPLICITY: WINDOW_SIZE
+    data: buf, size, MPI_CHAR
+
+    $for int size = 1; size < MAX_BUFSIZE; size *= 2
+        bench_p2p(comm, 0, 1, buf, size)
+
+    subcode: send_side
+        $my MPI_Request reqs[WINDOW_SIZE]
+        $for j=0:WINDOW_SIZE
+            MPI_Isend($(data), dst, TAG, comm, &reqs[j])
+        MPI_Waitall(WINDOW_SIZE, reqs, MPI_STATUSES_IGNORE)
+        MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, dst, TAG, comm, MPI_STATUS_IGNORE)
+
+    subcode: recv_side
+        $my MPI_Request reqs[WINDOW_SIZE]
+        $for j=0:WINDOW_SIZE
+            MPI_Irecv($(data), src, TAG, comm, &reqs[j])
+        MPI_Waitall(WINDOW_SIZE, reqs, MPI_STATUSES_IGNORE)
+        MPI_Send(NULL, 0, MPI_DATATYPE_NULL, src, TAG, comm)
diff --git a/test/mpi/bench/p2p_latency.def b/test/mpi/bench/p2p_latency.def
new file mode 100644
index 00000000000..3fa94d547eb
--- /dev/null
+++ b/test/mpi/bench/p2p_latency.def
@@ -0,0 +1,18 @@
+include: macros/bench_frame.def
+include: macros/bench_p2p.def
+
+page: p2p_latency, bench_frame
+    MULTIPLICITY: 2
+    data: buf, size, MPI_CHAR
+
+    bench_p2p(comm, 0, 1, buf, 0)
+    $for int size = 1; size < MAX_BUFSIZE; size *= 2
+        bench_p2p(comm, 0, 1, buf, size)
+
+    subcode: send_side
+        MPI_Send($(data), dst, TAG, comm);
+        MPI_Recv($(data), dst, TAG, comm, MPI_STATUS_IGNORE);
+
+    subcode: recv_side
+        MPI_Recv($(data), src, TAG, comm, MPI_STATUS_IGNORE);
+        MPI_Send($(data), src, TAG, comm);

From 6633f0a0015330b7bbf67fe959ab29c6d02e7803 Mon Sep 17 00:00:00 2001
From: Hui Zhou <hzhou321@anl.gov>
Date: Tue, 1 Oct 2024 16:57:23 -0500
Subject: [PATCH 3/6] autogen: convert mydef code in autogen

We could add rules to directly work with mydef code in Makefile, but
convert the code in autogen removes the mydef dependency.

Also fix a spelling error.
---
 autogen.sh                | 15 +++++++++++++--
 test/mpi/bench/.gitignore |  3 +++
 test/mpi/bench/autogen.sh |  3 +++
 3 files changed, 19 insertions(+), 2 deletions(-)
 create mode 100644 test/mpi/bench/.gitignore
 create mode 100755 test/mpi/bench/autogen.sh

diff --git a/autogen.sh b/autogen.sh
index 80494232ce4..5178d70ec1e 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -65,6 +65,7 @@ do_hydra=yes
 do_romio=yes
 do_pmi=yes
 do_doc=no
+do_mydef=yes
 
 yaksa_depth=
 
@@ -536,6 +537,14 @@ fn_json_gen() {
     echo "done"
 }
 
+fn_mydef() {
+    MYDEF_BOOT=$PWD/modules/mydef_boot
+    export PATH=$MYDEF_BOOT/bin:$PATH
+    export PERL5LIB=$MYDEF_BOOT/lib/perl5
+    export MYDEFLIB=$MYDEF_BOOT/lib/MyDef
+    (cd test/mpi/bench && ./autogen.sh)
+}
+
 # internal
 _patch_libtool() {
     _file=$1
@@ -731,9 +740,9 @@ EOF
             echo ">= $ver"
         else
             echo "bad autoconf installation"
-            echo "--- autoreconf diagnositcs ---"
+            echo "--- autoreconf diagnostics ---"
             $(cat autoreconf.err)
-            echo "--- autoreconf diagnositcs ---"
+            echo "--- autoreconf diagnostics ---"
             cat <<EOF
 You either do not have autoconf in your path or it is too old (version
 $ver or higher required). You may be able to use
@@ -1102,3 +1111,5 @@ fn_build_configure
 fn_ch4_api
 
 fn_json_gen
+
+fn_mydef
diff --git a/test/mpi/bench/.gitignore b/test/mpi/bench/.gitignore
new file mode 100644
index 00000000000..611b6ea6684
--- /dev/null
+++ b/test/mpi/bench/.gitignore
@@ -0,0 +1,3 @@
+/*.c
+/p2p_bw
+/p2p_latency
diff --git a/test/mpi/bench/autogen.sh b/test/mpi/bench/autogen.sh
new file mode 100755
index 00000000000..4ba4f265f3c
--- /dev/null
+++ b/test/mpi/bench/autogen.sh
@@ -0,0 +1,3 @@
+for a in *.def ; do
+    mydef_page $a
+done

From e4d96f828e25746a6a2ab4bf7b66b63a4d1df37d Mon Sep 17 00:00:00 2001
From: Hui Zhou <hzhou321@anl.gov>
Date: Tue, 1 Oct 2024 10:54:54 -0500
Subject: [PATCH 4/6] test/runtests: add TestBench result check

This check does not capture output (thus test results will show in
console log) and only checks for exit code - zero means success and
nonzero means failure.

We'll use this check for benchmark tests.
---
 test/mpi/runtests | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/test/mpi/runtests b/test/mpi/runtests
index 237ba5e22f4..c7a98295fe3 100755
--- a/test/mpi/runtests
+++ b/test/mpi/runtests
@@ -934,6 +934,8 @@ sub get_resultTest {
         return \&TestStatusNoErrors;
     } elsif ($resultTest eq "TestErrFatal") {
         return \&TestErrFatal;
+    } elsif ($resultTest eq "TestBench") {
+        return \&TestBench;
     } else {
         die "resultTest $resultTest not defined!\n";
     }
@@ -1112,6 +1114,22 @@ sub TestErrFatal {
     return ($found_error, $inline);
 }
 
+# Only check exit code: 0 means success, non-zero means failure
+sub TestBench {
+    my ($MPIOUT, $programname) = @_;
+    my ($found_error, $inline);
+
+    while (<$MPIOUT>) {
+        print STDOUT $_;
+    }
+    my $rc = close($MPIOUT);
+    if (!$rc) {
+        expect_status_zero($programname, $?);
+        $found_error = 1;
+    }
+    return ($found_error, $inline);
+}
+
 # ----------------------------------------------------------------------------
 # Output routines:
 #  OpenOutputs               - Open report files and print initial lines

From f2add2bed1a812ce02ee9e292abe59a095004b11 Mon Sep 17 00:00:00 2001
From: Hui Zhou <hzhou321@anl.gov>
Date: Tue, 1 Oct 2024 17:24:41 -0500
Subject: [PATCH 5/6] test/bench: add Makefile and testlist

"make testing" in test/mpi/bench should work.
---
 test/mpi/bench/Makefile.am | 17 +++++++++++++++++
 test/mpi/bench/testlist    |  2 ++
 test/mpi/configure.ac      |  1 +
 3 files changed, 20 insertions(+)
 create mode 100644 test/mpi/bench/Makefile.am
 create mode 100644 test/mpi/bench/testlist

diff --git a/test/mpi/bench/Makefile.am b/test/mpi/bench/Makefile.am
new file mode 100644
index 00000000000..2341c2c9a78
--- /dev/null
+++ b/test/mpi/bench/Makefile.am
@@ -0,0 +1,17 @@
+##
+## Copyright (C) by Argonne National Laboratory
+##     See COPYRIGHT in top-level directory
+##
+
+include $(top_srcdir)/Makefile_single.mtest
+LDADD += -lm
+
+## for all programs that are just built from the single corresponding source
+## file, we don't need per-target _SOURCES rules, automake will infer them
+## correctly
+noinst_PROGRAMS = \
+    p2p_latency \
+    p2p_bw
+
+.def.c:
+	mydef_page $<
diff --git a/test/mpi/bench/testlist b/test/mpi/bench/testlist
new file mode 100644
index 00000000000..af73dca792a
--- /dev/null
+++ b/test/mpi/bench/testlist
@@ -0,0 +1,2 @@
+p2p_latency 2 resultTest=TestBench
+p2p_bw 2 resultTest=TestBench
diff --git a/test/mpi/configure.ac b/test/mpi/configure.ac
index 8bf7b30937f..9a7ed7b50dc 100644
--- a/test/mpi/configure.ac
+++ b/test/mpi/configure.ac
@@ -1904,5 +1904,6 @@ AC_OUTPUT(maint/testmerge \
           impls/mpich/ulfm/Makefile \
           impls/mpich/info/Makefile \
           impls/mpich/info/testlist \
+          bench/Makefile \
           )
 

From 99c6adff2aa093b78f97fe73449390320a40dcf1 Mon Sep 17 00:00:00 2001
From: Hui Zhou <hzhou321@anl.gov>
Date: Tue, 1 Oct 2024 18:17:18 -0500
Subject: [PATCH 6/6] test/bench: add support for device memory

Add device memory support using mtest_common utilities. This will add
the dependency to utility libraries, which the makefile already
imports.

However, this will remove the simpliicity of building single
source with mpicc or mydef_run. If one doesn't need test device memory,
one can simply comment off "$include macros/mtest.def" to restore the
simplicity.
---
 test/mpi/bench/macros/bench_frame.def | 22 ++++++++++++++++++----
 test/mpi/bench/macros/mtest.def       | 14 ++++++++++++++
 test/mpi/bench/p2p_bw.def             |  1 +
 test/mpi/bench/p2p_latency.def        |  1 +
 4 files changed, 34 insertions(+), 4 deletions(-)
 create mode 100644 test/mpi/bench/macros/mtest.def

diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def
index 7fb78dbc596..47d19c468e8 100644
--- a/test/mpi/bench/macros/bench_frame.def
+++ b/test/mpi/bench/macros/bench_frame.def
@@ -9,10 +9,16 @@
 subcode: bench_frame
     $include stdio
     $include stdlib
-    $include mpi
+    $(if:HAS_MTEST)
+        $include mpitest.h
+    $(else)
+        $include mpi
 
     $function main
-        MPI_Init(NULL, NULL);
+        $(if:HAS_MTEST)
+            MTest_Init(NULL, NULL);
+        $(else)
+            MPI_Init(NULL, NULL);
 
         $my grank, gsize: int
         MPI_Comm_rank(MPI_COMM_WORLD, &grank);
@@ -23,7 +29,12 @@ subcode: bench_frame
                 return 1
 
         MPI_Comm comm = MPI_COMM_WORLD;
-        char *buf = malloc(MAX_BUFSIZE)
+
+        $my void *buf
+        $(if:HAS_MTEST)
+            $call mtest_malloc, MAX_BUFSIZE
+        $(else)
+            buf = malloc(MAX_BUFSIZE)
         $if !buf
             printf("! Failed to allocate buffer (size=%d)\n", MAX_BUFSIZE)
             return 1
@@ -35,7 +46,10 @@ subcode: bench_frame
         $if grank == 0
             printf("\n")
 
-        MPI_Finalize();
+        $(if:HAS_MTEST)
+            MTest_Finalize(0);
+        $(else)
+            MPI_Finalize();
 
 macros:
     use_double: 1
diff --git a/test/mpi/bench/macros/mtest.def b/test/mpi/bench/macros/mtest.def
new file mode 100644
index 00000000000..185a35bf77e
--- /dev/null
+++ b/test/mpi/bench/macros/mtest.def
@@ -0,0 +1,14 @@
+macros:
+    HAS_MTEST: 1
+
+subcode: mtest_malloc(size)
+    MTestArgList *head = MTestArgListCreate(argc, argv)
+    int send_rank = 0, recv_rank = 1;
+    $(for:a in send,recv)
+        $if grank == $(a)_rank
+            $my mtest_mem_type_e $(a)_memtype, int $(a)_device
+            $(a)_memtype = MTestArgListGetMemType(head, "$(a)mem")
+            $(a)_device = MTestArgListGetInt_with_default(head, "$(a)dev", 0)
+            MTestMalloc($(size), $(a)_memtype, NULL, &buf, $(a)_device)
+            MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name($(a)_memtype), $(a)_device, $(size))
+    MTestArgListDestroy(head)
diff --git a/test/mpi/bench/p2p_bw.def b/test/mpi/bench/p2p_bw.def
index b0c55e7fc94..ebd5e94ab55 100644
--- a/test/mpi/bench/p2p_bw.def
+++ b/test/mpi/bench/p2p_bw.def
@@ -1,5 +1,6 @@
 include: macros/bench_frame.def
 include: macros/bench_p2p.def
+include: macros/mtest.def
 
 subcode: _autoload
     $define WINDOW_SIZE 64
diff --git a/test/mpi/bench/p2p_latency.def b/test/mpi/bench/p2p_latency.def
index 3fa94d547eb..4d18cce0efd 100644
--- a/test/mpi/bench/p2p_latency.def
+++ b/test/mpi/bench/p2p_latency.def
@@ -1,5 +1,6 @@
 include: macros/bench_frame.def
 include: macros/bench_p2p.def
+include: macros/mtest.def
 
 page: p2p_latency, bench_frame
     MULTIPLICITY: 2