Merge pull request #6907 from hzhou/2311_bench

test: add p2p benchmark code Approved-by: Ken Raffenetti
pmodels · Oct 2, 2024 · 1f359fe · 1f359fe
2 parents 9c907a4 + 99c6adf
commit 1f359fe
Show file tree

Hide file tree

Showing 15 changed files with 310 additions and 2 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -14,3 +14,6 @@
 [submodule "modules/yaksa"]
 	path = modules/yaksa
 	url = https://github.com/pmodels/yaksa
+[submodule "modules/mydef_boot"]
+	path = modules/mydef_boot
+	url = https://github.com/pmodels/mydef_boot
diff --git a/autogen.sh b/autogen.sh
@@ -65,6 +65,7 @@ do_hydra=yes
 do_romio=yes
 do_pmi=yes
 do_doc=no
+do_mydef=yes
 
 yaksa_depth=
 
@@ -536,6 +537,14 @@ fn_json_gen() {
     echo "done"
 }
 
+fn_mydef() {
+    MYDEF_BOOT=$PWD/modules/mydef_boot
+    export PATH=$MYDEF_BOOT/bin:$PATH
+    export PERL5LIB=$MYDEF_BOOT/lib/perl5
+    export MYDEFLIB=$MYDEF_BOOT/lib/MyDef
+    (cd test/mpi/bench && ./autogen.sh)
+}
+
 # internal
 _patch_libtool() {
     _file=$1
@@ -731,9 +740,9 @@ EOF
             echo ">= $ver"
         else
             echo "bad autoconf installation"
-            echo "--- autoreconf diagnositcs ---"
+            echo "--- autoreconf diagnostics ---"
             $(cat autoreconf.err)
-            echo "--- autoreconf diagnositcs ---"
+            echo "--- autoreconf diagnostics ---"
             cat <<EOF
 You either do not have autoconf in your path or it is too old (version
 $ver or higher required). You may be able to use
@@ -1102,3 +1111,5 @@ fn_build_configure
 fn_ch4_api
 
 fn_json_gen
+
+fn_mydef
diff --git a/modules/mydef_boot b/modules/mydef_boot
diff --git a/test/mpi/bench/.gitignore b/test/mpi/bench/.gitignore
@@ -0,0 +1,3 @@
+/*.c
+/p2p_bw
+/p2p_latency
diff --git a/test/mpi/bench/Makefile.am b/test/mpi/bench/Makefile.am
@@ -0,0 +1,17 @@
+##
+## Copyright (C) by Argonne National Laboratory
+##     See COPYRIGHT in top-level directory
+##
+
+include $(top_srcdir)/Makefile_single.mtest
+LDADD += -lm
+
+## for all programs that are just built from the single corresponding source
+## file, we don't need per-target _SOURCES rules, automake will infer them
+## correctly
+noinst_PROGRAMS = \
+    p2p_latency \
+    p2p_bw
+
+.def.c:
+	mydef_page $<
diff --git a/test/mpi/bench/autogen.sh b/test/mpi/bench/autogen.sh
@@ -0,0 +1,3 @@
+for a in *.def ; do
+    mydef_page $a
+done
diff --git a/test/mpi/bench/config b/test/mpi/bench/config
@@ -0,0 +1,3 @@
+module: c
+CC: mpicc
+run: mpirun -n 2
diff --git a/test/mpi/bench/macros/bench_frame.def b/test/mpi/bench/macros/bench_frame.def
@@ -0,0 +1,107 @@
+/*
+ * bench_frame       : boilerplate for mpi program
+ * measure(iter)     : measures `tf_dur` for $(iter) iterations
+ * run_stat(N, var)  : run N measurements and obtain (avg, std) in sum1, sum2
+ * warm_up(iter, dur): repeat until measurements (iter, dur) stabilize
+ * report_latency(msgsize, MULTIPLICITY) : print a line of latency result
+ */
+
+subcode: bench_frame
+    $include stdio
+    $include stdlib
+    $(if:HAS_MTEST)
+        $include mpitest.h
+    $(else)
+        $include mpi
+
+    $function main
+        $(if:HAS_MTEST)
+            MTest_Init(NULL, NULL);
+        $(else)
+            MPI_Init(NULL, NULL);
+
+        $my grank, gsize: int
+        MPI_Comm_rank(MPI_COMM_WORLD, &grank);
+        MPI_Comm_size(MPI_COMM_WORLD, &gsize);
+        $(if:MIN_PROCS)
+            $if gsize < $(MIN_PROCS)
+                printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n");
+                return 1
+
+        MPI_Comm comm = MPI_COMM_WORLD;
+
+        $my void *buf
+        $(if:HAS_MTEST)
+            $call mtest_malloc, MAX_BUFSIZE
+        $(else)
+            buf = malloc(MAX_BUFSIZE)
+        $if !buf
+            printf("! Failed to allocate buffer (size=%d)\n", MAX_BUFSIZE)
+            return 1
+
+        $if grank == 0
+            printf("TEST $(_pagename):\n")
+            $call @report_header
+        $call main
+        $if grank == 0
+            printf("\n")
+
+        $(if:HAS_MTEST)
+            MTest_Finalize(0);
+        $(else)
+            MPI_Finalize();
+
+macros:
+    use_double: 1
+
+#----------------------------------------
+subcode: _autoload
+    $register_prefix(comm) MPI_Comm
+
+subcode: foreach_size
+    $for int size = 0; size < $(MAX_MSG); size = (size==0)?1:size*2
+        $(set:MSG_SIZE=size)
+        BLOCK
+
+subcode: measure(iter)
+    tf_start = MPI_Wtime()
+    $for 0:$(iter)
+        BLOCK
+    tf_dur = MPI_Wtime() - tf_start
+
+subcode: run_stat(N, var)
+    $my double sum1=0, double sum2=0
+    $for 0:$(N)
+        BLOCK
+        sum1 += $(var)
+        sum2 += $(var) * $(var)
+    sum1 /= $(N)
+    sum2 /= $(N)
+    sum2 = sqrt(sum2 - sum1 * sum1)
+
+subcode: warm_up(iter, dur)
+    $(set:MIN_ITER=(int) ($(iter) * 0.001 / $(dur)))
+    $(iter) = 2
+    $my double last_dur = 1.0
+    $my int num_best = 0
+    $while num_best < 10
+        BLOCK
+        $if $(iter) < $(MIN_ITER)
+            $(iter) = $(MIN_ITER)
+            num_best = 0
+            continue
+        # check that t_dur is no longer monotonically decreasing
+        $if $(dur) > last_dur
+            num_best++
+        last_dur = $(dur)
+
+subcode: header_latency
+    printf("%12s %10s(us) %6s(us) %12s(MB/s)\n", "msgsize", "latency", "sigma", "bandwidth")
+
+subcode: report_latency(MSGSIZE, MULTIPLICITY)
+    $my tf_latency, tf_sigma, tf_bw
+    tf_latency = sum1 / ($(MULTIPLICITY)) * 1e6
+    tf_sigma = sum2 / ($(MULTIPLICITY)) * 1e6
+    tf_bw = $(MSGSIZE) / tf_latency
+    printf("%12d %10.3f     %6.3f     %12.3f\n", $(MSGSIZE), tf_latency, tf_sigma, tf_bw)
+
diff --git a/test/mpi/bench/macros/bench_p2p.def b/test/mpi/bench/macros/bench_p2p.def
@@ -0,0 +1,79 @@
+/*
+ * Defines following functions:
+ *   bench_p2p
+ *       bench_send, bench_warmup
+ *       bench_recv
+ *
+ * For each measurement -
+ *    First sender tells receiver the `iter` parameter. `iter = 0` means to quit.
+ *    For each iteration runs `send_side` and `recv_side` assuming the measurement on sender side represents a latency measurement.
+ * 
+ * Caller page defines -
+ *     subcode: sender_side, recv_side
+ *     macro:
+ *         MULTIPLICITY: divisor for each measurement
+ */
+
+macros:
+    MIN_PROCS: 2
+    MAX_BUFSIZE: 5000000  # 5 MB
+
+subcode: _autoload
+    $register_name(src) int
+    $register_name(dst) int
+    $register_name(buf) void *
+    $register_name(size) int
+    $define TAG 0
+    $define SYNC_TAG 100
+    $define MAX_BUFSIZE 5000000
+    $define NUM_REPEAT 20
+
+subcode: report_header
+        $call header_latency
+
+fncode: bench_p2p(comm, src, dst, buf, size)
+    int rank;
+    MPI_Comm_rank(comm, &rank)
+
+    $(if:!MULTIPLICITY)
+        $(set:MULTIPLICITY=1)
+
+    $if rank == src
+        iter = bench_warmup(comm, dst, buf, size)
+        &call run_stat, NUM_REPEAT, tf_latency
+            tf_latency = bench_send(iter, comm, dst, buf, size)
+            tf_latency /= iter
+        $call report_latency, size, $(MULTIPLICITY)
+        $call send_stop
+    $elif rank == dst
+        bench_recv(comm, src, buf, size)
+
+    subcode: send_stop
+        iter = 0;
+        MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm)
+
+#---------------------------------------- 
+fncode: bench_send(int iter, comm, dst, buf, size)
+    # synchronize with receiver
+    MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm);
+
+    &call measure, iter
+        $call @send_side
+
+    return tf_dur
+
+fncode: bench_recv(comm, src, buf, size)
+    $while 1
+        int iter;
+        # synchronize with sender */
+        MPI_Recv(&iter, 1, MPI_INT, src, SYNC_TAG, comm, MPI_STATUS_IGNORE);
+        $if iter == 0
+            # time to quit
+            break
+        $for i=0:iter
+            $call @recv_side
+
+fncode: bench_warmup(comm, dst, buf, size): int
+    &call warm_up, iter, tf_dur
+        tf_dur = bench_send(iter, comm, dst, buf, size)
+    return iter
diff --git a/test/mpi/bench/macros/mtest.def b/test/mpi/bench/macros/mtest.def
@@ -0,0 +1,14 @@
+macros:
+    HAS_MTEST: 1
+
+subcode: mtest_malloc(size)
+    MTestArgList *head = MTestArgListCreate(argc, argv)
+    int send_rank = 0, recv_rank = 1;
+    $(for:a in send,recv)
+        $if grank == $(a)_rank
+            $my mtest_mem_type_e $(a)_memtype, int $(a)_device
+            $(a)_memtype = MTestArgListGetMemType(head, "$(a)mem")
+            $(a)_device = MTestArgListGetInt_with_default(head, "$(a)dev", 0)
+            MTestMalloc($(size), $(a)_memtype, NULL, &buf, $(a)_device)
+            MTestPrintfMsg(1, "Allocating buffer: memtype=%s, device=%d, size=%d\n", MTest_memtype_name($(a)_memtype), $(a)_device, $(size))
+    MTestArgListDestroy(head)
diff --git a/test/mpi/bench/p2p_bw.def b/test/mpi/bench/p2p_bw.def
@@ -0,0 +1,27 @@
+include: macros/bench_frame.def
+include: macros/bench_p2p.def
+include: macros/mtest.def
+
+subcode: _autoload
+    $define WINDOW_SIZE 64
+
+page: p2p_bw, bench_frame
+    MULTIPLICITY: WINDOW_SIZE
+    data: buf, size, MPI_CHAR
+
+    $for int size = 1; size < MAX_BUFSIZE; size *= 2
+        bench_p2p(comm, 0, 1, buf, size)
+
+    subcode: send_side
+        $my MPI_Request reqs[WINDOW_SIZE]
+        $for j=0:WINDOW_SIZE
+            MPI_Isend($(data), dst, TAG, comm, &reqs[j])
+        MPI_Waitall(WINDOW_SIZE, reqs, MPI_STATUSES_IGNORE)
+        MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, dst, TAG, comm, MPI_STATUS_IGNORE)
+
+    subcode: recv_side
+        $my MPI_Request reqs[WINDOW_SIZE]
+        $for j=0:WINDOW_SIZE
+            MPI_Irecv($(data), src, TAG, comm, &reqs[j])
+        MPI_Waitall(WINDOW_SIZE, reqs, MPI_STATUSES_IGNORE)
+        MPI_Send(NULL, 0, MPI_DATATYPE_NULL, src, TAG, comm)
diff --git a/test/mpi/bench/p2p_latency.def b/test/mpi/bench/p2p_latency.def
@@ -0,0 +1,19 @@
+include: macros/bench_frame.def
+include: macros/bench_p2p.def
+include: macros/mtest.def
+
+page: p2p_latency, bench_frame
+    MULTIPLICITY: 2
+    data: buf, size, MPI_CHAR
+
+    bench_p2p(comm, 0, 1, buf, 0)
+    $for int size = 1; size < MAX_BUFSIZE; size *= 2
+        bench_p2p(comm, 0, 1, buf, size)
+
+    subcode: send_side
+        MPI_Send($(data), dst, TAG, comm);
+        MPI_Recv($(data), dst, TAG, comm, MPI_STATUS_IGNORE);
+
+    subcode: recv_side
+        MPI_Recv($(data), src, TAG, comm, MPI_STATUS_IGNORE);
+        MPI_Send($(data), src, TAG, comm);
diff --git a/test/mpi/bench/testlist b/test/mpi/bench/testlist
@@ -0,0 +1,2 @@
+p2p_latency 2 resultTest=TestBench
+p2p_bw 2 resultTest=TestBench
diff --git a/test/mpi/configure.ac b/test/mpi/configure.ac
@@ -1904,5 +1904,6 @@ AC_OUTPUT(maint/testmerge \
           impls/mpich/ulfm/Makefile \
           impls/mpich/info/Makefile \
           impls/mpich/info/testlist \
+          bench/Makefile \
           )
 
diff --git a/test/mpi/runtests b/test/mpi/runtests
@@ -934,6 +934,8 @@ sub get_resultTest {
         return \&TestStatusNoErrors;
     } elsif ($resultTest eq "TestErrFatal") {
         return \&TestErrFatal;
+    } elsif ($resultTest eq "TestBench") {
+        return \&TestBench;
     } else {
         die "resultTest $resultTest not defined!\n";
     }
@@ -1112,6 +1114,22 @@ sub TestErrFatal {
     return ($found_error, $inline);
 }
 
+# Only check exit code: 0 means success, non-zero means failure
+sub TestBench {
+    my ($MPIOUT, $programname) = @_;
+    my ($found_error, $inline);
+
+    while (<$MPIOUT>) {
+        print STDOUT $_;
+    }
+    my $rc = close($MPIOUT);
+    if (!$rc) {
+        expect_status_zero($programname, $?);
+        $found_error = 1;
+    }
+    return ($found_error, $inline);
+}
+
 # ----------------------------------------------------------------------------
 # Output routines:
 #  OpenOutputs               - Open report files and print initial lines
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		p2p_latency 2 resultTest=TestBench
		p2p_bw 2 resultTest=TestBench