From 0d370abde0e954a9abf6545496354de0871fc934 Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 22 Oct 2023 16:05:18 -0600 Subject: [PATCH 1/2] add mpitrace Signed-off-by: vsoch --- docs/_static/data/addons.json | 5 + examples/addons/mpitrace-lammps/README.md | 49 ++++ .../addons/mpitrace-lammps/metrics-rocky.yaml | 32 +++ .../addons/mpitrace-lammps/mpi_profile.114.0 | 104 +++++++++ .../addons/mpitrace-lammps/mpi_profile.114.1 | 86 +++++++ .../addons/mpitrace-lammps/mpi_profile.114.2 | 85 +++++++ pkg/addons/mpitrace.go | 209 ++++++++++++++++++ 7 files changed, 570 insertions(+) create mode 100644 examples/addons/mpitrace-lammps/README.md create mode 100644 examples/addons/mpitrace-lammps/metrics-rocky.yaml create mode 100644 examples/addons/mpitrace-lammps/mpi_profile.114.0 create mode 100644 examples/addons/mpitrace-lammps/mpi_profile.114.1 create mode 100644 examples/addons/mpitrace-lammps/mpi_profile.114.2 create mode 100644 pkg/addons/mpitrace.go diff --git a/docs/_static/data/addons.json b/docs/_static/data/addons.json index 3dd73bf..bbaec7d 100644 --- a/docs/_static/data/addons.json +++ b/docs/_static/data/addons.json @@ -19,6 +19,11 @@ "description": "performance tools for measurement and analysis", "family": "performance" }, + { + "name": "perf-mpitrace", + "description": "library for measuring communication in distributed-memory parallel applications that use MPI", + "family": "performance" + }, { "name": "volume-cm", "description": "config map volume type", diff --git a/examples/addons/mpitrace-lammps/README.md b/examples/addons/mpitrace-lammps/README.md new file mode 100644 index 0000000..07e557c --- /dev/null +++ b/examples/addons/mpitrace-lammps/README.md @@ -0,0 +1,49 @@ +# LAMMPS Example + +This is an example of a metric app, lammps, which is part of the [coral 2 benchmarks](https://asc.llnl.gov/coral-2-benchmarks) and technically +isn't a metric, but we use it often to assess time and thus the MPI latency. A Python example (parsing the output data) +is provided in [python/app-lammps](../../python/app-lammps). + +## Usage + +Create a cluster and install JobSet to it. + +```bash +kind create cluster +VERSION=v0.2.0 +kubectl apply --server-side -f https://github.com/kubernetes-sigs/jobset/releases/download/$VERSION/manifests.yaml +``` + +Install the operator (from the development manifest here): + +```bash +kubectl apply -f ../../dist/metrics-operator-dev.yaml +``` + +How to see metrics operator logs: + +```bash +$ kubectl logs -n metrics-system metrics-controller-manager-859c66464c-7rpbw +``` + +Then create the metrics set. This is going to run a single run of LAMMPS over MPI! +as lammps runs. + +```bash +kubectl apply -f metrics-rocky.yaml +``` + +Wait until you see pods created by the job and then running. + +```bash +kubectl get pods +``` + +And then you can shell in and look at the output, which should be named with the pattern `mpi_profile..`. +I use kubectl copy to copy examples to the present working directory here. + +When you are done, cleanup. + +```bash +kubectl delete -f metrics.yaml +``` \ No newline at end of file diff --git a/examples/addons/mpitrace-lammps/metrics-rocky.yaml b/examples/addons/mpitrace-lammps/metrics-rocky.yaml new file mode 100644 index 0000000..578dec9 --- /dev/null +++ b/examples/addons/mpitrace-lammps/metrics-rocky.yaml @@ -0,0 +1,32 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MetricSet +metadata: + labels: + app.kubernetes.io/name: metricset + app.kubernetes.io/instance: metricset-sample + name: metricset-sample +spec: + # Number of pods for lammps (one launcher, the rest workers) + pods: 4 + logging: + interactive: true + + metrics: + + # Running more scaled lammps is our main goal + - name: app-lammps + + # This is for if you use rocky, not the default + image: ghcr.io/converged-computing/metric-lammps-intel-mpi:rocky + options: + command: /opt/intel/mpi/2021.8.0/bin/mpirun --hostfile ./hostlist.txt -np 4 --map-by socket lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite + workdir: /opt/lammps/examples/reaxff/HNS + + # Add on hpctoolkit, will mount a volume and wrap lammps + addons: + - name: perf-mpitrace + options: + mount: /opt/mnt + image: ghcr.io/converged-computing/metric-mpitrace:rocky + workdir: /opt/lammps/examples/reaxff/HNS + containerTarget: launcher \ No newline at end of file diff --git a/examples/addons/mpitrace-lammps/mpi_profile.114.0 b/examples/addons/mpitrace-lammps/mpi_profile.114.0 new file mode 100644 index 0000000..de36ce0 --- /dev/null +++ b/examples/addons/mpitrace-lammps/mpi_profile.114.0 @@ -0,0 +1,104 @@ +Data for MPI rank 0 of 4: +Times from MPI_Init() to MPI_Finalize(). +----------------------------------------------------------------------- +MPI Routine #calls avg. bytes time(sec) +----------------------------------------------------------------------- +MPI_Comm_rank 10 0.0 0.000 +MPI_Comm_size 4 0.0 0.000 +MPI_Send 20560 9780.7 0.008 +MPI_Irecv 20560 9781.5 0.003 +MPI_Sendrecv 36 8.0 0.001 +MPI_Wait 20560 0.0 7.251 +MPI_Bcast 129 1044.8 0.001 +MPI_Barrier 7 0.0 0.003 +MPI_Reduce 4 7.0 0.000 +MPI_Allreduce 5167 8.1 0.012 +MPI_Allgather 1 4.0 0.000 +MPI_Allgatherv 1 7392.0 0.000 +----------------------------------------------------------------------- +MPI task 0 of 4 had the maximum communication time. +total communication time = 7.279 seconds. +total elapsed time = 14.510 seconds. +user cpu time = 12.155 seconds. +system time = 2.337 seconds. +max resident set size = 137.273 MiB. + +----------------------------------------------------------------- +Message size distributions: + +MPI_Send #calls avg. bytes time(sec) + 7 0.0 0.000 + 3 176.0 0.000 + 1 352.0 0.000 + 1 528.0 0.000 + 9870 2529.2 0.002 + 392 7584.8 0.000 + 9870 15538.2 0.004 + 12 17714.7 0.000 + 392 46623.9 0.001 + 12 108532.7 0.000 + +MPI_Irecv #calls avg. bytes time(sec) + 8 0.0 0.000 + 1 176.0 0.000 + 2 352.0 0.000 + 1 528.0 0.000 + 9870 2529.1 0.001 + 392 7585.4 0.000 + 9870 15539.6 0.001 + 12 17668.0 0.000 + 392 46617.9 0.000 + 12 108990.0 0.000 + +MPI_Sendrecv #calls avg. bytes time(sec) + 36 8.0 0.001 + +MPI_Bcast #calls avg. bytes time(sec) + 80 3.4 0.000 + 3 7.7 0.000 + 4 13.2 0.000 + 20 24.5 0.001 + 12 49.0 0.000 + 2 96.0 0.000 + 1 312.0 0.000 + 1 992.0 0.000 + 1 2048.0 0.000 + 1 3840.0 0.000 + 3 24239.7 0.000 + 1 53248.0 0.000 + +MPI_Reduce #calls avg. bytes time(sec) + 1 4.0 0.000 + 3 8.0 0.000 + +MPI_Allreduce #calls avg. bytes time(sec) + 14 4.0 0.000 + 5125 8.0 0.012 + 13 15.4 0.000 + 12 24.0 0.000 + 3 40.0 0.000 + +MPI_Allgather #calls avg. bytes time(sec) + 1 4.0 0.000 + +MPI_Allgatherv #calls avg. bytes time(sec) + 1 7392.0 0.000 + +----------------------------------------------------------------- + +Summary for all tasks: + + Rank 0 reported the largest memory utilization : 137.27 MiB + Rank 2 reported the largest elapsed time : 14.51 sec + + minimum communication time = 0.085 sec for task 2 + median communication time = 1.633 sec for task 1 + maximum communication time = 7.279 sec for task 0 + + +MPI timing summary for all ranks: +taskid host cpu comm(s) elapsed(s) user(s) system(s) size(MiB) switches + 0 metricset-sample-l-0-0 1 7.28 14.51 12.16 2.34 137.27 842 + 1 metricset-sample-l-0-0 5 1.63 14.51 13.99 0.51 129.07 63 + 2 metricset-sample-l-0-0 7 0.08 14.51 14.45 0.05 129.86 43 + 3 metricset-sample-l-0-0 10 2.70 14.51 13.73 0.78 131.18 39 diff --git a/examples/addons/mpitrace-lammps/mpi_profile.114.1 b/examples/addons/mpitrace-lammps/mpi_profile.114.1 new file mode 100644 index 0000000..b8ba5fe --- /dev/null +++ b/examples/addons/mpitrace-lammps/mpi_profile.114.1 @@ -0,0 +1,86 @@ +Data for MPI rank 1 of 4: +Times from MPI_Init() to MPI_Finalize(). +----------------------------------------------------------------------- +MPI Routine #calls avg. bytes time(sec) +----------------------------------------------------------------------- +MPI_Comm_rank 10 0.0 0.000 +MPI_Comm_size 4 0.0 0.000 +MPI_Send 20560 9788.2 0.017 +MPI_Irecv 20560 9787.4 0.004 +MPI_Sendrecv 36 8.0 0.000 +MPI_Wait 20560 0.0 1.601 +MPI_Bcast 129 1044.8 0.001 +MPI_Barrier 7 0.0 0.000 +MPI_Reduce 4 7.0 0.000 +MPI_Allreduce 5167 8.1 0.009 +MPI_Allgather 1 4.0 0.000 +MPI_Allgatherv 1 7296.0 0.000 +----------------------------------------------------------------------- +MPI task 1 of 4 had the median communication time. +total communication time = 1.633 seconds. +total elapsed time = 14.510 seconds. +user cpu time = 13.993 seconds. +system time = 0.508 seconds. +max resident set size = 129.074 MiB. + +----------------------------------------------------------------- +Message size distributions: + +MPI_Send #calls avg. bytes time(sec) + 8 0.0 0.000 + 1 176.0 0.000 + 2 352.0 0.000 + 1 528.0 0.000 + 9870 2541.4 0.003 + 392 7626.1 0.000 + 9870 15539.6 0.011 + 12 17794.0 0.000 + 392 46617.9 0.002 + 12 108990.0 0.000 + +MPI_Irecv #calls avg. bytes time(sec) + 7 0.0 0.000 + 3 176.0 0.000 + 1 352.0 0.000 + 1 528.0 0.000 + 9870 2541.4 0.002 + 392 7626.0 0.000 + 9870 15538.2 0.002 + 12 17803.3 0.000 + 392 46623.9 0.000 + 12 108532.7 0.000 + +MPI_Sendrecv #calls avg. bytes time(sec) + 36 8.0 0.000 + +MPI_Bcast #calls avg. bytes time(sec) + 80 3.4 0.000 + 3 7.7 0.000 + 4 13.2 0.000 + 20 24.5 0.000 + 12 49.0 0.000 + 2 96.0 0.000 + 1 312.0 0.000 + 1 992.0 0.000 + 1 2048.0 0.000 + 1 3840.0 0.000 + 3 24239.7 0.000 + 1 53248.0 0.000 + +MPI_Reduce #calls avg. bytes time(sec) + 1 4.0 0.000 + 3 8.0 0.000 + +MPI_Allreduce #calls avg. bytes time(sec) + 14 4.0 0.000 + 5125 8.0 0.009 + 13 15.4 0.000 + 12 24.0 0.000 + 3 40.0 0.000 + +MPI_Allgather #calls avg. bytes time(sec) + 1 4.0 0.000 + +MPI_Allgatherv #calls avg. bytes time(sec) + 1 7296.0 0.000 + diff --git a/examples/addons/mpitrace-lammps/mpi_profile.114.2 b/examples/addons/mpitrace-lammps/mpi_profile.114.2 new file mode 100644 index 0000000..c6128fb --- /dev/null +++ b/examples/addons/mpitrace-lammps/mpi_profile.114.2 @@ -0,0 +1,85 @@ +Data for MPI rank 2 of 4: +Times from MPI_Init() to MPI_Finalize(). +----------------------------------------------------------------------- +MPI Routine #calls avg. bytes time(sec) +----------------------------------------------------------------------- +MPI_Comm_rank 10 0.0 0.000 +MPI_Comm_size 4 0.0 0.000 +MPI_Send 20560 9785.9 0.017 +MPI_Irecv 20560 9786.9 0.004 +MPI_Sendrecv 36 8.0 0.000 +MPI_Wait 20560 0.0 0.056 +MPI_Bcast 129 1044.8 0.001 +MPI_Barrier 7 0.0 0.000 +MPI_Reduce 4 7.0 0.000 +MPI_Allreduce 5167 8.1 0.006 +MPI_Allgather 1 4.0 0.000 +MPI_Allgatherv 1 7488.0 0.000 +----------------------------------------------------------------------- +MPI task 2 of 4 had the minimum communication time. +total communication time = 0.085 seconds. +total elapsed time = 14.510 seconds. +user cpu time = 14.454 seconds. +system time = 0.055 seconds. +max resident set size = 129.863 MiB. + +----------------------------------------------------------------- +Message size distributions: + +MPI_Send #calls avg. bytes time(sec) + 9 0.0 0.000 + 1 176.0 0.000 + 1 352.0 0.000 + 1 704.0 0.000 + 9870 2529.1 0.003 + 392 7585.4 0.000 + 9870 15547.8 0.012 + 12 17668.0 0.000 + 392 46656.8 0.002 + 12 108593.3 0.000 + +MPI_Irecv #calls avg. bytes time(sec) + 10 0.0 0.000 + 1 176.0 0.000 + 1 352.0 0.000 + 9870 2529.2 0.002 + 392 7584.8 0.000 + 9870 15549.4 0.002 + 12 17714.7 0.000 + 392 46650.1 0.000 + 12 109102.0 0.000 + +MPI_Sendrecv #calls avg. bytes time(sec) + 36 8.0 0.000 + +MPI_Bcast #calls avg. bytes time(sec) + 80 3.4 0.000 + 3 7.7 0.000 + 4 13.2 0.000 + 20 24.5 0.000 + 12 49.0 0.000 + 2 96.0 0.000 + 1 312.0 0.000 + 1 992.0 0.000 + 1 2048.0 0.000 + 1 3840.0 0.000 + 3 24239.7 0.000 + 1 53248.0 0.000 + +MPI_Reduce #calls avg. bytes time(sec) + 1 4.0 0.000 + 3 8.0 0.000 + +MPI_Allreduce #calls avg. bytes time(sec) + 14 4.0 0.000 + 5125 8.0 0.006 + 13 15.4 0.000 + 12 24.0 0.000 + 3 40.0 0.000 + +MPI_Allgather #calls avg. bytes time(sec) + 1 4.0 0.000 + +MPI_Allgatherv #calls avg. bytes time(sec) + 1 7488.0 0.000 + diff --git a/pkg/addons/mpitrace.go b/pkg/addons/mpitrace.go new file mode 100644 index 0000000..ba2579c --- /dev/null +++ b/pkg/addons/mpitrace.go @@ -0,0 +1,209 @@ +/* +Copyright 2023 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) + +SPDX-License-Identifier: MIT +*/ + +package addons + +import ( + "fmt" + + api "github.com/converged-computing/metrics-operator/api/v1alpha2" + "github.com/converged-computing/metrics-operator/pkg/metadata" + "github.com/converged-computing/metrics-operator/pkg/specs" + "k8s.io/apimachinery/pkg/util/intstr" + jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" +) + +// https://github.com/IBM/mpitrace +const ( + mpitraceIdentifier = "perf-mpitrace" +) + +type MPITrace struct { + SpackView + + // Target is the name of the replicated job to customize entrypoint logic for + target string + + // ContainerTarget is the name of the container to add the entrypoint logic to + containerTarget string +} + +func (m MPITrace) Family() string { + return AddonFamilyPerformance +} + +// AssembleVolumes to provide an empty volume for the application to share +// We also need to provide a config map volume for our container spec +func (m MPITrace) AssembleVolumes() []specs.VolumeSpec { + return m.GetSpackViewVolumes() +} + +// Validate we have an executable provided, and args and optional +func (a *MPITrace) Validate() bool { + return true +} + +// Set custom options / attributes for the metric +func (a *MPITrace) SetOptions(metric *api.MetricAddon, m *api.MetricSet) { + + a.EntrypointPath = "/metrics_operator/mpitrace-entrypoint.sh" + a.image = "ghcr.io/converged-computing/metric-mpitrace:rocky" + a.SetDefaultOptions(metric) + a.Mount = "/opt/share" + a.VolumeName = "mpitrace" + a.Identifier = mpitraceIdentifier + a.SpackViewContainer = "mpitrace" + + mount, ok := metric.Options["mount"] + if ok { + a.Mount = mount.StrVal + } + workdir, ok := metric.Options["workdir"] + if ok { + a.workdir = workdir.StrVal + } + target, ok := metric.Options["target"] + if ok { + a.target = target.StrVal + } + ctarget, ok := metric.Options["containerTarget"] + if ok { + a.containerTarget = ctarget.StrVal + } + image, ok := metric.Options["image"] + if ok { + a.image = image.StrVal + } +} + +// Exported options and list options +func (a *MPITrace) Options() map[string]intstr.IntOrString { + options := a.DefaultOptions() + options["mount"] = intstr.FromString(a.Mount) + return options +} + +// CustomizeEntrypoint scripts +func (a *MPITrace) CustomizeEntrypoints( + cs []*specs.ContainerSpec, + rjs []*jobset.ReplicatedJob, +) { + for _, rj := range rjs { + + // Only customize if the replicated job name matches the target + if a.target != "" && a.target != rj.Name { + continue + } + a.customizeEntrypoint(cs, rj) + } + +} + +// CustomizeEntrypoint for a single replicated job +func (a *MPITrace) customizeEntrypoint( + cs []*specs.ContainerSpec, + rj *jobset.ReplicatedJob, +) { + + // Generate addon metadata + meta := Metadata(a) + + // This should be run after the pre block of the script + preBlock := ` +echo "%s" +# Ensure hpcrun and software exists. This is rough, but should be OK with enough wait time +wget -q https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs +chmod +x ./wait-fs +mv ./wait-fs /usr/bin/goshare-wait-fs + +# Ensure spack view is on the path, wherever it is mounted +viewbase="%s" +software="${viewbase}/software" +viewbin="${viewbase}/view/bin" +libmpitraceso=${viewbase}/view/lib/libmpitrace.so + +# Important to add AFTER in case software in container duplicated +export PATH=$PATH:${viewbin} + +# Wait for software directory, and give it time +goshare-wait-fs -p ${software} + +# Wait for copy to finish +sleep 10 + +# Copy mount software to /opt/software +cp -R %s/software /opt/software + +# Wait for file indicator that copy is done +goshare-wait-fs -p ${viewbase}/metrics-operator-done.txt + +# A small extra wait time to be conservative +sleep 5 +echo "%s" +echo "%s" +` + preBlock = fmt.Sprintf( + preBlock, + meta, + a.Mount, + a.Mount, + metadata.CollectionStart, + metadata.Separator, + ) + + // Add the working directory, if defined + if a.workdir != "" { + preBlock += fmt.Sprintf(` +workdir="%s" +echo "Changing directory to ${workdir}" +cd ${workdir} +`, a.workdir) + } + + // We use container names to target specific entrypoint scripts here + for _, containerSpec := range cs { + + // First check - is this the right replicated job? + if containerSpec.JobName != rj.Name { + continue + } + + // Always copy over the pre block - we need the logic to copy software + containerSpec.EntrypointScript.Pre += "\n" + preBlock + + // Next check if we have a target set (for the container) + if a.containerTarget != "" && containerSpec.Name != "" && a.containerTarget != containerSpec.Name { + continue + } + + // If the post command ends with sleep infinity, tweak it + isInteractive, updatedPost := deriveUpdatedPost(containerSpec.EntrypointScript.Post) + containerSpec.EntrypointScript.Post = updatedPost + + // The post to run the command across nodes (when the application finishes) + containerSpec.EntrypointScript.Command = fmt.Sprintf( + "export LD_PRELOAD=${libmpitraceso}\n%s\nunset LD_PRELOAD", + containerSpec.EntrypointScript.Command, + ) + + // If is interactive, add back sleep infinity + if isInteractive { + containerSpec.EntrypointScript.Post += "\nsleep infinity\n" + } + } +} + +func init() { + base := AddonBase{ + Identifier: mpitraceIdentifier, + Summary: "library for measuring communication in distributed-memory parallel applications that use MPI", + } + app := ApplicationAddon{AddonBase: base} + spack := SpackView{ApplicationAddon: app} + toolkit := MPITrace{SpackView: spack} + Register(&toolkit) +} From c960f67e76ce0e028302cb54bfe71ba41798c85f Mon Sep 17 00:00:00 2001 From: vsoch Date: Sun, 22 Oct 2023 16:09:51 -0600 Subject: [PATCH 2/2] add brief docs for addon mpitrace Signed-off-by: vsoch --- docs/getting_started/addons.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/docs/getting_started/addons.md b/docs/getting_started/addons.md index 51b8db9..2e100d9 100644 --- a/docs/getting_started/addons.md +++ b/docs/getting_started/addons.md @@ -218,7 +218,7 @@ environments at this point, which is why I didn't add it. ### perf-hpctoolkit - - *[perf-hpctoolkit](https://github.com/converged-computing/metrics-operator/tree/main/examples/tests/perf-lammps-hpctoolkit)* + - *[perf-hpctoolkit](https://github.com/converged-computing/metrics-operator/tree/main/examples/addons/hpctoolkit-lammps)* This metric provides [HPCToolkit](https://gitlab.com/hpctoolkit/hpctoolkit) for your application to use. This is the first metric of its type to use a shared volume approach. Specifically, we: @@ -266,3 +266,19 @@ There is a brief listing on [this page](https://hpc.llnl.gov/software/developmen We recommend that you do not pair hpctoolkit with another metric, primarily because it is customizing the application entrypoint. If you add a process-namespace based metric, you likely need to account for the hpcrun command being the wrapper to the actual executable. + + +### perf-mpitrace + + - *[perf-mpitrace](https://github.com/converged-computing/metrics-operator/tree/main/examples/addons/perf-mpitrace)* + +This metric provides [mpitrace](https://github.com/IBM/mpitrace) to wrap an MPI application. The setup is the same as hpctoolkit, and we +currently only provide a rocky base (please let us know if you need another). It works by way of wrapping the mpirun command with `LD_PRELOAD`. +See the link above for an example that uses LAMMPS. + +Here are the acceptable parameters. + +| Name | Description | Type | Default | +|-----|-------------|------------|------| +| mount | Path to mount hpctoolview view in application container | string | /opt/share | +| image | Customize the container image | string | `ghcr.io/converged-computing/metric-mpitrace:rocky` | \ No newline at end of file