From 769ed3edb18d691ca1bde9e131c33bc49358ea70 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 00:00:20 -0600 Subject: [PATCH 1/9] installing flux view an on demand volume is almost working! Signed-off-by: vsoch --- docs/_static/data/addons.json | 5 + docs/getting_started/addons.md | 42 ++ examples/addons/flux-lammps/metrics.yaml | 32 + .../metrics-rocky.yaml | 0 .../metrics.yaml | 0 examples/addons/perf-lammps/metrics.yaml | 19 + pkg/addons/addons.go | 9 +- pkg/addons/commands.go | 8 +- pkg/addons/containers.go | 4 +- pkg/addons/flux.go | 599 ++++++++++++++++++ pkg/addons/hpctoolkit.go | 136 +--- pkg/addons/spack.go | 146 +++++ pkg/addons/volumes.go | 10 +- pkg/metrics/metrics.go | 2 +- 14 files changed, 874 insertions(+), 138 deletions(-) create mode 100644 examples/addons/flux-lammps/metrics.yaml rename examples/addons/{perf-lammps-hpctoolkit => hpctoolkit-lammps}/metrics-rocky.yaml (100%) rename examples/addons/{perf-lammps-hpctoolkit => hpctoolkit-lammps}/metrics.yaml (100%) create mode 100644 examples/addons/perf-lammps/metrics.yaml create mode 100644 pkg/addons/flux.go create mode 100644 pkg/addons/spack.go diff --git a/docs/_static/data/addons.json b/docs/_static/data/addons.json index a5a36f3..3dd73bf 100644 --- a/docs/_static/data/addons.json +++ b/docs/_static/data/addons.json @@ -43,5 +43,10 @@ "name": "volume-secret", "description": "secret volume type", "family": "volume" + }, + { + "name": "workload-flux", + "description": "hierarchical graph-based scheduler and resource manager", + "family": "workload" } ] \ No newline at end of file diff --git a/docs/getting_started/addons.md b/docs/getting_started/addons.md index 0190b82..9c845f0 100644 --- a/docs/getting_started/addons.md +++ b/docs/getting_started/addons.md @@ -157,6 +157,48 @@ spec: **Note that we have support for a custom application container, but haven't written any good examples yet!** +## Workload + +### workload-flux + +If you need to "throw in" Flux Framework into your container to use as a scheduler, you can do that with an addon! + +> Yes, it's astounding. + +This works by way of the same trick that we use for other addons that have a complex (and/or large) install setup. We: + +- Build the software into an isolated spack "copy" view +- The software is then (generally) at some `/opt/view` and `/opt/software` +- The flux container is added as a sidecar container to your pod for your replicated job + - Additional setup / configuration is done here +- We can then create an empty volume that is shared by your metric or scaled application +- The entire tree is copied over into the empty volume +- When the copy is done, indicated by the final touch of a file, the updated container entrypoint is run +- This typically means we have taken your metric command, and wrapped it in a Flux submit. + +It's really cool because it means you can run a metric / application with Flux without needing +to install it into your container to begin with. The one important detail is a matching of +general operating system. The current view uses rocky, however the image is customizable +(and we can provide other bases if/when requested). Here are the arguments you can customize +under the metric -> options. + +| Name | Description | Type | Default | +|-----|-------------|------------|------| +| mount | Path to mount flux view in application container | string | /opt/share | +| tasks | Number of tasks `-n` to give to flux (not provided if not set) | string | unset | +| image | Customize the container image | string | `ghcr.io/rse-ops/spack-flux-rocky-view:tag-8` | +| fluxUser | The flux user (currently not used, but TBA) | string | flux | +| fluxUid | The flux user ID (currently not used, but TBA) | string | 1004 | +| interactive | Run flux in interactive mode | string | "false" | +| connectTimeout | How long zeroMQ should wait to retry | string | "5s" | +| quorum | The number of brokers to require before starting the cluster | string | (total brokers or pods) | +| debugZeroMQ | Turn on zeroMQ debugging | string | "false" | +| logLevel | Customize the flux log level | string | "6" | +| queuePolicy | Queue policy for flux to use | string | fcfs | + +Note that the number of pods for flux defaults to the number in your MetricSet, along +with the namespace and service name. + ## Performance ### perf-hpctoolkit diff --git a/examples/addons/flux-lammps/metrics.yaml b/examples/addons/flux-lammps/metrics.yaml new file mode 100644 index 0000000..a099d3f --- /dev/null +++ b/examples/addons/flux-lammps/metrics.yaml @@ -0,0 +1,32 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MetricSet +metadata: + labels: + app.kubernetes.io/name: metricset + app.kubernetes.io/instance: metricset-sample + name: metricset-sample +spec: + # Number of pods for lammps (one launcher, the rest workers) + pods: 4 + logging: + interactive: true + + metrics: + + # Running more scaled lammps is our main goal + - name: app-lammps + + # This flux addon is built on rocky, and we can provide additional os bases + image: ghcr.io/converged-computing/metric-lammps-intel-mpi:rocky + + options: + command: lmp -v x 2 -v y 2 -v z 2 -in in.reaxc.hns -nocite + workdir: /opt/lammps/examples/reaxff/HNS + + # Add on hpctoolkit, will mount a volume and wrap lammps + addons: + - name: workload-flux + options: + + # Ensure the working directory is consistent + workdir: /opt/lammps/examples/reaxff/HNS \ No newline at end of file diff --git a/examples/addons/perf-lammps-hpctoolkit/metrics-rocky.yaml b/examples/addons/hpctoolkit-lammps/metrics-rocky.yaml similarity index 100% rename from examples/addons/perf-lammps-hpctoolkit/metrics-rocky.yaml rename to examples/addons/hpctoolkit-lammps/metrics-rocky.yaml diff --git a/examples/addons/perf-lammps-hpctoolkit/metrics.yaml b/examples/addons/hpctoolkit-lammps/metrics.yaml similarity index 100% rename from examples/addons/perf-lammps-hpctoolkit/metrics.yaml rename to examples/addons/hpctoolkit-lammps/metrics.yaml diff --git a/examples/addons/perf-lammps/metrics.yaml b/examples/addons/perf-lammps/metrics.yaml new file mode 100644 index 0000000..502bd87 --- /dev/null +++ b/examples/addons/perf-lammps/metrics.yaml @@ -0,0 +1,19 @@ +apiVersion: flux-framework.org/v1alpha2 +kind: MetricSet +metadata: + labels: + app.kubernetes.io/name: metricset + app.kubernetes.io/instance: metricset-sample + name: metricset-sample +spec: + # Number of pods for lammps (one launcher, the rest workers) + pods: 4 + metrics: + - name: app-lammps + image: ghcr.io/converged-computing/metric-lammps-intel-mpi:rocky + addons: + - name: commands + options: + preBlock: dnf install perf -y && sleep infinity + prefix: /opt/intel/mpi/2021.8.0/bin/mpirun --hostfile ./hostlist.txt -np 2 --map-by socket perf stat + suffix: sleep infinity diff --git a/pkg/addons/addons.go b/pkg/addons/addons.go index f68e1e4..615b1df 100644 --- a/pkg/addons/addons.go +++ b/pkg/addons/addons.go @@ -26,6 +26,7 @@ var ( AddonFamilyPerformance = "performance" AddonFamilyVolume = "volume" AddonFamilyApplication = "application" + AddonFamilyWorkload = "workload" ) // A general metric is a container added to a JobSet @@ -37,7 +38,7 @@ type Addon interface { Description() string // Options and exportable attributes - SetOptions(*api.MetricAddon) + SetOptions(*api.MetricAddon, *api.MetricSet) Options() map[string]intstr.IntOrString ListOptions() map[string][]intstr.IntOrString MapOptions() map[string]map[string]intstr.IntOrString @@ -65,7 +66,7 @@ type AddonBase struct { mapOptions map[string]map[string]intstr.IntOrString } -func (b *AddonBase) SetOptions(metric *api.MetricAddon) {} +func (b *AddonBase) SetOptions(addon *api.MetricAddon, metric *api.MetricSet) {} func (b *AddonBase) CustomizeEntrypoints([]*specs.ContainerSpec, []*jobset.ReplicatedJob) {} func (b *AddonBase) Validate() bool { @@ -97,7 +98,7 @@ func (b *AddonBase) MapOptions() map[string]map[string]intstr.IntOrString { } // GetAddon looks up and validates an addon -func GetAddon(a *api.MetricAddon) (Addon, error) { +func GetAddon(a *api.MetricAddon, set *api.MetricSet) (Addon, error) { // We don't want to change the addon interface/struct itself template, ok := Registry[a.Name] @@ -111,7 +112,7 @@ func GetAddon(a *api.MetricAddon) (Addon, error) { addon := reflect.New(templateType.Type()).Interface().(Addon) // Set options before validation - addon.SetOptions(a) + addon.SetOptions(a, set) // Validate the addon if !addon.Validate() { diff --git a/pkg/addons/commands.go b/pkg/addons/commands.go index e56f9d6..8507f6b 100644 --- a/pkg/addons/commands.go +++ b/pkg/addons/commands.go @@ -42,9 +42,9 @@ func (a *PerfAddon) CustomizeEntrypoints( } } -func (a *PerfAddon) SetOptions(metric *api.MetricAddon) { +func (a *PerfAddon) SetOptions(addon *api.MetricAddon, metric *api.MetricSet) { a.Identifier = perfCommandsName - a.SetSharedCommandOptions(metric) + a.SetSharedCommandOptions(addon) } // addContainerCaps adds capabilities to a container spec @@ -102,9 +102,9 @@ func (m CommandAddon) Family() string { return AddonFamilyApplication } -func (a *CommandAddon) SetOptions(metric *api.MetricAddon) { +func (a *CommandAddon) SetOptions(addon *api.MetricAddon, metric *api.MetricSet) { a.Identifier = commandsName - a.SetSharedCommandOptions(metric) + a.SetSharedCommandOptions(addon) } // Set custom options / attributes for the metric diff --git a/pkg/addons/containers.go b/pkg/addons/containers.go index b933290..1faa32a 100644 --- a/pkg/addons/containers.go +++ b/pkg/addons/containers.go @@ -139,8 +139,8 @@ func (a *ApplicationAddon) setDefaultEntrypoint() { } // Calling the default allows a custom application that uses this to do the same -func (a *ApplicationAddon) SetOptions(metric *api.MetricAddon) { - a.SetDefaultOptions(metric) +func (a *ApplicationAddon) SetOptions(addon *api.MetricAddon, metric *api.MetricSet) { + a.SetDefaultOptions(addon) } // Underlying function that can be shared diff --git a/pkg/addons/flux.go b/pkg/addons/flux.go new file mode 100644 index 0000000..6fa3867 --- /dev/null +++ b/pkg/addons/flux.go @@ -0,0 +1,599 @@ +/* +Copyright 2023 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) + +SPDX-License-Identifier: MIT +*/ + +package addons + +import ( + "fmt" + "strings" + + api "github.com/converged-computing/metrics-operator/api/v1alpha2" + "github.com/converged-computing/metrics-operator/pkg/metadata" + "github.com/converged-computing/metrics-operator/pkg/specs" + "k8s.io/apimachinery/pkg/util/intstr" + jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" +) + +// Flux Framework provides fully hierarchical graphs scheduler and resource manager +const ( + fluxIdentifier = "workload-flux" +) + +type FluxFramework struct { + SpackView + + // Target is the name of the replicated job to customize entrypoint logic for + // This is what determines the size of the cluster + target string + + // ContainerTarget is the name of the container to add flux to + containerTarget string + + // mount is the location to install flux to + mount string + pods int32 + jobname string + namespace string + + // flux user and id need to match between containers (both are created) + fluxUser string + fluxUid string + quorum string + tasks int32 + optionFlags string + submitCommand string + + // volumeName to provide for the empty volume + volumeName string + interactive bool + connectTimeout string + debugZeroMQ bool + logLevel string + queuePolicy string + serviceName string + jobLetter string + workerLetter string + jobIndex string +} + +func (m FluxFramework) Family() string { + return AddonFamilyWorkload +} + +// Validate we have an executable provided, and args and optional +func (a *FluxFramework) Validate() bool { + return true +} + +// GetAddFluxUser gets string text to add the flux user +// this might need to vary depending on the OS +// We might also just be irresponsible and run as root :) +func (a *FluxFramework) getAddFluxUser() string { + return fmt.Sprintf(`useradd -ms /bin/bash -u %s %s`, a.fluxUid, a.fluxUser) +} + +func (m FluxFramework) AssembleVolumes() []specs.VolumeSpec { + return m.GetSpackViewVolumes() +} + +// Set custom options / attributes for the metric +func (a *FluxFramework) SetOptions(metric *api.MetricAddon, set *api.MetricSet) { + + a.EntrypointPath = "/metrics_operator/flux-entrypoint.sh" + a.image = "ghcr.io/rse-ops/spack-flux-rocky-view:tag-8" + a.SetDefaultOptions(metric) + a.Mount = "/opt/share" + a.VolumeName = "flux-volume" + a.Identifier = fluxIdentifier + a.fluxUser = "flux" + a.fluxUid = "1004" + a.interactive = false + a.connectTimeout = "5s" + a.debugZeroMQ = false + a.logLevel = "6" + a.pods = set.Spec.Pods + a.jobname = set.Name + a.namespace = set.Namespace + a.serviceName = set.Spec.ServiceName + a.queuePolicy = "fcfs" + a.SpackViewContainer = "flux-framework" + a.jobIndex = "0" + a.jobLetter = "l" + a.workerLetter = "w" + a.quorum = fmt.Sprintf("%d", a.pods) + + // e.g., can be changed to mini submit or run + a.submitCommand = "submit" + + // UseColor set to anything means to use it + mount, ok := metric.Options["mount"] + if ok { + a.mount = mount.StrVal + } + submit, ok := metric.Options["submit"] + if ok { + a.submitCommand = submit.StrVal + } + tasks, ok := metric.Options["tasks"] + if ok { + a.tasks = tasks.IntVal + } + fluxUid, ok := metric.Options["fluxUid"] + if ok { + a.fluxUid = fluxUid.StrVal + } + fluxuser, ok := metric.Options["fluxUser"] + if ok { + a.fluxUser = fluxuser.StrVal + } + + workdir, ok := metric.Options["workdir"] + if ok { + a.workdir = workdir.StrVal + } + logLevel, ok := metric.Options["logLevel"] + if ok { + a.logLevel = logLevel.StrVal + } + target, ok := metric.Options["target"] + if ok { + a.target = target.StrVal + } + ctarget, ok := metric.Options["containerTarget"] + if ok { + a.containerTarget = ctarget.StrVal + } + image, ok := metric.Options["image"] + if ok { + a.image = image.StrVal + } + quorum, ok := metric.Options["quorum"] + if ok { + a.quorum = quorum.StrVal + } + ct, ok := metric.Options["connectTimeout"] + if ok { + a.connectTimeout = ct.StrVal + } + opts, ok := metric.Options["optionFlags"] + if ok { + a.optionFlags = opts.StrVal + } + interactive, ok := metric.Options["interactive"] + if ok { + if interactive.StrVal == "yes" || interactive.StrVal == "true" { + a.interactive = true + } + } + zmq, ok := metric.Options["debugZeroMQ"] + if ok { + if zmq.StrVal == "yes" || zmq.StrVal == "true" { + a.debugZeroMQ = true + } + } + + // Create setup logic for flux from the view + a.setSetup() +} + +// generateRange is a shared function to generate a range string +func generateRange(size int32, start int32) string { + var rangeString string + if size == 1 { + rangeString = fmt.Sprintf("%d", start) + } else { + rangeString = fmt.Sprintf("%d-%d", start, (start+size)-1) + } + return rangeString +} + +// setSetup assumes flux installed in the view (/opt/view/bin)) and runs additional setup +// This includes generating the broker config, the curve certificate, and other config assets +func (a *FluxFramework) setSetup() { + + // fluxRoot for the view is in /opt/view/lib + fluxRoot := "/opt/view" + + // Generate hostlists, this is the lead broker + leadBroker := fmt.Sprintf("%s-%s-%s-0", a.jobname, a.jobLetter, a.jobIndex) + workers := fmt.Sprintf("%s-%s-%s-[%s]", a.jobname, a.workerLetter, a.jobIndex, generateRange(a.pods-1, 0)) + hosts := fmt.Sprintf("%s,%s", leadBroker, workers) + fqdn := fmt.Sprintf("%s.%s.svc.cluster.local", a.serviceName, a.namespace) + + // These shouldn't be formatted in block + defaultBind := "tcp://eth0:%p" + defaultConnect := "tcp://%h" + fmt.Sprintf(".%s:", fqdn) + "%p" + + setup := `#!/bin/sh +fluxuser=%s +fluxuid=%s +fluxroot=%s + +# The mount for the view will be at the user defined mount / view +mount="%s/view" + +# Always use verbose, no reason to not here +echo "Flux username: ${fluxuser}" +echo "Flux install root: ${fluxroot}" +export fluxroot + +# Add flux to the path +export PATH=/opt/view/bin:$PATH + +# Cron directory +mkdir -p $fluxroot/etc/flux/system/cron.d +mkdir -p $fluxroot/var/lib/flux + +# These actions need to happen on all hosts +mkdir -p $fluxroot/etc/flux/system +hosts="%s" +echo "flux R encode --hosts=${hosts} --local" +flux R encode --hosts=${hosts} --local > ${fluxroot}/etc/flux/system/R + +echo +echo "đŸ“Ļ Resources" +cat ${fluxroot}/etc/flux/system/R + +mkdir -p $fluxroot/etc/flux/imp/conf.d/ + +cat <> ${fluxroot}/etc/flux/imp/conf.d/imp.toml +[exec] +allowed-users = [ "${fluxuser}", "root" ] +allowed-shells = [ "${mount}/libexec/flux/flux-shell" ] +EOT + +echo +echo "đŸĻŠ Independent Minister of Privilege" +cat ${fluxroot}/etc/flux/imp/conf.d/imp.toml + +# Write the broker configuration +mkdir -p ${fluxroot}/etc/flux/config +cat <> ${fluxroot}/etc/flux/config/broker.toml +[exec] +imp = "${mount}/libexec/flux/flux-imp" + +[access] +allow-guest-user = true +allow-root-owner = true + +# Point to resource definition generated with flux-R(1). +[resource] +path = "${mount}/etc/flux/system/R" + +[bootstrap] +curve_cert = "${mount}/etc/curve/curve.cert" +default_port = 8050 +default_bind = "%s" +default_connect = "%s" +hosts = [ + { host="${hosts}"}, +] +[archive] +dbpath = "${mount}/var/lib/flux/job-archive.sqlite" +period = "1m" +busytimeout = "50s" + +[sched-fluxion-qmanager] +queue-policy = "%s" +EOT + +echo +echo "🐸 Broker Configuration" +cat ${fluxroot}/etc/flux/config/broker.toml + +# If we are communicating via the flux uri this service needs to be started +chmod u+s ${fluxroot}/libexec/flux/flux-imp +chmod 4755 ${fluxroot}/libexec/flux/flux-imp +chmod 0644 ${fluxroot}/etc/flux/imp/conf.d/imp.toml + +# The rundir needs to be created first, and owned by user flux +# Along with the state directory and curve certificate +mkdir -p ${fluxroot}/run/flux ${fluxroot}/etc/curve + +# Generate the certificate +mkdir -p ${fluxroot}/etc/curve +$fluxroot/bin/flux keygen ${fluxroot}/etc/curve/curve.cert + +# Remove group and other read +chmod o-r ${fluxroot}/etc/curve/curve.cert +chmod g-r ${fluxroot}/etc/curve/curve.cert + +# Either the flux user owns the instance, or root +# We must get the correct flux user id - this user needs to own +# the run directory and these others +chown -R ${fluxuid} ${fluxroot}/etc/curve/curve.cert + +echo +echo "✨ Curve certificate" +cat ${fluxroot}/etc/curve/curve.cert +` + + setup = fmt.Sprintf( + setup, + a.fluxUser, + a.fluxUid, + fluxRoot, + a.Mount, + hosts, + defaultBind, + defaultConnect, + a.queuePolicy, + ) + a.Setup = setup +} + +// Exported options and list options +func (a *FluxFramework) Options() map[string]intstr.IntOrString { + options := a.DefaultOptions() + options["mount"] = intstr.FromString(a.mount) + return options +} + +// CustomizeEntrypoint scripts +func (a *FluxFramework) CustomizeEntrypoints( + cs []*specs.ContainerSpec, + rjs []*jobset.ReplicatedJob, +) { + for _, rj := range rjs { + + // Only customize if the replicated job name matches the target + if a.target != "" && a.target != rj.Name { + continue + } + a.customizeEntrypoint(cs, rj) + } + +} + +// CustomizeEntrypoint for a single replicated job +// This is the portion that customizes our application to be run / submit by flux instead of by itself :) +func (a *FluxFramework) customizeEntrypoint( + cs []*specs.ContainerSpec, + rj *jobset.ReplicatedJob, +) { + + // Generate addon metadata + meta := Metadata(a) + + interactive := "" + if a.interactive { + interactive = "-Sbroker.rc2_none" + } + zeromq := "" + if a.debugZeroMQ { + zeromq = "-Stbon.zmqdebug=1" + } + + // This assumes a certain launcher letter for now + // TODO allow to customize letter + leadBroker := fmt.Sprintf("%s-%s-%s-0", a.jobname, a.jobLetter, a.jobIndex) + + // Watch only works with submit + watch := "" + if strings.Contains(a.submitCommand, "submit") { + watch = "--watch" + } + + // Prepare flags for flux + flags := "" + if (a.tasks != 0 && a.tasks > a.pods) || a.tasks == 0 { + flags = fmt.Sprintf(" -N %d", a.pods) + } + if a.tasks != 0 { + flags += fmt.Sprintf(" -n %d %s -vvv", a.tasks, a.optionFlags) + } else { + flags += fmt.Sprintf(" %s -vvv", a.optionFlags) + } + + // This should be run after the pre block of the script + preBlock := ` +echo "%s" +# Ensure the flux volume addition is complete. +wget https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs +chmod +x ./wait-fs +mv ./wait-fs /usr/bin/goshare-wait-fs + +# Ensure spack view is on the path, wherever it is mounted +viewbase="%s" +viewroot=${viewbase}/view +software="${viewbase}/software" +viewbin="${viewroot}/bin" +fluxpath=${viewbin}/flux + +# Important to add AFTER in case software in container duplicated +export PATH=$PATH:${viewbin} + +# Wait for software directory, and give it time +goshare-wait-fs -p ${software} + +# Wait for copy to finish +sleep 10 + +# Copy mount software to /opt/software +cp -R ${viewbase}/software /opt/software + +# Wait for marker (from spack.go) to indicate copy is done +goshare-wait-fs -p ${fluxpath} +goshare-wait-fs -p ${viewbase}/metrics-operator-done.txt + +# A small extra wait time to be conservative +sleep 5 + +# Prefix to run as root (which we will do first) +fluxuser="%s" +fluxuid="%s" + +# Add a flux user (required) that should exist before pre-command +# This might vary between OS +adduser --disabled-password --uid ${fluxuid} --gecos "" ${fluxuser} > /dev/null 2>&1 || echo "Issue adding ${fluxuser}" + +# Add view to default LD_LIBRARY_PATH +export LD_LIBRARY_PATH=${viewroot}/lib:${viewroot}/lib64 + +# Ensure we use flux's python (TODO update this to use variable) +export PYTHONPATH=${viewroot}/lib/python3.11:${viewroot}/lib/python3.11/site-packages + +# We could install pip... +# /opt/share/view/bin/python3.11 -m ensurepip + +asSudo="sudo -E PYTHONPATH=$PYTHONPATH -E PATH=$PATH -E LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" +asFlux="${asSudo}" + +# If we aren't running as root, TODO +# asFlux="${asSudo} -E HOME=/home/${fluxuser}" +# asFlux="sudo -u ${fluxuser} -E PYTHONPATH=$PYTHONPATH -E PATH=$PATH -E LD_LIBRARY_PATH=${LD_LIBRARY_PATH} -E HOME=/home/${fluxuser}" + +# Add fluxuser to sudoers +echo "${fluxuser} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers + +# Put the state directory in /var/lib on shared view +export STATE_DIR=${viewroot}/var/lib/flux +mkdir -p ${STATE_DIR} + +# Main host -0 and the fully qualified domain name +mainHost="%s" + +echo "👋 Hello, I'm $(hostname)" +echo "The main host is ${mainHost}" + +workdir=$(pwd) +echo "The working directory is ${workdir}, contents include:" +ls -R ${workdir} + +# Try to support debian / rocky flavor +# This is the weakest point - it takes a long time to install with dnf +/usr/bin/yum install munge -y || apt-get install -y munge +systemctl enable munge && systemctl start munge || service munge start + +# Use root for now, easier :) +# fluxuid=$(id -u ${fluxuser}) +# chown -R ${fluxuid} /run/flux ${STATE_DIR} ${viewbin}/etc/curve/curve.cert ${workdir} + +brokerOptions="-Scron.directory=/etc/flux/system/cron.d \ + -Stbon.fanout=256 \ + -Srundir=/run/flux %s \ + -Sstatedir=${STATE_DIR} \ + -Slocal-uri=local:///run/flux/local \ + -Stbon.connect_timeout=%s \ + -Sbroker.quorum=%s %s \ + -Slog-stderr-level=%s \ + -Slog-stderr-mode=local" + +# Run an interactive cluster, giving no command to flux start +function run_interactive_cluster() { + echo "🌀 ${asFlux} flux broker --config-path /etc/flux/config ${brokerOptions}" + ${asFlux} flux broker --config-path /etc/flux/config ${brokerOptions} +} + +flags="%s" +watch="%s" +submit="%s" + +# Start flux with the original entrypoint +if [ $(hostname) == "${mainHost}" ]; then + + echo "Command provided is: ${command}" + if [ "${command}" == "" ]; then + + # An interactive job also doesn't require a command + run_interactive_cluster + + else + # TODO we can add --wrap here if needed + echo "🌀 Submit Mode: flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions} flux ${submit} ${flags} ${watch} --quiet -vvv ${command}" + flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions} flux ${submit} ${flags} --quiet ${watch} -vvv ${command} + fi + +# Block run by workers +else + +# We basically sleep/wait until the lead broker is ready +echo "🌀 flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions}" + +while true + do + flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions} + retval=$? + echo "Return value for follower worker is ${retval}" + if [[ "${retval}" -eq 0 ]]; then + echo "The follower worker exited cleanly. Goodbye." + break + fi + echo "đŸ˜Ē Sleeping 15s until broker is ready..." + sleep 15 +done +fi + +echo "%s" +echo "%s" +` + preBlock = fmt.Sprintf( + preBlock, + meta, + a.Mount, + a.fluxUser, + a.fluxUid, + leadBroker, + interactive, + a.connectTimeout, + a.quorum, + zeromq, + a.logLevel, + flags, + watch, + a.submitCommand, + metadata.CollectionStart, + metadata.Separator, + ) + + // Flux needs this set to false + setFQDN := false + + // We use container names to target specific entrypoint scripts here + for _, containerSpec := range cs { + + // First check - is this the right replicated job? + if containerSpec.JobName != rj.Name { + continue + } + rj.Template.Spec.Template.Spec.SetHostnameAsFQDN = &setFQDN + + // Always copy over the pre block - we need the logic to copy software + // Then we need to add the command,a nd finish with the full preBlock + // The command is given to flux! + command := containerSpec.EntrypointScript.Command + containerSpec.EntrypointScript.Pre += "\n" + fmt.Sprintf("command='%s'", command) + "\n" + preBlock + + // Next check if we have a target set (for the container) + if a.containerTarget != "" && containerSpec.Name != "" && a.containerTarget != containerSpec.Name { + continue + } + + // If the post command ends with sleep infinity, tweak it + isInteractive, updatedPost := deriveUpdatedPost(containerSpec.EntrypointScript.Post) + containerSpec.EntrypointScript.Post = updatedPost + + // We will never get to command, so just make it empty + containerSpec.EntrypointScript.Command = "" + + // If is interactive, add back sleep infinity + if isInteractive { + containerSpec.EntrypointScript.Post += "\nsleep infinity\n" + } + } +} + +func init() { + base := AddonBase{ + Identifier: fluxIdentifier, + Summary: "hierarchical graph-based scheduler and resource manager", + } + app := ApplicationAddon{AddonBase: base} + spack := SpackView{ApplicationAddon: app} + flux := FluxFramework{SpackView: spack} + Register(&flux) +} diff --git a/pkg/addons/hpctoolkit.go b/pkg/addons/hpctoolkit.go index 25f32a6..45452bc 100644 --- a/pkg/addons/hpctoolkit.go +++ b/pkg/addons/hpctoolkit.go @@ -9,13 +9,11 @@ package addons import ( "fmt" - "path/filepath" "strings" api "github.com/converged-computing/metrics-operator/api/v1alpha2" "github.com/converged-computing/metrics-operator/pkg/metadata" "github.com/converged-computing/metrics-operator/pkg/specs" - corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/util/intstr" jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" ) @@ -30,7 +28,7 @@ const ( ) type HPCToolkit struct { - ApplicationAddon + SpackView // Target is the name of the replicated job to customize entrypoint logic for target string @@ -45,9 +43,6 @@ type HPCToolkit struct { // ContainerTarget is the name of the container to add the entrypoint logic to containerTarget string events string - mount string - entrypointPath string - volumeName string // For mpirun and similar, mpirun needs to wrap hpcrun and the command, e.g., // mpirun hpcrun @@ -61,48 +56,7 @@ func (m HPCToolkit) Family() string { // AssembleVolumes to provide an empty volume for the application to share // We also need to provide a config map volume for our container spec func (m HPCToolkit) AssembleVolumes() []specs.VolumeSpec { - volume := corev1.Volume{ - Name: m.volumeName, - VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, - }, - } - - // Prepare items as key to path - items := []corev1.KeyToPath{ - { - Key: m.volumeName, - Path: filepath.Base(m.entrypointPath), - }, - } - - // This is a config map volume with items - // It needs to be created in the same metrics operator namespace - // Thus we only need the items! - configVolume := corev1.Volume{ - VolumeSource: corev1.VolumeSource{ - ConfigMap: &corev1.ConfigMapVolumeSource{ - Items: items, - }, - }, - } - - // EmptyDir should be ReadOnly False, and we don't need a mount for it - return []specs.VolumeSpec{ - { - Volume: volume, - Mount: true, - Path: m.mount, - }, - - // Mount is set to false here because we mount via metrics_operator - { - Volume: configVolume, - ReadOnly: true, - Mount: false, - Path: filepath.Dir(m.entrypointPath), - }, - } + return m.GetSpackViewVolumes() } // Validate we have an executable provided, and args and optional @@ -115,16 +69,17 @@ func (a *HPCToolkit) Validate() bool { } // Set custom options / attributes for the metric -func (a *HPCToolkit) SetOptions(metric *api.MetricAddon) { +func (a *HPCToolkit) SetOptions(metric *api.MetricAddon, m *api.MetricSet) { - a.entrypointPath = "/metrics_operator/hpctoolkit-entrypoint.sh" + a.EntrypointPath = "/metrics_operator/hpctoolkit-entrypoint.sh" a.image = "ghcr.io/converged-computing/metric-hpctoolkit-view:ubuntu" a.SetDefaultOptions(metric) - a.mount = "/opt/share" - a.volumeName = "hpctoolkit" + a.Mount = "/opt/share" + a.VolumeName = "hpctoolkit" a.output = "hpctoolkit-result" a.postAnalysis = true a.Identifier = hpctoolkitIdentifier + a.SpackViewContainer = "hpctoolkit" // UseColor set to anything means to use it output, ok := metric.Options["output"] @@ -133,7 +88,7 @@ func (a *HPCToolkit) SetOptions(metric *api.MetricAddon) { } mount, ok := metric.Options["mount"] if ok { - a.mount = mount.StrVal + a.Mount = mount.StrVal } prefix, ok := metric.Options["prefix"] if ok { @@ -172,7 +127,7 @@ func (a *HPCToolkit) SetOptions(metric *api.MetricAddon) { func (a *HPCToolkit) Options() map[string]intstr.IntOrString { options := a.DefaultOptions() options["events"] = intstr.FromString(a.events) - options["mount"] = intstr.FromString(a.mount) + options["mount"] = intstr.FromString(a.Mount) options["prefix"] = intstr.FromString(a.prefix) return options } @@ -262,8 +217,8 @@ echo "%s" preBlock = fmt.Sprintf( preBlock, meta, - a.mount, - a.mount, + a.Mount, + a.Mount, a.output, a.events, metadata.CollectionStart, @@ -337,76 +292,13 @@ func deriveUpdatedPost(post string) (bool, string) { return false, post } -// Generate a container spec that will map to a listing of containers for the replicated job -func (a *HPCToolkit) AssembleContainers() []specs.ContainerSpec { - - // The entrypoint script - // This is the addon container entrypoint, we don't care about metadata here - // The sole purpose is just to provide the volume, meaning copying content there - template := `#!/bin/bash - -echo "Moving content from /opt/view to be in shared volume at %s" -view=$(ls /opt/views/._view/) -view="/opt/views/._view/${view}" - -# Give a little extra wait time -sleep 10 - -viewroot="%s" -mkdir -p $viewroot/view -# We have to move both of these paths, *sigh* -cp -R ${view}/* $viewroot/view -cp -R /opt/software $viewroot/ - -# This is a marker to indicate the copy is done -touch $viewroot/metrics-operator-done.txt - -# Sleep forever, the application needs to run and end -echo "Sleeping forever so %s can be shared and use for hpctoolkit." -sleep infinity -` - script := fmt.Sprintf( - template, - a.mount, - a.mount, - a.mount, - ) - - // Leave the name empty to generate in the namespace of the metric set (e.g., set.Name) - entrypoint := specs.EntrypointScript{ - Name: a.volumeName, - Path: a.entrypointPath, - Script: filepath.Base(a.entrypointPath), - Pre: script, - } - - // The resource spec and attributes for now are empty (might redo this design) - // We assume they inherit the resources / attributes of the pod for now - // We don't use JobName here because we don't associate addon containers - // with other addon entrypoints - return []specs.ContainerSpec{ - { - Image: a.image, - Name: "hpctoolkit", - EntrypointScript: entrypoint, - Resources: &api.ContainerResources{}, - Attributes: &api.ContainerSpec{ - SecurityContext: api.SecurityContext{ - Privileged: a.privileged, - }, - }, - // We need to write this config map! - NeedsWrite: true, - }, - } -} - func init() { base := AddonBase{ Identifier: hpctoolkitIdentifier, Summary: "performance tools for measurement and analysis", } app := ApplicationAddon{AddonBase: base} - HPCToolkit := HPCToolkit{ApplicationAddon: app} - Register(&HPCToolkit) + spack := SpackView{ApplicationAddon: app} + toolkit := HPCToolkit{SpackView: spack} + Register(&toolkit) } diff --git a/pkg/addons/spack.go b/pkg/addons/spack.go new file mode 100644 index 0000000..636643c --- /dev/null +++ b/pkg/addons/spack.go @@ -0,0 +1,146 @@ +/* +Copyright 2023 Lawrence Livermore National Security, LLC + (c.f. AUTHORS, NOTICE.LLNS, COPYING) + +SPDX-License-Identifier: MIT +*/ + +package addons + +import ( + "fmt" + "path/filepath" + + api "github.com/converged-computing/metrics-operator/api/v1alpha2" + "github.com/converged-computing/metrics-operator/pkg/specs" + corev1 "k8s.io/api/core/v1" +) + +// A spack view expects to copy a view from /opt/view into a mount +// This is a virtual struct in that it just provides shared functions for others +type SpackView struct { + ApplicationAddon + + Setup string + SpackViewContainer string + VolumeName string + EntrypointPath string + Mount string +} + +// Generate a container spec that will map to a listing of containers for the replicated job +func (a *SpackView) AssembleContainers() []specs.ContainerSpec { + + // The entrypoint script + // This is the addon container entrypoint, we don't care about metadata here + // The sole purpose is just to provide the volume, meaning copying content there + template := `#!/bin/bash + +# Extra setup (optional) for a spack view +%s + +echo "Moving content from /opt/view to be in shared volume at %s" +view=$(ls /opt/views/._view/) +view="/opt/views/._view/${view}" + +# Give a little extra wait time +sleep 10 + +viewroot="%s" +mkdir -p $viewroot/view +# We have to move both of these paths, *sigh* +cp -R ${view}/* $viewroot/view +cp -R /opt/software $viewroot/ + +# This is a marker to indicate the copy is done +touch $viewroot/metrics-operator-done.txt + +# Sleep forever, the application needs to run and end +echo "Sleeping forever so %s can be shared and use for %s." +sleep infinity +` + script := fmt.Sprintf( + template, + a.Setup, + a.Mount, + a.Mount, + a.Mount, + a.Identifier, + ) + + // Leave the name empty to generate in the namespace of the metric set (e.g., set.Name) + entrypoint := specs.EntrypointScript{ + Name: a.VolumeName, + Path: a.EntrypointPath, + Script: filepath.Base(a.EntrypointPath), + Pre: script, + } + + // The resource spec and attributes for now are empty (might redo this design) + // We assume they inherit the resources / attributes of the pod for now + // We don't use JobName here because we don't associate addon containers + // with other addon entrypoints + return []specs.ContainerSpec{ + { + Image: a.image, + Name: a.SpackViewContainer, + EntrypointScript: entrypoint, + Resources: &api.ContainerResources{}, + Attributes: &api.ContainerSpec{ + SecurityContext: api.SecurityContext{ + Privileged: a.privileged, + }, + }, + // We need to write this config map! + NeedsWrite: true, + }, + } +} + +// AssembleVolumes to provide an empty volume for the application to share +// We also need to provide a config map volume for our container spec +func (m *SpackView) GetSpackViewVolumes() []specs.VolumeSpec { + + volume := corev1.Volume{ + Name: m.VolumeName, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + } + + // Prepare items as key to path + items := []corev1.KeyToPath{ + { + Key: m.VolumeName, + Path: filepath.Base(m.EntrypointPath), + }, + } + + // This is a config map volume with items + // It needs to be created in the same metrics operator namespace + // Thus we only need the items! + configVolume := corev1.Volume{ + VolumeSource: corev1.VolumeSource{ + ConfigMap: &corev1.ConfigMapVolumeSource{ + Items: items, + }, + }, + } + + // EmptyDir should be ReadOnly False, and we don't need a mount for it + return []specs.VolumeSpec{ + { + Volume: volume, + Mount: true, + Path: m.Mount, + }, + + // Mount is set to false here because we mount via metrics_operator + { + Volume: configVolume, + ReadOnly: true, + Mount: false, + Path: filepath.Dir(m.EntrypointPath), + }, + } +} diff --git a/pkg/addons/volumes.go b/pkg/addons/volumes.go index 38825dc..5c434fe 100644 --- a/pkg/addons/volumes.go +++ b/pkg/addons/volumes.go @@ -104,7 +104,7 @@ func (v *ConfigMapVolume) Validate() bool { } // Set custom options / attributes for the metric -func (v *ConfigMapVolume) SetOptions(metric *api.MetricAddon) { +func (v *ConfigMapVolume) SetOptions(metric *api.MetricAddon, m *api.MetricSet) { v.Identifier = cmName @@ -199,7 +199,7 @@ func (v *PersistentVolumeClaim) Validate() bool { } // Set custom options / attributes -func (v *PersistentVolumeClaim) SetOptions(metric *api.MetricAddon) { +func (v *PersistentVolumeClaim) SetOptions(metric *api.MetricAddon, m *api.MetricSet) { v.Identifier = pvcName @@ -246,7 +246,7 @@ func (v *SecretVolume) Validate() bool { } // Set custom options / attributes -func (v *SecretVolume) SetOptions(metric *api.MetricAddon) { +func (v *SecretVolume) SetOptions(metric *api.MetricAddon, m *api.MetricSet) { v.Identifier = secretName secretName, ok := metric.Options["secretName"] @@ -292,7 +292,7 @@ func (v *HostPathVolume) Validate() bool { } // Set custom options / attributes -func (v *HostPathVolume) SetOptions(metric *api.MetricAddon) { +func (v *HostPathVolume) SetOptions(metric *api.MetricAddon, m *api.MetricSet) { v.Identifier = hostPathName @@ -333,7 +333,7 @@ func (v *EmptyVolume) Validate() bool { } // Set custom options / attributes -func (v *EmptyVolume) SetOptions(metric *api.MetricAddon) { +func (v *EmptyVolume) SetOptions(metric *api.MetricAddon, m *api.MetricSet) { v.Identifier = emptyName name, ok := metric.Options["name"] if ok { diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index a86db07..d43f083 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -86,7 +86,7 @@ func GetMetric(metric *api.Metric, set *api.MetricSet) (Metric, error) { for _, a := range metric.Addons { logger.Infof("Attempting to add addon %s", a.Name) - addon, err := addons.GetAddon(&a) + addon, err := addons.GetAddon(&a, set) if err != nil { return nil, fmt.Errorf("Addon %s for metric %s did not validate", a.Name, metric.Name) } From 0acf43e829aec884837b96c837d1ae21222e62b6 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 00:07:35 -0600 Subject: [PATCH 2/9] almost working! Need to debug why workers are joining but not being seen Signed-off-by: vsoch --- config/manager/kustomization.yaml | 2 +- examples/addons/perf-lammps/metrics.yaml | 19 ------------------- pkg/addons/flux.go | 14 ++++++++------ 3 files changed, 9 insertions(+), 26 deletions(-) delete mode 100644 examples/addons/perf-lammps/metrics.yaml diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index efe054b..10a2d40 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,4 +5,4 @@ kind: Kustomization images: - name: controller newName: ghcr.io/converged-computing/metrics-operator - newTag: latest + newTag: test diff --git a/examples/addons/perf-lammps/metrics.yaml b/examples/addons/perf-lammps/metrics.yaml deleted file mode 100644 index 502bd87..0000000 --- a/examples/addons/perf-lammps/metrics.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: flux-framework.org/v1alpha2 -kind: MetricSet -metadata: - labels: - app.kubernetes.io/name: metricset - app.kubernetes.io/instance: metricset-sample - name: metricset-sample -spec: - # Number of pods for lammps (one launcher, the rest workers) - pods: 4 - metrics: - - name: app-lammps - image: ghcr.io/converged-computing/metric-lammps-intel-mpi:rocky - addons: - - name: commands - options: - preBlock: dnf install perf -y && sleep infinity - prefix: /opt/intel/mpi/2021.8.0/bin/mpirun --hostfile ./hostlist.txt -np 2 --map-by socket perf stat - suffix: sleep infinity diff --git a/pkg/addons/flux.go b/pkg/addons/flux.go index 6fa3867..d614fbc 100644 --- a/pkg/addons/flux.go +++ b/pkg/addons/flux.go @@ -378,11 +378,12 @@ func (a *FluxFramework) customizeEntrypoint( watch = "--watch" } - // Prepare flags for flux + // Prepare flags for flux. First, add big N if it's > pods, OR not set flags := "" if (a.tasks != 0 && a.tasks > a.pods) || a.tasks == 0 { flags = fmt.Sprintf(" -N %d", a.pods) } + // Little n only gets added if it is set if a.tasks != 0 { flags += fmt.Sprintf(" -n %d %s -vvv", a.tasks, a.optionFlags) } else { @@ -392,6 +393,12 @@ func (a *FluxFramework) customizeEntrypoint( // This should be run after the pre block of the script preBlock := ` echo "%s" + +# Try to support debian / rocky flavor +# This is the weakest point - it takes a long time to install with dnf +/usr/bin/yum install munge -y || apt-get install -y munge || echo "Issue installing munge, might already be installed." +systemctl enable munge && systemctl start munge || service munge start || echo "Issue starting munge, might already be started." + # Ensure the flux volume addition is complete. wget https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs chmod +x ./wait-fs @@ -464,11 +471,6 @@ workdir=$(pwd) echo "The working directory is ${workdir}, contents include:" ls -R ${workdir} -# Try to support debian / rocky flavor -# This is the weakest point - it takes a long time to install with dnf -/usr/bin/yum install munge -y || apt-get install -y munge -systemctl enable munge && systemctl start munge || service munge start - # Use root for now, easier :) # fluxuid=$(id -u ${fluxuser}) # chown -R ${fluxuid} /run/flux ${STATE_DIR} ${viewbin}/etc/curve/curve.cert ${workdir} From b4b59dc8c5832d935c4ce6cb56b244a0d35bea13 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 00:24:48 -0600 Subject: [PATCH 3/9] spelling Signed-off-by: vsoch --- config/manager/kustomization.yaml | 2 +- pkg/addons/flux.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 10a2d40..efe054b 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,4 +5,4 @@ kind: Kustomization images: - name: controller newName: ghcr.io/converged-computing/metrics-operator - newTag: test + newTag: latest diff --git a/pkg/addons/flux.go b/pkg/addons/flux.go index d614fbc..ac6b872 100644 --- a/pkg/addons/flux.go +++ b/pkg/addons/flux.go @@ -80,7 +80,7 @@ func (m FluxFramework) AssembleVolumes() []specs.VolumeSpec { return m.GetSpackViewVolumes() } -// Set custom options / attributes for the metric +// Set custom options / attributes for the addon metric func (a *FluxFramework) SetOptions(metric *api.MetricAddon, set *api.MetricSet) { a.EntrypointPath = "/metrics_operator/flux-entrypoint.sh" @@ -565,7 +565,7 @@ echo "%s" rj.Template.Spec.Template.Spec.SetHostnameAsFQDN = &setFQDN // Always copy over the pre block - we need the logic to copy software - // Then we need to add the command,a nd finish with the full preBlock + // Then we need to add the command, and finish with the full preBlock // The command is given to flux! command := containerSpec.EntrypointScript.Command containerSpec.EntrypointScript.Pre += "\n" + fmt.Sprintf("command='%s'", command) + "\n" + preBlock From ba27b5030881db53b6bba6d84cbb235ec0313654 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 07:50:46 -0600 Subject: [PATCH 4/9] add support for preCommand and ld library path Signed-off-by: vsoch --- docs/getting_started/addons.md | 12 ++ examples/addons/flux-lammps/metrics.yaml | 8 ++ pkg/addons/flux.go | 138 ++++++++++++++++++----- 3 files changed, 127 insertions(+), 31 deletions(-) diff --git a/docs/getting_started/addons.md b/docs/getting_started/addons.md index 9c845f0..27ee0e8 100644 --- a/docs/getting_started/addons.md +++ b/docs/getting_started/addons.md @@ -195,10 +195,22 @@ under the metric -> options. | debugZeroMQ | Turn on zeroMQ debugging | string | "false" | | logLevel | Customize the flux log level | string | "6" | | queuePolicy | Queue policy for flux to use | string | fcfs | +| workerLetter | The letter that the worker job is expected to have | string | w | +| launcherLetter | The letter that the launcher job is expected to have | string | w | +| workerIndex | The index of the replicated job for the worker | string | 0 | +| launcherIndex | The index of the replicated job for the launcher | string | 0 | +| libraryPath | Extra to add to the start of the `LD_LIBRARY_PATH` | string | unset | +| preCommand | Pre-command logic to run in launcher/workers before flux is started (after setup in flux container) | string | unset | Note that the number of pods for flux defaults to the number in your MetricSet, along with the namespace and service name. +**Important** the flux addon is currently supported for metric types that: + +1. have the launcher / worker design (so the hostlist.txt is present in the PWD) +2. Have scp installed, as the shared certificate needs to be copied from the lead broker to all followers +3. Ideally have munge installed - we do try to install it (but better to already be there) + ## Performance ### perf-hpctoolkit diff --git a/examples/addons/flux-lammps/metrics.yaml b/examples/addons/flux-lammps/metrics.yaml index a099d3f..89feaa2 100644 --- a/examples/addons/flux-lammps/metrics.yaml +++ b/examples/addons/flux-lammps/metrics.yaml @@ -27,6 +27,14 @@ spec: addons: - name: workload-flux options: + # Needed for intel MPI I think? + preCommand: | + export PMI_TRACE=0xff + # export I_MPI_PMI_LIBRARY=/opt/intel/mpi/latest/lib/libmpicxx.so + export I_MPI_PMI_LIBRARY=${fluxroot}/lib/flux/libpmi.so + # optionFlags: -ompi=pmix + # Ensure library path (with MPI) is used first + libraryPath: /opt/intel/mpi/latest/lib # Ensure the working directory is consistent workdir: /opt/lammps/examples/reaxff/HNS \ No newline at end of file diff --git a/pkg/addons/flux.go b/pkg/addons/flux.go index ac6b872..574aa87 100644 --- a/pkg/addons/flux.go +++ b/pkg/addons/flux.go @@ -46,6 +46,10 @@ type FluxFramework struct { tasks int32 optionFlags string submitCommand string + preCommand string + + // Extra to add to LD_LIBRARY_PATH first + libraryPath string // volumeName to provide for the empty volume volumeName string @@ -55,9 +59,10 @@ type FluxFramework struct { logLevel string queuePolicy string serviceName string - jobLetter string + launcherLetter string workerLetter string - jobIndex string + workerIndex string + launcherIndex string } func (m FluxFramework) Family() string { @@ -101,15 +106,30 @@ func (a *FluxFramework) SetOptions(metric *api.MetricAddon, set *api.MetricSet) a.serviceName = set.Spec.ServiceName a.queuePolicy = "fcfs" a.SpackViewContainer = "flux-framework" - a.jobIndex = "0" - a.jobLetter = "l" + a.launcherIndex = "0" + a.workerIndex = "0" + a.launcherLetter = "l" a.workerLetter = "w" a.quorum = fmt.Sprintf("%d", a.pods) // e.g., can be changed to mini submit or run a.submitCommand = "submit" - - // UseColor set to anything means to use it + lp, ok := metric.Options["libraryPath"] + if ok { + a.libraryPath = lp.StrVal + } + pc, ok := metric.Options["preCommand"] + if ok { + a.preCommand = pc.StrVal + } + wi, ok := metric.Options["workerIndex"] + if ok { + a.workerIndex = wi.StrVal + } + li, ok := metric.Options["launcherIndex"] + if ok { + a.launcherIndex = li.StrVal + } mount, ok := metric.Options["mount"] if ok { a.mount = mount.StrVal @@ -199,8 +219,8 @@ func (a *FluxFramework) setSetup() { fluxRoot := "/opt/view" // Generate hostlists, this is the lead broker - leadBroker := fmt.Sprintf("%s-%s-%s-0", a.jobname, a.jobLetter, a.jobIndex) - workers := fmt.Sprintf("%s-%s-%s-[%s]", a.jobname, a.workerLetter, a.jobIndex, generateRange(a.pods-1, 0)) + leadBroker := fmt.Sprintf("%s-%s-%s-0", a.jobname, a.launcherLetter, a.launcherIndex) + workers := fmt.Sprintf("%s-%s-%s-[%s]", a.jobname, a.workerLetter, a.workerIndex, generateRange(a.pods-1, 0)) hosts := fmt.Sprintf("%s,%s", leadBroker, workers) fqdn := fmt.Sprintf("%s.%s.svc.cluster.local", a.serviceName, a.namespace) @@ -216,6 +236,10 @@ fluxroot=%s # The mount for the view will be at the user defined mount / view mount="%s/view" +echo "Hello I am hostname $(hostname) running setup." +# We only want one host to generate a certificate +mainHost="%s" + # Always use verbose, no reason to not here echo "Flux username: ${fluxuser}" echo "Flux install root: ${fluxroot}" @@ -294,22 +318,26 @@ chmod 0644 ${fluxroot}/etc/flux/imp/conf.d/imp.toml # Along with the state directory and curve certificate mkdir -p ${fluxroot}/run/flux ${fluxroot}/etc/curve -# Generate the certificate +# Generate the certificate (ONLY if the lead broker) mkdir -p ${fluxroot}/etc/curve -$fluxroot/bin/flux keygen ${fluxroot}/etc/curve/curve.cert - -# Remove group and other read -chmod o-r ${fluxroot}/etc/curve/curve.cert -chmod g-r ${fluxroot}/etc/curve/curve.cert -# Either the flux user owns the instance, or root -# We must get the correct flux user id - this user needs to own -# the run directory and these others -chown -R ${fluxuid} ${fluxroot}/etc/curve/curve.cert +if [[ "$(hostname)" == "$mainHost" ]]; then + echo "I am the main host, generating shared certificate" + $fluxroot/bin/flux keygen ${fluxroot}/etc/curve/curve.cert -echo -echo "✨ Curve certificate" -cat ${fluxroot}/etc/curve/curve.cert + # Remove group and other read + chmod o-r ${fluxroot}/etc/curve/curve.cert + chmod g-r ${fluxroot}/etc/curve/curve.cert + + # Either the flux user owns the instance, or root + # We must get the correct flux user id - this user needs to own + # the run directory and these others + chown -R ${fluxuid} ${fluxroot}/etc/curve/curve.cert + + echo + echo "✨ Curve certificate" + cat ${fluxroot}/etc/curve/curve.cert +fi ` setup = fmt.Sprintf( @@ -318,6 +346,7 @@ cat ${fluxroot}/etc/curve/curve.cert a.fluxUid, fluxRoot, a.Mount, + leadBroker, hosts, defaultBind, defaultConnect, @@ -330,6 +359,23 @@ cat ${fluxroot}/etc/curve/curve.cert func (a *FluxFramework) Options() map[string]intstr.IntOrString { options := a.DefaultOptions() options["mount"] = intstr.FromString(a.mount) + options["quorum"] = intstr.FromString(a.quorum) + options["fluxUser"] = intstr.FromString(a.fluxUser) + options["fluxUid"] = intstr.FromString(a.fluxUid) + options["fluxUid"] = intstr.FromString(a.fluxUid) + options["pods"] = intstr.FromInt(int(a.pods)) + options["connectTimeout"] = intstr.FromString(a.connectTimeout) + options["logLevel"] = intstr.FromString(a.logLevel) + options["jobname"] = intstr.FromString(a.jobname) + options["namespace"] = intstr.FromString(a.namespace) + options["serviceName"] = intstr.FromString(a.serviceName) + options["queuePolicy"] = intstr.FromString(a.queuePolicy) + options["launcherIndex"] = intstr.FromString(a.launcherIndex) + options["launcherLetter"] = intstr.FromString(a.launcherLetter) + options["workerIndex"] = intstr.FromString(a.workerIndex) + options["workerLetter"] = intstr.FromString(a.workerLetter) + options["submitCommand"] = intstr.FromString(a.submitCommand) + options["libraryPath"] = intstr.FromString(a.libraryPath) return options } @@ -370,7 +416,7 @@ func (a *FluxFramework) customizeEntrypoint( // This assumes a certain launcher letter for now // TODO allow to customize letter - leadBroker := fmt.Sprintf("%s-%s-%s-0", a.jobname, a.jobLetter, a.jobIndex) + leadBroker := fmt.Sprintf("%s-%s-%s-0", a.jobname, a.launcherLetter, a.launcherIndex) // Watch only works with submit watch := "" @@ -397,13 +443,16 @@ echo "%s" # Try to support debian / rocky flavor # This is the weakest point - it takes a long time to install with dnf /usr/bin/yum install munge -y || apt-get install -y munge || echo "Issue installing munge, might already be installed." -systemctl enable munge && systemctl start munge || service munge start || echo "Issue starting munge, might already be started." +systemctl enable munge || service munge start || echo "Issue starting munge, might already be started." # Ensure the flux volume addition is complete. wget https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs chmod +x ./wait-fs mv ./wait-fs /usr/bin/goshare-wait-fs - + +# Pre-commands +%s + # Ensure spack view is on the path, wherever it is mounted viewbase="%s" viewroot=${viewbase}/view @@ -436,13 +485,20 @@ fluxuid="%s" # Add a flux user (required) that should exist before pre-command # This might vary between OS -adduser --disabled-password --uid ${fluxuid} --gecos "" ${fluxuser} > /dev/null 2>&1 || echo "Issue adding ${fluxuser}" +# adduser --disabled-password --uid ${fluxuid} --gecos "" ${fluxuser} > /dev/null 2>&1 || echo "Issue adding ${fluxuser}" # Add view to default LD_LIBRARY_PATH -export LD_LIBRARY_PATH=${viewroot}/lib:${viewroot}/lib64 +if [ -z ${LD_LIBRARY_PATH+x} ]; then + export LD_LIBRARY_PATH=%s:${viewroot}/lib:${viewroot}/lib64 +else + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:%s:${viewroot}/lib:${viewroot}/lib64 +fi +echo "LD_LIBRARY_PATH is ${LD_LIBRARY_PATH}" # Ensure we use flux's python (TODO update this to use variable) export PYTHONPATH=${viewroot}/lib/python3.11:${viewroot}/lib/python3.11/site-packages +echo "PYTHONPATH is ${PYTHONPATH}" +echo "PATH is $PATH" # We could install pip... # /opt/share/view/bin/python3.11 -m ensurepip @@ -495,9 +551,23 @@ flags="%s" watch="%s" submit="%s" +# We will copy the curve certificate if the lead, otherwise wait for it +curvepath=${viewroot}/etc/curve/curve.cert + # Start flux with the original entrypoint if [ $(hostname) == "${mainHost}" ]; then + # The main host needs to scp the curve.cert over to the others + for host in $(cat ./hostlist.txt); do + if [[ "$host" == "" ]]; then + continue + fi + if [[ "$host" == "${mainHost}" ]]; then + continue + fi + echo "Copying curve.cert to $host" + scp ${curvepath} $host:${curvepath} + done echo "Command provided is: ${command}" if [ "${command}" == "" ]; then @@ -516,16 +586,20 @@ else # We basically sleep/wait until the lead broker is ready echo "🌀 flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions}" +until [ -f ${curvepath} ] +do + echo "Waiting for curve certificate to exist." + sleep 5 +done + +# We can keep trying forever, don't care if worker is successful or not +# TODO likely need to tweak success policy here? while true do flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions} retval=$? echo "Return value for follower worker is ${retval}" - if [[ "${retval}" -eq 0 ]]; then - echo "The follower worker exited cleanly. Goodbye." - break - fi - echo "đŸ˜Ē Sleeping 15s until broker is ready..." + echo "đŸ˜Ē Sleeping 15s to try again..." sleep 15 done fi @@ -536,9 +610,11 @@ echo "%s" preBlock = fmt.Sprintf( preBlock, meta, + a.preCommand, a.Mount, a.fluxUser, a.fluxUid, + a.libraryPath, leadBroker, interactive, a.connectTimeout, From 1fa3851c3e8a39ca47bbb05b4bf9951f3f170fab Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 08:00:24 -0600 Subject: [PATCH 5/9] bug with ld_library_path Signed-off-by: vsoch --- pkg/addons/flux.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pkg/addons/flux.go b/pkg/addons/flux.go index 574aa87..a86572a 100644 --- a/pkg/addons/flux.go +++ b/pkg/addons/flux.go @@ -233,6 +233,8 @@ fluxuser=%s fluxuid=%s fluxroot=%s +%s + # The mount for the view will be at the user defined mount / view mount="%s/view" @@ -440,6 +442,8 @@ func (a *FluxFramework) customizeEntrypoint( preBlock := ` echo "%s" +%s + # Try to support debian / rocky flavor # This is the weakest point - it takes a long time to install with dnf /usr/bin/yum install munge -y || apt-get install -y munge || echo "Issue installing munge, might already be installed." @@ -449,10 +453,7 @@ systemctl enable munge || service munge start || echo "Issue starting munge, mig wget https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs chmod +x ./wait-fs mv ./wait-fs /usr/bin/goshare-wait-fs - -# Pre-commands -%s - + # Ensure spack view is on the path, wherever it is mounted viewbase="%s" viewroot=${viewbase}/view @@ -488,10 +489,11 @@ fluxuid="%s" # adduser --disabled-password --uid ${fluxuid} --gecos "" ${fluxuser} > /dev/null 2>&1 || echo "Issue adding ${fluxuser}" # Add view to default LD_LIBRARY_PATH +extra_ld_path=%s if [ -z ${LD_LIBRARY_PATH+x} ]; then - export LD_LIBRARY_PATH=%s:${viewroot}/lib:${viewroot}/lib64 + export LD_LIBRARY_PATH=${extra_ld_path}:${viewroot}/lib:${viewroot}/lib64 else - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:%s:${viewroot}/lib:${viewroot}/lib64 + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${extra_ld_path}:${viewroot}/lib:${viewroot}/lib64 fi echo "LD_LIBRARY_PATH is ${LD_LIBRARY_PATH}" @@ -585,12 +587,7 @@ else # We basically sleep/wait until the lead broker is ready echo "🌀 flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions}" - -until [ -f ${curvepath} ] -do - echo "Waiting for curve certificate to exist." - sleep 5 -done +goshare-wait-fs -p ${curvepath} # We can keep trying forever, don't care if worker is successful or not # TODO likely need to tweak success policy here? From 9f9ae070158999976ffdc3dd533daf6ba1992819 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 08:09:10 -0600 Subject: [PATCH 6/9] another bug, but adding the mpi envar does not seem to work Signed-off-by: vsoch --- config/manager/kustomization.yaml | 2 +- pkg/addons/flux.go | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index efe054b..10a2d40 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,4 +5,4 @@ kind: Kustomization images: - name: controller newName: ghcr.io/converged-computing/metrics-operator - newTag: latest + newTag: test diff --git a/pkg/addons/flux.go b/pkg/addons/flux.go index a86572a..ed0bc76 100644 --- a/pkg/addons/flux.go +++ b/pkg/addons/flux.go @@ -233,8 +233,6 @@ fluxuser=%s fluxuid=%s fluxroot=%s -%s - # The mount for the view will be at the user defined mount / view mount="%s/view" From 25177cf809bf614c6b6b4f6af328ce355070f008 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 13:54:23 -0600 Subject: [PATCH 7/9] it is all working!!! Signed-off-by: vsoch --- config/manager/kustomization.yaml | 2 +- docs/getting_started/addons.md | 5 +++- examples/addons/flux-lammps/metrics.yaml | 14 ++------- pkg/addons/flux.go | 38 ++---------------------- 4 files changed, 10 insertions(+), 49 deletions(-) diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml index 10a2d40..efe054b 100644 --- a/config/manager/kustomization.yaml +++ b/config/manager/kustomization.yaml @@ -5,4 +5,4 @@ kind: Kustomization images: - name: controller newName: ghcr.io/converged-computing/metrics-operator - newTag: test + newTag: latest diff --git a/docs/getting_started/addons.md b/docs/getting_started/addons.md index 27ee0e8..ef689ae 100644 --- a/docs/getting_started/addons.md +++ b/docs/getting_started/addons.md @@ -199,7 +199,6 @@ under the metric -> options. | launcherLetter | The letter that the launcher job is expected to have | string | w | | workerIndex | The index of the replicated job for the worker | string | 0 | | launcherIndex | The index of the replicated job for the launcher | string | 0 | -| libraryPath | Extra to add to the start of the `LD_LIBRARY_PATH` | string | unset | | preCommand | Pre-command logic to run in launcher/workers before flux is started (after setup in flux container) | string | unset | Note that the number of pods for flux defaults to the number in your MetricSet, along @@ -211,6 +210,10 @@ with the namespace and service name. 2. Have scp installed, as the shared certificate needs to be copied from the lead broker to all followers 3. Ideally have munge installed - we do try to install it (but better to already be there) +We also currently run flux as root. This is considered bad practice, but probably OK +for this early development work. We don't see a need to have shared namespace / operator +environments at this point, which is why I didn't add it. + ## Performance ### perf-hpctoolkit diff --git a/examples/addons/flux-lammps/metrics.yaml b/examples/addons/flux-lammps/metrics.yaml index 89feaa2..0ddae9c 100644 --- a/examples/addons/flux-lammps/metrics.yaml +++ b/examples/addons/flux-lammps/metrics.yaml @@ -26,15 +26,7 @@ spec: # Add on hpctoolkit, will mount a volume and wrap lammps addons: - name: workload-flux - options: - # Needed for intel MPI I think? - preCommand: | - export PMI_TRACE=0xff - # export I_MPI_PMI_LIBRARY=/opt/intel/mpi/latest/lib/libmpicxx.so - export I_MPI_PMI_LIBRARY=${fluxroot}/lib/flux/libpmi.so - - # optionFlags: -ompi=pmix - # Ensure library path (with MPI) is used first - libraryPath: /opt/intel/mpi/latest/lib - # Ensure the working directory is consistent + options: + # Ensure intel environment is setup + preCommand: . /opt/intel/mpi/latest/env/vars.sh workdir: /opt/lammps/examples/reaxff/HNS \ No newline at end of file diff --git a/pkg/addons/flux.go b/pkg/addons/flux.go index ed0bc76..ea044ac 100644 --- a/pkg/addons/flux.go +++ b/pkg/addons/flux.go @@ -48,9 +48,6 @@ type FluxFramework struct { submitCommand string preCommand string - // Extra to add to LD_LIBRARY_PATH first - libraryPath string - // volumeName to provide for the empty volume volumeName string interactive bool @@ -111,13 +108,8 @@ func (a *FluxFramework) SetOptions(metric *api.MetricAddon, set *api.MetricSet) a.launcherLetter = "l" a.workerLetter = "w" a.quorum = fmt.Sprintf("%d", a.pods) - - // e.g., can be changed to mini submit or run a.submitCommand = "submit" - lp, ok := metric.Options["libraryPath"] - if ok { - a.libraryPath = lp.StrVal - } + pc, ok := metric.Options["preCommand"] if ok { a.preCommand = pc.StrVal @@ -375,7 +367,6 @@ func (a *FluxFramework) Options() map[string]intstr.IntOrString { options["workerIndex"] = intstr.FromString(a.workerIndex) options["workerLetter"] = intstr.FromString(a.workerLetter) options["submitCommand"] = intstr.FromString(a.submitCommand) - options["libraryPath"] = intstr.FromString(a.libraryPath) return options } @@ -486,30 +477,11 @@ fluxuid="%s" # This might vary between OS # adduser --disabled-password --uid ${fluxuid} --gecos "" ${fluxuser} > /dev/null 2>&1 || echo "Issue adding ${fluxuser}" -# Add view to default LD_LIBRARY_PATH -extra_ld_path=%s -if [ -z ${LD_LIBRARY_PATH+x} ]; then - export LD_LIBRARY_PATH=${extra_ld_path}:${viewroot}/lib:${viewroot}/lib64 -else - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${extra_ld_path}:${viewroot}/lib:${viewroot}/lib64 -fi -echo "LD_LIBRARY_PATH is ${LD_LIBRARY_PATH}" - # Ensure we use flux's python (TODO update this to use variable) -export PYTHONPATH=${viewroot}/lib/python3.11:${viewroot}/lib/python3.11/site-packages +export PYTHONPATH=${viewroot}/lib/python3.11/site-packages echo "PYTHONPATH is ${PYTHONPATH}" echo "PATH is $PATH" -# We could install pip... -# /opt/share/view/bin/python3.11 -m ensurepip - -asSudo="sudo -E PYTHONPATH=$PYTHONPATH -E PATH=$PATH -E LD_LIBRARY_PATH=${LD_LIBRARY_PATH}" -asFlux="${asSudo}" - -# If we aren't running as root, TODO -# asFlux="${asSudo} -E HOME=/home/${fluxuser}" -# asFlux="sudo -u ${fluxuser} -E PYTHONPATH=$PYTHONPATH -E PATH=$PATH -E LD_LIBRARY_PATH=${LD_LIBRARY_PATH} -E HOME=/home/${fluxuser}" - # Add fluxuser to sudoers echo "${fluxuser} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers @@ -527,10 +499,6 @@ workdir=$(pwd) echo "The working directory is ${workdir}, contents include:" ls -R ${workdir} -# Use root for now, easier :) -# fluxuid=$(id -u ${fluxuser}) -# chown -R ${fluxuid} /run/flux ${STATE_DIR} ${viewbin}/etc/curve/curve.cert ${workdir} - brokerOptions="-Scron.directory=/etc/flux/system/cron.d \ -Stbon.fanout=256 \ -Srundir=/run/flux %s \ @@ -588,7 +556,6 @@ echo "🌀 flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions}" goshare-wait-fs -p ${curvepath} # We can keep trying forever, don't care if worker is successful or not -# TODO likely need to tweak success policy here? while true do flux start -o --config ${viewroot}/etc/flux/config ${brokerOptions} @@ -609,7 +576,6 @@ echo "%s" a.Mount, a.fluxUser, a.fluxUid, - a.libraryPath, leadBroker, interactive, a.connectTimeout, From e1748d08464e3abbafa37a842e5a3fc56d2dace1 Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 14:07:04 -0600 Subject: [PATCH 8/9] wget in quiet mode Signed-off-by: vsoch --- pkg/addons/flux.go | 2 +- pkg/addons/hpctoolkit.go | 2 +- pkg/metrics/perf/sysstat.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/addons/flux.go b/pkg/addons/flux.go index ea044ac..f4279da 100644 --- a/pkg/addons/flux.go +++ b/pkg/addons/flux.go @@ -439,7 +439,7 @@ echo "%s" systemctl enable munge || service munge start || echo "Issue starting munge, might already be started." # Ensure the flux volume addition is complete. -wget https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs +wget -q https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs chmod +x ./wait-fs mv ./wait-fs /usr/bin/goshare-wait-fs diff --git a/pkg/addons/hpctoolkit.go b/pkg/addons/hpctoolkit.go index 45452bc..0027bf8 100644 --- a/pkg/addons/hpctoolkit.go +++ b/pkg/addons/hpctoolkit.go @@ -161,7 +161,7 @@ func (a *HPCToolkit) customizeEntrypoint( preBlock := ` echo "%s" # Ensure hpcrun and software exists. This is rough, but should be OK with enough wait time -wget https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs +wget -q https://github.com/converged-computing/goshare/releases/download/2023-09-06/wait-fs chmod +x ./wait-fs mv ./wait-fs /usr/bin/goshare-wait-fs diff --git a/pkg/metrics/perf/sysstat.go b/pkg/metrics/perf/sysstat.go index c6d821e..8e77b0f 100644 --- a/pkg/metrics/perf/sysstat.go +++ b/pkg/metrics/perf/sysstat.go @@ -181,7 +181,7 @@ func (m PidStat) PrepareContainers( echo "%s" # Download the wait binary -wget https://github.com/converged-computing/goshare/releases/download/2023-07-27/wait > /dev/null +wget -q https://github.com/converged-computing/goshare/releases/download/2023-07-27/wait > /dev/null chmod +x ./wait mv ./wait /usr/bin/goshare-wait From 2f205334e06707e7f3a8fc1fb133f6c9a722187e Mon Sep 17 00:00:00 2001 From: vsoch Date: Wed, 27 Sep 2023 14:14:20 -0600 Subject: [PATCH 9/9] flamingo is missing Signed-off-by: vsoch --- docs/getting_started/addons.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting_started/addons.md b/docs/getting_started/addons.md index ef689ae..51b8db9 100644 --- a/docs/getting_started/addons.md +++ b/docs/getting_started/addons.md @@ -163,7 +163,7 @@ spec: If you need to "throw in" Flux Framework into your container to use as a scheduler, you can do that with an addon! -> Yes, it's astounding. +> Yes, it's astounding. đŸĻŠī¸ This works by way of the same trick that we use for other addons that have a complex (and/or large) install setup. We: