From fbbc14e7755935fbca6e35f449e515fe7271a793 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 5 Apr 2018 09:34:33 -0400 Subject: [PATCH 1/5] wreck: set FLUX_JOB_KVSPATH Problem: a child instance of Flux doesn't know the path to its KVS directory in the enclosing instance. Set the path in the env variable FLUX_JOB_KVSPATH for each job. For example, in the current exec implementation, this might have the value "lwj.0.0.1". --- src/modules/wreck/wrexecd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/modules/wreck/wrexecd.c b/src/modules/wreck/wrexecd.c index 9d56834139d5..f6d2c9a3e207 100644 --- a/src/modules/wreck/wrexecd.c +++ b/src/modules/wreck/wrexecd.c @@ -1929,6 +1929,7 @@ int exec_commands (struct prog_ctx *ctx) prog_ctx_setenvf (ctx, "FLUX_JOB_NNODES",1, "%d", ctx->nnodes); prog_ctx_setenvf (ctx, "FLUX_NODE_ID", 1, "%d", ctx->rankinfo.nodeid); prog_ctx_setenvf (ctx, "FLUX_JOB_SIZE", 1, "%d", ctx->total_ntasks); + prog_ctx_setenvf (ctx, "FLUX_JOB_KVSPATH", 1, "%s", ctx->kvspath); gtid_list_create (ctx, buf, sizeof (buf)); prog_ctx_setenvf (ctx, "FLUX_LOCAL_RANKS", 1, "%s", buf); From f0dcd90ca7a0a5a15833568c61c8c18c2166d6ba Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 5 Apr 2018 10:18:43 -0400 Subject: [PATCH 2/5] rc1: inform enclosing instance of useful URIs Problem: it is inconvenient to determine the URI to use to connect to a sub-instance. Write URIs to the job's KVS directory in the enclosing instance: lwj.X.X.X.flux.local_uri=local://... lwj.X.X.X.flux.remote_uri=ssh://... Fixes #1422 --- configure.ac | 6 ++++++ etc/Makefile.am | 3 +++ etc/rc1.d/01-enclosing-instance | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+) create mode 100755 etc/rc1.d/01-enclosing-instance diff --git a/configure.ac b/configure.ac index d7f6e43e324f..27514efc7d3c 100644 --- a/configure.ac +++ b/configure.ac @@ -235,6 +235,12 @@ AC_PKGCONFIG AS_VAR_SET(fluxrcdir, $sysconfdir/flux) AC_SUBST(fluxrcdir) +AS_VAR_SET(fluxrc1dir, $sysconfdir/flux/rc1.d) +AC_SUBST(fluxrc1dir) + +AS_VAR_SET(fluxrc3dir, $sysconfdir/flux/rc3.d) +AC_SUBST(fluxrc3dir) + AS_VAR_SET(fluxcfdir, $sysconfdir/flux/conf.d) AC_SUBST(fluxcfdir) diff --git a/etc/Makefile.am b/etc/Makefile.am index e41a1132517e..bb8b0970adc3 100644 --- a/etc/Makefile.am +++ b/etc/Makefile.am @@ -11,6 +11,9 @@ dist_fluxrc_SCRIPTS = \ rc1 \ rc3 +dist_fluxrc1_SCRIPTS = \ + rc1.d/01-enclosing-instance + flux/curve: $(AM_V_GEN)$(top_builddir)/src/cmd/flux keygen --force diff --git a/etc/rc1.d/01-enclosing-instance b/etc/rc1.d/01-enclosing-instance new file mode 100755 index 000000000000..6f1db83ed2b0 --- /dev/null +++ b/etc/rc1.d/01-enclosing-instance @@ -0,0 +1,18 @@ +# Inform the enclosing instance (if any) of the URI's for this instance + +update_parent() { + local parent_uri=$(flux getattr parent-uri) + local key_prefix=${FLUX_JOB_KVSPATH}.flux + local local_uri=${FLUX_URI} + local remote_uri="ssh://$(hostname)/$(echo $local_uri|sed 's,^.*://,,')" + + FLUX_URI=${parent_uri} \ + flux kvs put --json ${key_prefix}.local_uri=${local_uri} + FLUX_URI=${parent_uri} \ + flux kvs put --json ${key_prefix}.remote_uri=${remote_uri} +} + +# Only run this on rank 0 +if test -n "${FLUX_JOB_KVSPATH}" -a $(flux getattr rank) -eq 0; then + update_parent +fi From d04d22c53bc2a862a8a6de956f4e229582d9e822 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 5 Apr 2018 10:15:14 -0700 Subject: [PATCH 3/5] cmd/flux-wreck: add "uri" subcommand Add a subcommand that lists the lwj.X.Y.Z.flux.remote_uri value, if available. Usage is similar to flux wreck ls, e.g. flux wreck uri [-n, --max=count] [b, --bare] [JOBIDS...] If called with --bare, only the URI for exactly one job is listed, by itself for easy parsing. It is an error if there is not exactly one job specified, or if the job is not a Flux instance. If called without --bare, each job is listed with minimal state information. The FLUX_URI fields is left blank for jobs that are not Flux instances. $ flux wreck uri ID NTASKS STATE FLUX_URI COMMAND 1 1 exited hostname 2 1 exited hostname 3 1 exited ssh://jimbo//tmp/flux-qCbPz5 flux --- src/cmd/flux-wreck | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/cmd/flux-wreck b/src/cmd/flux-wreck index 25fc790ac885..ace70dea5186 100755 --- a/src/cmd/flux-wreck +++ b/src/cmd/flux-wreck @@ -347,6 +347,47 @@ prog:SubCommand { end } +prog:SubCommand { + name = "uri", + usage = "[OPTIONS] [JOBIDs]", + options = { + { name = "max", char = 'n', arg="COUNT", + usage = "Display at most COUNT jobs", + }, + { name = "bare", char = 'b', + usage = "Dispaly only the URI", + } + }, + description = "List FLUX_URI for jobs that are Flux instances", + handler = function (self, arg) + local dirs,err = joblist_from_args (self, arg) + if not dirs then self:die (err) end + if #dirs == 0 then return end + local fmt = "%6s %6s %-9s %-40s %-.13s\n"; + if self.opt.b then + if #dirs > 1 then self:die ("--bare only works with one job") end + else + printf (fmt, "ID", "NTASKS", "STATE", "FLUX_URI", "COMMAND") + end + for _,dir in pairs (dirs) do + local id = dir:match ("(%d+)$") + if tonumber (id) then + local j, err = LWJ.open (f, id, dir) + if not j then self:die ("job%d: %s", id, err) end + local uri, err = f:kvs_get (kvs_path (id, "flux.remote_uri")) + if self.opt.b then + if err then self:die ("job%d: not a Flux instance", id) end + printf ("%s\n", uri) + else + if err then uri = "" end + printf (fmt, id, j.ntasks, j:state_string(), uri, + j.command:match ("([^/]+)$")) + end + end + end + end +} + prog:SubCommand { name = "timing", usage = "[OPTIONS] [JOBIDs]...", From bd81b1e299bb241df708fa5a1088e2382279592c Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 5 Apr 2018 10:46:39 -0700 Subject: [PATCH 4/5] t2003-recurse.t: verify URI in enclosing instance Add a few tests to ensure that Flux running Flux results in KVS content that can be found by "flux wreck uri". N.B. this sharness script needed to drop the "wreck" personality and use the default full personality in order to execute the rc1.d script that updates the enclosing KVS. --- t/t2003-recurse.t | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/t/t2003-recurse.t b/t/t2003-recurse.t index 646068942c6d..7675d758bb72 100755 --- a/t/t2003-recurse.t +++ b/t/t2003-recurse.t @@ -6,7 +6,7 @@ test_description='Test that Flux can launch Flux' . `dirname $0`/sharness.sh mock_bootstrap_instance -test_under_flux 4 wreck +test_under_flux 4 test_expect_success 'recurse: Flux launches Flux ' ' printenv FLUX_URI >old_uri && @@ -55,4 +55,30 @@ test_expect_success 'recurse: Flux launches Flux launches Flux' ' test_cmp hello_expected hello_out ' +test_expect_success 'recurse: FLUX_JOB_KVSPATH is set in child job' ' + flux wreckrun -n1 -N1 flux start \ + printenv FLUX_JOB_KVSPATH >kvspath && + test -s kvspath +' + +test_expect_success 'recurse: flux.local_uri is set in enclosing KVS' ' + flux wreckrun -n1 -N1 flux start flux getattr local-uri >curi && + key=$(flux wreck kvs-path $(flux wreck last-jobid)).flux.local_uri && + flux kvs get --json $key >curi.out && + test_cmp curi curi.out +' + +test_expect_success 'recurse: flux.remote_uri is set in enclosing KVS' ' + flux wreckrun -n1 -N1 flux start flux getattr local-uri >curi2 && + key=$(flux wreck kvs-path $(flux wreck last-jobid)).flux.remote_uri && + flux kvs get --json $key >ruri.out && + grep -q "$(sed -e ,local://,, child_uri && + flux wreck uri --bare $(flux wreck last-jobid) >list_uri && + grep -q "$(sed -e ,local://,, Date: Thu, 5 Apr 2018 10:52:04 -0700 Subject: [PATCH 5/5] doc/flux-wreck(1): add uri subcommand --- doc/man1/flux-wreck.adoc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/doc/man1/flux-wreck.adoc b/doc/man1/flux-wreck.adoc index 1eb9096c4cb2..8ff000308336 100644 --- a/doc/man1/flux-wreck.adoc +++ b/doc/man1/flux-wreck.adoc @@ -33,6 +33,15 @@ states. If '-n, --max' option is provided, then display at most 'COUNT' jobs (default: 25). If an optional list of 'JOBIDS' is provided on the command line, then display only those jobs. +*uri* [-n, --max=COUNT] [-b, --bare] [JOBIDS...]:: +Display a list of wreck jobs currently in kvs, with abbreviated job info, +and a URI that can be used to contact the rank 0 broker if the job is a +Flux instance. The field is blank for other types of jobs. +If '-n, --max' option is provided, then display at most 'COUNT' +jobs (default: 25). If '-b, --bare' option is provided, display only the +URI for a single job so that it can be parsed by scripts. If an optional +list of 'JOBIDS' is provided on the command line, then display only those jobs. + *attach* [--status] [--label-io] 'jobid':: Attach to output of a running or completed job. If input was not previously connected, also attach to stdin. With '--status', also report job status