Skip to content

Commit

Permalink
wreck: set job state to failed if first rank fails
Browse files Browse the repository at this point in the history
In some cases a call to `wlog_fatal` would cause jobs to get
stuck in `reserved` or `starting` state, especially if rank 0
wrexecd exited with a failure.

Try harder in this function to update the job state to "failed"
before rank 0 wrexecd exits on error.
  • Loading branch information
grondo committed Mar 30, 2018
1 parent da0dd00 commit 1042efe
Showing 1 changed file with 9 additions and 0 deletions.
9 changes: 9 additions & 0 deletions src/modules/wreck/wrexecd.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,8 @@ static void wlog_msg (struct prog_ctx *ctx, const char *fmt, ...)
static void wlog_debug (struct prog_ctx *ctx, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));

void send_job_state_event (struct prog_ctx *ctx, const char *state);
int update_job_state (struct prog_ctx *ctx, const char *state);

void *lsd_nomem_error (const char *file, int line, char *msg)
{
Expand Down Expand Up @@ -273,6 +275,13 @@ static void wlog_fatal (struct prog_ctx *ctx, int code, const char *format, ...)
if (archive_lwj (ctx) < 0)
flux_log_error (ctx->flux, "wlog_fatal: archive_lwj");
}
/* Attempt to update job state to failed so commands do not hang */
if (ctx->rankinfo.nodeid == 0 && ctx->flux) {
update_job_state (ctx, "failed");
send_job_state_event (ctx, "failed");
flux_kvs_commit_anon (ctx->flux, 0);
}

if (code > 0)
exit (code);
}
Expand Down

0 comments on commit 1042efe

Please sign in to comment.