From ebe159acef07f62f6138f9fd12ddffd74e40c0c4 Mon Sep 17 00:00:00 2001 From: Ralph Castain Date: Fri, 27 May 2016 09:45:34 -0700 Subject: [PATCH] Add a timeout cmd line option and an option to report state info upon timeout to assist with debugging Jenkins tests If requested, obtain stacktraces for each application process and report it to stderr upon timeout stack traces: minor improvements - Also include the hostname and PID of the each process for which we're sending the stack traces (vs. just including the ORTE process name) - Send a specific error message if we couldn't find "gstack" in the $PATH (e.g., on OS X) - Send a sepcific error message if gstack fails to run - Print a message that obtaining the stack traces may take a few seconds so that users don't wonder what's happening Signed-off-by: Jeff Squyres help-orterun.txt: minor tweaks Trivial update: show "--timeout" (instead of "-timeout") in the help message, just to encourage the use of double-dash options. Signed-off-by: Jeff Squyres trivial: stacktrace -> stack trace Trivial word smything. Signed-off-by: Jeff Squyres --- orte/mca/odls/odls_types.h | 5 +- orte/mca/rml/rml_types.h | 3 + orte/mca/schizo/ompi/schizo_ompi.c | 14 ++- orte/orted/orted_comm.c | 85 +++++++++++++- orte/orted/orted_submit.c | 164 ++++++++++++++++++++++++++-- orte/orted/orted_submit.h | 4 + orte/tools/orterun/help-orterun.txt | 18 ++- 7 files changed, 278 insertions(+), 15 deletions(-) diff --git a/orte/mca/odls/odls_types.h b/orte/mca/odls/odls_types.h index bd3115f254..51c9afc363 100644 --- a/orte/mca/odls/odls_types.h +++ b/orte/mca/odls/odls_types.h @@ -9,7 +9,7 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010-2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2011 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011-2012 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2014-2016 Intel, Inc. All rights reserved. @@ -80,6 +80,9 @@ typedef uint8_t orte_daemon_cmd_flag_t; /* add procs for the DVM */ #define ORTE_DAEMON_DVM_ADD_PROCS (orte_daemon_cmd_flag_t) 30 +/* for debug purposes, get stack traces from all application procs */ +#define ORTE_DAEMON_GET_STACK_TRACES (orte_daemon_cmd_flag_t) 31 + /* * Struct written up the pipe from the child to the parent. */ diff --git a/orte/mca/rml/rml_types.h b/orte/mca/rml/rml_types.h index f0271b60e8..a8714ad24a 100644 --- a/orte/mca/rml/rml_types.h +++ b/orte/mca/rml/rml_types.h @@ -163,6 +163,9 @@ BEGIN_C_DECLS /* error notifications */ #define ORTE_RML_TAG_NOTIFICATION 59 +/* stacktrace for debug */ +#define ORTE_RML_TAG_STACK_TRACE 60 + #define ORTE_RML_TAG_MAX 100 #define ORTE_RML_TAG_NTOH(t) ntohl(t) diff --git a/orte/mca/schizo/ompi/schizo_ompi.c b/orte/mca/schizo/ompi/schizo_ompi.c index 296cdfaa3b..f197d0a33d 100644 --- a/orte/mca/schizo/ompi/schizo_ompi.c +++ b/orte/mca/schizo/ompi/schizo_ompi.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006-2013 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. * Copyright (c) 2013-2016 Intel, Inc. All rights reserved. * Copyright (c) 2015 Research Organization for Information Science @@ -92,6 +92,18 @@ static opal_cmd_line_init_t cmd_line_init[] = { &orte_cmd_options.report_uri, OPAL_CMD_LINE_TYPE_STRING, "Printout URI on stdout [-], stderr [+], or a file [anything else]" }, + /* testing options */ + { NULL, '\0', "timeout", "timeout", 1, + &orte_cmd_options.timeout, OPAL_CMD_LINE_TYPE_INT, + "Timeout the job after the specified number of seconds" }, + { NULL, '\0', "report-state-on-timeout", "report-state-on-timeout", 0, + &orte_cmd_options.report_state_on_timeout, OPAL_CMD_LINE_TYPE_BOOL, + "Report all job and process states upon timeout" }, + { NULL, '\0', "get-stack-traces", "get-stack-traces", 0, + &orte_cmd_options.get_stack_traces, OPAL_CMD_LINE_TYPE_BOOL, + "Get stack traces of all application procs on timeout" }, + + /* exit status reporting */ { "orte_report_child_jobs_separately", '\0', "report-child-jobs-separately", "report-child-jobs-separately", 0, NULL, OPAL_CMD_LINE_TYPE_BOOL, diff --git a/orte/orted/orted_comm.c b/orte/orted/orted_comm.c index adf62431f8..64146d5cde 100644 --- a/orte/orted/orted_comm.c +++ b/orte/orted/orted_comm.c @@ -9,7 +9,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2009 Sun Microsystems, Inc. All rights reserved. @@ -47,6 +47,7 @@ #include "opal/mca/base/base.h" #include "opal/util/output.h" #include "opal/util/opal_environ.h" +#include "opal/util/path.h" #include "opal/runtime/opal.h" #include "opal/runtime/opal_progress.h" #include "opal/dss/dss.h" @@ -111,6 +112,9 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, bool found = false; orte_node_t *node; orte_grpcomm_signature_t *sig; + FILE *fp; + char gscmd[256], path[1035], *pathptr; + char string[256], *string_ptr = string; /* unpack the command */ n = 1; @@ -1071,6 +1075,82 @@ void orte_daemon_recv(int status, orte_process_name_t* sender, } break; + case ORTE_DAEMON_GET_STACK_TRACES: + /* prep the response */ + answer = OBJ_NEW(opal_buffer_t); + pathptr = path; + + // Try to find the "gstack" executable. Failure to find the + // executable will be handled below, because the receiver + // expects to have the process name, hostname, and PID in the + // buffer before finding an error message. + char *gstack_exec; + gstack_exec = opal_find_absolute_path("gstack"); + + /* hit each local process with a gstack command */ + for (i=0; i < orte_local_children->size; i++) { + if (NULL != (proct = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && + ORTE_FLAG_TEST(proct, ORTE_PROC_FLAG_ALIVE)) { + relay_msg = OBJ_NEW(opal_buffer_t); + if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->name, 1, ORTE_NAME) || + OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->node->name, 1, OPAL_STRING) || + OPAL_SUCCESS != opal_dss.pack(relay_msg, &proct->pid, 1, OPAL_PID)) { + OBJ_RELEASE(relay_msg); + break; + } + + // If we were able to find the gstack executable, + // above, then run the command here. + fp = NULL; + if (NULL != gstack_exec) { + (void) snprintf(gscmd, sizeof(gscmd), "%s %lu", + gstack_exec, (unsigned long) proct->pid); + fp = popen(gscmd, "r"); + } + + // If either we weren't able to find or run the gstack + // exectuable, send back a nice error message here. + if (NULL == gstack_exec || NULL == fp) { + (void) snprintf(string, sizeof(string), + "Failed to %s \"%s\" on %s to obtain stack traces", + (NULL == gstack_exec) ? "find" : "run", + (NULL == gstack_exec) ? "gstack" : gstack_exec, + proct->node->name); + if (OPAL_SUCCESS == + opal_dss.pack(relay_msg, &string_ptr, 1, OPAL_STRING)) { + opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER); + } + OBJ_RELEASE(relay_msg); + break; + } + /* Read the output a line at a time and pack it for transmission */ + memset(path, 0, sizeof(path)); + while (fgets(path, sizeof(path)-1, fp) != NULL) { + if (OPAL_SUCCESS != opal_dss.pack(relay_msg, &pathptr, 1, OPAL_STRING)) { + OBJ_RELEASE(relay_msg); + break; + } + memset(path, 0, sizeof(path)); + } + /* close */ + pclose(fp); + /* transfer this load */ + if (OPAL_SUCCESS != opal_dss.pack(answer, &relay_msg, 1, OPAL_BUFFER)) { + OBJ_RELEASE(relay_msg); + break; + } + OBJ_RELEASE(relay_msg); + } + } + /* always send our response */ + if (0 > (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, answer, + ORTE_RML_TAG_STACK_TRACE, + orte_rml_send_callback, NULL))) { + ORTE_ERROR_LOG(ret); + OBJ_RELEASE(answer); + } + break; + default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); } @@ -1139,6 +1219,9 @@ static char *get_orted_comm_cmd_str(int command) case ORTE_DAEMON_DVM_ADD_PROCS: return strdup("ORTE_DAEMON_DVM_ADD_PROCS"); + case ORTE_DAEMON_GET_STACK_TRACES: + return strdup("ORTE_DAEMON_GET_STACK_TRACES"); + default: return strdup("Unknown Command!"); } diff --git a/orte/orted/orted_submit.c b/orte/orted/orted_submit.c index d004585cef..e55994fc59 100644 --- a/orte/orted/orted_submit.c +++ b/orte/orted/orted_submit.c @@ -10,7 +10,7 @@ * University of Stuttgart. All rights reserved. * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. - * Copyright (c) 2006-2014 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2006-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2007-2009 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights * reserved. @@ -88,6 +88,7 @@ #include "orte/mca/rmaps/base/base.h" #include "orte/mca/errmgr/errmgr.h" +#include "orte/mca/grpcomm/grpcomm.h" #include "orte/mca/plm/base/plm_private.h" #include "orte/mca/rml/rml.h" #include "orte/mca/rml/base/rml_contact.h" @@ -114,6 +115,8 @@ static char **global_mca_env = NULL; static orte_std_cntr_t total_num_apps = 0; static bool want_prefix_by_default = (bool) ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT; static opal_pointer_array_t tool_jobs; +static int timeout_seconds; + int orte_debugger_attach_fd = -1; bool orte_debugger_fifo_active=false; opal_event_t *orte_debugger_attach=NULL; @@ -853,13 +856,26 @@ int orte_submit_job(char *argv[], int *index, /* check for a job timeout specification, to be provided in seconds * as that is what MPICH used */ - if (NULL != (param = getenv("MPIEXEC_TIMEOUT"))) { + param = NULL; + if (0 < orte_cmd_options.timeout || + NULL != (param = getenv("MPIEXEC_TIMEOUT"))) { + if (NULL != param) { + timeout_seconds = strtol(param, NULL, 10); + /* both cannot be present, or they must agree */ + if (0 < orte_cmd_options.timeout && timeout_seconds != orte_cmd_options.timeout) { + orte_show_help("help-orterun.txt", "orterun:timeoutconflict", false, + orte_basename, orte_cmd_options.timeout, param); + exit(1); + } + } else { + timeout_seconds = orte_cmd_options.timeout; + } if (NULL == (orte_mpiexec_timeout = OBJ_NEW(orte_timer_t))) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_OUT_OF_RESOURCE); //goto DONE; } - orte_mpiexec_timeout->tv.tv_sec = strtol(param, NULL, 10); + orte_mpiexec_timeout->tv.tv_sec = timeout_seconds; orte_mpiexec_timeout->tv.tv_usec = 0; opal_event_evtimer_set(orte_event_base, orte_mpiexec_timeout->ev, orte_timeout_wakeup, jdata); @@ -887,7 +903,7 @@ int orte_submit_job(char *argv[], int *index, if (NULL == fp) { orte_show_help("help-orterun.txt", "orterun:write_file", false, orte_basename, "uri", orte_cmd_options.report_uri); - exit(0); + exit(1); } fprintf(fp, "%s\n", (NULL == rml_uri) ? "NULL" : rml_uri); fclose(fp); @@ -2824,16 +2840,77 @@ void orte_debugger_detached(int fd, short event, void *cbdata) mpir_breakpoint_fired = false; } +static uint32_t ntraces = 0; +static orte_timer_t stack_trace_timer; + +static void stack_trace_recv(int status, orte_process_name_t* sender, + opal_buffer_t *buffer, orte_rml_tag_t tag, + void* cbdata) +{ + opal_buffer_t *blob; + char *st; + int32_t cnt; + orte_process_name_t name; + char *hostname; + pid_t pid; + + /* unpack the stack_trace blob */ + cnt = 1; + while (OPAL_SUCCESS == opal_dss.unpack(buffer, &blob, &cnt, OPAL_BUFFER)) { + /* first piece is the name of the process */ + cnt = 1; + if (OPAL_SUCCESS != opal_dss.unpack(blob, &name, &cnt, ORTE_NAME) || + OPAL_SUCCESS != opal_dss.unpack(blob, &hostname, &cnt, OPAL_STRING) || + OPAL_SUCCESS != opal_dss.unpack(blob, &pid, &cnt, OPAL_PID)) { + OBJ_RELEASE(blob); + continue; + } + fprintf(stderr, "STACK TRACE FOR PROC %s (%s, PID %lu)\n", ORTE_NAME_PRINT(&name), hostname, (unsigned long) pid); + free(hostname); + /* unpack the stack_trace until complete */ + cnt = 1; + while (OPAL_SUCCESS == opal_dss.unpack(blob, &st, &cnt, OPAL_STRING)) { + fprintf(stderr, "\t%s", st); // has its own newline + free(st); + cnt = 1; + } + fprintf(stderr, "\n"); + OBJ_RELEASE(blob); + cnt = 1; + } + ++ntraces; + if (orte_process_info.num_procs == ntraces) { + /* cancel the timeout */ + OBJ_DESTRUCT(&stack_trace_timer); + /* abort the job */ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); + /* set the global abnormal exit flag */ + orte_abnormal_term_ordered = true; + } +} + +static void stack_trace_timeout(int sd, short args, void *cbdata) +{ + /* abort the job */ + ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); + /* set the global abnormal exit flag */ + orte_abnormal_term_ordered = true; +} + void orte_timeout_wakeup(int sd, short args, void *cbdata) { - char *tm; + orte_job_t *jdata; + orte_proc_t *proc; + int i; + int rc; + uint32_t key; + void *nptr; /* this function gets called when the job execution time * has hit a prescribed limit - so just abort */ - tm = getenv("MPIEXEC_TIMEOUT"); orte_show_help("help-orterun.txt", "orterun:timeout", - true, (NULL == tm) ? "NULL" : tm); + true, timeout_seconds); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); /* if we are testing HNP suicide, then just exit */ if (ORTE_PROC_IS_HNP && @@ -2841,6 +2918,79 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata) opal_output(0, "HNP exiting w/o cleanup"); exit(1); } + if (orte_cmd_options.report_state_on_timeout) { + /* cycle across all the jobs and report their state */ + rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jdata, &nptr); + while (OPAL_SUCCESS == rc) { + /* don't use the opal_output system as it may be borked */ + fprintf(stderr, "DATA FOR JOB: %s\n", ORTE_JOBID_PRINT(jdata->jobid)); + fprintf(stderr, "\tNum apps: %d\tNum procs: %d\tJobState: %s\tAbort: %s\n", + (int)jdata->num_apps, (int)jdata->num_procs, + orte_job_state_to_str(jdata->state), + (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) ? "True" : "False"); + fprintf(stderr, "\tNum launched: %ld\tNum reported: %ld\tNum terminated: %ld\n", + (long)jdata->num_launched, (long)jdata->num_reported, (long)jdata->num_terminated); + fprintf(stderr, "\n\tProcs:\n"); + for (i=0; i < jdata->procs->size; i++) { + if (NULL != (proc = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, i))) { + fprintf(stderr, "\t\tRank: %s\tNode: %s\tPID: %u\tState: %s\tExitCode %d\n", + ORTE_VPID_PRINT(proc->name.vpid), + (NULL == proc->node) ? "UNKNOWN" : proc->node->name, + (unsigned int)proc->pid, + orte_proc_state_to_str(proc->state), proc->exit_code); + } + } + fprintf(stderr, "\n"); + rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jdata, nptr, &nptr); + } + } + /* if they asked for stack_traces, attempt to get them, but timeout + * if we cannot do so */ + if (orte_cmd_options.get_stack_traces) { + orte_daemon_cmd_flag_t command = ORTE_DAEMON_GET_STACK_TRACES; + opal_buffer_t *buffer; + orte_grpcomm_signature_t *sig; + + fprintf(stderr, "Waiting for stack traces (this may take a few moments)...\n"); + + /* set the recv */ + orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_STACK_TRACE, + ORTE_RML_PERSISTENT, stack_trace_recv, NULL); + + /* setup the buffer */ + buffer = OBJ_NEW(opal_buffer_t); + /* pack the command */ + if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + goto giveup; + } + /* goes to all daemons */ + sig = OBJ_NEW(orte_grpcomm_signature_t); + sig->signature = (orte_process_name_t*)malloc(sizeof(orte_process_name_t)); + sig->signature[0].jobid = ORTE_PROC_MY_NAME->jobid; + sig->signature[0].vpid = ORTE_VPID_WILDCARD; + sig->sz = 1; + if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(sig, ORTE_RML_TAG_DAEMON, buffer))) { + ORTE_ERROR_LOG(rc); + OBJ_RELEASE(buffer); + OBJ_RELEASE(sig); + goto giveup; + } + OBJ_RELEASE(buffer); + /* maintain accounting */ + OBJ_RELEASE(sig); + /* we will terminate after we get the stack_traces, but set a timeout + * just in case we never hear back from everyone */ + OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t); + opal_event_evtimer_set(orte_event_base, + stack_trace_timer.ev, stack_trace_timeout, NULL); + opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI); + stack_trace_timer.tv.tv_sec = 30; + opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv); + return; + } + giveup: /* abort the job */ ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); /* set the global abnormal exit flag */ diff --git a/orte/orted/orted_submit.h b/orte/orted/orted_submit.h index f26b14aa9b..99d61ffcb8 100644 --- a/orte/orted/orted_submit.h +++ b/orte/orted/orted_submit.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2016 Intel, Inc. All rights reserved. + * Copyright (c) 2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -92,6 +93,9 @@ struct orte_cmd_options_t { bool enable_recovery; char *hnp; bool staged_exec; + int timeout; + bool report_state_on_timeout; + bool get_stack_traces; }; typedef struct orte_cmd_options_t orte_cmd_options_t; ORTE_DECLSPEC extern orte_cmd_options_t orte_cmd_options; diff --git a/orte/tools/orterun/help-orterun.txt b/orte/tools/orterun/help-orterun.txt index d862ac5656..272bccdb03 100644 --- a/orte/tools/orterun/help-orterun.txt +++ b/orte/tools/orterun/help-orterun.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2007-2010 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2007-2016 Cisco Systems, Inc. All rights reserved. # Copyright (c) 2012 Oak Ridge National Labs. All rights reserved. # $COPYRIGHT$ # @@ -645,11 +645,11 @@ Please correct this value and try again. The user-provided time limit for job execution has been reached: - MPIEXEC_TIMEOUT: %s seconds + MPIEXEC_TIMEOUT: %d seconds -The job will now be aborted. Please check your code and/or -adjust/remove the job execution time limit (as specified -by MPIEXEC_TIMEOUT in your environment). +The job will now be aborted. Please check your code and/or +adjust/remove the job execution time limit (as specified by +MPIEXEC_TIMEOUT in your environment or --timeout on the command line). # [orterun:conflict-env-set] ERROR: You have attempted to pass environment variables to Open MPI @@ -669,3 +669,11 @@ system's restriction for Unix's socket's path-length. Please try to set TMPDIR to something short (like /tmp) or change Your computer's name (see uname -n). # +[orterun:timeoutconflict] +Conflicting requests for timeout were given: + + --timeout command line option: %d + MPIEXEC_TIMEOUT envar: %s + +Only one method should be provided, or else they must agree. Please +correct and retry.