Skip to content

Commit

Permalink
Merge pull request #2837 from jjhursey/topic/ibm/v2.x/orted-timeout-i…
Browse files Browse the repository at this point in the history
…mprov

orterun: Add parameter to control when we give up on stack traces
  • Loading branch information
hppritcha authored Feb 6, 2017
2 parents 74440de + b858344 commit b7e45ef
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 8 deletions.
3 changes: 3 additions & 0 deletions orte/runtime/orte_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -129,6 +130,8 @@ orte_timer_t *orte_mpiexec_timeout = NULL;

opal_buffer_t *orte_tree_launch_cmd = NULL;

int orte_stack_trace_wait_timeout = 30;

/* global arrays for data storage */
opal_pointer_array_t *orte_job_data = NULL;
opal_pointer_array_t *orte_node_pool = NULL;
Expand Down
4 changes: 4 additions & 0 deletions orte/runtime/orte_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -587,6 +588,9 @@ ORTE_DECLSPEC extern char *orte_daemon_cores;
/* cutoff for collective modex */
ORTE_DECLSPEC extern uint32_t orte_direct_modex_cutoff;

/* Max time to wait for stack straces to return */
ORTE_DECLSPEC extern int orte_stack_trace_wait_timeout;

END_C_DECLS

#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */
10 changes: 10 additions & 0 deletions orte/runtime/orte_mca_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -743,5 +744,14 @@ int orte_register_params(void)
/* register a synonym for old name */
mca_base_var_register_synonym (id, "ompi", "ompi", "hostname", "cutoff", MCA_BASE_VAR_SYN_FLAG_DEPRECATED);


/* Amount of time to wait for a stack trace to return from the daemons */
orte_stack_trace_wait_timeout = 30;
(void) mca_base_var_register ("orte", "orte", NULL, "timeout_for_stack_trace",
"Seconds to wait for stack traces to return before terminating the job (<= 0 wait forever)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_stack_trace_wait_timeout);

return ORTE_SUCCESS;
}
22 changes: 14 additions & 8 deletions orte/tools/orterun/orterun.c
Original file line number Diff line number Diff line change
Expand Up @@ -2860,8 +2860,10 @@ static void stack_trace_recv(int status, orte_process_name_t* sender,
}
++ntraces;
if (orte_process_info.num_procs == ntraces) {
/* cancel the timeout */
OBJ_DESTRUCT(&stack_trace_timer);
if( orte_stack_trace_wait_timeout > 0 ) {
/* cancel the timeout */
OBJ_DESTRUCT(&stack_trace_timer);
}
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
/* set the global abnormal exit flag */
Expand All @@ -2871,6 +2873,8 @@ static void stack_trace_recv(int status, orte_process_name_t* sender,

static void stack_trace_timeout(int sd, short args, void *cbdata)
{
fprintf(stderr, "Timed out waiting for stack traces. Job will now terminate. orte_stack_trace_wait_timeout = %d\n", orte_stack_trace_wait_timeout);

/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
/* set the global abnormal exit flag */
Expand Down Expand Up @@ -2963,12 +2967,14 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
OBJ_RELEASE(sig);
/* we will terminate after we get the stack_traces, but set a timeout
* just in case we never hear back from everyone */
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
opal_event_evtimer_set(orte_event_base,
stack_trace_timer.ev, stack_trace_timeout, NULL);
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
stack_trace_timer.tv.tv_sec = 30;
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
if( orte_stack_trace_wait_timeout > 0 ) {
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
opal_event_evtimer_set(orte_event_base,
stack_trace_timer.ev, stack_trace_timeout, NULL);
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
stack_trace_timer.tv.tv_sec = orte_stack_trace_wait_timeout;
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
}
return;
}
giveup:
Expand Down

0 comments on commit b7e45ef

Please sign in to comment.