Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

orterun: Add parameter to control when we give up on stack traces #2837

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions orte/runtime/orte_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014-2015 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -129,6 +130,8 @@ orte_timer_t *orte_mpiexec_timeout = NULL;

opal_buffer_t *orte_tree_launch_cmd = NULL;

int orte_stack_trace_wait_timeout = 30;

/* global arrays for data storage */
opal_pointer_array_t *orte_job_data = NULL;
opal_pointer_array_t *orte_node_pool = NULL;
Expand Down
4 changes: 4 additions & 0 deletions orte/runtime/orte_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* Copyright (c) 2011-2013 Los Alamos National Security, LLC.
* All rights reserved.
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -587,6 +588,9 @@ ORTE_DECLSPEC extern char *orte_daemon_cores;
/* cutoff for collective modex */
ORTE_DECLSPEC extern uint32_t orte_direct_modex_cutoff;

/* Max time to wait for stack straces to return */
ORTE_DECLSPEC extern int orte_stack_trace_wait_timeout;

END_C_DECLS

#endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */
10 changes: 10 additions & 0 deletions orte/runtime/orte_mca_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* Copyright (c) 2013-2015 Intel, Inc. All rights reserved
* Copyright (c) 2014 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2017 IBM Corporation. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -743,5 +744,14 @@ int orte_register_params(void)
/* register a synonym for old name */
mca_base_var_register_synonym (id, "ompi", "ompi", "hostname", "cutoff", MCA_BASE_VAR_SYN_FLAG_DEPRECATED);


/* Amount of time to wait for a stack trace to return from the daemons */
orte_stack_trace_wait_timeout = 30;
(void) mca_base_var_register ("orte", "orte", NULL, "timeout_for_stack_trace",
"Seconds to wait for stack traces to return before terminating the job (<= 0 wait forever)",
MCA_BASE_VAR_TYPE_INT, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_stack_trace_wait_timeout);

return ORTE_SUCCESS;
}
22 changes: 14 additions & 8 deletions orte/tools/orterun/orterun.c
Original file line number Diff line number Diff line change
Expand Up @@ -2860,8 +2860,10 @@ static void stack_trace_recv(int status, orte_process_name_t* sender,
}
++ntraces;
if (orte_process_info.num_procs == ntraces) {
/* cancel the timeout */
OBJ_DESTRUCT(&stack_trace_timer);
if( orte_stack_trace_wait_timeout > 0 ) {
/* cancel the timeout */
OBJ_DESTRUCT(&stack_trace_timer);
}
/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
/* set the global abnormal exit flag */
Expand All @@ -2871,6 +2873,8 @@ static void stack_trace_recv(int status, orte_process_name_t* sender,

static void stack_trace_timeout(int sd, short args, void *cbdata)
{
fprintf(stderr, "Timed out waiting for stack traces. Job will now terminate. orte_stack_trace_wait_timeout = %d\n", orte_stack_trace_wait_timeout);

/* abort the job */
ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE);
/* set the global abnormal exit flag */
Expand Down Expand Up @@ -2963,12 +2967,14 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata)
OBJ_RELEASE(sig);
/* we will terminate after we get the stack_traces, but set a timeout
* just in case we never hear back from everyone */
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
opal_event_evtimer_set(orte_event_base,
stack_trace_timer.ev, stack_trace_timeout, NULL);
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
stack_trace_timer.tv.tv_sec = 30;
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
if( orte_stack_trace_wait_timeout > 0 ) {
OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t);
opal_event_evtimer_set(orte_event_base,
stack_trace_timer.ev, stack_trace_timeout, NULL);
opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI);
stack_trace_timer.tv.tv_sec = orte_stack_trace_wait_timeout;
opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv);
}
return;
}
giveup:
Expand Down