From b858344b3e21b6c12c36fa285d983b4fc27c5b65 Mon Sep 17 00:00:00 2001 From: Joshua Hursey Date: Thu, 1 Dec 2016 13:23:52 -0500 Subject: [PATCH] orterun: Add parameter to control when we give up on stack traces * MCA option to control how long we wait for stack traces: - orte_timeout_for_stack_trace INTEGER Default: 30 Setting to <= 0 will cause it to wait forever * Useful when gathering stack traces from large jobs which might take a long time. Signed-off-by: Joshua Hursey --- orte/runtime/orte_globals.c | 3 +++ orte/runtime/orte_globals.h | 4 ++++ orte/runtime/orte_mca_params.c | 10 ++++++++++ orte/tools/orterun/orterun.c | 22 ++++++++++++++-------- 4 files changed, 31 insertions(+), 8 deletions(-) diff --git a/orte/runtime/orte_globals.c b/orte/runtime/orte_globals.c index b571b127dde..2d374c8e5ad 100644 --- a/orte/runtime/orte_globals.c +++ b/orte/runtime/orte_globals.c @@ -16,6 +16,7 @@ * Copyright (c) 2013-2015 Intel, Inc. All rights reserved * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -129,6 +130,8 @@ orte_timer_t *orte_mpiexec_timeout = NULL; opal_buffer_t *orte_tree_launch_cmd = NULL; +int orte_stack_trace_wait_timeout = 30; + /* global arrays for data storage */ opal_pointer_array_t *orte_job_data = NULL; opal_pointer_array_t *orte_node_pool = NULL; diff --git a/orte/runtime/orte_globals.h b/orte/runtime/orte_globals.h index bfa4cd63afb..e04f63ea045 100644 --- a/orte/runtime/orte_globals.h +++ b/orte/runtime/orte_globals.h @@ -14,6 +14,7 @@ * Copyright (c) 2011-2013 Los Alamos National Security, LLC. * All rights reserved. * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -587,6 +588,9 @@ ORTE_DECLSPEC extern char *orte_daemon_cores; /* cutoff for collective modex */ ORTE_DECLSPEC extern uint32_t orte_direct_modex_cutoff; +/* Max time to wait for stack straces to return */ +ORTE_DECLSPEC extern int orte_stack_trace_wait_timeout; + END_C_DECLS #endif /* ORTE_RUNTIME_ORTE_GLOBALS_H */ diff --git a/orte/runtime/orte_mca_params.c b/orte/runtime/orte_mca_params.c index 69c0c7ee02d..9cc3979b4f0 100644 --- a/orte/runtime/orte_mca_params.c +++ b/orte/runtime/orte_mca_params.c @@ -16,6 +16,7 @@ * Copyright (c) 2013-2015 Intel, Inc. All rights reserved * Copyright (c) 2014 Research Organization for Information Science * and Technology (RIST). All rights reserved. + * Copyright (c) 2017 IBM Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -743,5 +744,14 @@ int orte_register_params(void) /* register a synonym for old name */ mca_base_var_register_synonym (id, "ompi", "ompi", "hostname", "cutoff", MCA_BASE_VAR_SYN_FLAG_DEPRECATED); + + /* Amount of time to wait for a stack trace to return from the daemons */ + orte_stack_trace_wait_timeout = 30; + (void) mca_base_var_register ("orte", "orte", NULL, "timeout_for_stack_trace", + "Seconds to wait for stack traces to return before terminating the job (<= 0 wait forever)", + MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY, + &orte_stack_trace_wait_timeout); + return ORTE_SUCCESS; } diff --git a/orte/tools/orterun/orterun.c b/orte/tools/orterun/orterun.c index 346080ed809..1db7538bdcc 100644 --- a/orte/tools/orterun/orterun.c +++ b/orte/tools/orterun/orterun.c @@ -2860,8 +2860,10 @@ static void stack_trace_recv(int status, orte_process_name_t* sender, } ++ntraces; if (orte_process_info.num_procs == ntraces) { - /* cancel the timeout */ - OBJ_DESTRUCT(&stack_trace_timer); + if( orte_stack_trace_wait_timeout > 0 ) { + /* cancel the timeout */ + OBJ_DESTRUCT(&stack_trace_timer); + } /* abort the job */ ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); /* set the global abnormal exit flag */ @@ -2871,6 +2873,8 @@ static void stack_trace_recv(int status, orte_process_name_t* sender, static void stack_trace_timeout(int sd, short args, void *cbdata) { + fprintf(stderr, "Timed out waiting for stack traces. Job will now terminate. orte_stack_trace_wait_timeout = %d\n", orte_stack_trace_wait_timeout); + /* abort the job */ ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_ALL_JOBS_COMPLETE); /* set the global abnormal exit flag */ @@ -2963,12 +2967,14 @@ void orte_timeout_wakeup(int sd, short args, void *cbdata) OBJ_RELEASE(sig); /* we will terminate after we get the stack_traces, but set a timeout * just in case we never hear back from everyone */ - OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t); - opal_event_evtimer_set(orte_event_base, - stack_trace_timer.ev, stack_trace_timeout, NULL); - opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI); - stack_trace_timer.tv.tv_sec = 30; - opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv); + if( orte_stack_trace_wait_timeout > 0 ) { + OBJ_CONSTRUCT(&stack_trace_timer, orte_timer_t); + opal_event_evtimer_set(orte_event_base, + stack_trace_timer.ev, stack_trace_timeout, NULL); + opal_event_set_priority(stack_trace_timer.ev, ORTE_ERROR_PRI); + stack_trace_timer.tv.tv_sec = orte_stack_trace_wait_timeout; + opal_event_evtimer_add(stack_trace_timer.ev, &stack_trace_timer.tv); + } return; } giveup: