Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not push child processes into separate process groups so that any … #1431

Merged
merged 1 commit into from
Mar 7, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions orte/mca/ess/hnp/ess_hnp_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -786,10 +786,8 @@ static int rte_finalize(void)
/** Remove the USR signal handlers */
opal_event_signal_del(&sigusr1_handler);
opal_event_signal_del(&sigusr2_handler);
if (orte_forward_job_control) {
opal_event_signal_del(&sigtstp_handler);
opal_event_signal_del(&sigcont_handler);
}
opal_event_signal_del(&sigtstp_handler);
opal_event_signal_del(&sigcont_handler);
signals_set = false;
}

Expand Down
12 changes: 0 additions & 12 deletions orte/mca/odls/alps/odls_alps_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -416,13 +416,6 @@ static int do_child(orte_app_context_t* context,
sigset_t sigs;
char *param, *msg;

if (orte_forward_job_control) {
/* Set a new process group for this child, so that a
SIGSTOP can be sent to it without being sent to the
orted. */
setpgid(0, 0);
}

/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);

Expand Down Expand Up @@ -798,11 +791,6 @@ static int send_signal(pid_t pid, int signal)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal, (long)pid));

if (orte_forward_job_control) {
/* Send the signal to the process group rather than the
process. The child is the leader of its process group. */
pid = -pid;
}
if (kill(pid, signal) != 0) {
switch(errno) {
case EINVAL:
Expand Down
50 changes: 11 additions & 39 deletions orte/mca/odls/default/odls_default_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -193,18 +193,18 @@ static bool odls_default_child_died(orte_proc_t *child)
* that occasionally causes us to incorrectly report a proc
* as refusing to die. Unfortunately, errno may not be reset
* by waitpid in this case, so we cannot check it.
*
* (note the previous fix to this, to return 'process dead'
* here, fixes the race condition at the cost of reporting
* all live processes have immediately died! Better to
* occasionally report a dead process as still living -
* which will occasionally trip the timeout for cases that
* are right on the edge.)
*
* (note the previous fix to this, to return 'process dead'
* here, fixes the race condition at the cost of reporting
* all live processes have immediately died! Better to
* occasionally report a dead process as still living -
* which will occasionally trip the timeout for cases that
* are right on the edge.)
*/
OPAL_OUTPUT_VERBOSE((20, orte_odls_base_framework.framework_output,
"%s odls:default:WAITPID INDICATES PID %d MAY HAVE ALREADY EXITED",
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)(child->pid)));
/* Do nothing, process still alive */
/* Do nothing, process still alive */
} else if (-1 == ret && ECHILD == errno) {
/* The pid no longer exists, so we'll call this "good
enough for government work" */
Expand All @@ -228,23 +228,10 @@ static bool odls_default_child_died(orte_proc_t *child)
return false;
}


/* deliver a signal to a specified pid. */
static int odls_default_kill_local(pid_t pid, int signum)
{
pid_t pgrp;

#if HAVE_SETPGID
pgrp = getpgid(pid);
if (-1 != pgrp) {
/* target the lead process of the process
* group so we ensure that the signal is
* seen by all members of that group. This
* ensures that the signal is seen by any
* child processes our child may have
* started
*/
pid = pgrp;
}
#endif
if (0 != kill(pid, signum)) {
if (ESRCH != errno) {
OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
Expand Down Expand Up @@ -391,13 +378,6 @@ static int do_child(orte_app_context_t* context,
long fd, fdmax = sysconf(_SC_OPEN_MAX);
char *param, *msg;

if (orte_forward_job_control) {
/* Set a new process group for this child, so that a
SIGSTOP can be sent to it without being sent to the
orted. */
setpgid(0, 0);
}

/* Setup the pipe to be close-on-exec */
opal_fd_set_cloexec(write_fd);

Expand Down Expand Up @@ -720,10 +700,7 @@ static int odls_default_fork_local_proc(orte_app_context_t* context,
}

if (pid == 0) {
close(p[0]);
#if HAVE_SETPGID
setpgid(0, 0);
#endif
close(p[0]);
do_child(context, child, environ_copy, jobdat, p[1], opts);
/* Does not return */
}
Expand Down Expand Up @@ -770,11 +747,6 @@ static int send_signal(pid_t pid, int signal)
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
signal, (long)pid));

if (orte_forward_job_control) {
/* Send the signal to the process group rather than the
process. The child is the leader of its process group. */
pid = -pid;
}
if (kill(pid, signal) != 0) {
switch(errno) {
case EINVAL:
Expand Down
3 changes: 0 additions & 3 deletions orte/runtime/orte_globals.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,6 @@ char *orte_output_filename = NULL;
/* generate new xterm windows to display output from specified ranks */
char *orte_xterm = NULL;

/* whether or not to forward SIGTSTP and SIGCONT signals */
bool orte_forward_job_control = false;

/* report launch progress */
bool orte_report_launch_progress = false;

Expand Down
3 changes: 0 additions & 3 deletions orte/runtime/orte_globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,9 +521,6 @@ ORTE_DECLSPEC extern opal_pointer_array_t *orte_node_topologies;
ORTE_DECLSPEC extern opal_pointer_array_t *orte_local_children;
ORTE_DECLSPEC extern orte_vpid_t orte_total_procs;

/* whether or not to forward SIGTSTP and SIGCONT signals */
ORTE_DECLSPEC extern bool orte_forward_job_control;

/* IOF controls */
ORTE_DECLSPEC extern bool orte_tag_output;
ORTE_DECLSPEC extern bool orte_timestamp_output;
Expand Down
8 changes: 0 additions & 8 deletions orte/runtime/orte_mca_params.c
Original file line number Diff line number Diff line change
Expand Up @@ -543,14 +543,6 @@ int orte_register_params(void)
orte_map_stddiag_to_stderr = true;
}

/* whether or not to forward SIGTSTP and SIGCONT signals */
orte_forward_job_control = false;
(void) mca_base_var_register ("orte", "orte", NULL, "forward_job_control",
"Forward SIGTSTP (after converting to SIGSTOP) and SIGCONT signals to the application procs [default: no]",
MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_READONLY,
&orte_forward_job_control);

/* whether or not to report launch progress */
orte_report_launch_progress = false;
(void) mca_base_var_register ("orte", "orte", NULL, "report_launch_progress",
Expand Down
4 changes: 1 addition & 3 deletions orte/tools/orte-submit/orte-submit.1in
Original file line number Diff line number Diff line change
Expand Up @@ -1133,9 +1133,7 @@ SIGUSR1 and SIGUSR2 signals received by orte-submit are propagated to
all processes in the job.
.
.PP
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
by ompi-submit by setting the MCA parameter orte_forward_job_control to 1.
A SIGTSTOP signal to ompi-submit will then cause a SIGSTOP signal to be sent
A SIGTSTOP signal to ompi-submit will cause a SIGSTOP signal to be sent
to all of the programs started by ompi-submit and likewise a SIGCONT signal
to ompi-submit will cause a SIGCONT sent.
.
Expand Down
4 changes: 1 addition & 3 deletions orte/tools/orterun/orterun.1in
Original file line number Diff line number Diff line change
Expand Up @@ -1240,9 +1240,7 @@ SIGUSR1 and SIGUSR2 signals received by orterun are propagated to
all processes in the job.
.
.PP
One can turn on forwarding of SIGSTOP and SIGCONT to the program executed
by mpirun by setting the MCA parameter orte_forward_job_control to 1.
A SIGTSTOP signal to mpirun will then cause a SIGSTOP signal to be sent
A SIGTSTOP signal to mpirun will cause a SIGSTOP signal to be sent
to all of the programs started by mpirun and likewise a SIGCONT signal
to mpirun will cause a SIGCONT sent.
.
Expand Down