52
52
53
53
extern ompi_rte_orte_component_t mca_rte_orte_component ;
54
54
55
- typedef struct {
56
- volatile bool active ;
57
- int status ;
58
- int errhandler ;
59
- } errhandler_t ;
60
-
61
- static void register_cbfunc (int status , int errhndler , void * cbdata )
62
- {
63
- errhandler_t * cd = (errhandler_t * )cbdata ;
64
- cd -> status = status ;
65
- cd -> errhandler = errhndler ;
66
- cd -> active = false;
67
- }
68
-
69
- static volatile bool wait_for_release = true;
70
- static int errhandler = -1 ;
71
-
72
- static void notify_cbfunc (int status ,
73
- opal_list_t * procs ,
74
- opal_list_t * info ,
75
- opal_pmix_release_cbfunc_t cbfunc ,
76
- void * cbdata )
77
- {
78
- if (NULL != cbfunc ) {
79
- cbfunc (cbdata );
80
- }
81
- wait_for_release = false;
82
- }
83
-
84
-
85
- int ompi_rte_init (int * pargc , char * * * pargv )
86
- {
87
- int rc ;
88
- opal_list_t info ;
89
- opal_value_t val ;
90
- errhandler_t cd ;
91
-
92
- if (ORTE_SUCCESS != (rc = orte_init (pargc , pargv , ORTE_PROC_MPI ))) {
93
- return rc ;
94
- }
95
-
96
- if (!orte_standalone_operation ) {
97
- /* register to receive any debugger release */
98
- OBJ_CONSTRUCT (& info , opal_list_t );
99
- OBJ_CONSTRUCT (& val , opal_value_t );
100
- val .key = strdup (OPAL_PMIX_ERROR_NAME );
101
- val .type = OPAL_INT ;
102
- val .data .integer = OPAL_ERR_DEBUGGER_RELEASE ;
103
- opal_list_append (& info , & val .super );
104
- cd .status = ORTE_ERROR ;
105
- cd .errhandler = -1 ;
106
- cd .active = true;
107
-
108
- opal_pmix .register_errhandler (& info , notify_cbfunc , register_cbfunc , & cd );
109
-
110
- /* let the MPI progress engine run while we wait for
111
- * registration to complete */
112
- OMPI_WAIT_FOR_COMPLETION (cd .active );
113
- /* safely deconstruct the list */
114
- opal_list_remove_first (& info );
115
- OBJ_DESTRUCT (& val );
116
- OBJ_DESTRUCT (& info );
117
- if (OPAL_SUCCESS != cd .status ) {
118
- /* ouch - we are doomed */
119
- ORTE_ERROR_LOG (cd .status );
120
- return OMPI_ERROR ;
121
- }
122
- errhandler = cd .errhandler ;
123
- }
124
-
125
- return OMPI_SUCCESS ;
126
- }
127
-
128
55
void ompi_rte_abort (int error_code , char * fmt , ...)
129
56
{
130
57
va_list arglist ;
@@ -173,10 +100,10 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
173
100
* attaching debuggers -- see big comment in
174
101
* orte/tools/orterun/debuggers.c explaining the two scenarios.
175
102
*/
176
-
177
103
void ompi_rte_wait_for_debugger (void )
178
104
{
179
105
int debugger ;
106
+ orte_rml_recv_cb_t xfer ;
180
107
181
108
/* See lengthy comment in orte/tools/orterun/debuggers.c about
182
109
orte_in_parallel_debugger */
@@ -186,16 +113,16 @@ void ompi_rte_wait_for_debugger(void)
186
113
debugger = 1 ;
187
114
}
188
115
189
- if (!debugger ) {
116
+ if (!debugger && NULL == getenv ( "ORTE_TEST_DEBUGGER_ATTACH" ) ) {
190
117
/* if not, just return */
191
118
return ;
192
119
}
120
+
193
121
/* if we are being debugged, then we need to find
194
122
* the correct plug-ins
195
123
*/
196
124
ompi_debugger_setup_dlls ();
197
125
198
- /* wait for the debugger to attach */
199
126
if (orte_standalone_operation ) {
200
127
/* spin until debugger attaches and releases us */
201
128
while (MPIR_debug_gate == 0 ) {
@@ -206,9 +133,23 @@ void ompi_rte_wait_for_debugger(void)
206
133
#endif
207
134
}
208
135
} else {
209
- /* now wait for the notification to occur */
210
- OMPI_WAIT_FOR_COMPLETION (wait_for_release );
211
- /* deregister the errhandler */
212
- opal_pmix .deregister_errhandler (errhandler , NULL , NULL );
136
+ /* only the rank=0 proc waits for either a message from the
137
+ * HNP or for the debugger to attach - everyone else will just
138
+ * spin in * the grpcomm barrier in ompi_mpi_init until rank=0
139
+ * joins them.
140
+ */
141
+ if (0 != ORTE_PROC_MY_NAME -> vpid ) {
142
+ return ;
143
+ }
144
+
145
+ /* VPID 0 waits for a message from the HNP */
146
+ OBJ_CONSTRUCT (& xfer , orte_rml_recv_cb_t );
147
+ xfer .active = true;
148
+ orte_rml .recv_buffer_nb (OMPI_NAME_WILDCARD ,
149
+ ORTE_RML_TAG_DEBUGGER_RELEASE ,
150
+ ORTE_RML_NON_PERSISTENT ,
151
+ orte_rml_recv_callback , & xfer );
152
+ /* let the MPI progress engine run while we wait */
153
+ OMPI_WAIT_FOR_COMPLETION (xfer .active );
213
154
}
214
155
}
0 commit comments