Skip to content

Commit ff293ec

Browse files
authored
Merge pull request #6998 from jsquyres/pr/v3.0.x/vader-do-not-use-cma
v3.0.x: Do not use CMA in user namespaces
2 parents dc61f8f + 9de3d0f commit ff293ec

File tree

4 files changed

+151
-5
lines changed

4 files changed

+151
-5
lines changed

opal/mca/btl/vader/btl_vader.h

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,12 @@ union vader_modex_t {
8181
void *segment_base;
8282
} xpmem;
8383
#endif
84-
opal_shmem_ds_t seg_ds;
84+
struct vader_modex_other_t {
85+
ino_t user_ns_id;
86+
int seg_ds_size;
87+
/* seg_ds needs to be the last element */
88+
opal_shmem_ds_t seg_ds;
89+
} other;
8590
};
8691

8792
/**
@@ -261,6 +266,31 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
261266
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
262267
#endif
263268

269+
ino_t mca_btl_vader_get_user_ns_id(void);
270+
271+
int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
272+
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
273+
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
274+
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
275+
276+
int mca_btl_vader_emu_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
277+
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
278+
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
279+
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
280+
281+
int mca_btl_vader_emu_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
282+
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
283+
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
284+
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
285+
void *cbcontext, void *cbdata);
286+
287+
int mca_btl_vader_emu_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
288+
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
289+
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
290+
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
291+
292+
void mca_btl_vader_sc_emu_init (void);
293+
264294
/**
265295
* Allocate a segment.
266296
*

opal/mca/btl/vader/btl_vader_component.c

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
#include "btl_vader_fbox.h"
3838
#include "btl_vader_xpmem.h"
3939

40+
#ifdef HAVE_SYS_STAT_H
41+
#include <sys/stat.h>
42+
#endif
43+
4044
#include <sys/mman.h>
4145
#include <fcntl.h>
4246

@@ -324,6 +328,25 @@ static int mca_btl_vader_component_close(void)
324328
return OPAL_SUCCESS;
325329
}
326330

331+
/*
332+
* mca_btl_vader_parse_proc_ns_user() tries to get the user namespace ID
333+
* of the current process.
334+
* Returns the ID of the user namespace. In the case of an error '0' is returned.
335+
*/
336+
ino_t mca_btl_vader_get_user_ns_id(void)
337+
{
338+
struct stat buf;
339+
340+
if (0 > stat("/proc/self/ns/user", &buf)) {
341+
/*
342+
* Something went wrong, probably an old kernel that does not support namespaces
343+
* simply assume all processes are in the same user namespace and return 0
344+
*/
345+
return 0;
346+
}
347+
348+
return buf.st_ino;
349+
}
327350
static int mca_btl_base_vader_modex_send (void)
328351
{
329352
union vader_modex_t modex;
@@ -337,8 +360,16 @@ static int mca_btl_base_vader_modex_send (void)
337360
modex_size = sizeof (modex.xpmem);
338361
} else {
339362
#endif
340-
modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
341-
memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size);
363+
modex.other.seg_ds_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
364+
memmove (&modex.other.seg_ds, &mca_btl_vader_component.seg_ds, modex.other.seg_ds_size);
365+
modex.other.user_ns_id = mca_btl_vader_get_user_ns_id();
366+
/*
367+
* If modex.other.user_ns_id is '0' something did not work out
368+
* during user namespace detection. Assuming there are no
369+
* namespaces available it will return '0' for all processes and
370+
* the check later will see '0' everywhere and not disable CMA.
371+
*/
372+
modex_size = sizeof (modex.other);
342373

343374
#if OPAL_BTL_VADER_HAVE_XPMEM
344375
}

opal/mca/btl/vader/btl_vader_module.c

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
*/
2626

2727
#include "opal_config.h"
28+
#include "opal/util/show_help.h"
2829

2930
#include "btl_vader.h"
3031
#include "btl_vader_endpoint.h"
@@ -77,6 +78,28 @@ mca_btl_vader_t mca_btl_vader = {
7778
}
7879
};
7980

81+
/*
82+
* Exit function copied from btl_usnic_util.c
83+
*
84+
* The following comment tells Coverity that this function does not return.
85+
* See https://scan.coverity.com/tune.
86+
*/
87+
88+
/* coverity[+kill] */
89+
static void vader_btl_exit(mca_btl_vader_t *btl)
90+
{
91+
if (NULL != btl && NULL != btl->error_cb) {
92+
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
93+
(opal_proc_t*) opal_proc_local_get(),
94+
"The vader BTL is aborting the MPI job (via PML error callback).");
95+
}
96+
97+
/* If the PML error callback returns (or if there wasn't one), just exit. Shrug. */
98+
fprintf(stderr, "*** The Open MPI vader BTL is aborting the MPI job (via exit(3)).\n");
99+
fflush(stderr);
100+
exit(1);
101+
}
102+
80103
static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
81104
{
82105
mca_btl_vader_component_t *component = &mca_btl_vader_component;
@@ -158,6 +181,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
158181
static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) {
159182
mca_btl_vader_component_t *component = &mca_btl_vader_component;
160183
union vader_modex_t *modex;
184+
ino_t my_user_ns_id;
161185
size_t msg_size;
162186
int rc;
163187

@@ -182,17 +206,59 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
182206
} else {
183207
#endif
184208
/* store a copy of the segment information for detach */
185-
ep->segment_data.other.seg_ds = malloc (msg_size);
209+
ep->segment_data.other.seg_ds = malloc (modex->other.seg_ds_size);
186210
if (NULL == ep->segment_data.other.seg_ds) {
187211
return OPAL_ERR_OUT_OF_RESOURCE;
188212
}
189213

190-
memcpy (ep->segment_data.other.seg_ds, &modex->seg_ds, msg_size);
214+
memcpy (ep->segment_data.other.seg_ds, &modex->other.seg_ds, modex->other.seg_ds_size);
191215

192216
ep->segment_base = opal_shmem_segment_attach (ep->segment_data.other.seg_ds);
193217
if (NULL == ep->segment_base) {
194218
return OPAL_ERROR;
195219
}
220+
221+
if (MCA_BTL_VADER_CMA == mca_btl_vader_component.single_copy_mechanism) {
222+
my_user_ns_id = mca_btl_vader_get_user_ns_id();
223+
if (my_user_ns_id != modex->other.user_ns_id) {
224+
mca_base_var_source_t source;
225+
int vari;
226+
rc = mca_base_var_find_by_name("btl_vader_single_copy_mechanism", &vari);
227+
if (OPAL_ERROR == rc) {
228+
return OPAL_ERROR;
229+
}
230+
rc = mca_base_var_get_value(vari, NULL, &source, NULL);
231+
if (OPAL_ERROR == rc) {
232+
return OPAL_ERROR;
233+
}
234+
/*
235+
* CMA is not possible as different user namespaces are in use.
236+
* Currently the kernel does not allow * process_vm_{read,write}v()
237+
* for processes running in different user namespaces even if
238+
* all involved user IDs are mapped to the same user ID.
239+
*
240+
* Fallback to MCA_BTL_VADER_NONE.
241+
*/
242+
if (MCA_BASE_VAR_SOURCE_DEFAULT != source) {
243+
/* If CMA has been explicitly selected we want to error out */
244+
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-error",
245+
true, opal_process_info.nodename);
246+
vader_btl_exit(&mca_btl_vader);
247+
}
248+
/*
249+
* If CMA has been selected because it is the default or
250+
* some fallback, this falls back even further.
251+
*/
252+
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-warning",
253+
true, opal_process_info.nodename);
254+
mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_NONE;
255+
mca_btl_vader.super.btl_flags &= ~MCA_BTL_FLAGS_RDMA;
256+
mca_btl_vader.super.btl_get = NULL;
257+
mca_btl_vader.super.btl_put = NULL;
258+
mca_btl_vader.super.btl_put_limit = 0;
259+
mca_btl_vader.super.btl_get_limit = 0;
260+
}
261+
}
196262
#if OPAL_BTL_VADER_HAVE_XPMEM
197263
}
198264
#endif

opal/mca/btl/vader/help-btl-vader.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,25 @@ WARNING: Linux kernel CMA support was requested via the
121121
btl_vader_single_copy_mechanism MCA variable, but CMA support is
122122
not available due to restrictive ptrace settings.
123123

124+
The vader shared memory BTL will fall back on another single-copy
125+
mechanism if one is available. This may result in lower performance.
126+
127+
Local host: %s
128+
#
129+
[cma-different-user-namespace-error]
130+
ERROR: Linux kernel CMA support was requested via the
131+
btl_vader_single_copy_mechanism MCA variable, but CMA support is
132+
not available due to different user namespaces.
133+
134+
Your MPI job will abort now. Please select another value for
135+
btl_vader_single_copy_mechanism.
136+
137+
Local host: %s
138+
#
139+
[cma-different-user-namespace-warning]
140+
WARNING: The default btl_vader_single_copy_mechanism CMA is
141+
not available due to different user namespaces.
142+
124143
The vader shared memory BTL will fall back on another single-copy
125144
mechanism if one is available. This may result in lower performance.
126145

0 commit comments

Comments
 (0)