Skip to content

Commit e19e210

Browse files
adrianreberjsquyres
authored andcommitted
Do not use CMA in user namespaces
Trying out to run processes via mpirun in Podman containers has shown that the CMA btl_vader_single_copy_mechanism does not work when user namespaces are involved. Creating containers with Podman requires at least user namespaces to be able to do unprivileged mounts in a container Even if running the container with user namespace user ID mappings which result in the same user ID on the inside and outside of all involved containers, the check in the kernel to allow ptrace (and thus process_vm_{read,write}v()), fails if the same IDs are not in the same user namespace. One workaround is to specify '--mca btl_vader_single_copy_mechanism none' and this commit adds code to automatically skip CMA if user namespaces are detected and fall back to MCA_BTL_VADER_NONE (as opposed to MCA_BTL_VADER_EMUL on master as of 2019-09-21 and the v4.0.x branch). Signed-off-by: Adrian Reber <areber@redhat.com> Signed-off-by: Jeff Squyres <jsquyres@cisco.com> (cherry picked from commit fc68d8a)
1 parent 231bcda commit e19e210

File tree

4 files changed

+151
-5
lines changed

4 files changed

+151
-5
lines changed

opal/mca/btl/vader/btl_vader.h

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,12 @@ union vader_modex_t {
8181
void *segment_base;
8282
} xpmem;
8383
#endif
84-
opal_shmem_ds_t seg_ds;
84+
struct vader_modex_other_t {
85+
ino_t user_ns_id;
86+
int seg_ds_size;
87+
/* seg_ds needs to be the last element */
88+
opal_shmem_ds_t seg_ds;
89+
} other;
8590
};
8691

8792
/**
@@ -261,6 +266,31 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
261266
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
262267
#endif
263268

269+
ino_t mca_btl_vader_get_user_ns_id(void);
270+
271+
int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
272+
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
273+
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
274+
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
275+
276+
int mca_btl_vader_emu_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
277+
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
278+
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
279+
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
280+
281+
int mca_btl_vader_emu_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
282+
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
283+
mca_btl_base_registration_handle_t *remote_handle, mca_btl_base_atomic_op_t op,
284+
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
285+
void *cbcontext, void *cbdata);
286+
287+
int mca_btl_vader_emu_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
288+
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
289+
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
290+
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
291+
292+
void mca_btl_vader_sc_emu_init (void);
293+
264294
/**
265295
* Allocate a segment.
266296
*

opal/mca/btl/vader/btl_vader_component.c

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@
3737
#include "btl_vader_fbox.h"
3838
#include "btl_vader_xpmem.h"
3939

40+
#ifdef HAVE_SYS_STAT_H
41+
#include <sys/stat.h>
42+
#endif
43+
4044
#include <sys/mman.h>
4145
#include <fcntl.h>
4246

@@ -324,6 +328,25 @@ static int mca_btl_vader_component_close(void)
324328
return OPAL_SUCCESS;
325329
}
326330

331+
/*
332+
* mca_btl_vader_parse_proc_ns_user() tries to get the user namespace ID
333+
* of the current process.
334+
* Returns the ID of the user namespace. In the case of an error '0' is returned.
335+
*/
336+
ino_t mca_btl_vader_get_user_ns_id(void)
337+
{
338+
struct stat buf;
339+
340+
if (0 > stat("/proc/self/ns/user", &buf)) {
341+
/*
342+
* Something went wrong, probably an old kernel that does not support namespaces
343+
* simply assume all processes are in the same user namespace and return 0
344+
*/
345+
return 0;
346+
}
347+
348+
return buf.st_ino;
349+
}
327350
static int mca_btl_base_vader_modex_send (void)
328351
{
329352
union vader_modex_t modex;
@@ -337,8 +360,16 @@ static int mca_btl_base_vader_modex_send (void)
337360
modex_size = sizeof (modex.xpmem);
338361
} else {
339362
#endif
340-
modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
341-
memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size);
363+
modex.other.seg_ds_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
364+
memmove (&modex.other.seg_ds, &mca_btl_vader_component.seg_ds, modex.other.seg_ds_size);
365+
modex.other.user_ns_id = mca_btl_vader_get_user_ns_id();
366+
/*
367+
* If modex.other.user_ns_id is '0' something did not work out
368+
* during user namespace detection. Assuming there are no
369+
* namespaces available it will return '0' for all processes and
370+
* the check later will see '0' everywhere and not disable CMA.
371+
*/
372+
modex_size = sizeof (modex.other);
342373

343374
#if OPAL_BTL_VADER_HAVE_XPMEM
344375
}

opal/mca/btl/vader/btl_vader_module.c

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
*/
2626

2727
#include "opal_config.h"
28+
#include "opal/util/show_help.h"
2829

2930
#include "btl_vader.h"
3031
#include "btl_vader_endpoint.h"
@@ -77,6 +78,28 @@ mca_btl_vader_t mca_btl_vader = {
7778
}
7879
};
7980

81+
/*
82+
* Exit function copied from btl_usnic_util.c
83+
*
84+
* The following comment tells Coverity that this function does not return.
85+
* See https://scan.coverity.com/tune.
86+
*/
87+
88+
/* coverity[+kill] */
89+
static void vader_btl_exit(mca_btl_vader_t *btl)
90+
{
91+
if (NULL != btl && NULL != btl->error_cb) {
92+
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
93+
(opal_proc_t*) opal_proc_local_get(),
94+
"The vader BTL is aborting the MPI job (via PML error callback).");
95+
}
96+
97+
/* If the PML error callback returns (or if there wasn't one), just exit. Shrug. */
98+
fprintf(stderr, "*** The Open MPI vader BTL is aborting the MPI job (via exit(3)).\n");
99+
fflush(stderr);
100+
exit(1);
101+
}
102+
80103
static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
81104
{
82105
mca_btl_vader_component_t *component = &mca_btl_vader_component;
@@ -158,6 +181,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
158181
static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) {
159182
mca_btl_vader_component_t *component = &mca_btl_vader_component;
160183
union vader_modex_t *modex;
184+
ino_t my_user_ns_id;
161185
size_t msg_size;
162186
int rc;
163187

@@ -182,17 +206,59 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
182206
} else {
183207
#endif
184208
/* store a copy of the segment information for detach */
185-
ep->segment_data.other.seg_ds = malloc (msg_size);
209+
ep->segment_data.other.seg_ds = malloc (modex->other.seg_ds_size);
186210
if (NULL == ep->segment_data.other.seg_ds) {
187211
return OPAL_ERR_OUT_OF_RESOURCE;
188212
}
189213

190-
memcpy (ep->segment_data.other.seg_ds, &modex->seg_ds, msg_size);
214+
memcpy (ep->segment_data.other.seg_ds, &modex->other.seg_ds, modex->other.seg_ds_size);
191215

192216
ep->segment_base = opal_shmem_segment_attach (ep->segment_data.other.seg_ds);
193217
if (NULL == ep->segment_base) {
194218
return OPAL_ERROR;
195219
}
220+
221+
if (MCA_BTL_VADER_CMA == mca_btl_vader_component.single_copy_mechanism) {
222+
my_user_ns_id = mca_btl_vader_get_user_ns_id();
223+
if (my_user_ns_id != modex->other.user_ns_id) {
224+
mca_base_var_source_t source;
225+
int vari;
226+
rc = mca_base_var_find_by_name("btl_vader_single_copy_mechanism", &vari);
227+
if (OPAL_ERROR == rc) {
228+
return OPAL_ERROR;
229+
}
230+
rc = mca_base_var_get_value(vari, NULL, &source, NULL);
231+
if (OPAL_ERROR == rc) {
232+
return OPAL_ERROR;
233+
}
234+
/*
235+
* CMA is not possible as different user namespaces are in use.
236+
* Currently the kernel does not allow * process_vm_{read,write}v()
237+
* for processes running in different user namespaces even if
238+
* all involved user IDs are mapped to the same user ID.
239+
*
240+
* Fallback to MCA_BTL_VADER_NONE.
241+
*/
242+
if (MCA_BASE_VAR_SOURCE_DEFAULT != source) {
243+
/* If CMA has been explicitly selected we want to error out */
244+
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-error",
245+
true, opal_process_info.nodename);
246+
vader_btl_exit(&mca_btl_vader);
247+
}
248+
/*
249+
* If CMA has been selected because it is the default or
250+
* some fallback, this falls back even further.
251+
*/
252+
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-warning",
253+
true, opal_process_info.nodename);
254+
mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_NONE;
255+
mca_btl_vader.super.btl_flags &= ~MCA_BTL_FLAGS_RDMA;
256+
mca_btl_vader.super.btl_get = NULL;
257+
mca_btl_vader.super.btl_put = NULL;
258+
mca_btl_vader.super.btl_put_limit = 0;
259+
mca_btl_vader.super.btl_get_limit = 0;
260+
}
261+
}
196262
#if OPAL_BTL_VADER_HAVE_XPMEM
197263
}
198264
#endif

opal/mca/btl/vader/help-btl-vader.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,25 @@ WARNING: Linux kernel CMA support was requested via the
121121
btl_vader_single_copy_mechanism MCA variable, but CMA support is
122122
not available due to restrictive ptrace settings.
123123

124+
The vader shared memory BTL will fall back on another single-copy
125+
mechanism if one is available. This may result in lower performance.
126+
127+
Local host: %s
128+
#
129+
[cma-different-user-namespace-error]
130+
ERROR: Linux kernel CMA support was requested via the
131+
btl_vader_single_copy_mechanism MCA variable, but CMA support is
132+
not available due to different user namespaces.
133+
134+
Your MPI job will abort now. Please select another value for
135+
btl_vader_single_copy_mechanism.
136+
137+
Local host: %s
138+
#
139+
[cma-different-user-namespace-warning]
140+
WARNING: The default btl_vader_single_copy_mechanism CMA is
141+
not available due to different user namespaces.
142+
124143
The vader shared memory BTL will fall back on another single-copy
125144
mechanism if one is available. This may result in lower performance.
126145

0 commit comments

Comments
 (0)