Skip to content

Commit

Permalink
Merge pull request #6844 from adrianreber/check_for_user_ns
Browse files Browse the repository at this point in the history
Do not use CMA in user namespaces
  • Loading branch information
jsquyres authored Sep 21, 2019
2 parents a7da93f + fc68d8a commit 8038fac
Show file tree
Hide file tree
Showing 4 changed files with 127 additions and 5 deletions.
9 changes: 8 additions & 1 deletion opal/mca/btl/vader/btl_vader.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,12 @@ union vader_modex_t {
void *segment_base;
} xpmem;
#endif
opal_shmem_ds_t seg_ds;
struct vader_modex_other_t {
ino_t user_ns_id;
int seg_ds_size;
/* seg_ds needs to be the last element */
opal_shmem_ds_t seg_ds;
} other;
};

/**
Expand Down Expand Up @@ -270,6 +275,8 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
#endif

ino_t mca_btl_vader_get_user_ns_id(void);

int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
Expand Down
35 changes: 33 additions & 2 deletions opal/mca/btl/vader/btl_vader_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@
#include "btl_vader_fbox.h"
#include "btl_vader_xpmem.h"

#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif

#include <sys/mman.h>
#include <fcntl.h>

Expand Down Expand Up @@ -351,6 +355,25 @@ static int mca_btl_vader_component_close(void)
return OPAL_SUCCESS;
}

/*
* mca_btl_vader_parse_proc_ns_user() tries to get the user namespace ID
* of the current process.
* Returns the ID of the user namespace. In the case of an error '0' is returned.
*/
ino_t mca_btl_vader_get_user_ns_id(void)
{
struct stat buf;

if (0 > stat("/proc/self/ns/user", &buf)) {
/*
* Something went wrong, probably an old kernel that does not support namespaces
* simply assume all processes are in the same user namespace and return 0
*/
return 0;
}

return buf.st_ino;
}
static int mca_btl_base_vader_modex_send (void)
{
union vader_modex_t modex;
Expand All @@ -364,8 +387,16 @@ static int mca_btl_base_vader_modex_send (void)
modex_size = sizeof (modex.xpmem);
} else {
#endif
modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size);
modex.other.seg_ds_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
memmove (&modex.other.seg_ds, &mca_btl_vader_component.seg_ds, modex.other.seg_ds_size);
modex.other.user_ns_id = mca_btl_vader_get_user_ns_id();
/*
* If modex.other.user_ns_id is '0' something did not work out
* during user namespace detection. Assuming there are no
* namespaces available it will return '0' for all processes and
* the check later will see '0' everywhere and not disable CMA.
*/
modex_size = sizeof (modex.other);

#if OPAL_BTL_VADER_HAVE_XPMEM
}
Expand Down
69 changes: 67 additions & 2 deletions opal/mca/btl/vader/btl_vader_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
*/

#include "opal_config.h"
#include "opal/util/show_help.h"

#include "btl_vader.h"
#include "btl_vader_endpoint.h"
Expand Down Expand Up @@ -79,6 +80,28 @@ mca_btl_vader_t mca_btl_vader = {
}
};

/*
* Exit function copied from btl_usnic_util.c
*
* The following comment tells Coverity that this function does not return.
* See https://scan.coverity.com/tune.
*/

/* coverity[+kill] */
static void vader_btl_exit(mca_btl_vader_t *btl)
{
if (NULL != btl && NULL != btl->error_cb) {
btl->error_cb(&btl->super, MCA_BTL_ERROR_FLAGS_FATAL,
(opal_proc_t*) opal_proc_local_get(),
"The vader BTL is aborting the MPI job (via PML error callback).");
}

/* If the PML error callback returns (or if there wasn't one), just exit. Shrug. */
fprintf(stderr, "*** The Open MPI vader BTL is aborting the MPI job (via exit(3)).\n");
fflush(stderr);
exit(1);
}

static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
{
mca_btl_vader_component_t *component = &mca_btl_vader_component;
Expand Down Expand Up @@ -173,6 +196,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) {
mca_btl_vader_component_t *component = &mca_btl_vader_component;
union vader_modex_t *modex;
ino_t my_user_ns_id;
size_t msg_size;
int rc;

Expand All @@ -197,17 +221,58 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
} else {
#endif
/* store a copy of the segment information for detach */
ep->segment_data.other.seg_ds = malloc (msg_size);
ep->segment_data.other.seg_ds = malloc (modex->other.seg_ds_size);
if (NULL == ep->segment_data.other.seg_ds) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

memcpy (ep->segment_data.other.seg_ds, &modex->seg_ds, msg_size);
memcpy (ep->segment_data.other.seg_ds, &modex->other.seg_ds, modex->other.seg_ds_size);

ep->segment_base = opal_shmem_segment_attach (ep->segment_data.other.seg_ds);
if (NULL == ep->segment_base) {
return OPAL_ERROR;
}

if (MCA_BTL_VADER_CMA == mca_btl_vader_component.single_copy_mechanism) {
my_user_ns_id = mca_btl_vader_get_user_ns_id();
if (my_user_ns_id != modex->other.user_ns_id) {
mca_base_var_source_t source;
int vari;
rc = mca_base_var_find_by_name("btl_vader_single_copy_mechanism", &vari);
if (OPAL_ERROR == rc) {
return OPAL_ERROR;
}
rc = mca_base_var_get_value(vari, NULL, &source, NULL);
if (OPAL_ERROR == rc) {
return OPAL_ERROR;
}
/*
* CMA is not possible as different user namespaces are in use.
* Currently the kernel does not allow * process_vm_{read,write}v()
* for processes running in different user namespaces even if
* all involved user IDs are mapped to the same user ID.
*
* Fallback to MCA_BTL_VADER_EMUL.
*/
if (MCA_BASE_VAR_SOURCE_DEFAULT != source) {
/* If CMA has been explicitly selected we want to error out */
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-error",
true, opal_process_info.nodename);
vader_btl_exit(&mca_btl_vader);
}
/*
* If CMA has been selected because it is the default or
* some fallback, this falls back even further.
*/
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-warning",
true, opal_process_info.nodename);
mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_EMUL;
mca_btl_vader.super.btl_get = mca_btl_vader_get_sc_emu;
mca_btl_vader.super.btl_put = mca_btl_vader_put_sc_emu;
mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
}
}
#if OPAL_BTL_VADER_HAVE_XPMEM
}
#endif
Expand Down
19 changes: 19 additions & 0 deletions opal/mca/btl/vader/help-btl-vader.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,25 @@ WARNING: Linux kernel CMA support was requested via the
btl_vader_single_copy_mechanism MCA variable, but CMA support is
not available due to restrictive ptrace settings.

The vader shared memory BTL will fall back on another single-copy
mechanism if one is available. This may result in lower performance.

Local host: %s
#
[cma-different-user-namespace-error]
ERROR: Linux kernel CMA support was requested via the
btl_vader_single_copy_mechanism MCA variable, but CMA support is
not available due to different user namespaces.

Your MPI job will abort now. Please select another value for
btl_vader_single_copy_mechanism.

Local host: %s
#
[cma-different-user-namespace-warning]
WARNING: The default btl_vader_single_copy_mechanism CMA is
not available due to different user namespaces.

The vader shared memory BTL will fall back on another single-copy
mechanism if one is available. This may result in lower performance.

Expand Down

0 comments on commit 8038fac

Please sign in to comment.