Skip to content

Commit

Permalink
Do not use CMA in user namespaces
Browse files Browse the repository at this point in the history
Trying out to run processes via mpirun in Podman containers has shown
that the CMA btl_vader_single_copy_mechanism does not work when user
namespaces are involved.

Creating containers with Podman requires at least user namespaces to be
able to do unprivileged mounts in a container

Even if running the container with user namespace user ID mappings which
result in the same user ID on the inside and outside of all involved
containers, the check in the kernel to allow ptrace (and thus
process_vm_{read,write}v()), fails if the same IDs are not in the same
user namespace.

One workaround is to specify '--mca btl_vader_single_copy_mechanism none'
and this commit adds code to automatically skip CMA if user namespaces
are detected and fall back to MCA_BTL_VADER_EMUL.

Signed-off-by: Adrian Reber <areber@redhat.com>
  • Loading branch information
adrianreber committed Jul 30, 2019
1 parent 5bd90ee commit 7686c6a
Show file tree
Hide file tree
Showing 4 changed files with 124 additions and 5 deletions.
9 changes: 8 additions & 1 deletion opal/mca/btl/vader/btl_vader.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,12 @@ union vader_modex_t {
void *segment_base;
} xpmem;
#endif
opal_shmem_ds_t seg_ds;
struct vader_modex_other_t {
int seg_ds_size;
uint64_t user_ns_id;
/* seg_ds needs to be the last element */
opal_shmem_ds_t seg_ds;
} other;
};

/**
Expand Down Expand Up @@ -270,6 +275,8 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
#endif

uint64_t mca_btl_vader_parse_proc_ns_user(void);

int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
Expand Down
54 changes: 52 additions & 2 deletions opal/mca/btl/vader/btl_vader_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,48 @@ static int mca_btl_vader_component_close(void)
return OPAL_SUCCESS;
}

/*
* mca_btl_vader_parse_proc_ns_user() tries to parse the user namespace ID
* of the current process.
* Returns the ID of the user namespace. In the case of an error '0' is returned.
*/
uint64_t mca_btl_vader_parse_proc_ns_user(void)
{
char *link = malloc(PATH_MAX);
pid_t pid = getpid();
uint64_t user_ns_id;
char *tmp;
int i;

i = readlink("/proc/self/ns/user", link, PATH_MAX);
if (-1 == i) {
return 0;
}

/*
* Result in link should look like 'user:[<inode-number>]', so at least
* 8 characters long: 'user:[?]'.
*/
if (8 > i) {
free(link);
opal_output(0, "Error reading user namespace ID of process %d\n", pid);
return 0;
}

/* remove trailing ']' */
link[i - 1] = '\0';
tmp = strchr(link, '[');
if (NULL == tmp) {
free(link);
opal_output(0, "Error reading user namespace ID of process %d\n", pid);
return 0;
}

user_ns_id = strtoul(tmp + 1, NULL, 10);
free(link);

return user_ns_id;
}
static int mca_btl_base_vader_modex_send (void)
{
union vader_modex_t modex;
Expand All @@ -363,8 +405,16 @@ static int mca_btl_base_vader_modex_send (void)
modex_size = sizeof (modex.xpmem);
} else {
#endif
modex_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
memmove (&modex.seg_ds, &mca_btl_vader_component.seg_ds, modex_size);
modex.other.seg_ds_size = opal_shmem_sizeof_shmem_ds (&mca_btl_vader_component.seg_ds);
memmove (&modex.other.seg_ds, &mca_btl_vader_component.seg_ds, modex.other.seg_ds_size);
modex.other.user_ns_id = mca_btl_vader_parse_proc_ns_user();
/*
* If modex.other.user_ns_id is '0' something did not work out
* during user namespace detection. Assuming there are no
* namespaces available it will return '0' for all processes and
* the check later will see '0' everywhere and not disable CMA.
*/
modex_size = sizeof (modex.other);

#if OPAL_BTL_VADER_HAVE_XPMEM
}
Expand Down
47 changes: 45 additions & 2 deletions opal/mca/btl/vader/btl_vader_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
*/

#include "opal_config.h"
#include "opal/util/show_help.h"

#include "btl_vader.h"
#include "btl_vader_endpoint.h"
Expand Down Expand Up @@ -173,6 +174,7 @@ static int vader_btl_first_time_init(mca_btl_vader_t *vader_btl, int n)
static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_proc_t *proc, int remote_rank) {
mca_btl_vader_component_t *component = &mca_btl_vader_component;
union vader_modex_t *modex;
uint64_t my_user_ns_id;
size_t msg_size;
int rc;

Expand All @@ -197,17 +199,58 @@ static int init_vader_endpoint (struct mca_btl_base_endpoint_t *ep, struct opal_
} else {
#endif
/* store a copy of the segment information for detach */
ep->segment_data.other.seg_ds = malloc (msg_size);
ep->segment_data.other.seg_ds = malloc (modex->other.seg_ds_size);
if (NULL == ep->segment_data.other.seg_ds) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

memcpy (ep->segment_data.other.seg_ds, &modex->seg_ds, msg_size);
memcpy (ep->segment_data.other.seg_ds, &modex->other.seg_ds, modex->other.seg_ds_size);

ep->segment_base = opal_shmem_segment_attach (ep->segment_data.other.seg_ds);
if (NULL == ep->segment_base) {
return OPAL_ERROR;
}

if (MCA_BTL_VADER_CMA == mca_btl_vader_component.single_copy_mechanism) {
my_user_ns_id = mca_btl_vader_parse_proc_ns_user();
if (my_user_ns_id != modex->other.user_ns_id) {
mca_base_var_source_t source;
int vari;
rc = mca_base_var_find_by_name("btl_vader_single_copy_mechanism", &vari);
if (OPAL_ERROR == rc) {
return OPAL_ERROR;
}
rc = mca_base_var_get_value(vari, NULL, &source, NULL);
if (OPAL_ERROR == rc) {
return OPAL_ERROR;
}
/*
* CMA is not possible as different user namespaces are in use.
* Currently the kernel does not allow * process_vm_{read,write}v()
* for processes running in different user namespaces even if
* all involved user IDs are mapped to the same user ID.
*
* Fallback to MCA_BTL_VADER_EMUL.
*/
if (MCA_BASE_VAR_SOURCE_DEFAULT != source) {
/* If CMA has been explicitly selected we want to error out */
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-error",
true, opal_process_info.nodename);
return OPAL_ERROR;
}
/*
* If CMA has been selected because it is the default or
* some fallback, this falls back even further.
*/
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace-warning",
true, opal_process_info.nodename);
mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_EMUL;
mca_btl_vader.super.btl_get = mca_btl_vader_get_sc_emu;
mca_btl_vader.super.btl_put = mca_btl_vader_put_sc_emu;
mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
}
}
#if OPAL_BTL_VADER_HAVE_XPMEM
}
#endif
Expand Down
19 changes: 19 additions & 0 deletions opal/mca/btl/vader/help-btl-vader.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,25 @@ WARNING: Linux kernel CMA support was requested via the
btl_vader_single_copy_mechanism MCA variable, but CMA support is
not available due to restrictive ptrace settings.

The vader shared memory BTL will fall back on another single-copy
mechanism if one is available. This may result in lower performance.

Local host: %s
#
[cma-different-user-namespace-error]
ERROR: Linux kernel CMA support was requested via the
btl_vader_single_copy_mechanism MCA variable, but CMA support is
not available due to different user namespaces.

Your MPI job will abort now. Please select another value for
btl_vader_single_copy_mechanism.

Local host: %s
#
[cma-different-user-namespace-warning]
WARNING: The default btl_vader_single_copy_mechanism CMA is
not available due to different user namespaces.

The vader shared memory BTL will fall back on another single-copy
mechanism if one is available. This may result in lower performance.

Expand Down

0 comments on commit 7686c6a

Please sign in to comment.