Skip to content

Commit

Permalink
Do not use CMA in user namespaces
Browse files Browse the repository at this point in the history
Trying out to run processes via mpirun in Podman containers has shown
that the CMA btl_vader_single_copy_mechanism does not work when user
namespaces are involved.

Creating containers with Podman requires at least user namespaces to be
able to do unprivileged mounts in a container

Even if running the container with user namespace user ID mappings which
result in the same user ID on the inside and outside of all involved
containers, the check in the kernel to allow ptrace (and thus
process_vm_{read,write}v()), fails if the same IDs are not in the same
user namespace.

One workaround is to specify '--mca btl_vader_single_copy_mechanism none'
and this commit adds code to automatically skip CMA if user namespaces
are detected.

Preferred implementation would have been to detect if the other local
processes are running in different user namespaces, but it was not
clear how get the PIDs of the other involved processes in
mca_btl_vader_check_single_copy(). This is even more complicated if some
processes would be running in the same user namespace, but not all of
them. If one different user namespace is detected, CMA should be
disabled for all involved processes. So if one local process detects
that CMA is not working it would need to communicate this information to
all local processes.

This implementation now checks during the first access of
mca_btl_vader_{put,get}_cma() if the destination process is running in
another user namespace and switches to MCA_BTL_VADER_EMUL if this is
true.

So if the first access to process_vm_{read,write}v()) fails all further
accesses are automatically no longer trying to use CMA.

Signed-off-by: Adrian Reber <areber@redhat.com>
  • Loading branch information
adrianreber committed Jul 25, 2019
1 parent 5bd90ee commit 8c9ad46
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 0 deletions.
2 changes: 2 additions & 0 deletions opal/mca/btl/vader/btl_vader.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata);
#endif

int mca_btl_vader_check_for_user_ns(pid_t pid);

int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
Expand Down
127 changes: 127 additions & 0 deletions opal/mca/btl/vader/btl_vader_get.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
*/

#include "opal_config.h"
#include "opal/util/printf.h"
#include "opal/util/show_help.h"

#include "btl_vader.h"
#include "btl_vader_frag.h"
Expand Down Expand Up @@ -65,6 +67,108 @@ int mca_btl_vader_get_xpmem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
#endif

#if OPAL_BTL_VADER_HAVE_CMA
/*
* mca_btl_vader_parse_proc_ns_user() tries to parse the user namespace ID
* for a given PID
* Returns the ID of the user namespace. If the remote user namespace ID
* cannot be accessed, because it is another user namespace, it will be 0.
*
* In the case of an error, 'error' will be set to true.
*/
static uint64_t mca_btl_vader_parse_proc_ns_user(pid_t pid, bool *error)
{
char *link = malloc(PATH_MAX);
uint64_t user_ns_id;
char *tmp;
int i;

i = opal_asprintf(&tmp, "/proc/%d/ns/user", pid);
if (0 > i) {
opal_output(0, "Error reading user namespace ID of process %d\n", pid);
free(link);
*error = true;
return 0;
}

i = readlink(tmp, link, PATH_MAX);
free(tmp);

if ((-1 == i) && (EACCES == errno)) {
/*
* At this point we are pretty sure, that the PID of the processes we
* are looking at should exist. If we cannot read the user namespace
* ID, the process we are looking at is probably running in another
* user namespace.
*/
return 0;
}

/*
* Result in link should look like 'user:[<inode-number>]', so at least
* 8 characters long: 'user:[?]'.
*/
if (8 > i) {
free(link);
opal_output(0, "Error reading user namespace ID of process %d\n", pid);
*error = true;
return 0;
}

/* remove trailing ']' */
link[i - 1] = '\0';
tmp = strchr(link, '[');
if (NULL == tmp) {
free(link);
opal_output(0, "Error reading user namespace ID of process %d\n", pid);
*error = true;
return 0;
}

user_ns_id = strtoul(tmp + 1, NULL, 10);
free(link);

if ((0 == user_ns_id) && (EINVAL == errno)) {
/* strtoul() failed */
*error = true;
}

return user_ns_id;
}
/*
* mca_btl_vader_check_for_user_ns checks if the specified process (pid)
* runs in another user namespace. Currently the kernel does not allow
* process_vm_{read,write}v() for processes running in different user
* namespaces even if all involved user IDs are mapped to the same user ID.
*
* Returns -1 for failure, 0 if running in the same user namespace,
* 1 if running in different user namespaces.
*/
int mca_btl_vader_check_for_user_ns(pid_t pid)
{
uint64_t remote_user_ns_id;
uint64_t my_user_ns_id;
bool error = false;

/* Read my user namespace ID */
my_user_ns_id = mca_btl_vader_parse_proc_ns_user(getpid(), &error);
if ((0 == my_user_ns_id) && error) {
return -1;
}

/* Read remote user namespace ID */
remote_user_ns_id = mca_btl_vader_parse_proc_ns_user(pid, &error);

if ((0 == remote_user_ns_id) && error) {
return -1;
}

if (remote_user_ns_id != my_user_ns_id) {
return 1;
}

return 0;
}

int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
Expand All @@ -73,6 +177,29 @@ int mca_btl_vader_get_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *
struct iovec src_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size};
struct iovec dst_iov = {.iov_base = local_address, .iov_len = size};
ssize_t ret;
int rc;

/* First make sure, that the remote process is not running in a different user namespace. */
rc = mca_btl_vader_check_for_user_ns(endpoint->segment_data.other.seg_ds->seg_cpid);
if (-1 == rc) {
return OPAL_ERROR;
} else if (1 == rc) {
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace",
true, opal_process_info.nodename);
/*
* The involved processes are running in different user namespaces.
* Disable CMA and use MCA_BTL_VADER_EMUL.
*/
mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_EMUL;
mca_btl_vader.super.btl_get = mca_btl_vader_get_sc_emu;
mca_btl_vader.super.btl_put = mca_btl_vader_put_sc_emu;
mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);

return mca_btl_vader_get_sc_emu(btl, endpoint, local_address, remote_address, local_handle,
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
}


/*
* According to the man page :
Expand Down
23 changes: 23 additions & 0 deletions opal/mca/btl/vader/btl_vader_put.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
*/

#include "opal_config.h"
#include "opal/util/show_help.h"

#include "btl_vader.h"
#include "btl_vader_frag.h"
Expand Down Expand Up @@ -68,6 +69,28 @@ int mca_btl_vader_put_cma (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *
struct iovec src_iov = {.iov_base = local_address, .iov_len = size};
struct iovec dst_iov = {.iov_base = (void *)(intptr_t) remote_address, .iov_len = size};
ssize_t ret;
int rc;

/* First make sure, that the remote process is not running in a different user namespace. */
rc = mca_btl_vader_check_for_user_ns(endpoint->segment_data.other.seg_ds->seg_cpid);
if (-1 == rc) {
return OPAL_ERROR;
} else if (1 == rc) {
opal_show_help("help-btl-vader.txt", "cma-different-user-namespace",
true, opal_process_info.nodename);
/*
* The involved processes are running in different user namespaces.
* Disable CMA and use MCA_BTL_VADER_EMUL.
*/
mca_btl_vader_component.single_copy_mechanism = MCA_BTL_VADER_EMUL;
mca_btl_vader.super.btl_get = mca_btl_vader_get_sc_emu;
mca_btl_vader.super.btl_put = mca_btl_vader_put_sc_emu;
mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);

return mca_btl_vader_put_sc_emu(btl, endpoint, local_address, remote_address, local_handle,
remote_handle, size, flags, order, cbfunc, cbcontext, cbdata);
}

/* This should not be needed, see the rationale in mca_btl_vader_get_cma() */
do {
Expand Down
10 changes: 10 additions & 0 deletions opal/mca/btl/vader/help-btl-vader.txt
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,16 @@ WARNING: Linux kernel CMA support was requested via the
btl_vader_single_copy_mechanism MCA variable, but CMA support is
not available due to restrictive ptrace settings.

The vader shared memory BTL will fall back on another single-copy
mechanism if one is available. This may result in lower performance.

Local host: %s
#
[cma-different-user-namespace]
WARNING: Linux kernel CMA support was requested via the
btl_vader_single_copy_mechanism MCA variable, but CMA support is
not available due to different user namespaces.

The vader shared memory BTL will fall back on another single-copy
mechanism if one is available. This may result in lower performance.

Expand Down

0 comments on commit 8c9ad46

Please sign in to comment.