Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

oshmem/shmem: Allocate and exchange base segment address beforehand #12889

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
147 changes: 142 additions & 5 deletions oshmem/mca/memheap/base/memheap_base_select.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include "oshmem/mca/sshmem/base/base.h"
#include "ompi/util/timings.h"

#include <sys/mman.h>

mca_memheap_base_config_t mca_memheap_base_config = {
.device_nic_mem_seg_size = 0
};
Expand Down Expand Up @@ -106,6 +108,136 @@ static size_t _memheap_size(void)
return (size_t) memheap_align(oshmem_shmem_info_env.symmetric_heap_size);
}

static void *memheap_mmap_get(void *hint, size_t size)
{
void *addr;

addr = mmap(hint, size,
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (addr == MAP_FAILED) {
return NULL;
}

return addr;
}

static int memheap_exchange_base_address(size_t size, void **address)
{
int nprocs = oshmem_num_procs();
int need_sync = (*address == NULL);
void *base = NULL;
void *ptr = NULL;
int rc, i;
void **bases;

bases = calloc(nprocs, sizeof(*bases));
if (NULL == bases) {
return OSHMEM_ERROR;
}

if (oshmem_my_proc_id() == 0) {
ptr = memheap_mmap_get(NULL, size);
base = ptr;
}

rc = oshmem_shmem_bcast(&base, sizeof(base), 0);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@brminich, tried the patch below where they all do the mmap(). the mmap() returned address is randomized like below, so we need some form of synchronization of the base adddress.

memheap_exchange_base_address() #1: exchange base address: base 0x7fa7d9dff000: ok
memheap_exchange_base_address() #3: exchange base address: base 0x7fdc5a15b000: ok
memheap_exchange_base_address() #2: exchange base address: base 0x7fe8aa56a000: ok
memheap_exchange_base_address() #0: exchange base address: base 0x7f3d1736b000: ok
diff --git a/oshmem/mca/memheap/base/memheap_base_select.c b/oshmem/mca/memheap/base/memheap_base_select.c
index 0ec74de6aa..0b0cfe4bee 100644
--- a/oshmem/mca/memheap/base/memheap_base_select.c
+++ b/oshmem/mca/memheap/base/memheap_base_select.c
@@ -134,21 +134,8 @@ static int memheap_exchange_base_address(size_t size, void **address)
         return OSHMEM_ERROR;
     }

-    if (oshmem_my_proc_id() == 0) {
-        ptr = memheap_mmap_get(NULL, size);
-        base = ptr;
-    }
-
-    rc = oshmem_shmem_bcast(&base, sizeof(base), 0);
-    if (OSHMEM_SUCCESS != rc) {
-        MEMHEAP_ERROR("Failed to exchange allocated vma for base segment "
-                      "(error %d)", rc);
-        goto out;
-    }
-
-    if (oshmem_my_proc_id() != 0) {
-        ptr = memheap_mmap_get(base, size);
-    }
+    ptr = memheap_mmap_get(NULL, size);
+    base = ptr;

     MEMHEAP_VERBOSE(100, "#%d: exchange base address: base %p: %s",
                     oshmem_my_proc_id(), base,

if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to exchange allocated vma for base segment "
"(error %d)", rc);
goto out;
}

if (oshmem_my_proc_id() != 0) {
ptr = memheap_mmap_get(base, size);
}

MEMHEAP_VERBOSE(100, "#%d: exchange base address: base %p: %s",
oshmem_my_proc_id(), base,
(base == ptr)? "ok" : "unavailable");

*address = base;
if (need_sync) {
/* They all succeed or fail to allow fallback */
rc = oshmem_shmem_allgather(&ptr, bases, sizeof(ptr));
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to exchange selected vma for base segment "
"(error %d)", rc);
goto out;
}

for (i = 0; i < nprocs; i++) {
if ((NULL == bases[i]) || (bases[i] != base)) {
*address = NULL;
break;
}
}
} else if (ptr != base) {
/* Any failure terminates the rank and others start teardown */
rc = OSHMEM_ERROR;
}
Comment on lines +174 to +177
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe use this as default flow (i mean setting mca_sshmem_base_start_address = NULL by default)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed


out:
if (((OSHMEM_SUCCESS != rc) || (*address == NULL)) && (ptr != NULL)) {
(void)munmap(ptr, size);
}

free(bases);
return rc;
}


/*
* The returned mca_sshmem_base_start_address value is reserved by using
* mmap() for the expected size.
*/
static int memheap_base_segment_setup(size_t size)
{
int rc;

if ((mca_sshmem_base_start_address == (void *)UINTPTR_MAX) ||
(mca_sshmem_base_start_address == NULL)) {
if (UINTPTR_MAX == 0xFFFFFFFF) {
/**
* if 32 bit we set sshmem_base_start_adress to 0
* to let OS allocate segment automatically
*/
mca_sshmem_base_start_address = NULL;
return OSHMEM_SUCCESS;
}

rc = memheap_exchange_base_address(size, &mca_sshmem_base_start_address);
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to setup base segment address (error %d)", rc);
return rc;
}

if (NULL != mca_sshmem_base_start_address) {
goto done; /* Region is reserved */
}

#if defined(__aarch64__)
mca_sshmem_base_start_address = (void*)0xAB0000000000;
#else
mca_sshmem_base_start_address = (void*)0xFF000000;
#endif
}

if (mca_sshmem_base_start_address != memheap_mmap_get(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is based on mmap() behavior where it always creates vma at the hint position if possible. If this not always true (kernel vesions..), this could regress existing behavior and even fail to honor command line parameter.

Shall we remove that confirmation check and proceed regardless? Or maybe only ignore that check when address was passed from command line?

mca_sshmem_base_start_address, size)) {
MEMHEAP_ERROR("Failed to create segment address %p/%zu",
mca_sshmem_base_start_address, size);
return OSHMEM_ERROR;
}

done:
if (oshmem_my_proc_id() == 0) {
MEMHEAP_VERBOSE(10, "Using symmetric segment address %p/%zu",
mca_sshmem_base_start_address, size);
}

return OSHMEM_SUCCESS;
}

static memheap_context_t* _memheap_create(void)
{
int rc = OSHMEM_SUCCESS;
Expand All @@ -123,13 +255,18 @@ static memheap_context_t* _memheap_create(void)

OPAL_TIMING_ENV_NEXT(timing, "_memheap_size()");

/* Inititialize symmetric area */
if (OSHMEM_SUCCESS == rc) {
rc = mca_memheap_base_alloc_init(&mca_memheap_base_map,
user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0,
"regular_mem");
/* Locate and reserve symmetric area */
rc = memheap_base_segment_setup(user_size + MEMHEAP_BASE_PRIVATE_SIZE);
if (OSHMEM_SUCCESS != rc) {
MEMHEAP_ERROR("Failed to negotiate base segment addres");
return NULL;
}

/* Initialize symmetric area */
rc = mca_memheap_base_alloc_init(&mca_memheap_base_map,
user_size + MEMHEAP_BASE_PRIVATE_SIZE, 0,
"regular_mem");

OPAL_TIMING_ENV_NEXT(timing, "mca_memheap_base_alloc_init()");

/* Initialize atomic symmetric area */
Expand Down
44 changes: 32 additions & 12 deletions oshmem/mca/memheap/base/memheap_base_static.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,17 @@
#include <pthread.h>

static int _check_perms(const char *perm);
static int _check_non_static_segment(const map_segment_t *mem_segs,
int n_segment,
const void *start, const void *end);
static int _check_address(void *start, void **end);
static int _check_pathname(uint64_t inode, const char *pathname);

int mca_memheap_base_static_init(mca_memheap_map_t *map)
{
/* read and parse segments from /proc/self/maps */
int ret = OSHMEM_SUCCESS;
int n_segments = map->n_segments;
uint64_t total_mem = 0;
void* start;
void* end;
Expand All @@ -54,14 +58,6 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map)
return OSHMEM_ERROR;
}

#ifdef __linux__
extern unsigned _end;
if (mca_sshmem_base_start_address < (uintptr_t)&_end) {
MEMHEAP_WARN("sshmem base start address is inside data region"
" (%p < %p)", mca_sshmem_base_start_address, &_end);
}
#endif

while (NULL != fgets(line, sizeof(line), fp)) {
if (3 > sscanf(line,
"%llx-%llx %s %llx %s %llx %s",
Expand All @@ -77,6 +73,12 @@ int mca_memheap_base_static_init(mca_memheap_map_t *map)
goto out;
}

if (OSHMEM_ERROR == _check_non_static_segment(
map->mem_segs, n_segments,
start, end)) {
continue;
}

if (OSHMEM_ERROR == _check_address(start, &end))
continue;

Expand Down Expand Up @@ -138,6 +140,26 @@ static int _check_perms(const char *perms)
return OSHMEM_ERROR;
}

static int _check_non_static_segment(const map_segment_t *mem_segs,
int n_segment,
const void *start, const void *end)
{
int i;

for (i = 0; i < n_segment; i++) {
if ((start <= mem_segs[i].super.va_base) &&
(mem_segs[i].super.va_base < end)) {
MEMHEAP_VERBOSE(100,
"non static segment: %p-%p already exists as %p-%p",
start, end, mem_segs[i].super.va_base,
mem_segs[i].super.va_end);
return OSHMEM_ERROR;
}
}

return OSHMEM_SUCCESS;
}

brminich marked this conversation as resolved.
Show resolved Hide resolved
static int _check_address(void *start, void **end)
{
/* FIXME Linux specific code */
Expand All @@ -148,11 +170,9 @@ static int _check_address(void *start, void **end)
/**
* SGI shmem only supports globals&static in main program.
* It does not support them in shared objects or in dlopen()
* (Clarified on PGAS 2011 tutorial)
* (Clarified on PGAS 2011 tutorial).
*
* So ignored any maps that start higher then process _end
* FIXME: make sure we do not register symmetric heap twice
* if we decide to allow shared objects
* So ignored any maps that start higher then process _end.
*/
if ((uintptr_t)start > data_end) {
MEMHEAP_VERBOSE(100,
Expand Down
12 changes: 1 addition & 11 deletions oshmem/mca/sshmem/base/sshmem_base_open.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,7 @@
* globals
*/

/**
* if 32 bit we set sshmem_base_start_address to 0
* to let OS allocate segment automatically
*/
#if UINTPTR_MAX == 0xFFFFFFFF
void *mca_sshmem_base_start_address = (void*)0;
#elif defined(__aarch64__)
void* mca_sshmem_base_start_address = (void*)0xAB0000000000;
#else
void* mca_sshmem_base_start_address = (void*)0xFF000000;
#endif
void *mca_sshmem_base_start_address = UINTPTR_MAX;

char * mca_sshmem_base_backing_file_dir = NULL;

Expand Down
1 change: 1 addition & 0 deletions oshmem/mca/sshmem/sysv/sshmem_sysv_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ segment_create(map_segment_t *ds_buf,
}

/* Attach to the segment */
(void)munmap(mca_sshmem_base_start_address, size);
brminich marked this conversation as resolved.
Show resolved Hide resolved
addr = shmat(shmid, (void *) mca_sshmem_base_start_address, 0);
if (addr == (void *) -1L) {
opal_show_help("help-oshmem-sshmem.txt",
Expand Down
9 changes: 9 additions & 0 deletions oshmem/runtime/oshmem_shmem_exchange.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@
#include "oshmem/runtime/runtime.h"
#include "oshmem/runtime/params.h"

int oshmem_shmem_bcast(void *buf, int elem_size, int root)
{
int rc;

rc = PMPI_Bcast(buf, elem_size, MPI_BYTE, root, oshmem_comm_world);

return rc;
}

int oshmem_shmem_allgather(void *send_buf, void *rcv_buf, int elem_size)
{
int rc;
Expand Down
5 changes: 5 additions & 0 deletions oshmem/runtime/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,11 @@ int oshmem_shmem_finalize(void);
*/
OSHMEM_DECLSPEC int oshmem_shmem_abort(int errcode);

/**
* Broadcast between all PEs
*/
OSHMEM_DECLSPEC int oshmem_shmem_bcast(void *buf, int elem_size, int root);

/**
* Allgather between all PEs
*/
Expand Down
Loading