From dbfd9a44baca1a6ec08556e40ae5a556f39c8419 Mon Sep 17 00:00:00 2001 From: Sean Hefty <sean.hefty@intel.com> Date: Fri, 26 Jul 2019 07:26:13 -0700 Subject: [PATCH] core/util: Adding MR cache malloc hooking mechanism Integrate hooking memory allocation related calls into the MR cache. This allows us to support older linux kernel that do not support userfaultfd. The core of the code is adapted from OpenMPI. Signed-off-by: Nikita Gusev <nikita.gusev@intel.com> Signed-off-by: Sean Hefty <sean.hefty@intel.com> --- Makefile.am | 1 + include/ofi_mr.h | 19 +- prov/util/src/util_mem_hooks.c | 408 +++++++++++++++++++++++++++++++ prov/util/src/util_mem_monitor.c | 33 ++- prov/verbs/src/verbs_domain.c | 2 +- 5 files changed, 458 insertions(+), 5 deletions(-) create mode 100644 prov/util/src/util_mem_hooks.c diff --git a/Makefile.am b/Makefile.am index a48dda167ec..fff7c2c01d2 100644 --- a/Makefile.am +++ b/Makefile.am @@ -66,6 +66,7 @@ common_srcs = \ prov/util/src/util_ns.c \ prov/util/src/util_shm.c \ prov/util/src/util_mem_monitor.c\ + prov/util/src/util_mem_hooks.c \ prov/util/src/util_mr_cache.c diff --git a/include/ofi_mr.h b/include/ofi_mr.h index 56cf75ac281..e4edca14eaa 100644 --- a/include/ofi_mr.h +++ b/include/ofi_mr.h @@ -46,7 +46,6 @@ #include <ofi_list.h> #include <ofi_tree.h> - struct ofi_mr_info { struct iovec iov; }; @@ -125,6 +124,8 @@ int ofi_monitor_subscribe(struct ofi_mem_monitor *monitor, void ofi_monitor_unsubscribe(struct ofi_mem_monitor *monitor, const void *addr, size_t len); +extern struct ofi_mem_monitor *default_monitor; + /* * Userfault fd memory monitor */ @@ -139,6 +140,19 @@ void ofi_uffd_cleanup(void); extern struct ofi_mem_monitor *uffd_monitor; +/* + * Memory intercept call memory monitor + */ +struct ofi_memhooks { + struct ofi_mem_monitor monitor; + struct dlist_entry intercept_list; +}; + +int ofi_memhooks_init(void); +void ofi_memhooks_cleanup(void); + +extern struct ofi_mem_monitor *memhooks_monitor; + /* * Used to store registered memory regions into a lookup map. This @@ -184,7 +198,7 @@ int ofi_mr_close(struct fid *fid); int ofi_mr_regattr(struct fid *fid, const struct fi_mr_attr *attr, uint64_t flags, struct fid_mr **mr_fid); int ofi_mr_regv(struct fid *fid, const struct iovec *iov, - size_t count, uint64_t access, uint64_t offset, + size_t count, uint64_t access, uint64_t offset, uint64_t requested_key, uint64_t flags, struct fid_mr **mr_fid, void *context); int ofi_mr_reg(struct fid *fid, const void *buf, size_t len, @@ -201,6 +215,7 @@ struct ofi_mr_cache_params { size_t max_cnt; size_t max_size; int merge_regions; + char * monitor; }; extern struct ofi_mr_cache_params cache_params; diff --git a/prov/util/src/util_mem_hooks.c b/prov/util/src/util_mem_hooks.c new file mode 100644 index 00000000000..05598a259a0 --- /dev/null +++ b/prov/util/src/util_mem_hooks.c @@ -0,0 +1,408 @@ +/* + * Copyright (c) 2016 Los Alamos National Security, LLC. All rights reserved. + * Copyright (c) 2019 Intel Corporation, Inc. All rights reserved. + * + * License text from Open-MPI (www.open-mpi.org/community/license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer listed + * in this license in the documentation and/or other materials + * provided with the distribution. + * + * - Neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * The copyright holders provide no reassurances that the source code + * provided does not infringe any patent, copyright, or any other + * intellectual property rights of third parties. The copyright holders + * disclaim any liability to any recipient for claims brought against + * recipient by any third party for infringement of that parties + * intellectual property rights. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/*#if HAVE_PATCH_UNMAP*/ +#include <elf.h> +#include <sys/auxv.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <dlfcn.h> +#include <fcntl.h> +#include <link.h> +#include <ofi_mr.h> + + +struct ofi_intercept { + struct dlist_entry entry; + const char *symbol; + void *our_func; + struct dlist_entry dl_intercept_list; +}; + +struct ofi_dl_intercept { + struct dlist_entry entry; + void **dl_func_addr; + void *dl_func; +}; + +enum { + OFI_INTERCEPT_DLOPEN, + OFI_INTERCEPT_MMAP, + OFI_INTERCEPT_MUNMAP +}; + +static void *ofi_intercept_dlopen(const char *filename, int flag); +static void *ofi_intercept_mmap(void *start, size_t length, + int prot, int flags, int fd, off_t offset); +static int ofi_intercept_munmap(void *start, size_t length); + +static struct ofi_intercept intercepts[] = { + [OFI_INTERCEPT_DLOPEN] = { .symbol = "dlopen", + .our_func = ofi_intercept_dlopen}, + [OFI_INTERCEPT_MMAP] = { .symbol = "mmap", + .our_func = ofi_intercept_mmap}, + [OFI_INTERCEPT_MUNMAP] = { .symbol = "munmap", + .our_func = ofi_intercept_munmap} +}; + +struct ofi_mem_calls { + void *(*dlopen) (const char *, int); + void *(*mmap)(void *, size_t, int, int, int, off_t); + int (*munmap)(void *, size_t); +}; + +static struct ofi_mem_calls real_calls; + +struct ofi_memhooks memhooks; +struct ofi_mem_monitor *memhooks_monitor = &memhooks.monitor; + + +static const ElfW(Phdr) * +ofi_get_phdr_dynamic(const ElfW(Phdr) *phdr, uint16_t phnum, int phent) +{ + for (uint16_t i = 0 ; i < phnum ; ++i, + phdr = (ElfW(Phdr)*)((intptr_t) phdr + phent)) { + if (phdr->p_type == PT_DYNAMIC) + return phdr; + } + + return NULL; +} + +static void *ofi_get_dynentry(ElfW(Addr) base, const ElfW(Phdr) *pdyn, + ElfW(Sxword) type) +{ + for (ElfW(Dyn) *dyn = (ElfW(Dyn)*) (base + pdyn->p_vaddr); + dyn->d_tag; ++dyn) { + if (dyn->d_tag == type) + return (void *) (uintptr_t) dyn->d_un.d_val; + } + + return NULL; +} + +#if SIZE_MAX > UINT_MAX +#define OFI_ELF_R_SYM ELF64_R_SYM +#else +#define OFI_ELF_R_SYM ELF32_R_SYM +#endif + +static void *ofi_dl_func_addr(ElfW(Addr) base, const ElfW(Phdr) *phdr, + int16_t phnum, int phent, const char *symbol) +{ + const ElfW(Phdr) *dphdr; + void *jmprel, *strtab; + char *elf_sym; + uint32_t relsymidx; + ElfW(Sym) *symtab; + size_t pltrelsz; + + dphdr = ofi_get_phdr_dynamic(phdr, phnum, phent); + jmprel = ofi_get_dynentry(base, dphdr, DT_JMPREL); + symtab = (ElfW(Sym) *) ofi_get_dynentry(base, dphdr, DT_SYMTAB); + strtab = ofi_get_dynentry (base, dphdr, DT_STRTAB); + pltrelsz = (uintptr_t) ofi_get_dynentry(base, dphdr, DT_PLTRELSZ); + + for (ElfW(Rela) *reloc = jmprel; + (intptr_t) reloc < (intptr_t) jmprel + pltrelsz; ++reloc) { + relsymidx = OFI_ELF_R_SYM(reloc->r_info); + elf_sym = (char *) strtab + symtab[relsymidx].st_name; + if (!strcmp(symbol, elf_sym)) + return (void *) (base + reloc->r_offset); + } + + return NULL; +} + +static int ofi_intercept_dl_calls(ElfW(Addr) base, const ElfW(Phdr) *phdr, + const char *phname, int16_t phnum, int phent, + struct ofi_intercept *intercept) +{ + struct ofi_dl_intercept *dl_entry; + long page_size = ofi_get_page_size(); + void **func_addr, *page; + int ret; + + FI_DBG(&core_prov, FI_LOG_MR, + "intercepting symbol %s from dl\n", intercept->symbol); + func_addr = ofi_dl_func_addr(base, phdr, phnum, phent, intercept->symbol); + if (!func_addr) + return FI_SUCCESS; + + page = (void *) ((intptr_t) func_addr & ~(page_size - 1)); + ret = mprotect(page, page_size, PROT_READ | PROT_WRITE); + if (ret < 0) + return -FI_ENOSYS; + + if (*func_addr != intercept->our_func) { + dl_entry = malloc(sizeof(*dl_entry)); + if (!dl_entry) + return -FI_ENOMEM; + + dl_entry->dl_func_addr = func_addr; + dl_entry->dl_func = *func_addr; + *func_addr = intercept->our_func; + dlist_insert_tail(&dl_entry->entry, &intercept->dl_intercept_list); + } + + return FI_SUCCESS; +} + +static int ofi_intercept_phdr_handler(struct dl_phdr_info *info, + size_t size, void *data) +{ + struct ofi_intercept *intercept = data; + int phent, ret; + + phent = getauxval(AT_PHENT); + if (phent <= 0) { + FI_DBG(&core_prov, FI_LOG_MR, "failed to read phent size"); + return -FI_EINVAL; + } + + ret = ofi_intercept_dl_calls(info->dlpi_addr, info->dlpi_phdr, + info->dlpi_name, info->dlpi_phnum, + phent, intercept); + return ret; +} + +static void *ofi_intercept_dlopen(const char *filename, int flag) +{ + struct ofi_intercept *intercept; + void *handle; + + handle = real_calls.dlopen(filename, flag); + if (!handle) + return NULL; + + fastlock_acquire(&memhooks_monitor->lock); + dlist_foreach_container(&memhooks.intercept_list, struct ofi_intercept, + intercept, entry) { + dl_iterate_phdr(ofi_intercept_phdr_handler, intercept); + } + fastlock_release(&memhooks_monitor->lock); + return handle; +} + +static int ofi_restore_dl_calls(ElfW(Addr) base, const ElfW(Phdr) *phdr, + const char *phname, int16_t phnum, int phent, + struct ofi_intercept *intercept) +{ + struct ofi_dl_intercept *dl_entry; + long page_size = ofi_get_page_size(); + void **func_addr, *page; + int ret; + + FI_DBG(&core_prov, FI_LOG_MR, + "releasing symbol %s from dl\n", intercept->symbol); + func_addr = ofi_dl_func_addr(base, phdr, phnum, phent, intercept->symbol); + if (!func_addr) + return FI_SUCCESS; + + page = (void *) ((intptr_t) func_addr & ~(page_size - 1)); + ret = mprotect(page, page_size, PROT_READ | PROT_WRITE); + if (ret < 0) + return -FI_ENOSYS; + + dlist_foreach_container_reverse(&intercept->dl_intercept_list, + struct ofi_dl_intercept, dl_entry, entry) { + + if (dl_entry->dl_func_addr != func_addr) + continue; + + assert(*func_addr == intercept->our_func); + *func_addr = dl_entry->dl_func; + dlist_remove(&dl_entry->entry); + free(dl_entry); + FI_DBG(&core_prov, FI_LOG_MR, + "dl symbol %s restored\n", intercept->symbol); + break; + } + + return FI_SUCCESS; +} + +static int ofi_restore_phdr_handler(struct dl_phdr_info *info, + size_t size, void *data) +{ + struct ofi_intercept *intercept = data; + int phent, ret; + + phent = getauxval(AT_PHENT); + if (phent <= 0) { + FI_DBG(&core_prov, FI_LOG_MR, "failed to read phent size"); + return -FI_EINVAL; + } + + ret = ofi_restore_dl_calls(info->dlpi_addr, info->dlpi_phdr, + info->dlpi_name, info->dlpi_phnum, + phent, intercept); + return ret; +} + +static void ofi_restore_intercepts(void) +{ + struct ofi_intercept *intercept; + + fastlock_acquire(&memhooks_monitor->lock); + dlist_foreach_container(&memhooks.intercept_list, struct ofi_intercept, + intercept, entry) { + dl_iterate_phdr(ofi_restore_phdr_handler, intercept); + } + fastlock_release(&memhooks_monitor->lock); +} + +static int ofi_intercept_symbol(struct ofi_intercept *intercept, void **real_func) +{ + int ret; + + /* + * Take lock first to handle a possible race where dlopen() is called + * from another thread and we may end up not patching it. + */ + FI_DBG(&core_prov, FI_LOG_MR, + "intercepting symbol %s\n", intercept->symbol); + fastlock_acquire(&memhooks_monitor->lock); + ret = dl_iterate_phdr(ofi_intercept_phdr_handler, intercept); + if (ret) + goto unlock; + + *real_func = dlsym(RTLD_DEFAULT, intercept->symbol); + if (*real_func == intercept->our_func) { + (void) dlerror(); + *real_func = dlsym(RTLD_NEXT, intercept->symbol); + } + + if (!*real_func) { + FI_DBG(&core_prov, FI_LOG_MR, + "could not find symbol %s\n", intercept->symbol); + ret = -FI_ENOMEM; + goto unlock; + } + dlist_insert_tail(&memhooks.intercept_list, &intercept->entry); +unlock: + fastlock_release(&memhooks_monitor->lock); + return ret; +} + +void ofi_intercept_handler(const void *addr, size_t len) +{ + fastlock_acquire(&memhooks_monitor->lock); + ofi_monitor_notify(memhooks_monitor, addr, len); + fastlock_release(&memhooks_monitor->lock); +} + +static void *ofi_intercept_mmap(void *start, size_t length, + int prot, int flags, int fd, off_t offset) +{ + FI_DBG(&core_prov, FI_LOG_MR, + "intercepted mmap start %p len %zu\n", start, length); + ofi_intercept_handler(start, length); + + return real_calls.mmap(start, length, prot, flags, fd, offset); +} + +static int ofi_intercept_munmap(void *start, size_t length) +{ + FI_DBG(&core_prov, FI_LOG_MR, + "intercepted munmap start %p len %zu\n", start, length); + ofi_intercept_handler(start, length); + + return real_calls.munmap(start, length); +} + +static int ofi_memhooks_subscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len) +{ + /* no-op */ + return FI_SUCCESS; +} + +static void ofi_memhooks_unsubscribe(struct ofi_mem_monitor *monitor, + const void *addr, size_t len) +{ + /* no-op */ +} + +int ofi_memhooks_init(void) +{ + int ret; + + /* TODO: remove once cleanup is written */ + if (memhooks_monitor->subscribe == ofi_memhooks_subscribe) + return 0; + + memhooks_monitor->subscribe = ofi_memhooks_subscribe; + memhooks_monitor->unsubscribe = ofi_memhooks_unsubscribe; + dlist_init(&memhooks.intercept_list); + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_DLOPEN], + (void **) &real_calls.dlopen); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept dlopen failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MMAP], + (void **) &real_calls.mmap); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept mmap failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + ret = ofi_intercept_symbol(&intercepts[OFI_INTERCEPT_MUNMAP], + (void **) &real_calls.munmap); + if (ret) { + FI_WARN(&core_prov, FI_LOG_MR, + "intercept munmap failed %d %s\n", ret, fi_strerror(ret)); + return ret; + } + + return 0; +} + +void ofi_memhooks_cleanup(void) +{ + ofi_restore_intercepts(); +} diff --git a/prov/util/src/util_mem_monitor.c b/prov/util/src/util_mem_monitor.c index 5899d20079b..44e3277fe9d 100644 --- a/prov/util/src/util_mem_monitor.c +++ b/prov/util/src/util_mem_monitor.c @@ -38,6 +38,8 @@ static struct ofi_uffd uffd; struct ofi_mem_monitor *uffd_monitor = &uffd.monitor; +struct ofi_mem_monitor *default_monitor; + /* * Initialize all available memory monitors @@ -47,6 +49,15 @@ void ofi_monitor_init(void) fastlock_init(&uffd_monitor->lock); dlist_init(&uffd_monitor->list); + fastlock_init(&memhooks_monitor->lock); + dlist_init(&memhooks_monitor->list); + +#if HAVE_UFFD_UNMAP +struct ofi_mem_monitor *default_monitor = uffd_monitor; +#else +struct ofi_mem_monitor *default_monitor = memhooks_monitor; +#endif + fi_param_define(NULL, "mr_cache_max_size", FI_PARAM_SIZE_T, "Defines the total number of bytes for all memory" " regions that may be tracked by the MR cache." @@ -65,11 +76,20 @@ void ofi_monitor_init(void) " region. Merging regions can reduce the cache" " memory footprint, but can negatively impact" " performance in some situations. (default: false)"); + fi_param_define(NULL, "mr_cache_monitor", FI_PARAM_STRING, + "Define a default memory registration monitor." + " The monitor checks for virtual to physical memory" + " address changes. Options are: userfaultfd and" + " memhooks. Userfaultfd is a Linux kernel feature." + " Memhooks operates by intercepting memory allocation" + " and free calls. Userfaultfd is the default if" + "available on the system."); fi_param_get_size_t(NULL, "mr_cache_max_size", &cache_params.max_size); fi_param_get_size_t(NULL, "mr_cache_max_count", &cache_params.max_cnt); fi_param_get_bool(NULL, "mr_cache_merge_regions", &cache_params.merge_regions); + fi_param_get_str(NULL, "mr_cache_monitor", &cache_params.monitor); if (!cache_params.max_size) cache_params.max_size = SIZE_MAX; @@ -79,6 +99,9 @@ void ofi_monitor_cleanup(void) { assert(dlist_empty(&uffd_monitor->list)); fastlock_destroy(&uffd_monitor->lock); + + assert(dlist_empty(&memhooks_monitor->list)); + fastlock_destroy(&memhooks_monitor->lock); } int ofi_monitor_add_cache(struct ofi_mem_monitor *monitor, @@ -88,8 +111,11 @@ int ofi_monitor_add_cache(struct ofi_mem_monitor *monitor, fastlock_acquire(&monitor->lock); if (dlist_empty(&monitor->list)) { + if (monitor == uffd_monitor) ret = ofi_uffd_init(); + else if (monitor == memhooks_monitor) + ret = ofi_memhooks_init(); else ret = -FI_ENOSYS; @@ -113,8 +139,11 @@ void ofi_monitor_del_cache(struct ofi_mr_cache *cache) fastlock_acquire(&monitor->lock); dlist_remove(&cache->notify_entry); - if (dlist_empty(&monitor->list) && (monitor == uffd_monitor)) + if (dlist_empty(&monitor->list)) { ofi_uffd_cleanup(); + ofi_memhooks_cleanup(); + } + fastlock_release(&monitor->lock); } @@ -125,7 +154,7 @@ void ofi_monitor_notify(struct ofi_mem_monitor *monitor, struct ofi_mr_cache *cache; dlist_foreach_container(&monitor->list, struct ofi_mr_cache, - cache, notify_entry) { + cache, notify_entry) { ofi_mr_cache_notify(cache, addr, len); } } diff --git a/prov/verbs/src/verbs_domain.c b/prov/verbs/src/verbs_domain.c index 6245bdebabc..8534db3a272 100644 --- a/prov/verbs/src/verbs_domain.c +++ b/prov/verbs/src/verbs_domain.c @@ -282,7 +282,7 @@ fi_ibv_domain(struct fid_fabric *fabric, struct fi_info *info, _domain->cache.entry_data_size = sizeof(struct fi_ibv_mem_desc); _domain->cache.add_region = fi_ibv_mr_cache_add_region; _domain->cache.delete_region = fi_ibv_mr_cache_delete_region; - ret = ofi_mr_cache_init(&_domain->util_domain, uffd_monitor, + ret = ofi_mr_cache_init(&_domain->util_domain, default_monitor, &_domain->cache); if (!ret) _domain->util_domain.domain_fid.mr = &fi_ibv_mr_cache_ops;