Skip to content

Commit

Permalink
FB8-132: NUMA support (#987)
Browse files Browse the repository at this point in the history
Summary:
Jira ticket: https://jira.percona.com/browse/FB8-132

Reference commit: e50c754
Reference commit: 286e975

WebScaleSQL Feature: NUMA Support

Credits for research and implementation: Jeremy Cole and Davi Arnaut

This patch provides startup options:
* flush-caches: Flush and purge buffers/caches
* numa-nodebind: Run mysqld with CPU affinity

Note, the original patch provided numa-interleave, but that is supported natively by 8.0 now.

It also provides a config option:
* innodb_buffer_pool_populate: pre-allocation of buffer pool
memory at start up:
  -- Use MAP_POPULATE if supported (Linux 2.6.23 and higher)
  -- Forced pre-allocation using memset

Automation will pass in the right value to this option if
it needs to bind mysqld to a particular socket. That will happen
via systemd scripts.

This is only adding the capability on the mysqld_safe side to be able to
pass in numactl to the server. The turning on of this feature will need more
testing and slow roll, and that will happen in different diffs on the ops repo.

Pull Request resolved: #987

Reviewed By: lth

Differential Revision: D14652411

Pulled By: lth

fbshipit-source-id: c51a5a3
  • Loading branch information
dutow authored and facebook-github-bot committed May 9, 2019
1 parent 757c279 commit b5d4a67
Show file tree
Hide file tree
Showing 16 changed files with 175 additions and 20 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CALL mtr.add_suppression(".* Forcing preallocation by faulting in pages.");
SELECT @@GLOBAL.innodb_buffer_pool_populate;
@@GLOBAL.innodb_buffer_pool_populate
1
1 Expected
SET @@GLOBAL.innodb_buffer_pool_populate=0;
ERROR HY000: Variable 'innodb_buffer_pool_populate' is a read only variable
Expected error 'Read only variable'
SELECT @@GLOBAL.innodb_buffer_pool_populate;
@@GLOBAL.innodb_buffer_pool_populate
1
1 Expected
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--innodb-buffer-pool-populate=true
13 changes: 13 additions & 0 deletions mysql-test/suite/sys_vars/t/innodb_buffer_pool_populate_basic.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
CALL mtr.add_suppression(".* Forcing preallocation by faulting in pages.");

# Display current value of innodb_buffer_pool_populate
SELECT @@GLOBAL.innodb_buffer_pool_populate;
--echo 1 Expected

# Variable should be read-only
--error ER_INCORRECT_GLOBAL_LOCAL_VAR
SET @@GLOBAL.innodb_buffer_pool_populate=0;
--echo Expected error 'Read only variable'

SELECT @@GLOBAL.innodb_buffer_pool_populate;
--echo 1 Expected
66 changes: 66 additions & 0 deletions scripts/mysqld_safe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ MYSQLD=
niceness=0
mysqld_ld_preload=
mysqld_ld_library_path=
flush_caches=0
numa_nodebind=

# Initial logging status: error log is not open, and not using syslog
logging=init
Expand Down Expand Up @@ -94,6 +96,8 @@ Usage: $0 [OPTIONS]
timestamps=TYPE system (ISO 8601 local time), hyphen
(hyphenated date a la mysqld 5.6), legacy
(legacy non-ISO 8601 mysqld_safe timestamps)
--flush-caches Flush and purge buffers/caches
--numa-nodebind Run mysqld with numa binding to one socket
All other options are passed to the mysqld program.
Expand Down Expand Up @@ -288,6 +292,8 @@ parse_arguments() {
--skip-syslog) want_syslog=0 ;;
--syslog-tag=*) syslog_tag="$val" ;;
--timezone=*) TZ="$val"; export TZ; ;;
--flush-caches) flush_caches=1 ;;
--numa-nodebind=*) numa_nodebind="$val" ;;

--help) usage ;;

Expand Down Expand Up @@ -878,6 +884,41 @@ mysqld daemon not started"
fi
fi

#
# Flush and purge buffers/caches.
#

if @TARGET_LINUX@ && test $flush_caches -eq 1
then
# Locate sync, ensure it exists.
if ! my_which sync > /dev/null 2>&1
then
log_error "sync command not found, required for --flush-caches"
exit 1
# Flush file system buffers.
elif ! sync
then
# Huh, the sync() function is always successful...
log_error "sync failed, check if sync is properly installed"
fi

# Locate sysctl, ensure it exists.
if ! my_which sysctl > /dev/null 2>&1
then
log_error "sysctl command not found, required for --flush-caches"
exit 1
# Purge page cache, dentries and inodes.
elif ! sysctl -q -w vm.drop_caches=3
then
log_error "sysctl failed, check the error message for details"
exit 1
fi
elif test $flush_caches -eq 1
then
log_error "--flush-caches is not supported on this platform"
exit 1
fi

#
# Uncomment the following lines if you want all tables to be automatically
# checked and repaired during startup. You should add sensible key_buffer
Expand Down Expand Up @@ -907,6 +948,31 @@ do
cmd="$cmd "`shell_quote_string "$i"`
done
cmd="$cmd $args"

if @TARGET_LINUX@ && test ! -z "$numa_nodebind"
then
# Locate numactl, ensure it exists.
if ! my_which numactl > /dev/null 2>&1
then
log_error "numactl command not found, required for --numa-nodebind"
exit 1
fi

# Attempt to run a command, ensure it works.
if ! numactl --cpunodebind=$numa_nodebind --preferred=$numa_nodebind true
then
log_error "numactl failed, check if numa-nodebind value is correct"
exit 1
fi

# Launch mysqld with numactl.
cmd="numactl --cpunodebind=$numa_nodebind --preferred=$numa_nodebind $cmd"
elif test ! -z "$numa_nodebind"
then
log_error "--numa-nodebind is not supported on this platform"
exit 1
fi

# Avoid 'nohup: ignoring input' warning
test -n "$NOHUP_NICENESS" && cmd="$cmd < /dev/null"

Expand Down
19 changes: 11 additions & 8 deletions storage/innobase/buf/buf0buf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -797,7 +797,8 @@ static buf_chunk_t *buf_chunk_init(
buf_pool_t *buf_pool, /*!< in: buffer pool instance */
buf_chunk_t *chunk, /*!< out: chunk of buffers */
ulonglong mem_size, /*!< in: requested size in bytes */
std::mutex *mutex) /*!< in,out: Mutex protecting chunk map. */
std::mutex *mutex, /*!< in,out: Mutex protecting chunk map. */
bool populate) /*!< in: virtual page preallocation */
{
buf_block_t *block;
byte *frame;
Expand All @@ -813,7 +814,8 @@ static buf_chunk_t *buf_chunk_init(

DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return (NULL););

chunk->mem = buf_pool->allocator.allocate_large(mem_size, &chunk->mem_pfx);
chunk->mem =
buf_pool->allocator.allocate_large(mem_size, &chunk->mem_pfx, populate);

if (chunk->mem == NULL) {
return (NULL);
Expand Down Expand Up @@ -1012,8 +1014,8 @@ static void buf_pool_set_sizes(void) {
@param[in,out] mutex Mutex to protect common data structures
@param[out] err DB_SUCCESS if all goes well */
static void buf_pool_create(buf_pool_t *buf_pool, ulint buf_pool_size,
ulint instance_no, std::mutex *mutex,
dberr_t &err) {
ulint instance_no, std::mutex *mutex, dberr_t &err,
bool populate) {
ulint i;
ulint chunk_size;
buf_chunk_t *chunk;
Expand Down Expand Up @@ -1079,7 +1081,7 @@ static void buf_pool_create(buf_pool_t *buf_pool, ulint buf_pool_size,
chunk = buf_pool->chunks;

do {
if (!buf_chunk_init(buf_pool, chunk, chunk_size, mutex)) {
if (!buf_chunk_init(buf_pool, chunk, chunk_size, mutex, populate)) {
while (--chunk >= buf_pool->chunks) {
buf_block_t *block = chunk->blocks;

Expand Down Expand Up @@ -1245,7 +1247,7 @@ static void buf_pool_free() {
@param[in] total_size Size of the total pool in bytes.
@param[in] n_instances Number of buffer pool instances to create.
@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
dberr_t buf_pool_init(ulint total_size, ulint n_instances) {
dberr_t buf_pool_init(ulint total_size, ulint n_instances, bool populate) {
ulint i;
const ulint size = total_size / n_instances;

Expand Down Expand Up @@ -1298,7 +1300,7 @@ dberr_t buf_pool_init(ulint total_size, ulint n_instances) {

for (ulint id = i; id < n; ++id) {
threads.emplace_back(std::thread(buf_pool_create, &buf_pool_ptr[id], size,
id, &m, std::ref(errs[id])));
id, &m, std::ref(errs[id]), populate));
}

for (ulint id = i; id < n; ++id) {
Expand Down Expand Up @@ -2101,7 +2103,8 @@ static void buf_pool_resize() {
while (chunk < echunk) {
ulonglong unit = srv_buf_pool_chunk_unit;

if (!buf_chunk_init(buf_pool, chunk, unit, nullptr)) {
if (!buf_chunk_init(buf_pool, chunk, unit, nullptr,
srv_buf_pool_populate)) {
ib::error(ER_IB_MSG_65) << "buffer pool " << i
<< " : failed to allocate"
" new memory.";
Expand Down
8 changes: 8 additions & 0 deletions storage/innobase/handler/ha_innodb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19826,6 +19826,13 @@ static MYSQL_SYSVAR_ULONG(
NULL, 120, 1, 127, 0);
#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */

static MYSQL_SYSVAR_BOOL(
buffer_pool_populate, srv_buf_pool_populate,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
"Preallocate (pre-fault) the page frames required for the mapping "
"established by the buffer pool memory region. Disabled by default.",
nullptr, nullptr, false);

static MYSQL_SYSVAR_ULONG(buffer_pool_instances, srv_buf_pool_instances,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of buffer pool instances, set to higher "
Expand Down Expand Up @@ -20610,6 +20617,7 @@ static SYS_VAR *innobase_system_variables[] = {
MYSQL_SYSVAR(dedicated_server),
MYSQL_SYSVAR(buffer_pool_size),
MYSQL_SYSVAR(buffer_pool_chunk_size),
MYSQL_SYSVAR(buffer_pool_populate),
MYSQL_SYSVAR(buffer_pool_instances),
MYSQL_SYSVAR(buffer_pool_filename),
MYSQL_SYSVAR(buffer_pool_dump_now),
Expand Down
2 changes: 1 addition & 1 deletion storage/innobase/include/buf0buf.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ struct buf_pools_list_size_t {
@param[in] total_size Size of the total pool in bytes.
@param[in] n_instances Number of buffer pool instances to create.
@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
dberr_t buf_pool_init(ulint total_size, ulint n_instances);
dberr_t buf_pool_init(ulint total_size, ulint n_instances, bool populate);

/** Frees the buffer pool at shutdown. This must not be invoked before
freeing all mutexes. */
Expand Down
2 changes: 1 addition & 1 deletion storage/innobase/include/os0proc.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ ulint os_proc_get_number(void);
/** Allocates large pages memory.
@param[in,out] n Number of bytes to allocate
@return allocated memory */
void *os_mem_alloc_large(ulint *n);
void *os_mem_alloc_large(ulint *n, bool populate);

/** Frees large pages memory.
@param[in] ptr pointer returned by os_mem_alloc_large()
Expand Down
2 changes: 2 additions & 0 deletions storage/innobase/include/srv0srv.h
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ extern bool srv_load_corrupted;
extern bool srv_dedicated_server;
/** Requested size in bytes */
extern ulint srv_buf_pool_size;
/** virtual page preallocation */
extern bool srv_buf_pool_populate;
/** Minimum pool size in bytes */
extern const ulint srv_buf_pool_min_size;
/** Default pool size in bytes */
Expand Down
6 changes: 4 additions & 2 deletions storage/innobase/include/ut0new.h
Original file line number Diff line number Diff line change
Expand Up @@ -813,14 +813,16 @@ class ut_allocator {
it until the memory is no longer needed and then pass it to
deallocate_large().
@return pointer to the allocated memory or NULL */
pointer allocate_large(size_type n_elements, ut_new_pfx_t *pfx) {
pointer allocate_large(size_type n_elements, ut_new_pfx_t *pfx,
bool populate) {
if (n_elements == 0 || n_elements > max_size()) {
return (NULL);
}

ulint n_bytes = n_elements * sizeof(T);

pointer ptr = reinterpret_cast<pointer>(os_mem_alloc_large(&n_bytes));
pointer ptr =
reinterpret_cast<pointer>(os_mem_alloc_large(&n_bytes, populate));

#ifdef UNIV_PFS_MEMORY
if (ptr != NULL) {
Expand Down
50 changes: 47 additions & 3 deletions storage/innobase/os/os0proc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "ut0byte.h"
#include "ut0mem.h"

/* Linux release version */
#if defined(UNIV_LINUX) && defined(_GNU_SOURCE)
#include <string.h> /* strverscmp() */
#include <sys/utsname.h> /* uname() */
#endif

/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and
MAP_ANON but MAP_ANON is marked as deprecated */
#if defined(MAP_ANONYMOUS)
Expand All @@ -50,6 +56,13 @@ MAP_ANON but MAP_ANON is marked as deprecated */
#define OS_MAP_ANON MAP_ANON
#endif

/* Linux's MAP_POPULATE */
#if defined(MAP_POPULATE)
#define OS_MAP_POPULATE MAP_POPULATE
#else
#define OS_MAP_POPULATE 0
#endif

/** The total amount of memory currently allocated from the operating
system with os_mem_alloc_large(). */
ulint os_total_large_mem_allocated = 0;
Expand All @@ -70,10 +83,21 @@ ulint os_proc_get_number(void) {
#endif
}

/** Retrieve and compare operating system release.
@return true if the OS release is equal to, or later than release. */
static bool os_compare_release(const char *release) {
#if defined(UNIV_LINUX) && defined(_GNU_SOURCE)
struct utsname name;
return (uname(&name) == 0 && strverscmp(name.release, release) >= 0);
#else
return false;
#endif
}

/** Allocates large pages memory.
@param[in,out] n Number of bytes to allocate
@return allocated memory */
void *os_mem_alloc_large(ulint *n) {
void *os_mem_alloc_large(ulint *n, bool populate) {
void *ptr;
ulint size;
#if defined HAVE_LINUX_LARGE_PAGES && defined UNIV_LINUX
Expand Down Expand Up @@ -145,8 +169,9 @@ void *os_mem_alloc_large(ulint *n) {
/* Align block size to system page size */
ut_ad(ut_is_2pow(size));
size = *n = ut_2pow_round(*n + (size - 1), size);
ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | OS_MAP_ANON, -1,
0);
ptr =
mmap(NULL, size, PROT_READ | PROT_WRITE,
MAP_PRIVATE | OS_MAP_ANON | (populate ? OS_MAP_POPULATE : 0), -1, 0);
if (UNIV_UNLIKELY(ptr == (void *)-1)) {
ib::error(ER_IB_MSG_856) << "mmap(" << size
<< " bytes) failed;"
Expand All @@ -158,6 +183,25 @@ void *os_mem_alloc_large(ulint *n) {
UNIV_MEM_ALLOC(ptr, size);
}
#endif

#if OS_MAP_ANON && OS_MAP_POPULATE
/* MAP_POPULATE is only supported for private mappings
since Linux 2.6.23. */
populate = populate && !os_compare_release("2.6.23");

if (ptr && populate) {
ib::warn() << "InnoDB: Warning: mmap(MAP_POPULATE) "
"is not supported for private mappings. "
"Forcing preallocation by faulting in pages.";
}
#endif

/* Initialize the entire buffer to force the allocation
of physical memory page frames. */
if (ptr && populate) {
memset(ptr, '\0', size);
}

return (ptr);
}

Expand Down
5 changes: 3 additions & 2 deletions storage/innobase/row/row0log.cc
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,9 @@ static MY_ATTRIBUTE((warn_unused_result)) bool row_log_block_allocate(
if (log_buf.block == NULL) {
DBUG_EXECUTE_IF("simulate_row_log_allocation_failure", DBUG_RETURN(false););

log_buf.block = ut_allocator<byte>(mem_key_row_log_buf)
.allocate_large(srv_sort_buf_size, &log_buf.block_pfx);
log_buf.block =
ut_allocator<byte>(mem_key_row_log_buf)
.allocate_large(srv_sort_buf_size, &log_buf.block_pfx, false);

if (log_buf.block == NULL) {
DBUG_RETURN(false);
Expand Down
2 changes: 1 addition & 1 deletion storage/innobase/row/row0merge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3522,7 +3522,7 @@ dberr_t row_merge_build_indexes(

/* This will allocate "3 * srv_sort_buf_size" elements of type
row_merge_block_t. The latter is defined as byte. */
block = alloc.allocate_large(3 * srv_sort_buf_size, &block_pfx);
block = alloc.allocate_large(3 * srv_sort_buf_size, &block_pfx, false);

if (block == NULL) {
DBUG_RETURN(DB_OUT_OF_MEMORY);
Expand Down
2 changes: 2 additions & 0 deletions storage/innobase/srv/srv0srv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,8 @@ with mutex_enter(), which will wait until it gets the mutex. */
bool srv_dedicated_server = true;
/** Requested size in bytes */
ulint srv_buf_pool_size = ULINT_MAX;
/* force virtual page preallocation (prefault) */
bool srv_buf_pool_populate = false;
/** Minimum pool size in bytes */
const ulint srv_buf_pool_min_size = 5 * 1024 * 1024;
/** Default pool size in bytes */
Expand Down
3 changes: 2 additions & 1 deletion storage/innobase/srv/srv0start.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1816,7 +1816,8 @@ dberr_t srv_start(bool create_new_db, const std::string &scan_directories) {
ib::info(ER_IB_MSG_1130, size, unit, srv_buf_pool_instances, chunk_size,
chunk_unit);

err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances);
err = buf_pool_init(srv_buf_pool_size, srv_buf_pool_instances,
srv_buf_pool_populate);

if (err != DB_SUCCESS) {
ib::error(ER_IB_MSG_1131);
Expand Down
Loading

0 comments on commit b5d4a67

Please sign in to comment.