Skip to content

Commit

Permalink
FB8-132: NUMA support (facebook#987) (facebook#987)
Browse files Browse the repository at this point in the history
Summary:
Jira ticket: https://jira.percona.com/browse/FB8-132

Reference commit: facebook@e50c754
Reference commit: facebook@286e975

WebScaleSQL Feature: NUMA Support

Credits for research and implementation: Jeremy Cole and Davi Arnaut

This patch provides startup options:
* flush-caches: Flush and purge buffers/caches
* numa-nodebind: Run mysqld with CPU affinity

Note, the original patch provided numa-interleave, but that is supported natively by 8.0 now.

It also provides a config option:
* innodb_buffer_pool_populate: pre-allocation of buffer pool
memory at start up:
  -- Use MAP_POPULATE if supported (Linux 2.6.23 and higher)
  -- Forced pre-allocation using memset

Automation will pass in the right value to this option if
it needs to bind mysqld to a particular socket. That will happen
via systemd scripts.

This is only adding the capability on the mysqld_safe side to be able to
pass in numactl to the server. The turning on of this feature will need more
testing and slow roll, and that will happen in different diffs on the ops repo.

Pull Request resolved: facebook#987

Reviewed By: lloyd

Differential Revision: D14652411

Pulled By: lth

-----------------------------------------------------------------------------

Fix InnoDB large_page_aligned_alloc function signature on macOS (facebook#1207)

Summary:
On Linux, it takes an extra bool parameter for populating mmap'ed memory. This
is not available on macOS, so just add an unnamed argument.

Pull Request resolved: facebook#1207

Reviewed By: luqun

Differential Revision: D38956552

Pulled By: hermanlee
  • Loading branch information
dutow authored and inikep committed May 21, 2024
1 parent b2bfc0f commit dc01b7d
Show file tree
Hide file tree
Showing 16 changed files with 205 additions and 48 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CALL mtr.add_suppression(".* Forcing preallocation by faulting in pages.");
SELECT @@GLOBAL.innodb_buffer_pool_populate;
@@GLOBAL.innodb_buffer_pool_populate
1
1 Expected
SET @@GLOBAL.innodb_buffer_pool_populate=0;
ERROR HY000: Variable 'innodb_buffer_pool_populate' is a read only variable
Expected error 'Read only variable'
SELECT @@GLOBAL.innodb_buffer_pool_populate;
@@GLOBAL.innodb_buffer_pool_populate
1
1 Expected
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
--innodb-buffer-pool-populate=true
13 changes: 13 additions & 0 deletions mysql-test/suite/sys_vars/t/innodb_buffer_pool_populate_basic.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
CALL mtr.add_suppression(".* Forcing preallocation by faulting in pages.");

# Display current value of innodb_buffer_pool_populate
SELECT @@GLOBAL.innodb_buffer_pool_populate;
--echo 1 Expected

# Variable should be read-only
--error ER_INCORRECT_GLOBAL_LOCAL_VAR
SET @@GLOBAL.innodb_buffer_pool_populate=0;
--echo Expected error 'Read only variable'

SELECT @@GLOBAL.innodb_buffer_pool_populate;
--echo 1 Expected
1 change: 1 addition & 0 deletions mysql-test/t/all_persisted_variables.test
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ let $total_persistent_vars= `SELECT $total_global_vars - 214`;
let $total_excluded_vars=`SELECT COUNT(*) FROM performance_schema.global_variables WHERE variable_name in (
'binlog_file_basedir',
'binlog_index_basedir',
'innodb_buffer_pool_populate',
'innodb_log_flush_events',
'innodb_log_recent_closed_size',
'innodb_log_recent_written_size',
Expand Down
66 changes: 66 additions & 0 deletions scripts/mysqld_safe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ MYSQLD=
niceness=0
mysqld_ld_preload=
mysqld_ld_library_path=
flush_caches=0
numa_nodebind=

# Initial logging status: error log is not open, and not using syslog
logging=init
Expand Down Expand Up @@ -99,6 +101,8 @@ Usage: $0 [OPTIONS]
timestamps=TYPE system (ISO 8601 local time), hyphen
(hyphenated date a la mysqld 5.6), legacy
(legacy non-ISO 8601 mysqld_safe timestamps)
--flush-caches Flush and purge buffers/caches
--numa-nodebind Run mysqld with numa binding to one socket
All other options are passed to the mysqld program.
Expand Down Expand Up @@ -297,6 +301,8 @@ parse_arguments() {
--skip-syslog) want_syslog=0 ;;
--syslog-tag=*) syslog_tag="$val" ;;
--timezone=*) TZ="$val"; export TZ; ;;
--flush-caches) flush_caches=1 ;;
--numa-nodebind=*) numa_nodebind="$val" ;;

--help) usage ;;

Expand Down Expand Up @@ -866,6 +872,41 @@ mysqld daemon not started"
fi
fi

#
# Flush and purge buffers/caches.
#

if @TARGET_LINUX@ && test $flush_caches -eq 1
then
# Locate sync, ensure it exists.
if ! my_which sync > /dev/null 2>&1
then
log_error "sync command not found, required for --flush-caches"
exit 1
# Flush file system buffers.
elif ! sync
then
# Huh, the sync() function is always successful...
log_error "sync failed, check if sync is properly installed"
fi

# Locate sysctl, ensure it exists.
if ! my_which sysctl > /dev/null 2>&1
then
log_error "sysctl command not found, required for --flush-caches"
exit 1
# Purge page cache, dentries and inodes.
elif ! sysctl -q -w vm.drop_caches=3
then
log_error "sysctl failed, check the error message for details"
exit 1
fi
elif test $flush_caches -eq 1
then
log_error "--flush-caches is not supported on this platform"
exit 1
fi

#
# Uncomment the following lines if you want all tables to be automatically
# checked and repaired during startup. You should add sensible key_buffer
Expand Down Expand Up @@ -895,6 +936,31 @@ do
cmd="$cmd "`shell_quote_string "$i"`
done
cmd="$cmd $args"

if @TARGET_LINUX@ && test ! -z "$numa_nodebind"
then
# Locate numactl, ensure it exists.
if ! my_which numactl > /dev/null 2>&1
then
log_error "numactl command not found, required for --numa-nodebind"
exit 1
fi

# Attempt to run a command, ensure it works.
if ! numactl --cpunodebind=$numa_nodebind --preferred=$numa_nodebind true
then
log_error "numactl failed, check if numa-nodebind value is correct"
exit 1
fi

# Launch mysqld with numactl.
cmd="numactl --cpunodebind=$numa_nodebind --preferred=$numa_nodebind $cmd"
elif test ! -z "$numa_nodebind"
then
log_error "--numa-nodebind is not supported on this platform"
exit 1
fi

if [ -n "$malloc_conf_options" ]
then
cmd="MALLOC_CONF=$malloc_conf_options $cmd"
Expand Down
23 changes: 13 additions & 10 deletions storage/innobase/buf/buf0buf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -879,11 +879,12 @@ bool buf_chunk_t::madvise_dont_dump() {

/* Implementation of buf_pool_t's methods */

bool buf_pool_t::allocate_chunk(ulonglong mem_size, buf_chunk_t *chunk) {
bool buf_pool_t::allocate_chunk(ulonglong mem_size, buf_chunk_t *chunk,
bool populate) {
ut_ad(mutex_own(&chunks_mutex));
chunk->mem = static_cast<uint8_t *>(ut::malloc_large_page_withkey(
ut::make_psi_memory_key(mem_key_buf_buf_pool), mem_size,
ut::fallback_to_normal_page_t{}));
ut::fallback_to_normal_page_t{}, os_use_large_pages, populate));
if (chunk->mem == nullptr) {
return false;
}
Expand Down Expand Up @@ -993,7 +994,8 @@ static buf_chunk_t *buf_chunk_init(
buf_pool_t *buf_pool, /*!< in: buffer pool instance */
buf_chunk_t *chunk, /*!< out: chunk of buffers */
ulonglong mem_size, /*!< in: requested size in bytes */
std::mutex *mutex) /*!< in,out: Mutex protecting chunk map. */
std::mutex *mutex, /*!< in,out: Mutex protecting chunk map. */
bool populate) /*!< in: virtual page preallocation */
{
buf_block_t *block;
byte *frame;
Expand All @@ -1011,7 +1013,7 @@ static buf_chunk_t *buf_chunk_init(

DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return (nullptr););

if (!buf_pool->allocate_chunk(mem_size, chunk)) {
if (!buf_pool->allocate_chunk(mem_size, chunk, populate)) {
return (nullptr);
}

Expand Down Expand Up @@ -1196,8 +1198,8 @@ static void buf_pool_set_sizes(void) {
@param[in,out] mutex Mutex to protect common data structures
@param[out] err DB_SUCCESS if all goes well */
static void buf_pool_create(buf_pool_t *buf_pool, ulint buf_pool_size,
ulint instance_no, std::mutex *mutex,
dberr_t &err) {
ulint instance_no, std::mutex *mutex, dberr_t &err,
bool populate) {
ulint i;
ulint chunk_size;
buf_chunk_t *chunk;
Expand Down Expand Up @@ -1262,7 +1264,7 @@ static void buf_pool_create(buf_pool_t *buf_pool, ulint buf_pool_size,
chunk = buf_pool->chunks;

do {
if (!buf_chunk_init(buf_pool, chunk, chunk_size, mutex)) {
if (!buf_chunk_init(buf_pool, chunk, chunk_size, mutex, populate)) {
while (--chunk >= buf_pool->chunks) {
buf_block_t *block = chunk->blocks;

Expand Down Expand Up @@ -1437,7 +1439,7 @@ static void buf_pool_free() {
@param[in] total_size Size of the total pool in bytes.
@param[in] n_instances Number of buffer pool instances to create.
@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
dberr_t buf_pool_init(ulint total_size, ulint n_instances) {
dberr_t buf_pool_init(ulint total_size, ulint n_instances, bool populate) {
ulint i;
const ulint size = total_size / n_instances;

Expand Down Expand Up @@ -1495,7 +1497,7 @@ dberr_t buf_pool_init(ulint total_size, ulint n_instances) {

for (ulint id = i; id < n; ++id) {
threads.emplace_back(std::thread(buf_pool_create, &buf_pool_ptr[id], size,
id, &m, std::ref(errs[id])));
id, &m, std::ref(errs[id]), populate));
}

for (ulint id = i; id < n; ++id) {
Expand Down Expand Up @@ -2455,7 +2457,8 @@ static void buf_pool_resize() {
while (chunk < echunk) {
ulonglong unit = srv_buf_pool_chunk_unit;

if (!buf_chunk_init(buf_pool, chunk, unit, nullptr)) {
if (!buf_chunk_init(buf_pool, chunk, unit, nullptr,
srv_buf_pool_populate)) {
ib::error(ER_IB_MSG_65) << "buffer pool " << i
<< " : failed to allocate"
" new memory.";
Expand Down
8 changes: 8 additions & 0 deletions storage/innobase/handler/ha_innodb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22404,6 +22404,13 @@ static MYSQL_SYSVAR_ULONG(
0, 0, 256, 0);
// clang-format on

static MYSQL_SYSVAR_BOOL(
buffer_pool_populate, srv_buf_pool_populate,
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
"Preallocate (pre-fault) the page frames required for the mapping "
"established by the buffer pool memory region. Disabled by default.",
nullptr, nullptr, false);

static MYSQL_SYSVAR_ULONG(buffer_pool_instances, srv_buf_pool_instances,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of buffer pool instances, set to higher "
Expand Down Expand Up @@ -23263,6 +23270,7 @@ static SYS_VAR *innobase_system_variables[] = {
MYSQL_SYSVAR(dedicated_server),
MYSQL_SYSVAR(buffer_pool_size),
MYSQL_SYSVAR(buffer_pool_chunk_size),
MYSQL_SYSVAR(buffer_pool_populate),
MYSQL_SYSVAR(buffer_pool_instances),
MYSQL_SYSVAR(buffer_pool_filename),
MYSQL_SYSVAR(buffer_pool_dump_now),
Expand Down
4 changes: 2 additions & 2 deletions storage/innobase/include/buf0buf.h
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ struct buf_pools_list_size_t {
@param[in] total_size Size of the total pool in bytes.
@param[in] n_instances Number of buffer pool instances to create.
@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
dberr_t buf_pool_init(ulint total_size, ulint n_instances);
dberr_t buf_pool_init(ulint total_size, ulint n_instances, bool populate);

/** Frees the buffer pool at shutdown. This must not be invoked before
freeing all mutexes. */
Expand Down Expand Up @@ -2399,7 +2399,7 @@ struct buf_pool_t {
@param[in,out] chunk mem and mem_pfx fields of this chunk will be updated
to contain information about allocated memory region
@return true iff allocated successfully */
bool allocate_chunk(ulonglong mem_size, buf_chunk_t *chunk);
bool allocate_chunk(ulonglong mem_size, buf_chunk_t *chunk, bool populate);

/** A wrapper for buf_pool_t::allocator.deallocate_large which also advices
the OS that this chunk can be dumped to a core file.
Expand Down
49 changes: 46 additions & 3 deletions storage/innobase/include/detail/ut/large_page_alloc-linux.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,43 @@ this program; if not, write to the Free Software Foundation, Inc.,
#include "storage/innobase/include/detail/ut/helper.h"
#include "storage/innobase/include/ut0log.h"

/* Linux release version */
#if defined(UNIV_LINUX) && defined(_GNU_SOURCE)
#include <string.h> /* strverscmp() */
#include <sys/utsname.h> /* uname() */
#endif

/* Linux's MAP_POPULATE */
#if defined(MAP_POPULATE)
#define OS_MAP_POPULATE MAP_POPULATE
#else
#define OS_MAP_POPULATE 0
#endif

extern const size_t large_page_default_size;

namespace ut {
namespace detail {
/** Retrieve and compare operating system release.
@return true if the OS release is equal to, or later than release. */
static bool os_compare_release(const char *release) {
#if defined(UNIV_LINUX) && defined(_GNU_SOURCE)
struct utsname name;
return (uname(&name) == 0 && strverscmp(name.release, release) >= 0);
#else
return false;
#endif
}

/** Allocates memory backed by large (huge) pages.

@param[in] n_bytes Size of storage (in bytes) requested to be allocated.
@return Pointer to the allocated storage. nullptr if allocation failed.
*/
inline void *large_page_aligned_alloc(size_t n_bytes) {
inline void *large_page_aligned_alloc(size_t n_bytes, bool populate) {
// mmap will internally round n_bytes to the multiple of huge-page size if it
// is not already
int mmap_flags = MAP_PRIVATE | MAP_ANON;
int mmap_flags = MAP_PRIVATE | MAP_ANON | (populate ? OS_MAP_POPULATE : 0);
#ifndef __FreeBSD__
mmap_flags |= MAP_HUGETLB;
#endif
Expand All @@ -61,8 +84,28 @@ inline void *large_page_aligned_alloc(size_t n_bytes) {
<< " bytes) failed;"
" errno "
<< errno;
ptr = nullptr;
}
return (ptr != (void *)-1) ? ptr : nullptr;

#if MAP_ANONYMOUS && OS_MAP_POPULATE
/* MAP_POPULATE is only supported for private mappings
since Linux 2.6.23. */
populate = populate && !os_compare_release("2.6.23");

if (ptr && populate) {
ib::warn() << "InnoDB: Warning: mmap(MAP_POPULATE) "
"is not supported for private mappings. "
"Forcing preallocation by faulting in pages.";
}
#endif

/* Initialize the entire buffer to force the allocation
of physical memory page frames. */
if (ptr && populate) {
memset(ptr, '\0', n_bytes);
}

return ptr;
}

/** Releases memory backed by large (huge) pages.
Expand Down
2 changes: 1 addition & 1 deletion storage/innobase/include/detail/ut/large_page_alloc-osx.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ static constexpr auto SUPER_PAGE_SIZE = VM_FLAGS_SUPERPAGE_SIZE_2MB;
@param[in] n_bytes Size of storage (in bytes) requested to be allocated.
@return Pointer to the allocated storage. nullptr if allocation failed.
*/
inline void *large_page_aligned_alloc(size_t n_bytes) {
inline void *large_page_aligned_alloc(size_t n_bytes, bool) {
// mmap on OSX requires for n_bytes to be a multiple of large-page size
size_t n_bytes_rounded = pow2_round(n_bytes + (large_page_default_size - 1),
large_page_default_size);
Expand Down
17 changes: 9 additions & 8 deletions storage/innobase/include/detail/ut/large_page_alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ struct Large_page_alloc : public allocator_traits<false> {
@param[in] size Size of storage (in bytes) requested to be allocated.
@return Pointer to the allocated storage. nullptr if allocation failed.
*/
static inline void *alloc(std::size_t size) {
static inline void *alloc(std::size_t size, bool populate) {
auto total_len = round_to_next_multiple(
size + page_allocation_metadata::len, large_page_default_size);
auto mem = large_page_aligned_alloc(total_len);
auto mem = large_page_aligned_alloc(total_len, populate);
if (unlikely(!mem)) return nullptr;
page_allocation_metadata::datalen(mem, total_len);
page_allocation_metadata::page_type(mem, Page_type::large_page);
Expand Down Expand Up @@ -245,10 +245,11 @@ struct Large_page_alloc_pfs : public allocator_traits<true> {
*/
static inline void *alloc(
std::size_t size,
page_allocation_metadata::pfs_metadata::pfs_memory_key_t key) {
page_allocation_metadata::pfs_metadata::pfs_memory_key_t key,
bool populate) {
auto total_len = round_to_next_multiple(
size + page_allocation_metadata::len, large_page_default_size);
auto mem = large_page_aligned_alloc(total_len);
auto mem = large_page_aligned_alloc(total_len, populate);
if (unlikely(!mem)) return nullptr;

#ifdef HAVE_PSI_MEMORY_INTERFACE
Expand Down Expand Up @@ -372,13 +373,13 @@ template <typename Impl>
struct Large_alloc_ {
template <typename T = Impl>
static inline typename std::enable_if<T::is_pfs_instrumented_v, void *>::type
alloc(size_t size, PSI_memory_key key) {
return Impl::alloc(size, key);
alloc(size_t size, PSI_memory_key key, bool populate) {
return Impl::alloc(size, key, populate);
}
template <typename T = Impl>
static inline typename std::enable_if<!T::is_pfs_instrumented_v, void *>::type
alloc(size_t size, PSI_memory_key /*key*/) {
return Impl::alloc(size);
alloc(size_t size, PSI_memory_key /*key*/, bool populate) {
return Impl::alloc(size, populate);
}
static inline bool free(void *ptr) { return Impl::free(ptr); }
static inline size_t datalen(void *ptr) { return Impl::datalen(ptr); }
Expand Down
2 changes: 2 additions & 0 deletions storage/innobase/include/srv0srv.h
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,8 @@ extern bool srv_load_corrupted;
extern bool srv_dedicated_server;
/** Requested size in bytes */
extern ulint srv_buf_pool_size;
/** virtual page preallocation */
extern bool srv_buf_pool_populate;
/** Minimum pool size in bytes */
extern const ulint srv_buf_pool_min_size;
/** Default pool size in bytes */
Expand Down
Loading

0 comments on commit dc01b7d

Please sign in to comment.