Skip to content

Commit

Permalink
Recalculate index statistics periodically
Browse files Browse the repository at this point in the history
Upstream commit ID : fb-mysql-5.6.35/ee8ca237a15c63ca98904183fd16fb79fb17e1ef
PS-4476 : Merge prod--------

Summary:
Persisted stats sometimes drift away from the actual sum from sst files. To mitigate this problem, recalculate stats periodically. This is controlled by the rocksdb_stats_recalc_rate global variable which configures the number of indexes to recalculate per second.

Implementation details:
- Refactor calculate_stats to not depend on handler.
- Track indexes left to recalculate in rdb_indexes_to_recalc, and refill when it empties.
- Extend existing background thread to recalculate stats on every wake-up.

Removed a call to calculate_stats from alter table. This was originally added in facebook/mysql-5.6@3442d47 because uncommitted indexes could not be found in the data dictionary yet, but a subsequent commmit facebook/mysql-5.6@5d2b953 solved this.

Also remove fake stats from Rdb_index_stats::merge. This could cause underflow if 1. during recovery, we write sst files with no index stats and 2. on compaction of these sst files, we subtract index stats. Underflow happens because the global count was not incremented during recovery, yet it is subtracted on compaction.

Reviewed By: hermanlee

Differential Revision: D7482233

fbshipit-source-id: 47286bd
  • Loading branch information
lth authored and George O. Lorch III committed May 28, 2018
1 parent b9cf3ed commit 687f8d2
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 106 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
CREATE TABLE valid_values (value varchar(255)) ENGINE=myisam;
INSERT INTO valid_values VALUES(100);
INSERT INTO valid_values VALUES(1);
CREATE TABLE invalid_values (value varchar(255)) ENGINE=myisam;
INSERT INTO invalid_values VALUES('\'aaa\'');
INSERT INTO invalid_values VALUES('\'123\'');
SET @start_global_value = @@global.ROCKSDB_STATS_RECALC_RATE;
SELECT @start_global_value;
@start_global_value
0
'# Setting to valid values in global scope#'
"Trying to set variable @@global.ROCKSDB_STATS_RECALC_RATE to 100"
SET @@global.ROCKSDB_STATS_RECALC_RATE = 100;
SELECT @@global.ROCKSDB_STATS_RECALC_RATE;
@@global.ROCKSDB_STATS_RECALC_RATE
100
"Setting the global scope variable back to default"
SET @@global.ROCKSDB_STATS_RECALC_RATE = DEFAULT;
SELECT @@global.ROCKSDB_STATS_RECALC_RATE;
@@global.ROCKSDB_STATS_RECALC_RATE
0
"Trying to set variable @@global.ROCKSDB_STATS_RECALC_RATE to 1"
SET @@global.ROCKSDB_STATS_RECALC_RATE = 1;
SELECT @@global.ROCKSDB_STATS_RECALC_RATE;
@@global.ROCKSDB_STATS_RECALC_RATE
1
"Setting the global scope variable back to default"
SET @@global.ROCKSDB_STATS_RECALC_RATE = DEFAULT;
SELECT @@global.ROCKSDB_STATS_RECALC_RATE;
@@global.ROCKSDB_STATS_RECALC_RATE
0
"Trying to set variable @@session.ROCKSDB_STATS_RECALC_RATE to 444. It should fail because it is not session."
SET @@session.ROCKSDB_STATS_RECALC_RATE = 444;
ERROR HY000: Variable 'rocksdb_stats_recalc_rate' is a GLOBAL variable and should be set with SET GLOBAL
'# Testing with invalid values in global scope #'
"Trying to set variable @@global.ROCKSDB_STATS_RECALC_RATE to 'aaa'"
SET @@global.ROCKSDB_STATS_RECALC_RATE = 'aaa';
Got one of the listed errors
SELECT @@global.ROCKSDB_STATS_RECALC_RATE;
@@global.ROCKSDB_STATS_RECALC_RATE
0
"Trying to set variable @@global.ROCKSDB_STATS_RECALC_RATE to '123'"
SET @@global.ROCKSDB_STATS_RECALC_RATE = '123';
Got one of the listed errors
SELECT @@global.ROCKSDB_STATS_RECALC_RATE;
@@global.ROCKSDB_STATS_RECALC_RATE
0
SET @@global.ROCKSDB_STATS_RECALC_RATE = @start_global_value;
SELECT @@global.ROCKSDB_STATS_RECALC_RATE;
@@global.ROCKSDB_STATS_RECALC_RATE
0
DROP TABLE valid_values;
DROP TABLE invalid_values;
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
--source include/have_rocksdb.inc
--source include/have_myisam.inc

CREATE TABLE valid_values (value varchar(255)) ENGINE=myisam;
INSERT INTO valid_values VALUES(100);
INSERT INTO valid_values VALUES(1);

CREATE TABLE invalid_values (value varchar(255)) ENGINE=myisam;
INSERT INTO invalid_values VALUES('\'aaa\'');
INSERT INTO invalid_values VALUES('\'123\'');

--let $sys_var=ROCKSDB_STATS_RECALC_RATE
--let $read_only=0
--let $session=0
--source ../include/rocksdb_sys_var.inc

DROP TABLE valid_values;
DROP TABLE invalid_values;
1 change: 1 addition & 0 deletions mysql-test/suite/rocksdb/r/rocksdb.result
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,7 @@ rocksdb_skip_bloom_filter_on_read OFF
rocksdb_skip_fill_cache OFF
rocksdb_sst_mgr_rate_bytes_per_sec 0
rocksdb_stats_dump_period_sec 600
rocksdb_stats_recalc_rate 0
rocksdb_store_row_debug_checksums OFF
rocksdb_strict_collation_check OFF
rocksdb_strict_collation_exceptions
Expand Down
182 changes: 97 additions & 85 deletions storage/rocksdb/ha_rocksdb.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ static st_io_stall_stats io_stall_stats;
const std::string DEFAULT_CF_NAME("default");
const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
const std::string PER_INDEX_CF_NAME("$per_index_cf");
static std::vector<GL_INDEX_ID> rdb_indexes_to_recalc;

/**
Updates row counters based on the table type and operation type.
Expand Down Expand Up @@ -510,6 +511,7 @@ static my_bool rocksdb_allow_to_start_after_corruption = FALSE;
static uint64_t rocksdb_write_policy =
rocksdb::TxnDBWritePolicy::WRITE_COMMITTED;
static my_bool rocksdb_error_on_suboptimal_collation = FALSE;
static uint32_t rocksdb_stats_recalc_rate = 0;

std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
Expand Down Expand Up @@ -1497,6 +1499,13 @@ static MYSQL_SYSVAR_UINT(
RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
/* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);

static MYSQL_SYSVAR_UINT(
stats_recalc_rate, rocksdb_stats_recalc_rate, PLUGIN_VAR_RQCMDARG,
"The number of indexes per second to recalculate statistics for. 0 to "
"disable background recalculation.",
nullptr, nullptr, 0 /* default value */, 0 /* min value */,
UINT_MAX /* max value */, 0);

static MYSQL_SYSVAR_BOOL(
large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
"Support large index prefix length of 3072 bytes. If off, the maximum "
Expand Down Expand Up @@ -1658,6 +1667,7 @@ static struct st_mysql_sys_var *rocksdb_system_variables[] = {
MYSQL_SYSVAR(large_prefix),
MYSQL_SYSVAR(allow_to_start_after_corruption),
MYSQL_SYSVAR(error_on_suboptimal_collation),
MYSQL_SYSVAR(stats_recalc_rate),
nullptr};

static rocksdb::WriteOptions
Expand Down Expand Up @@ -10411,39 +10421,26 @@ int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
DBUG_RETURN(HA_EXIT_SUCCESS);
}

int ha_rocksdb::calculate_stats(
const TABLE *const table_arg,
const std::unordered_set<GL_INDEX_ID> &to_recalc) {
static int calculate_stats(
const std::unordered_set<std::shared_ptr<const Rdb_key_def>> &to_recalc,
bool include_memtables) {
DBUG_ENTER_FUNC();

// find per column family key ranges which need to be queried
std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
ranges;
std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
std::vector<uint> to_recalc_indexes;
std::vector<uchar> buf(table_arg->s->keys * 2 *
Rdb_key_def::INDEX_NUMBER_SIZE);
for (uint i = 0; i < table_arg->s->keys; i++) {
const Rdb_key_def &kd = *m_key_descr_arr[i];
const GL_INDEX_ID index_id = kd.get_gl_index_id();
if (to_recalc.find(index_id) == to_recalc.end()) {
continue;
}
std::vector<uchar> buf(to_recalc.size() * 2 * Rdb_key_def::INDEX_NUMBER_SIZE);

to_recalc_indexes.push_back(i);
}

for (uint i : to_recalc_indexes) {
const auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE];
const Rdb_key_def &kd = *m_key_descr_arr[i];
const GL_INDEX_ID index_id = kd.get_gl_index_id();
ranges[kd.get_cf()].push_back(get_range(i, bufp));
uchar *bufp = buf.data();
for (const auto &kd : to_recalc) {
const GL_INDEX_ID index_id = kd->get_gl_index_id();
ranges[kd->get_cf()].push_back(myrocks::get_range(*kd, bufp));
bufp += 2 * Rdb_key_def::INDEX_NUMBER_SIZE;

// Initialize the stats to 0. If there are no files that contain
// this gl_index_id, then 0 should be stored for the cached stats.
stats[index_id] = Rdb_index_stats(index_id);
DBUG_ASSERT(kd.get_key_parts() > 0);
stats[index_id].m_distinct_keys_per_prefix.resize(kd.get_key_parts());
DBUG_ASSERT(kd->get_key_parts() > 0);
stats[index_id].m_distinct_keys_per_prefix.resize(kd->get_key_parts());
}

// get RocksDB table properties for these ranges
Expand All @@ -10454,8 +10451,8 @@ int ha_rocksdb::calculate_stats(
it.first, &it.second[0], it.second.size(), &props);
DBUG_ASSERT(props.size() >= old_size);
if (!status.ok()) {
DBUG_RETURN(
rdb_error_to_mysql(status, "Could not access RocksDB properties"));
DBUG_RETURN(ha_rocksdb::rdb_error_to_mysql(
status, "Could not access RocksDB properties"));
}
}

Expand All @@ -10476,7 +10473,7 @@ int ha_rocksdb::calculate_stats(
other SQL tables, it can be that we're only seeing a small fraction
of table's entries (and so we can't update statistics based on that).
*/
if (to_recalc.find(it1.m_gl_index_id) == to_recalc.end()) {
if (stats.find(it1.m_gl_index_id) == stats.end()) {
continue;
}

Expand All @@ -10487,52 +10484,46 @@ int ha_rocksdb::calculate_stats(
num_sst++;
}

// calculate memtable cardinality
Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
auto read_opts = rocksdb::ReadOptions();
read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
for (const uint i : to_recalc_indexes) {
const Rdb_key_def &kd = *m_key_descr_arr[i];

Rdb_index_stats &stat = stats[kd.get_gl_index_id()];

uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
auto r = get_range(i, r_buf);
uint64_t memtableCount;
uint64_t memtableSize;
rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memtableCount,
&memtableSize);
if (memtableCount < (uint64_t)stat.m_rows / 10) {
// skip tables that already have enough stats from SST files to reduce
// overhead and avoid degradation of big tables stats by sampling from
// relatively tiny (less than 10% of full data set) memtable dataset
continue;
}
if (include_memtables) {
// calculate memtable cardinality
Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
auto read_opts = rocksdb::ReadOptions();
read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
for (const auto &kd : to_recalc) {
Rdb_index_stats &stat = stats[kd->get_gl_index_id()];

uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
auto r = myrocks::get_range(*kd, r_buf);
uint64_t memtableCount;
uint64_t memtableSize;
rdb->GetApproximateMemTableStats(kd->get_cf(), r, &memtableCount,
&memtableSize);
if (memtableCount < (uint64_t)stat.m_rows / 10) {
// skip tables that already have enough stats from SST files to reduce
// overhead and avoid degradation of big tables stats by sampling from
// relatively tiny (less than 10% of full data set) memtable dataset
continue;
}

std::unique_ptr<rocksdb::Iterator> it = std::unique_ptr<rocksdb::Iterator>(
rdb->NewIterator(read_opts, kd.get_cf()));
std::unique_ptr<rocksdb::Iterator> it =
std::unique_ptr<rocksdb::Iterator>(
rdb->NewIterator(read_opts, kd->get_cf()));

uchar *first_key;
uint key_size;
if (is_pk(i, table, m_tbl_def)) {
first_key = m_pk_packed_tuple;
} else {
first_key = m_sk_packed_tuple;
}
kd.get_first_key(first_key, &key_size);
rocksdb::Slice first_index_key((const char *)first_key, key_size);
rocksdb::Slice first_index_key((const char *)r_buf,
Rdb_key_def::INDEX_NUMBER_SIZE);

cardinality_collector.Reset();
for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
const rocksdb::Slice key = it->key();
if (!kd.covers_key(key)) {
break; // end of this index
}
stat.m_rows++;
cardinality_collector.Reset();
for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
const rocksdb::Slice key = it->key();
if (!kd->covers_key(key)) {
break; // end of this index
}
stat.m_rows++;

cardinality_collector.ProcessKey(key, &kd, &stat);
cardinality_collector.ProcessKey(key, kd.get(), &stat);
}
cardinality_collector.AdjustStats(&stat);
}
cardinality_collector.AdjustStats(&stat);
}

// set and persist new stats
Expand All @@ -10551,14 +10542,12 @@ int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
DBUG_ENTER_FUNC();

if (table) {
std::unordered_set<GL_INDEX_ID> ids_to_check;
std::unordered_set<std::shared_ptr<const Rdb_key_def>> ids_to_check;
for (uint i = 0; i < table->s->keys; i++) {
const Rdb_key_def &kd = *m_key_descr_arr[i];
const GL_INDEX_ID index_id = kd.get_gl_index_id();
ids_to_check.insert(index_id);
ids_to_check.insert(m_key_descr_arr[i]);
}

int res = calculate_stats(table, ids_to_check);
int res = calculate_stats(ids_to_check, true);
if (res != HA_EXIT_SUCCESS) {
DBUG_RETURN(HA_ADMIN_FAILED);
}
Expand Down Expand Up @@ -11419,18 +11408,6 @@ bool ha_rocksdb::commit_inplace_alter_table(
dict_manager.finish_indexes_operation(
create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);

/*
We need to recalculate the index stats here manually. The reason is that
the secondary index does not exist inside
m_index_num_to_keydef until it is committed to the data dictionary, which
prevents us from updating the stats normally as the ddl_manager cannot
find the proper gl_index_ids yet during adjust_stats calls.
*/
if (calculate_stats(altered_table, create_index_ids)) {
/* Failed to update index statistics, should never happen */
DBUG_ASSERT(0);
}

rdb_drop_idx_thread.signal();
}

Expand Down Expand Up @@ -11939,6 +11916,41 @@ void Rdb_background_thread::run() {
}
}

// Recalculate statistics for indexes.
if (rocksdb_stats_recalc_rate) {
std::unordered_set<std::shared_ptr<const Rdb_key_def>> to_recalc;

if (rdb_indexes_to_recalc.empty()) {
struct Rdb_index_collector : public Rdb_tables_scanner {
int add_table(Rdb_tbl_def *tdef) override {
for (uint i = 0; i < tdef->m_key_count; i++) {
rdb_indexes_to_recalc.push_back(
tdef->m_key_descr_arr[i]->get_gl_index_id());
}
return HA_EXIT_SUCCESS;
}
} collector;
ddl_manager.scan_for_tables(&collector);
}

while (to_recalc.size() < rocksdb_stats_recalc_rate &&
!rdb_indexes_to_recalc.empty()) {
const auto index_id = rdb_indexes_to_recalc.back();
rdb_indexes_to_recalc.pop_back();

std::shared_ptr<const Rdb_key_def> keydef =
ddl_manager.safe_find(index_id);

if (keydef) {
to_recalc.insert(keydef);
}
}

if (!to_recalc.empty()) {
calculate_stats(to_recalc, false);
}
}

// Set the next timestamp for mysql_cond_timedwait() (which ends up calling
// pthread_cond_timedwait()) to wait on.
ts_next_sync.tv_sec = ts.tv_sec + WAKE_UP_INTERVAL;
Expand Down
3 changes: 0 additions & 3 deletions storage/rocksdb/ha_rocksdb.h
Original file line number Diff line number Diff line change
Expand Up @@ -1320,9 +1320,6 @@ class ha_rocksdb : public my_core::handler {
MY_ATTRIBUTE((__warn_unused_result__));
int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override
MY_ATTRIBUTE((__warn_unused_result__));
int calculate_stats(const TABLE *const table_arg,
const std::unordered_set<GL_INDEX_ID> &to_recalc)
MY_ATTRIBUTE((__warn_unused_result__));

enum_alter_inplace_result check_if_supported_inplace_alter(
TABLE *altered_table,
Expand Down
Loading

0 comments on commit 687f8d2

Please sign in to comment.