From 8cd3673d10d71e5ad38e859a42981e4434feca5a Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Sat, 12 Dec 2020 23:47:51 -0800 Subject: [PATCH] Improve count(*) performance and fix index merge bug Upstream commit ID : fb-mysql-5.6.35/25b42b5d4365b6d2deb6bf40da9e776cd2e56698 PS-7780 : Merge fb-prod202101 Summary: Today in `SELECT count(*)` MyRocks would still decode every single column due to this check, despite the readset being empty: ``` // bitmap is cleared on index merge, but it still needs to decode columns bool field_requested = decode_all_fields || m_verify_row_debug_checksums || bitmap_is_set(field_map, m_table->field[i]->field_index); ``` As a result MyRocks is significantly slower than InnoDB in this particular scenario. Turns out in index merge, when it tries to reset, it calls ha_index_init with an empty column_bitmap, so our field decoders didn't know it needs to decode anything, so the entire query would return nothing. This is discussed in [this commit](https://github.com/facebook/mysql-5.6/commit/70f2bcd777b0fb58b72e9dbe5e5477e34248d7dd), and [issue 624](https://github.com/facebook/mysql-5.6/issues/624) and [PR 626](https://github.com/facebook/mysql-5.6/pull/626). So the workaround we had at that time is to simply treat empty map as implicitly everything, and the side effect is massively slowed down count(*). We have a few options to address this: 1. Fix index merge optimizer - looking at the code in QUICK_RANGE_SELECT::init_ror_merged_scan, it actually fixes up the column_bitmap properly, but after init/reset, so the fix would simply be moving the bitmap set code up. For secondary keys, prepare_for_position will automatically call `mark_columns_used_by_index_no_reset(s->primary_key, read_set)` if HA_PRIMARY_KEY_REQUIRED_FOR_POSITION is set (true for both InnoDB and MyRocks), so we would know correctly that we need to unpack PK when walking SK during index merge. 2. Overriding `column_bitmaps_signal` and setup decoders whenever the bitmap changes - however this doesn't work by itself. Because no storage engine today actually use handler::column_bitmaps_signal this path haven't been tested properly in index merge. In this case, QUICK_RANGE_SELECT::init_ror_merged_scan should call set_column_bitmaps_no_signal to avoid resetting the correct read/write set of head since head is used as first handler (reuses_handler=true) and subsequent place holders for read/write set updates (reuse_handler=false). 3. Follow InnoDB's solution - InnoDB delays it actually initialize its template again in index_read for the 2nd time (relying on `prebuilt->sql_stat_start`), and during index_read `QUICK_RANGE_SELECT::column_bitmap` is already fixed up and the table read/write set is switched to it, so the new template would be built correctly. In order to make it easier to maintain and port, after discussing with Manuel, I'm going with a simplified version of #3 that delays decoder creation until the first read operation (index_*, rnd_*, range_read_*, multi_range_read_*), and setting the delay flag in index_init / rnd_init / multi_range_read_init. Also, I ran into a bug with truncation_partition where Rdb_converter's tbl_def is stale (we only update ha_rocksdb::m_tbl_def), but it is fine because it is not being used after table open. But my change moves the lookup_bitmap initialization into Rdb_converter which takes a dependency on Rdb_converter::m_tbl_def so now we need to reset it properly. Reviewed By: lth Differential Revision: D25521518 fbshipit-source-id: a46ed3e71fa --- storage/rocksdb/ha_rocksdb.cc | 75 ++++++++++++++++++++++---------- storage/rocksdb/ha_rocksdb.h | 17 ++++---- storage/rocksdb/rdb_converter.cc | 30 ++++++++++--- storage/rocksdb/rdb_converter.h | 18 +++++--- 4 files changed, 97 insertions(+), 43 deletions(-) diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 70811f65b8c6..ca47f2ba1665 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -5982,13 +5982,12 @@ ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton, m_lock_rows(RDB_LOCK_NONE), m_keyread_only(false), m_insert_with_update(false), - m_dup_key_found(false) + m_dup_key_found(false), #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR - , m_in_rpl_delete_rows(false), - m_in_rpl_update_rows(false) + m_in_rpl_update_rows(false), #endif // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR -{ + m_need_build_decoder(false) { } ha_rocksdb::~ha_rocksdb() { @@ -7622,7 +7621,7 @@ int ha_rocksdb::create(const char *const name, TABLE *const table_arg, HA_EXIT_SUCCESS OK other HA_ERR error code (can be SE-specific) */ -int ha_rocksdb::truncate_table(Rdb_tbl_def *tbl_def_arg, const TABLE *table_arg, +int ha_rocksdb::truncate_table(Rdb_tbl_def *tbl_def_arg, TABLE *table_arg, ulonglong auto_increment_value) { DBUG_ENTER_FUNC(); @@ -7708,6 +7707,7 @@ int ha_rocksdb::truncate_table(Rdb_tbl_def *tbl_def_arg, const TABLE *table_arg, /* Update the local m_tbl_def reference */ m_tbl_def = ddl_manager.find(orig_tablename); + m_converter.reset(new Rdb_converter(ha_thd(), m_tbl_def, table_arg)); DBUG_RETURN(err); } @@ -8018,8 +8018,9 @@ int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, #endif // !defined(DBUG_OFF) DBUG_EXECUTE_IF("dbug.rocksdb.HA_EXTRA_KEYREAD", { m_keyread_only = true; }); - bool covered_lookup = (m_keyread_only && kd.can_cover_lookup()) || - kd.covers_lookup(&value, &m_lookup_bitmap); + bool covered_lookup = + (m_keyread_only && kd.can_cover_lookup()) || + kd.covers_lookup(&value, m_converter->get_lookup_bitmap()); #if !defined(DBUG_OFF) m_keyread_only = save_keyread_only; @@ -8156,7 +8157,8 @@ int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) { rocksdb::Slice value = m_scan_it->value(); bool covered_lookup = (m_keyread_only && m_key_descr_arr[keyno]->can_cover_lookup()) || - m_key_descr_arr[keyno]->covers_lookup(&value, &m_lookup_bitmap); + m_key_descr_arr[keyno]->covers_lookup( + &value, m_converter->get_lookup_bitmap()); if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) { rc = m_key_descr_arr[keyno]->unpack_record( table, buf, &key, &value, @@ -8192,6 +8194,8 @@ int ha_rocksdb::read_range_first(const key_range *const start_key, bool eq_range_arg, bool sorted) { DBUG_ENTER_FUNC(); + check_build_decoder(); + int result; eq_range = eq_range_arg; @@ -8239,6 +8243,8 @@ int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key, enum ha_rkey_function find_flag) { DBUG_ENTER_FUNC(); + check_build_decoder(); + DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr)); } @@ -8536,6 +8542,8 @@ int ha_rocksdb::index_read_last_map(uchar *const buf, const uchar *const key, key_part_map keypart_map) { DBUG_ENTER_FUNC(); + check_build_decoder(); + DBUG_RETURN(index_read_map(buf, key, keypart_map, HA_READ_PREFIX_LAST)); } @@ -8875,6 +8883,8 @@ int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, int ha_rocksdb::index_next(uchar *const buf) { DBUG_ENTER_FUNC(); + check_build_decoder(); + bool moves_forward = true; ha_statistic_increment(&SSV::ha_read_next_count); if (m_key_descr_arr[active_index]->m_is_reverse_cf) { @@ -8895,6 +8905,8 @@ int ha_rocksdb::index_next(uchar *const buf) { int ha_rocksdb::index_prev(uchar *const buf) { DBUG_ENTER_FUNC(); + check_build_decoder(); + bool moves_forward = false; ha_statistic_increment(&SSV::ha_read_prev_count); if (m_key_descr_arr[active_index]->m_is_reverse_cf) { @@ -8957,6 +8969,8 @@ int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) { int ha_rocksdb::index_first(uchar *const buf) { DBUG_ENTER_FUNC(); + check_build_decoder(); + m_sk_match_prefix = nullptr; ha_statistic_increment(&SSV::ha_read_first_count); int rc = m_key_descr_arr[active_index]->m_is_reverse_cf @@ -8975,6 +8989,8 @@ int ha_rocksdb::index_first(uchar *const buf) { int ha_rocksdb::index_last(uchar *const buf) { DBUG_ENTER_FUNC(); + check_build_decoder(); + m_sk_match_prefix = nullptr; ha_statistic_increment(&SSV::ha_read_last_count); int rc = m_key_descr_arr[active_index]->m_is_reverse_cf @@ -10363,6 +10379,9 @@ void ha_rocksdb::setup_iterator_for_rnd_scan() { int ha_rocksdb::rnd_init(bool scan) { DBUG_ENTER_FUNC(); + m_need_build_decoder = true; + active_index = table->s->primary_key; + THD *thd = ha_thd(); if (thd && thd->killed) { DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED); @@ -10370,10 +10389,6 @@ int ha_rocksdb::rnd_init(bool scan) { Rdb_transaction *const tx = get_or_create_tx(table->in_use); - // when this table is being updated, decode all fields - m_converter->setup_field_decoders(table->read_set, - m_lock_rows == RDB_LOCK_WRITE); - if (scan) { m_rnd_scan_is_new_snapshot = !tx->has_snapshot(); setup_iterator_for_rnd_scan(); @@ -10397,6 +10412,8 @@ int ha_rocksdb::rnd_init(bool scan) { int ha_rocksdb::rnd_next(uchar *const buf) { DBUG_ENTER_FUNC(); + check_build_decoder(); + int rc; ha_statistic_increment(&SSV::ha_read_rnd_next_count); for (;;) { @@ -10532,11 +10549,26 @@ int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) { int ha_rocksdb::rnd_end() { DBUG_ENTER_FUNC(); + m_need_build_decoder = false; + release_scan_iterator(); DBUG_RETURN(HA_EXIT_SUCCESS); } +void ha_rocksdb::build_decoder() { + m_converter->setup_field_decoders(table->read_set, active_index, + m_keyread_only, + m_lock_rows == RDB_LOCK_WRITE); +} + +void ha_rocksdb::check_build_decoder() { + if (m_need_build_decoder) { + build_decoder(); + m_need_build_decoder = false; + } +} + /** @return HA_EXIT_SUCCESS OK @@ -10545,6 +10577,9 @@ int ha_rocksdb::rnd_end() { int ha_rocksdb::index_init(uint idx, bool sorted) { DBUG_ENTER_FUNC(); + m_need_build_decoder = true; + active_index = idx; + THD *thd = ha_thd(); if (thd && thd->killed) { DBUG_RETURN(HA_ERR_QUERY_INTERRUPTED); @@ -10553,21 +10588,11 @@ int ha_rocksdb::index_init(uint idx, bool sorted) { Rdb_transaction *const tx = get_or_create_tx(table->in_use); DBUG_ASSERT(tx != nullptr); - // when this table is being updated, decode all fields - m_converter->setup_field_decoders(table->read_set, - m_lock_rows == RDB_LOCK_WRITE); - - if (!m_keyread_only) { - m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap); - } - // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update // when accessing the index, so don't acquire the snapshot right away. // Otherwise acquire the snapshot immediately. tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE); - active_index = idx; - DBUG_RETURN(HA_EXIT_SUCCESS); } @@ -10578,9 +10603,9 @@ int ha_rocksdb::index_init(uint idx, bool sorted) { int ha_rocksdb::index_end() { DBUG_ENTER_FUNC(); - release_scan_iterator(); + m_need_build_decoder = false; - bitmap_free(&m_lookup_bitmap); + release_scan_iterator(); active_index = MAX_KEY; in_range_check_pushed_down = false; @@ -10901,6 +10926,8 @@ void ha_rocksdb::position(const uchar *const record) { int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) { DBUG_ENTER_FUNC(); + check_build_decoder(); + int rc; size_t len; diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index d6f29c577e99..55cd701c210d 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -373,13 +373,6 @@ class ha_rocksdb : public my_core::handler { void set_last_rowkey(const uchar *const old_data); - /* - For the active index, indicates which columns must be covered for the - current lookup to be covered. If the bitmap field is null, that means this - index does not cover the current lookup for any record. - */ - MY_BITMAP m_lookup_bitmap = {nullptr, 0, 0, nullptr, nullptr}; - int alloc_key_buffers(const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg, bool alloc_alter_buffers = false) @@ -925,7 +918,7 @@ class ha_rocksdb : public my_core::handler { MY_ATTRIBUTE((__warn_unused_result__)); int create_table(const std::string &table_name, const TABLE *table_arg, ulonglong auto_increment_value); - int truncate_table(Rdb_tbl_def *tbl_def, const TABLE *table_arg, + int truncate_table(Rdb_tbl_def *tbl_def, TABLE *table_arg, ulonglong auto_increment_value); bool check_if_incompatible_data(HA_CREATE_INFO *const info, uint table_changes) override @@ -981,6 +974,9 @@ class ha_rocksdb : public my_core::handler { int adjust_handler_stats_sst_and_memtable(); int adjust_handler_stats_table_scan(); + void build_decoder(); + void check_build_decoder(); + #if defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR public: virtual void rpl_before_delete_rows() override; @@ -996,6 +992,11 @@ class ha_rocksdb : public my_core::handler { bool m_in_rpl_delete_rows; bool m_in_rpl_update_rows; #endif // defined(ROCKSDB_INCLUDE_RFR) && ROCKSDB_INCLUDE_RFR + + bool m_force_skip_unique_check; + + /* Need to build decoder on next read operation */ + bool m_need_build_decoder; }; /* diff --git a/storage/rocksdb/rdb_converter.cc b/storage/rocksdb/rdb_converter.cc index 93d1955ecbdc..833f14039057 100644 --- a/storage/rocksdb/rdb_converter.cc +++ b/storage/rocksdb/rdb_converter.cc @@ -309,6 +309,7 @@ Rdb_converter::Rdb_converter(const THD *thd, const Rdb_tbl_def *tbl_def, m_row_checksums_checked = 0; m_null_bytes = nullptr; setup_field_encoders(); + m_lookup_bitmap = {nullptr, 0, 0, nullptr, nullptr}; } Rdb_converter::~Rdb_converter() { @@ -316,6 +317,7 @@ Rdb_converter::~Rdb_converter() { m_encoder_arr = nullptr; // These are needed to suppress valgrind errors in rocksdb.partition m_storage_record.mem_free(); + bitmap_free(&m_lookup_bitmap); } /* @@ -340,30 +342,29 @@ void Rdb_converter::get_storage_type(Rdb_field_encoder *const encoder, Setup which fields will be unpacked when reading rows @detail - Three special cases when we still unpack all fields: + Two special cases when we still unpack all fields: - When client requires decode_all_fields, such as this table is being updated (m_lock_rows==RDB_LOCK_WRITE). - When @@rocksdb_verify_row_debug_checksums is ON (In this mode, we need to read all fields to find whether there is a row checksum at the end. We could skip the fields instead of decoding them, but currently we do decoding.) - - On index merge as bitmap is cleared during that operation @seealso Rdb_converter::setup_field_encoders() Rdb_converter::convert_record_from_storage_format() */ void Rdb_converter::setup_field_decoders(const MY_BITMAP *field_map, + uint active_index, bool keyread_only, bool decode_all_fields) { m_key_requested = false; m_decoders_vect.clear(); + bitmap_free(&m_lookup_bitmap); int last_useful = 0; int skip_size = 0; for (uint i = 0; i < m_table->s->fields; i++) { - // bitmap is cleared on index merge, but it still needs to decode columns bool field_requested = decode_all_fields || m_verify_row_debug_checksums || - bitmap_is_clear_all(field_map) || bitmap_is_set(field_map, m_table->field[i]->field_index); // We only need the decoder if the whole record is stored. @@ -399,6 +400,11 @@ void Rdb_converter::setup_field_decoders(const MY_BITMAP *field_map, // skipping. Remove them. m_decoders_vect.erase(m_decoders_vect.begin() + last_useful, m_decoders_vect.end()); + + if (!keyread_only && active_index != m_table->s->primary_key) { + m_tbl_def->m_key_descr_arr[active_index]->get_lookup_bitmap( + m_table, &m_lookup_bitmap); + } } void Rdb_converter::setup_field_encoders() { @@ -510,7 +516,8 @@ void Rdb_converter::setup_field_encoders() { int Rdb_converter::decode(const std::shared_ptr &key_def, uchar *dst, // address to fill data const rocksdb::Slice *key_slice, - const rocksdb::Slice *value_slice) { + const rocksdb::Slice *value_slice, + bool decode_value) { // Currently only support decode primary key, Will add decode secondary later DBUG_ASSERT(key_def->m_index_type == Rdb_key_def::INDEX_TYPE_PRIMARY || key_def->m_index_type == Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY); @@ -587,7 +594,13 @@ int Rdb_converter::decode_value_header( int Rdb_converter::convert_record_from_storage_format( const std::shared_ptr &pk_def, const rocksdb::Slice *const key_slice, - const rocksdb::Slice *const value_slice, uchar *const dst) { + const rocksdb::Slice *const value_slice, uchar *const dst, + bool decode_value) { + bool skip_value = !decode_value || get_decode_fields()->size() == 0; + if (!m_key_requested && skip_value) { + return HA_EXIT_SUCCESS; + } + int err = HA_EXIT_SUCCESS; Rdb_string_reader value_slice_reader(value_slice); @@ -609,6 +622,11 @@ int Rdb_converter::convert_record_from_storage_format( } } + if (skip_value) { + // We are done + return HA_EXIT_SUCCESS; + } + Rdb_value_field_iterator value_field_iterator(m_table, &value_slice_reader, this, dst); diff --git a/storage/rocksdb/rdb_converter.h b/storage/rocksdb/rdb_converter.h index 58df7c1c2022..52e107cee1b2 100644 --- a/storage/rocksdb/rdb_converter.h +++ b/storage/rocksdb/rdb_converter.h @@ -135,12 +135,12 @@ class Rdb_converter { Rdb_converter &operator=(const Rdb_converter &decoder) = delete; ~Rdb_converter(); - void setup_field_decoders(const MY_BITMAP *field_map, - bool decode_all_fields = false); + void setup_field_decoders(const MY_BITMAP *field_map, uint active_index, + bool keyread_only, bool decode_all_fields = false); int decode(const std::shared_ptr &key_def, uchar *dst, - const rocksdb::Slice *key_slice, - const rocksdb::Slice *value_slice); + const rocksdb::Slice *key_slice, const rocksdb::Slice *value_slice, + bool decode_value = true); int encode_value_slice(const std::shared_ptr &pk_def, const rocksdb::Slice &pk_packed_slice, @@ -173,6 +173,8 @@ class Rdb_converter { return &m_decoders_vect; } + const MY_BITMAP *get_lookup_bitmap() { return &m_lookup_bitmap; } + private: int decode_value_header(Rdb_string_reader *reader, const std::shared_ptr &pk_def, @@ -185,7 +187,7 @@ class Rdb_converter { int convert_record_from_storage_format( const std::shared_ptr &pk_def, const rocksdb::Slice *const key, const rocksdb::Slice *const value, - uchar *const buf); + uchar *const buf, bool decode_value = true); int verify_row_debug_checksum(const std::shared_ptr &pk_def, Rdb_string_reader *reader, @@ -242,5 +244,11 @@ class Rdb_converter { my_core::ha_rows m_row_checksums_checked; // buffer to hold data during encode_value_slice String m_storage_record; + /* + For the active index, indicates which columns must be covered for the + current lookup to be covered. If the bitmap field is null, that means this + index does not cover the current lookup for any record. + */ + MY_BITMAP m_lookup_bitmap; }; } // namespace myrocks