From 2b72e946688935553c9a7648a9b2a916a71b87e7 Mon Sep 17 00:00:00 2001 From: Luqun Lou Date: Wed, 27 Mar 2019 22:24:18 -0700 Subject: [PATCH] Simplify TTL value fetch for ttl column Upstream commit ID : fb-mysql-5.6.35/77032004ad23d21a4c386f8136ecfbb071ea42d6 PS-6865 : Merge fb-prod201903 Summary: Currently during primary key's value encode, its ttl value can be from either one of these 3 cases 1. ttl column in primary key 2. non-ttl column a. old record(update case) b. current timestamp 3. ttl column in non-key field Workflow #1: first in Rdb_key_def::pack_record() find and store pk_offset, then in value encode try to parse key slice to fetch ttl value by using pk_offset. Workflow #3: fetch ttl value from ttl column The change is to merge #1 and #3 by always fetching TTL value from ttl column, not matter whether the ttl column is in primary key or not. Of course, remove pk_offset, since it isn't used. BTW, for secondary keys, its ttl value is always from m_ttl_bytes, which is stored by primary value encoding. Reviewed By: yizhang82 Differential Revision: D14662716 fbshipit-source-id: 6b4e5f044fd --- storage/rocksdb/ha_rocksdb.cc | 95 ++++++++++++---------------------- storage/rocksdb/ha_rocksdb.h | 10 ---- storage/rocksdb/rdb_datadic.cc | 41 ++++++--------- storage/rocksdb/rdb_datadic.h | 7 ++- 4 files changed, 52 insertions(+), 101 deletions(-) diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 5f0a92b2b813..e24a014d10ad 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -5649,7 +5649,6 @@ int ha_rocksdb::convert_record_to_storage_format( Rdb_string_writer *const pk_unpack_info = row_info.new_pk_unpack_info; bool has_ttl = m_pk_descr->has_ttl(); bool has_ttl_column = !m_pk_descr->m_ttl_column.empty(); - bool ttl_in_pk = has_ttl_column && (row_info.ttl_pk_offset != UINT_MAX); int null_bytes_in_record = m_converter->get_null_bytes_in_record(); m_storage_record.length(0); @@ -5657,40 +5656,40 @@ int ha_rocksdb::convert_record_to_storage_format( if (has_ttl) { /* If it's a TTL record, reserve space for 8 byte TTL value in front. */ m_storage_record.fill(ROCKSDB_SIZEOF_TTL_RECORD + null_bytes_in_record, 0); + // NOTE: m_ttl_bytes_updated is only used for update case + // During update, skip update sk key/values slice iff none of sk fields + // have changed and ttl bytes isn't changed. see + // ha_rocksdb::update_write_sk() for more info m_ttl_bytes_updated = false; /* - If the TTL is contained within the key, we use the offset to find the - TTL value and place it in the beginning of the value record. + If the TTL is contained within table columns, we use the field index to + find the TTL value and place it in the beginning of the value record. */ - if (ttl_in_pk) { - Rdb_string_reader reader(&pk_packed_slice); - const char *ts; - if (!reader.read(row_info.ttl_pk_offset) || - !(ts = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) { - std::string buf; - buf = rdb_hexdump(pk_packed_slice.data(), pk_packed_slice.size(), - RDB_MAX_HEXDUMP_LEN); - const GL_INDEX_ID gl_index_id = m_pk_descr->get_gl_index_id(); - LogPluginErrMsg(ERROR_LEVEL, 0, - "Decoding ttl from PK failed during insert, for index " - "(%u,%u), key: %s", - gl_index_id.cf_id, gl_index_id.index_id, buf.c_str()); - return HA_EXIT_FAILURE; - } + if (has_ttl_column) { + uint ttl_field_index = m_pk_descr->get_ttl_field_index(); + DBUG_ASSERT(ttl_field_index != UINT_MAX); + + Field *const field = table->field[ttl_field_index]; + DBUG_ASSERT(field->pack_length_in_rec() == ROCKSDB_SIZEOF_TTL_RECORD); + DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG); char *const data = const_cast(m_storage_record.ptr()); - memcpy(data, ts, ROCKSDB_SIZEOF_TTL_RECORD); + uint64 ts = uint8korr(field->ptr); #if !defined(DBUG_OFF) - // Adjust for test case if needed - rdb_netbuf_store_uint64( - reinterpret_cast(data), - rdb_netbuf_to_uint64(reinterpret_cast(data)) + - rdb_dbug_set_ttl_rec_ts()); + ts += rdb_dbug_set_ttl_rec_ts(); #endif // !defined(DBUG_OFF) - // Also store in m_ttl_bytes to propagate to update_write_sk + rdb_netbuf_store_uint64(reinterpret_cast(data), ts); + + // If this is an update and the timestamp has been updated, take note + // so we can avoid updating SKs unnecessarily. + if (!row_info.old_pk_slice.empty()) { + m_ttl_bytes_updated = + memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); + } + // Store timestamp in m_ttl_bytes to propagate to update_write_sk memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); - } else if (!has_ttl_column) { + } else { /* For implicitly generated TTL records we need to copy over the old TTL value from the old record in the event of an update. It was stored @@ -5773,35 +5772,8 @@ int ha_rocksdb::convert_record_to_storage_format( field_var->length_bytes + data_len); } else { /* Copy the field data */ - const uint len = field->pack_length_in_rec(); - m_storage_record.append(reinterpret_cast(field->ptr), len); - - /* - Check if this is the TTL field within the table, if so store the TTL - in the front of the record as well here. - */ - if (has_ttl && has_ttl_column && - i == m_pk_descr->get_ttl_field_offset()) { - DBUG_ASSERT(len == ROCKSDB_SIZEOF_TTL_RECORD); - DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG); - DBUG_ASSERT(m_pk_descr->get_ttl_field_offset() != UINT_MAX); - - char *const data = const_cast(m_storage_record.ptr()); - uint64 ts = uint8korr(field->ptr); -#if !defined(DBUG_OFF) - ts += rdb_dbug_set_ttl_rec_ts(); -#endif // !defined(DBUG_OFF) - rdb_netbuf_store_uint64(reinterpret_cast(data), ts); - - // If this is an update and the timestamp has been updated, take note - // so we can avoid updating SKs unnecessarily. - if (!row_info.old_pk_slice.empty()) { - m_ttl_bytes_updated = - memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); - } - // Store timestamp in m_ttl_bytes to propagate to update_write_sk - memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); - } + m_storage_record.append(reinterpret_cast(field->ptr), + field->pack_length_in_rec()); } } @@ -8799,10 +8771,9 @@ int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) { row_info->new_pk_unpack_info = &m_pk_unpack_info; - size = - m_pk_descr->pack_record(table, m_pack_buffer, row_info->new_data, - m_pk_packed_tuple, row_info->new_pk_unpack_info, - false, 0, 0, nullptr, &row_info->ttl_pk_offset); + size = m_pk_descr->pack_record( + table, m_pack_buffer, row_info->new_data, m_pk_packed_tuple, + row_info->new_pk_unpack_info, false, 0, 0, nullptr); } else if (row_info->old_data == nullptr) { row_info->hidden_pk_id = update_hidden_pk_val(); size = @@ -9314,14 +9285,14 @@ int ha_rocksdb::update_write_sk(const TABLE *const table_arg, new_packed_size = kd.pack_record(table_arg, m_pack_buffer, row_info.new_data, m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums, - row_info.hidden_pk_id, 0, nullptr, nullptr, m_ttl_bytes); + row_info.hidden_pk_id, 0, nullptr, m_ttl_bytes); if (row_info.old_data != nullptr) { // The old value old_packed_size = kd.pack_record( table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old, &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0, - nullptr, nullptr, m_ttl_bytes); + nullptr, m_ttl_bytes); /* Check if we are going to write the same value. This can happen when @@ -11763,7 +11734,7 @@ int ha_rocksdb::inplace_populate_sk( const int new_packed_size = index->pack_record( new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple, &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0, - nullptr, nullptr, m_ttl_bytes); + nullptr, m_ttl_bytes); const rocksdb::Slice key = rocksdb::Slice( reinterpret_cast(m_sk_packed_tuple), new_packed_size); diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index e652ebe036eb..59c8d396f153 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -643,16 +643,6 @@ class ha_rocksdb : public my_core::handler { longlong hidden_pk_id; bool skip_unique_check; - - // In certain cases, TTL is enabled on a table, as well as an explicit TTL - // column. The TTL column can be part of either the key or the value part - // of the record. If it is part of the key, we store the offset here. - // - // Later on, we use this offset to store the TTL in the value part of the - // record, which we can then access in the compaction filter. - // - // Set to UINT_MAX by default to indicate that the TTL is not in key. - uint ttl_pk_offset = UINT_MAX; }; /* diff --git a/storage/rocksdb/rdb_datadic.cc b/storage/rocksdb/rdb_datadic.cc index 85e068647c09..362b058e044f 100644 --- a/storage/rocksdb/rdb_datadic.cc +++ b/storage/rocksdb/rdb_datadic.cc @@ -95,7 +95,7 @@ Rdb_key_def::Rdb_key_def(uint indexnr_arg, uint keyno_arg, m_ttl_rec_offset(ttl_rec_offset), m_ttl_duration(ttl_duration), m_ttl_column(""), m_pk_part_no(nullptr), m_pack_info(nullptr), m_keyno(keyno_arg), m_key_parts(0), m_ttl_pk_key_part_offset(UINT_MAX), - m_ttl_field_offset(UINT_MAX), m_prefix_extractor(nullptr), + m_ttl_field_index(UINT_MAX), m_prefix_extractor(nullptr), m_maxlength(0) // means 'not intialized' { mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST); @@ -120,7 +120,7 @@ Rdb_key_def::Rdb_key_def(const Rdb_key_def &k) m_ttl_column(k.m_ttl_column), m_pk_part_no(k.m_pk_part_no), m_pack_info(nullptr), m_keyno(k.m_keyno), m_key_parts(k.m_key_parts), m_ttl_pk_key_part_offset(k.m_ttl_pk_key_part_offset), - m_ttl_field_offset(UINT_MAX), m_prefix_extractor(k.m_prefix_extractor), + m_ttl_field_index(UINT_MAX), m_prefix_extractor(k.m_prefix_extractor), m_maxlength(k.m_maxlength) { mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST); rdb_netbuf_store_index(m_index_number_storage_form, m_index_number); @@ -250,7 +250,7 @@ void Rdb_key_def::setup(const TABLE *const tbl, table creation. */ Rdb_key_def::extract_ttl_col(tbl, tbl_def, &m_ttl_column, - &m_ttl_field_offset, true); + &m_ttl_field_index, true); size_t max_len = INDEX_NUMBER_SIZE; int unpack_len = 0; @@ -429,7 +429,7 @@ uint Rdb_key_def::extract_ttl_duration(const TABLE *const table_arg, uint Rdb_key_def::extract_ttl_col(const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg, std::string *ttl_column, - uint *ttl_field_offset, bool skip_checks) { + uint *ttl_field_index, bool skip_checks) { std::string table_comment(table_arg->s->comment.str, table_arg->s->comment.length); /* @@ -448,7 +448,7 @@ uint Rdb_key_def::extract_ttl_col(const TABLE *const table_arg, if (my_strcasecmp(system_charset_info, field->field_name, ttl_col_str.c_str()) == 0) { *ttl_column = ttl_col_str; - *ttl_field_offset = i; + *ttl_field_index = i; } } return HA_EXIT_SUCCESS; @@ -465,7 +465,7 @@ uint Rdb_key_def::extract_ttl_col(const TABLE *const table_arg, field->key_type() == HA_KEYTYPE_ULONGLONG && !field->real_maybe_null()) { *ttl_column = ttl_col_str; - *ttl_field_offset = i; + *ttl_field_index = i; found = true; break; } @@ -1050,8 +1050,8 @@ uchar *Rdb_key_def::pack_field(Field *const field, Rdb_field_packing *pack_info, unpack_info_len OUT Unpack data length n_key_parts Number of keyparts to process. 0 means all of them. n_null_fields OUT Number of key fields with NULL value. - ttl_pk_offset OUT Offset of the ttl column if specified and in the key - + ttl_bytes IN Previous ttl bytes from old record for update case or + current ttl bytes from just packed primary key/value @detail Some callers do not need the unpack information, they can pass unpack_info=nullptr, unpack_info_len=nullptr. @@ -1060,12 +1060,14 @@ uchar *Rdb_key_def::pack_field(Field *const field, Rdb_field_packing *pack_info, Length of the packed tuple */ -uint Rdb_key_def::pack_record( - const TABLE *const tbl, uchar *const pack_buffer, const uchar *const record, - uchar *const packed_tuple, Rdb_string_writer *const unpack_info, - const bool should_store_row_debug_checksums, const longlong hidden_pk_id, - uint n_key_parts, uint *const n_null_fields, uint *const ttl_pk_offset, - const char *const ttl_bytes) const { +uint Rdb_key_def::pack_record(const TABLE *const tbl, uchar *const pack_buffer, + const uchar *const record, + uchar *const packed_tuple, + Rdb_string_writer *const unpack_info, + const bool should_store_row_debug_checksums, + const longlong hidden_pk_id, uint n_key_parts, + uint *const n_null_fields, + const char *const ttl_bytes) const { DBUG_ASSERT(tbl != nullptr); DBUG_ASSERT(pack_buffer != nullptr); DBUG_ASSERT(record != nullptr); @@ -1166,17 +1168,6 @@ uint Rdb_key_def::pack_record( uint null_offset = field->null_offset(tbl->record[0]); bool maybe_null = field->real_maybe_null(); - // Save the ttl duration offset in the key so we can store it in front of - // the record later. - if (ttl_pk_offset && m_ttl_duration > 0 && i == m_ttl_pk_key_part_offset) { - DBUG_ASSERT(my_strcasecmp(system_charset_info, field->field_name, - m_ttl_column.c_str()) == 0); - DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG); - DBUG_ASSERT(field->key_type() == HA_KEYTYPE_ULONGLONG); - DBUG_ASSERT(!field->real_maybe_null()); - *ttl_pk_offset = tuple - packed_tuple; - } - field->move_field(const_cast(record) + field_offset, maybe_null ? const_cast(record) + null_offset : nullptr, diff --git a/storage/rocksdb/rdb_datadic.h b/storage/rocksdb/rdb_datadic.h index 1ef05b0b4697..a8ac357fd364 100644 --- a/storage/rocksdb/rdb_datadic.h +++ b/storage/rocksdb/rdb_datadic.h @@ -203,7 +203,6 @@ class Rdb_key_def { const bool should_store_row_debug_checksums, const longlong hidden_pk_id = 0, uint n_key_parts = 0, uint *const n_null_fields = nullptr, - uint *const ttl_pk_offset = nullptr, const char *const ttl_bytes = nullptr) const; /* Pack the hidden primary key into mem-comparable form. */ uint pack_hidden_pk(const longlong hidden_pk_id, @@ -371,7 +370,7 @@ class Rdb_key_def { uint get_key_parts() const { return m_key_parts; } - uint get_ttl_field_offset() const { return m_ttl_field_offset; } + uint get_ttl_field_index() const { return m_ttl_field_index; } /* Get a field object for key part #part_no @@ -537,7 +536,7 @@ class Rdb_key_def { uint64 *ttl_duration); static uint extract_ttl_col(const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg, - std::string *ttl_column, uint *ttl_field_offset, + std::string *ttl_column, uint *ttl_field_index, bool skip_checks = false); inline bool has_ttl() const { return m_ttl_duration > 0; } @@ -869,7 +868,7 @@ class Rdb_key_def { Index of the TTL column in table->s->fields, if it exists. Default is UINT_MAX to denote that it does not exist. */ - uint m_ttl_field_offset; + uint m_ttl_field_index; /* Prefix extractor for the column family of the key definiton */ std::shared_ptr m_prefix_extractor;