Skip to content

Commit

Permalink
recover raft logs by removing partial trxs
Browse files Browse the repository at this point in the history
Summary:
mysqld removes partial trxs in the tail of trx log (named binary-logs on
primaries and apply-logs on secondaries) during startup. However, relay logs
were not of much importance since it was anyways discarded and a new one would
be created.

However, with raft, this is not ideal. Relay logs are raft logs on secondaries
and have to be kept around (and kept sane and consistent). This diff adds the
ability to remove partial trxs from raft/relay logs.

Much of the code to open the last relay log (based on relay log index) and
identify partial trxs is borrowed from existing logic in
MYSQL_BIN_LOG::open_binlog() and MYSQL_BIN_LOG::recover()

Reviewed By: anirbanr-fb

Differential Revision: D24628821

fbshipit-source-id: 5694a593d91
  • Loading branch information
bhatvinay authored and facebook-github-bot committed Mar 13, 2021
1 parent 6b1db2c commit 6ab1c7c
Show file tree
Hide file tree
Showing 13 changed files with 337 additions and 7 deletions.
10 changes: 10 additions & 0 deletions mysql-test/suite/rpl_raft/my.cnf
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ enable_binlog_hlc=1
rpl_wait_for_semi_sync_ack=1
# To bypass some strict raft config checks only for MTR
override_enable_raft_check=1
# Info repo should be file
master_info_repository=FILE

[mysqld.2]
# Run the <testname>-slave.sh script before starting this process
Expand Down Expand Up @@ -56,6 +58,8 @@ enable_binlog_hlc=1
rpl_wait_for_semi_sync_ack=1
# To bypass some strict raft config checks only for MTR
override_enable_raft_check=1
# Info repo should be file
master_info_repository=FILE

[mysqld.3]
# Run the <testname>-slave.sh script before starting this process
Expand Down Expand Up @@ -86,6 +90,8 @@ enable_binlog_hlc=1
rpl_wait_for_semi_sync_ack=1
# To bypass some strict raft config checks only for MTR
override_enable_raft_check=1
# Info repo should be file
master_info_repository=FILE

[mysqld.4]
# Run the <testname>-slave.sh script before starting this process
Expand Down Expand Up @@ -115,6 +121,8 @@ enable_binlog_hlc=1
rpl_wait_for_semi_sync_ack=1
# To bypass some strict raft config checks only for MTR
override_enable_raft_check=1
# Info repo should be file
master_info_repository=FILE

[mysqld.5]
# Run the <testname>-slave.sh script before starting this process
Expand Down Expand Up @@ -144,6 +152,8 @@ enable_binlog_hlc=1
rpl_wait_for_semi_sync_ack=1
# To bypass some strict raft config checks only for MTR
override_enable_raft_check=1
# Info repo should be file
master_info_repository=FILE

[ENV]
MASTER_MYPORT=@mysqld.1.port
Expand Down
47 changes: 47 additions & 0 deletions mysql-test/suite/rpl_raft/r/rpl_raft_recover_raft_log.result
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
include/raft_3_node.inc
Warnings:
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
Warnings:
Note #### Sending passwords in plain text without SSL/TLS is extremely insecure.
Note #### Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
[connection master]
call mtr.add_suppression(".*");
create table t1 (a int primary key) engine = innodb;
insert into t1 values(1);
insert into t1 values(2);
include/sync_slave_sql_with_master.inc
include/sync_slave_sql_with_master.inc
select * from t1;
a
1
2
select * from t1;
a
1
2
select sleep(10);
sleep(10)
0
"raft file: binary-logs-13001.000002"
"raft file pos: 1362"
"Restarting server_2"
include/rpl_restart_server.inc [server_number=2]
start slave sql_thread;
insert into t1 values(3);
include/sync_slave_sql_with_master.inc
include/sync_slave_sql_with_master.inc
select * from t1;
a
1
2
3
select * from t1;
a
1
2
3
drop table t1;
include/sync_slave_sql_with_master.inc
include/sync_slave_sql_with_master.inc
include/rpl_end.inc
63 changes: 63 additions & 0 deletions mysql-test/suite/rpl_raft/t/rpl_raft_recover_raft_log.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
source ../include/raft_3_node.inc;

call mtr.add_suppression(".*");

connection server_1;
create table t1 (a int primary key) engine = innodb;
insert into t1 values(1);
insert into t1 values(2);

let $sync_slave_connection= server_2;
source include/sync_slave_sql_with_master.inc;
let $sync_slave_connection= server_3;
source include/sync_slave_sql_with_master.inc;

connection server_2;
select * from t1;

connection server_3;
select * from t1;

connection server_2;
select sleep(10);

# Expand this test later to cover multiple scenarios
let $server2_datadir = `select @@datadir`;
let $server2_raft_file = query_get_value("SHOW SLAVE STATUS", "Relay_Log_File", 1);
let $server2_raft_file_pos = query_get_value("SHOW SLAVE STATUS", "Relay_Log_Pos", 1);
echo "raft file: $server2_raft_file";
echo "raft file pos: $server2_raft_file_pos";
let $half = `select ROUND($server2_raft_file_pos / 2)`;
exec truncate -s $half $server2_datadir/$server2_raft_file;

echo "Restarting server_2";
let $rpl_server_number = 2;
source include/rpl_restart_server.inc;

connection server_2;
start slave sql_thread;

connection server_1;
insert into t1 values(3);

let $sync_slave_connection= server_2;
source include/sync_slave_sql_with_master.inc;
let $sync_slave_connection= server_3;
source include/sync_slave_sql_with_master.inc;

connection server_2;
select * from t1;

connection server_3;
select * from t1;

# cleanup
connection server_1;
drop table t1;

let $sync_slave_connection= server_2;
source include/sync_slave_sql_with_master.inc;
let $sync_slave_connection= server_3;
source include/sync_slave_sql_with_master.inc;

source include/rpl_end.inc;
2 changes: 2 additions & 0 deletions mysql-test/suite/sys_vars/r/all_vars.result
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ There should be *no* long test name listed below:
select variable_name as `There should be *no* variables listed below:` from t2
left join t1 on variable_name=test_name where test_name is null ORDER BY variable_name;
There should be *no* variables listed below:
RECOVER_RAFT_LOG
RECOVER_RAFT_LOG
WAIT_FOR_HLC_SLEEP_SCALING_FACTOR
WAIT_FOR_HLC_SLEEP_SCALING_FACTOR
WAIT_FOR_HLC_SLEEP_THRESHOLD_MS
Expand Down
183 changes: 183 additions & 0 deletions sql/binlog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10800,6 +10800,189 @@ int MYSQL_BIN_LOG::ordered_commit(THD *thd, bool all, bool skip_commit,
DBUG_RETURN(thd->commit_error == THD::CE_COMMIT_ERROR);
}

/**
* Recover raft log. This is primarily for relay logs in the raft world since
* trx logs (binary logs or apply logs) are already recovered by mysqld as part
* of trx log recovery. This method tries to get rid of partial trxs in the tal
* of the raft log. Much has been borrowed from
* MYSQL_BIN_LOG::open_binlog(const char *opt_name) and
* MYSQL_BIN_LOG::recover(...). Refactoring the components is rather hard and
* adds unnecessary complexity with additional params and if() {} else {}
* branches. Hence a separate method.
*/
int MYSQL_BIN_LOG::recover_raft_log()
{
int error= 0;
const char *errmsg;
IO_CACHE log;
File file;
Log_event *ev= 0;
Format_description_log_event fdle(BINLOG_VERSION);
Format_description_log_event *fdle_ev= 0;
bool in_operation= FALSE;
char log_name[FN_REFLEN];
my_off_t valid_pos= 0;
my_off_t binlog_size= 0;
MY_STAT s;
LOG_INFO log_info;
bool pending_gtid= FALSE;
std::string error_message;
int status= 0;

if (!mysql_bin_log.is_apply_log)
goto err; // raft log already recovered as part of trx log recovery

if (!fdle.is_valid())
{
error= 1;
error_message= "BINLOG_VERSION is not valid for format descriptor";
goto err;
}

if (!my_b_inited(&index_file))
{
error_message= "Index file is not inited in recover_raft_log";
error= 1;
goto err;
}

if ((status= find_log_pos(&log_info, NullS, true/*need_lock_index=true*/)))
{
if (status != LOG_INFO_EOF)
{
error_message= "find_log_pos() failed in recover_raft_log with error: "
+ std::to_string(error);
error= 1;
}
goto err;
}

do
{
strmake(log_name, log_info.log_file_name, sizeof(log_name)-1);
} while (!(status= find_next_log(&log_info, true/*need_lock_index=true*/)));

if (status != LOG_INFO_EOF)
{
error_message= "find_log_pos() failed in recover_raft_log with error: "
+ std::to_string(error);
error= 1;
goto err;
}

if ((file= open_binlog_file(&log, log_name, &errmsg)) < 0)
{
error= 1;
error_message= "open_binlog_file() failed in recover_raft_log with " +
std::string(errmsg);
goto err;
}

// Get the current size of the file
my_stat(log_name, &s, MYF(0));
binlog_size= s.st_size;

// Get the format description event from relay log
if ((ev= Log_event::read_log_event(&log, 0, &fdle,
opt_master_verify_checksum, NULL)))
{
if (ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
fdle_ev= (Format_description_log_event *)ev;
else
{
error_message= "Could not find format description event in raft log " +
std::string(log_name);
error= 1;
goto err;
}
}
else
{
error_message= "Could not read from raft log " + std::string(log_name);
error= 1;
goto err;
}

// This logic is borrowed from MYSQL_BIN_LOG::recover() which has to do
// additional things and refactoring it will simply add more branches. Hence
// the code duplication
while ((ev= Log_event::read_log_event(&log, 0, fdle_ev, TRUE, NULL))
&& ev->is_valid())
{
if (ev->get_type_code() == QUERY_EVENT &&
!strcmp(((Query_log_event*)ev)->query, "BEGIN"))
in_operation= TRUE;
else if (is_gtid_event(ev))
{
pending_gtid= TRUE;
}
else if (ev->get_type_code() == XID_EVENT ||
(ev->get_type_code() == QUERY_EVENT &&
!strcmp(((Query_log_event*)ev)->query, "COMMIT")))
{
if (!in_operation)
{
// When we see a commit message, we should already be parsing a valid
// transaction
error_message= "Saw a XID/COMMIT event without a begin. Corrupted log: "
+ std::string(log_name);
error= 1;
delete ev;
break;
}
in_operation= FALSE;
}

if (!(ev->get_type_code() == METADATA_EVENT && pending_gtid))
{
if (!log.error && !in_operation && !is_gtid_event(ev))
{
valid_pos= my_b_tell(&log);
pending_gtid= FALSE;
}
}

delete ev;
}

delete fdle_ev;
end_io_cache(&log);
mysql_file_close(file, MYF(MY_WME));

// No partial trxs found in the raft log or error parsing the log
if (error || (valid_pos == 0 || valid_pos >= binlog_size))
goto err;

// NO_LINT_DEBUG
sql_print_information("Raft log %s with a size of %llu will be trimmed to "
"%llu bytes based on valid transactions in the file",
log_name, binlog_size, valid_pos);

if ((file= mysql_file_open(key_file_relaylog, log_name,
O_RDWR | O_BINARY, MYF(MY_WME))) < 0)
{
error_message= "Failed to remove partial transactions from raft log file "
+ std::string(log_name);
error= 1;
goto err;
}

if (my_chsize(file, valid_pos, 0, MYF(MY_WME)))
{
error_message= "Failed to remove partial transactions from raft log file "
+ std::string(log_name);
error= 1;
}

mysql_file_close(file, MYF(MY_WME));

err:
if (error && !error_message.empty())
// NO_LINT_DEBUG
sql_print_error("%s", error_message.c_str());

return error;
}

/**
MYSQLD server recovers from last crashed binlog.
Expand Down
1 change: 1 addition & 0 deletions sql/binlog.h
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,7 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
int recover(IO_CACHE *log, Format_description_log_event *fdle,
my_off_t *valid_pos, const std::string& cur_binlog_file);
int recover(IO_CACHE *log, Format_description_log_event *fdle);
int recover_raft_log();
int set_valid_pos(
my_off_t* valid_pos,
const std::string& cur_binlog_file,
Expand Down
1 change: 1 addition & 0 deletions sql/mysqld.cc
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,7 @@ ulonglong max_tmp_disk_usage;
ulonglong tmp_table_disk_usage_period_peak = 0;
ulonglong filesort_disk_usage_period_peak = 0;
bool enable_raft_plugin= 0;
bool recover_raft_log= 0;
bool disable_raft_log_repointing= 0;
bool override_enable_raft_check= false;
ulong opt_raft_signal_async_dump_threads= 0;
Expand Down
1 change: 1 addition & 0 deletions sql/mysqld.h
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ extern ulonglong minimum_hlc_ns;
/* Maximum allowed forward drift in the HLC as compared to wall clock */
extern ulonglong maximum_hlc_drift_ns;
extern bool enable_raft_plugin;
extern bool recover_raft_log;
extern bool disable_raft_log_repointing;
extern bool override_enable_raft_check;
extern ulong opt_raft_signal_async_dump_threads;
Expand Down
Loading

0 comments on commit 6ab1c7c

Please sign in to comment.