Skip to content

Commit

Permalink
MDEV-35806: Error in read_log_event() corrupts relay log writer, cras…
Browse files Browse the repository at this point in the history
…hes server

In Log_event::read_log_event(), don't use IO_CACHE::error of the relay log's
IO_CACHE to signal an error back to the caller. When reading the active
relay log, this flag is also being used by the IO thread, and setting it can
randomly cause the IO thread to wrongly detect IO error on writing and
permanently disable the relay log.

This was seen sporadically in test case rpl.rpl_from_mysql80. The read
error set by the SQL thread in the IO_CACHE would be interpreted as a
write error by the IO thread, which would cause it to throw a fatal
error and close the relay log. And this would later cause CHANGE
MASTER to try to purge a closed relay log, resulting in nullptr crash.

Also fix another sporadic failure in rpl.rpl_from_mysql80 where the outout
of MASTER_POS_WAIT() depended on timing of SQL and IO thread.

Signed-off-by: Kristian Nielsen <knielsen@knielsen-hq.org>
  • Loading branch information
knielsen committed Jan 10, 2025
1 parent 43233fe commit 777c351
Show file tree
Hide file tree
Showing 10 changed files with 50 additions and 28 deletions.
17 changes: 11 additions & 6 deletions client/mysqlbinlog.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2698,6 +2698,7 @@ static Exit_status check_header(IO_CACHE* file,
uchar buf[PROBE_HEADER_LEN];
my_off_t tmp_pos, pos;
MY_STAT my_file_stat;
int read_error;

delete glob_description_event;
if (!(glob_description_event= new Format_description_log_event(3)))
Expand Down Expand Up @@ -2798,7 +2799,8 @@ static Exit_status check_header(IO_CACHE* file,
Format_description_log_event *new_description_event;
my_b_seek(file, tmp_pos); /* seek back to event's start */
if (!(new_description_event= (Format_description_log_event*)
Log_event::read_log_event(file, glob_description_event,
Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum)))
/* EOF can't be hit here normally, so it's a real error */
{
Expand Down Expand Up @@ -2831,7 +2833,8 @@ static Exit_status check_header(IO_CACHE* file,
{
Log_event *ev;
my_b_seek(file, tmp_pos); /* seek back to event's start */
if (!(ev= Log_event::read_log_event(file, glob_description_event,
if (!(ev= Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum)))
{
/* EOF can't be hit here normally, so it's a real error */
Expand Down Expand Up @@ -2945,8 +2948,10 @@ static Exit_status dump_local_log_entries(PRINT_EVENT_INFO *print_event_info,
{
char llbuff[21];
my_off_t old_off = my_b_tell(file);
int read_error;

Log_event* ev = Log_event::read_log_event(file, glob_description_event,
Log_event* ev = Log_event::read_log_event(file, &read_error,
glob_description_event,
opt_verify_binlog_checksum);
if (!ev)
{
Expand All @@ -2955,15 +2960,15 @@ static Exit_status dump_local_log_entries(PRINT_EVENT_INFO *print_event_info,
about a corruption, but treat it as EOF and move to the next binlog.
*/
if (glob_description_event->flags & LOG_EVENT_BINLOG_IN_USE_F)
file->error= 0;
else if (file->error)
read_error= 0;
else if (read_error)
{
error("Could not read entry at offset %s: "
"Error in log format or read error.",
llstr(old_off,llbuff));
goto err;
}
// else file->error == 0 means EOF, that's OK, we break in this case
// else read_error == 0 means EOF, that's OK, we break in this case

/*
Emit a warning in the event that we finished processing input
Expand Down
2 changes: 0 additions & 2 deletions mysql-test/suite/rpl/r/rpl_from_mysql80.result
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ START SLAVE IO_THREAD;
include/wait_for_slave_io_to_start.inc
START SLAVE UNTIL Master_log_file='master-bin.000001', Master_log_pos= 1178;
SELECT MASTER_POS_WAIT('master-bin.000001', 1178, 60);
MASTER_POS_WAIT('master-bin.000001', 1178, 60)
NULL
SELECT * FROM t1 ORDER BY a;
a b c
1 0
Expand Down
2 changes: 2 additions & 0 deletions mysql-test/suite/rpl/t/rpl_from_mysql80.test
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ START SLAVE IO_THREAD;
# The position 1178 is the start of: INSERT INTO t1 VALUES (4, 0, 'skip');
# After that comes unknown MySQL 8.0 events, which we test error for below.
START SLAVE UNTIL Master_log_file='master-bin.000001', Master_log_pos= 1178;
--disable_result_log
SELECT MASTER_POS_WAIT('master-bin.000001', 1178, 60);
--enable_result_log
SELECT * FROM t1 ORDER BY a;

--source include/wait_for_slave_sql_to_stop.inc
Expand Down
19 changes: 15 additions & 4 deletions sql/log.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5390,6 +5390,13 @@ int MYSQL_BIN_LOG::new_file_impl()
DBUG_RETURN(error);
}

if (log_file.error)
{
my_error(ER_SLAVE_RELAY_LOG_WRITE_FAILURE, MYF(ME_ERROR_LOG),
"Writing to the relay log prevented by an earlier error");
DBUG_RETURN(1);
}

mysql_mutex_lock(&LOCK_index);

/* Reuse old name if not binlog and not update log */
Expand Down Expand Up @@ -11385,7 +11392,8 @@ int TC_LOG_BINLOG::recover(LOG_INFO *linfo, const char *last_log_name,
binlog_checkpoint_found= false;
for (round= 1;;)
{
while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log,
int error;
while ((ev= Log_event::read_log_event(round == 1 ? first_log : &log, &error,
fdle, opt_master_verify_checksum))
&& ev->is_valid())
{
Expand Down Expand Up @@ -11671,7 +11679,8 @@ MYSQL_BIN_LOG::do_binlog_recovery(const char *opt_name, bool do_xa_recovery)
return 1;
}

if ((ev= Log_event::read_log_event(&log, &fdle,
int read_error;
if ((ev= Log_event::read_log_event(&log, &read_error, &fdle,
opt_master_verify_checksum)) &&
ev->get_type_code() == FORMAT_DESCRIPTION_EVENT)
{
Expand Down Expand Up @@ -11895,10 +11904,11 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
Format_description_log_event *fdle;
Log_event *ev;
const char *errormsg = NULL;
int read_error;

*out_gtid_list= NULL;

if (!(ev= Log_event::read_log_event(cache, &init_fdle,
if (!(ev= Log_event::read_log_event(cache, &read_error, &init_fdle,
opt_master_verify_checksum)) ||
ev->get_type_code() != FORMAT_DESCRIPTION_EVENT)
{
Expand All @@ -11914,7 +11924,8 @@ get_gtid_list_event(IO_CACHE *cache, Gtid_list_log_event **out_gtid_list)
{
Log_event_type typ;

ev= Log_event::read_log_event(cache, fdle, opt_master_verify_checksum);
ev= Log_event::read_log_event(cache, &read_error, fdle,
opt_master_verify_checksum);
if (!ev)
{
errormsg= "Could not read GTID list event while looking for GTID "
Expand Down
7 changes: 4 additions & 3 deletions sql/log_event.cc
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@ int Log_event::read_log_event(IO_CACHE* file, String* packet,
DBUG_RETURN(0);
}

Log_event* Log_event::read_log_event(IO_CACHE* file,
Log_event* Log_event::read_log_event(IO_CACHE* file, int *out_error,
const Format_description_log_event *fdle,
my_bool crc_check,
my_bool print_errors)
Expand All @@ -927,6 +927,7 @@ Log_event* Log_event::read_log_event(IO_CACHE* file,
const char *error= 0;
Log_event *res= 0;

*out_error= 0;
switch (read_log_event(file, &event, fdle, BINLOG_CHECKSUM_ALG_OFF))
{
case 0:
Expand Down Expand Up @@ -976,14 +977,14 @@ Log_event* Log_event::read_log_event(IO_CACHE* file,
#endif

/*
The SQL slave thread will check if file->error<0 to know
The SQL slave thread will check *out_error to know
if there was an I/O error. Even if there is no "low-level" I/O errors
with 'file', any of the high-level above errors is worrying
enough to stop the SQL thread now ; as we are skipping the current event,
going on with reading and successfully executing other events can
only corrupt the slave's databases. So stop.
*/
file->error= -1;
*out_error= 1;

#ifndef MYSQL_CLIENT
if (!print_errors)
Expand Down
2 changes: 1 addition & 1 deletion sql/log_event.h
Original file line number Diff line number Diff line change
Expand Up @@ -1344,7 +1344,7 @@ class Log_event
we detect the event's type, then call the specific event's
constructor and pass description_event as an argument.
*/
static Log_event* read_log_event(IO_CACHE* file,
static Log_event* read_log_event(IO_CACHE* file, int *out_error,
const Format_description_log_event
*description_event,
my_bool crc_check,
Expand Down
5 changes: 3 additions & 2 deletions sql/rpl_parallel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1037,14 +1037,15 @@ retry_event_group(rpl_group_info *rgi, rpl_parallel_thread *rpt,
/* The loop is here so we can try again the next relay log file on EOF. */
for (;;)
{
int error;
old_offset= cur_offset;
ev= Log_event::read_log_event(&rlog, description_event,
ev= Log_event::read_log_event(&rlog, &error, description_event,
opt_slave_sql_verify_checksum);
cur_offset= my_b_tell(&rlog);

if (ev)
break;
if (unlikely(rlog.error < 0))
if (unlikely(error))
{
errmsg= "slave SQL thread aborted because of I/O error";
err= 1;
Expand Down
9 changes: 5 additions & 4 deletions sql/rpl_rli.cc
Original file line number Diff line number Diff line change
Expand Up @@ -544,12 +544,13 @@ read_relay_log_description_event(IO_CACHE *cur_log, ulonglong start_pos,
if (my_b_tell(cur_log) >= start_pos)
break;

if (!(ev= Log_event::read_log_event(cur_log, fdev,
int read_error;
if (!(ev= Log_event::read_log_event(cur_log, &read_error, fdev,
opt_slave_sql_verify_checksum)))
{
DBUG_PRINT("info",("could not read event, cur_log->error=%d",
cur_log->error));
if (cur_log->error) /* not EOF */
DBUG_PRINT("info",("could not read event, read_error=%d",
read_error));
if (read_error) /* not EOF */
{
*errmsg= "I/O error reading event at position 4";
delete fdev;
Expand Down
7 changes: 4 additions & 3 deletions sql/slave.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7845,7 +7845,8 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
MYSQL_BIN_LOG::open() will write the buffered description event.
*/
old_pos= rli->event_relay_log_pos;
if ((ev= Log_event::read_log_event(cur_log,
int error;
if ((ev= Log_event::read_log_event(cur_log, &error,
rli->relay_log.description_event_for_exec,
opt_slave_sql_verify_checksum)))

Expand All @@ -7862,8 +7863,8 @@ static Log_event* next_event(rpl_group_info *rgi, ulonglong *event_size)
DBUG_RETURN(ev);
}
if (opt_reckless_slave) // For mysql-test
cur_log->error = 0;
if (unlikely(cur_log->error < 0))
error = 0;
if (unlikely(error))
{
errmsg = "slave SQL thread aborted because of I/O error";
if (hot_log)
Expand Down
8 changes: 5 additions & 3 deletions sql/sql_repl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4188,7 +4188,8 @@ bool mysql_show_binlog_events(THD* thd)
my_off_t scan_pos = BIN_LOG_HEADER_SIZE;
while (scan_pos < pos)
{
ev= Log_event::read_log_event(&log, description_event,
int error;
ev= Log_event::read_log_event(&log, &error, description_event,
opt_master_verify_checksum);
scan_pos = my_b_tell(&log);
if (ev == NULL || !ev->is_valid())
Expand Down Expand Up @@ -4263,8 +4264,9 @@ bool mysql_show_binlog_events(THD* thd)
writing about this in the server log would be confusing as it isn't
related to server operational status.
*/
int error;
for (event_count = 0;
(ev = Log_event::read_log_event(&log,
(ev = Log_event::read_log_event(&log, &error,
description_event,
(opt_master_verify_checksum ||
verify_checksum_once), false)); )
Expand Down Expand Up @@ -4308,7 +4310,7 @@ bool mysql_show_binlog_events(THD* thd)
break;
}

if (unlikely(event_count < unit->lim.get_select_limit() && log.error))
if (unlikely(event_count < unit->lim.get_select_limit() && error))
{
errmsg = "Wrong offset or I/O error";
mysql_mutex_unlock(log_lock);
Expand Down

0 comments on commit 777c351

Please sign in to comment.