Skip to content

Commit

Permalink
Update tablet healthcheck test based on new replica lag logic
Browse files Browse the repository at this point in the history
Signed-off-by: Matt Lord <mattalord@gmail.com>
  • Loading branch information
mattlord committed Dec 2, 2021
1 parent c5143d8 commit 31560fd
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 6 deletions.
2 changes: 1 addition & 1 deletion go/mysql/replication_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type ReplicationStatus struct {
// However, some MySQL flavors don't expose this information,
// in which case RelayLogPosition.IsZero() will be true.
// If ReplicationLagUnknown is true then we should not rely on the seconds
// behind value and we can instead try to calcuate the lag ourselves when
// behind value and we can instead try to calculate the lag ourselves when
// appropriate.
RelayLogPosition Position
FilePosition Position
Expand Down
7 changes: 6 additions & 1 deletion go/test/endtoend/tabletmanager/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ var (
username = "vt_dba"
cell = "zone1"
tabletHealthcheckRefreshInterval = 5 * time.Second
tabletUnhealthyThreshold = tabletHealthcheckRefreshInterval * 2
sqlSchema = `
create table t1(
id bigint,
Expand Down Expand Up @@ -94,13 +95,17 @@ func TestMain(m *testing.M) {
}

// List of users authorized to execute vschema ddl operations
clusterInstance.VtGateExtraArgs = []string{"-vschema_ddl_authorized_users=%"}
clusterInstance.VtGateExtraArgs = []string{
"-vschema_ddl_authorized_users=%",
"-discovery_low_replication_lag", tabletUnhealthyThreshold.String(),
}
// Set extra tablet args for lock timeout
clusterInstance.VtTabletExtraArgs = []string{
"-lock_tables_timeout", "5s",
"-watch_replication_stream",
"-enable_replication_reporter",
"-health_check_interval", tabletHealthcheckRefreshInterval.String(),
"-unhealthy_threshold", tabletUnhealthyThreshold.String(),
}
// We do not need semiSync for this test case.
clusterInstance.EnableSemiSync = false
Expand Down
8 changes: 4 additions & 4 deletions go/test/endtoend/tabletmanager/tablet_health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,11 +153,12 @@ func TestHealthCheck(t *testing.T) {
}

// stop the replica's source mysqld instance to break replication
// and test that the tablet becomes unhealthy and non-serving
// and test that the replica tablet becomes unhealthy and non-serving after crossing
// the tablet's -unhealthy_threshold and the gateway's -discovery_low_replication_lag
err = primaryTablet.MysqlctlProcess.Stop()
require.NoError(t, err)

time.Sleep(tabletHealthcheckRefreshInterval)
time.Sleep(tabletUnhealthyThreshold + tabletHealthcheckRefreshInterval)

// now the replica's VtTabletStreamHealth should show it as unhealthy
result, err = clusterInstance.VtctlclientProcess.ExecuteCommandWithOutput("VtTabletStreamHealth", "-count", "1", rTablet.Alias)
Expand Down Expand Up @@ -238,8 +239,7 @@ func verifyStreamHealth(t *testing.T, result string, expectHealthy bool) {
// replicationLagSeconds varies till 7200 so setting safe limit
assert.True(t, replicationLagSeconds < 10000, "replica should not be behind primary")
} else {
assert.False(t, serving, "Tablet should not be in serving state")
assert.False(t, replicationLagSeconds < 10000, "replica should be behind primary")
assert.True(t, (!serving || replicationLagSeconds >= uint32(tabletUnhealthyThreshold.Seconds())), "Tablet should not be in serving and healthy state")
}
}

Expand Down

0 comments on commit 31560fd

Please sign in to comment.