Skip to content

Commit

Permalink
apply patch from vitessio#9106
Browse files Browse the repository at this point in the history
Signed-off-by: Priya Bibra <pbibra@slack-corp.com>
  • Loading branch information
pbibra committed Feb 13, 2023
1 parent c40ac25 commit 8c0bdd0
Show file tree
Hide file tree
Showing 6 changed files with 316 additions and 8 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# DO NOT MODIFY: THIS FILE IS GENERATED USING "make generate_ci_workflows"

name: Cluster (vtgate_tablet_healthcheck_cache)
on: [push, pull_request]
concurrency:
group: format('{0}-{1}', ${{ github.ref }}, 'Cluster (vtgate_tablet_healthcheck_cache)')
cancel-in-progress: true

jobs:
build:
name: Run endtoend tests on Cluster (vtgate_tablet_healthcheck_cache)
runs-on: ubuntu-18.04

steps:
- name: Set up Go
uses: actions/setup-go@v2
with:
go-version: 1.17

- name: Tune the OS
run: |
echo '1024 65535' | sudo tee -a /proc/sys/net/ipv4/ip_local_port_range
# TEMPORARY WHILE GITHUB FIXES THIS https://github.com/actions/virtual-environments/issues/3185
- name: Add the current IP address, long hostname and short hostname record to /etc/hosts file
run: |
echo -e "$(ip addr show eth0 | grep "inet\b" | awk '{print $2}' | cut -d/ -f1)\t$(hostname -f) $(hostname -s)" | sudo tee -a /etc/hosts
# DON'T FORGET TO REMOVE CODE ABOVE WHEN ISSUE IS ADRESSED!

- name: Check out code
uses: actions/checkout@v2

- name: Get dependencies
run: |
sudo apt-get update
sudo apt-get install -y mysql-server mysql-client make unzip g++ etcd curl git wget eatmydata
sudo service mysql stop
sudo service etcd stop
sudo ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/
sudo apparmor_parser -R /etc/apparmor.d/usr.sbin.mysqld
go mod download
wget https://repo.percona.com/apt/percona-release_latest.$(lsb_release -sc)_all.deb
sudo apt-get install -y gnupg2
sudo dpkg -i percona-release_latest.$(lsb_release -sc)_all.deb
sudo apt-get update
sudo apt-get install percona-xtrabackup-24
- name: Run cluster endtoend test
timeout-minutes: 30
run: |
source build.env
eatmydata -- go run test.go -docker=false -print-log -follow -shard vtgate_tablet_healthcheck_cache
10 changes: 8 additions & 2 deletions go/test/endtoend/cluster/vttablet_process.go
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,14 @@ func (vttablet *VttabletProcess) GetStatusDetails() string {
}

// WaitForStatus waits till desired status of tablet is reached
func (vttablet *VttabletProcess) WaitForStatus(status string) bool {
return vttablet.GetTabletStatus() == status
func (vttablet *VttabletProcess) WaitForStatus(status string, howLong time.Duration) bool {
ticker := time.NewTicker(howLong)
for range ticker.C {
if vttablet.GetTabletStatus() == status {
return true
}
}
return false
}

// GetTabletStatus returns the tablet state as seen in /debug/vars TabletStateName
Expand Down
15 changes: 10 additions & 5 deletions go/test/endtoend/tabletgateway/vtgate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,16 @@ func TestReplicaTransactions(t *testing.T) {
time.Sleep(2 * time.Second)
exec(t, readConn, fetchAllCustomers, "is either down or nonexistent")

// bring up tablet again
// query using same transaction will fail
_ = replicaTablet.VttabletProcess.Setup()
exec(t, readConn, fetchAllCustomers, "not found")
exec(t, readConn, "commit", "")
// bring up the tablet again
// trying to use the same session/transaction should fail as the vtgate has
// been restarted and the session lost
replicaTablet.VttabletProcess.ServingStatus = "SERVING"
err = replicaTablet.VttabletProcess.Setup()
require.Nil(t, err)
serving := replicaTablet.VttabletProcess.WaitForStatus("SERVING", time.Duration(60*time.Second))
assert.Equal(t, serving, true, "Tablet did not become ready within a reasonable time")
exec(t, readConn, fetchAllCustomers, "is either down or nonexistent")

// create a new connection, should be able to query again
readConn, err = mysql.Connect(ctx, &vtParams)
require.NoError(t, err)
Expand Down
230 changes: 230 additions & 0 deletions go/test/endtoend/vtgate/tablet_healthcheck_cache/correctness_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
/*
Copyright 2021 The Vitess Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package tablethealthcheckcache

import (
"context"
"flag"
"fmt"
"os"
"sync"
"testing"
"time"

"github.com/stretchr/testify/require"
"gotest.tools/assert"

"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/test/endtoend/cluster"
)

var (
clusterInstance *cluster.LocalProcessCluster
vtParams mysql.ConnParams
keyspaceName = "healthcheck_test_ks"
cell = "healthcheck_test_cell"
shards = []string{"-80", "80-"}
schemaSQL = `
create table customer(
customer_id bigint not null auto_increment,
email varbinary(128),
primary key(customer_id)
) ENGINE=InnoDB;
create table corder(
order_id bigint not null auto_increment,
customer_id bigint,
sku varbinary(128),
price bigint,
primary key(order_id)
) ENGINE=InnoDB;
`

vSchema = `
{
"sharded": true,
"vindexes": {
"hash": {
"type": "hash"
}
},
"tables": {
"customer": {
"column_vindexes": [
{
"column": "customer_id",
"name": "hash"
}
]
},
"corder": {
"column_vindexes": [
{
"column": "customer_id",
"name": "hash"
}
]
}
}
}
`
)

// TestMain sets up the vitess cluster for any subsequent tests
func TestMain(m *testing.M) {
defer cluster.PanicHandler(nil)
flag.Parse()

exitCode := func() int {
clusterInstance = cluster.NewCluster(cell, "localhost")
defer clusterInstance.Teardown()

// Start topo server
err := clusterInstance.StartTopo()
if err != nil {
return 1
}

// Start keyspace
keyspace := &cluster.Keyspace{
Name: keyspaceName,
SchemaSQL: schemaSQL,
VSchema: vSchema,
}
clusterInstance.VtTabletExtraArgs = append(clusterInstance.VtTabletExtraArgs, []string{"-health_check_interval", "1s"}...)
err = clusterInstance.StartKeyspace(*keyspace, shards, 1, false)
if err != nil {
return 1
}

clusterInstance.VtGateExtraArgs = []string{}
err = clusterInstance.StartVtgate()
if err != nil {
return 1
}

vtParams = mysql.ConnParams{
Host: clusterInstance.Hostname,
Port: clusterInstance.VtgateMySQLPort,
}
return m.Run()
}()
os.Exit(exitCode)
}

// TestHealthCheckCacheWithTabletChurn verifies that the tablet healthcheck cache has the correct number of records
// after many rounds of adding and removing tablets in quick succession. This verifies that we don't have any race
// conditions with these operations and their interactions with the cache.
func TestHealthCheckCacheWithTabletChurn(t *testing.T) {
ctx := context.Background()
tries := 10
numShards := len(shards)
// 1 for primary,replica
expectedTabletHCcacheEntries := numShards * 2
churnTabletUID := 9999
churnTabletType := "rdonly"

// verify output of SHOW VITESS_TABLETS
vtgateConn, err := mysql.Connect(ctx, &vtParams)
require.Nil(t, err)
defer vtgateConn.Close()
query := "show vitess_tablets"

// starting with two shards, each with 1 primary and 1 replica tablet)
// we'll be adding and removing a tablet of type churnTabletType with churnTabletUID
qr, _ := vtgateConn.ExecuteFetch(query, 100, true)
assert.Equal(t, expectedTabletHCcacheEntries, len(qr.Rows), "wrong number of tablet records in healthcheck cache, expected %d but had %d. Results: %v", expectedTabletHCcacheEntries, len(qr.Rows), qr.Rows)

for i := 0; i < tries; i++ {
tablet := addTablet(t, churnTabletUID, churnTabletType)
expectedTabletHCcacheEntries++

qr, _ := vtgateConn.ExecuteFetch(query, 100, true)
assert.Equal(t, expectedTabletHCcacheEntries, len(qr.Rows), "wrong number of tablet records in healthcheck cache, expected %d but had %d. Results: %v", expectedTabletHCcacheEntries, len(qr.Rows), qr.Rows)

killTablet(t, tablet)
expectedTabletHCcacheEntries--

qr, _ = vtgateConn.ExecuteFetch(query, 100, true)
assert.Equal(t, expectedTabletHCcacheEntries, len(qr.Rows), "wrong number of tablet records in healthcheck cache, expected %d but had %d. Results: %v", expectedTabletHCcacheEntries, len(qr.Rows), qr.Rows)
}

// one final time, w/o the churning tablet
qr, _ = vtgateConn.ExecuteFetch(query, 100, true)
assert.Equal(t, expectedTabletHCcacheEntries, len(qr.Rows), "wrong number of tablet records in healthcheck cache, expected %d but had %d", expectedTabletHCcacheEntries, len(qr.Rows))
}

func addTablet(t *testing.T, tabletUID int, tabletType string) *cluster.Vttablet {
tablet := &cluster.Vttablet{
TabletUID: tabletUID,
Type: tabletType,
HTTPPort: clusterInstance.GetAndReservePort(),
GrpcPort: clusterInstance.GetAndReservePort(),
MySQLPort: clusterInstance.GetAndReservePort(),
Alias: fmt.Sprintf("%s-%010d", cell, tabletUID),
}
// Start Mysqlctl process
tablet.MysqlctlProcess = *cluster.MysqlCtlProcessInstanceOptionalInit(tablet.TabletUID, tablet.MySQLPort, clusterInstance.TmpDirectory, !clusterInstance.ReusingVTDATAROOT)
proc, err := tablet.MysqlctlProcess.StartProcess()
require.Nil(t, err)

// Start vttablet process
tablet.VttabletProcess = cluster.VttabletProcessInstance(tablet.HTTPPort,
tablet.GrpcPort,
tabletUID,
cell,
shards[0],
keyspaceName,
clusterInstance.VtctldProcess.Port,
tablet.Type,
clusterInstance.TopoProcess.Port,
clusterInstance.Hostname,
clusterInstance.TmpDirectory,
clusterInstance.VtTabletExtraArgs,
clusterInstance.EnableSemiSync)

// wait for mysqld to be ready
err = proc.Wait()
require.Nil(t, err)

err = tablet.VttabletProcess.Setup()
require.Nil(t, err)

serving := tablet.VttabletProcess.WaitForStatus("SERVING", time.Duration(60*time.Second))
assert.Equal(t, serving, true, "Tablet did not become ready within a reasonable time")
err = clusterInstance.VtgateProcess.WaitForStatusOfTabletInShard(fmt.Sprintf("%s.%s.%s",
tablet.VttabletProcess.Keyspace, tablet.VttabletProcess.Shard, tablet.Type), 1)
require.Nil(t, err)

t.Logf("Added tablet: %s", tablet.Alias)
return tablet
}

func killTablet(t *testing.T, tablet *cluster.Vttablet) {
t.Logf("Killing tablet: %s", tablet.Alias)
var wg sync.WaitGroup
wg.Add(1)
go func(tablet *cluster.Vttablet) {
defer wg.Done()
_ = tablet.VttabletProcess.TearDown()
_ = tablet.MysqlctlProcess.Stop()
tablet.MysqlctlProcess.CleanupFiles(tablet.TabletUID)
}(tablet)
wg.Wait()

err := clusterInstance.VtctlclientProcess.ExecuteCommand("RebuildKeyspaceGraph", keyspaceName)
require.Nil(t, err)
}
7 changes: 6 additions & 1 deletion go/vt/discovery/tablet_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,12 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {

if err != nil {
hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1)
if strings.Contains(err.Error(), "health stats mismatch") {
// We have reason to suspect the tablet healthcheck record is corrupted or invalid so let's remove the tablet's record
// from the healthcheck cache and it will get re-added again if the tablet is reachable
if strings.Contains(err.Error(), "health stats mismatch") ||
strings.HasSuffix(err.Error(), context.Canceled.Error()) ||
strings.Contains(err.Error(), `"error reading from server: EOF", received prior goaway`) {
log.Warningf("tablet %s had a suspect healthcheck error: %s -- clearing cache record", thc.Tablet.Alias, err.Error())
hc.deleteTablet(thc.Tablet)
return
}
Expand Down
9 changes: 9 additions & 0 deletions test/config.json
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,15 @@
"RetryMax": 1,
"Tags": []
},
"vtgate_tablet_healthcheck_cache": {
"File": "unused.go",
"Args": ["vitess.io/vitess/go/test/endtoend/vtgate/tablet_healthcheck_cache", "-timeout", "45m"],
"Command": [],
"Manual": false,
"Shard": "vtgate_tablet_healthcheck_cache",
"RetryMax": 2,
"Tags": []
},
"vtgate_transaction": {
"File": "unused.go",
"Args": ["vitess.io/vitess/go/test/endtoend/vtgate/transaction"],
Expand Down

0 comments on commit 8c0bdd0

Please sign in to comment.