Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stability: retry truncating sst files upon failure #484

Merged
merged 11 commits into from
May 27, 2019
4 changes: 4 additions & 0 deletions tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon
glog.Infof("deleting pod: [%s/%s] and wait 1 minute for the pod to terminate", info.Namespace, store.PodName)
err = cli.CoreV1().Pods(info.Namespace).Delete(store.PodName, nil)
if err != nil {
glog.Errorf("failed to get delete the pod: ns=%s tc=%s pod=%s err=%s",
info.Namespace, info.ClusterName, store.PodName, err.Error())
return err
}

Expand All @@ -77,6 +79,8 @@ func (oa *operatorActions) TruncateSSTFileThenCheckFailover(info *TidbClusterCon
Store: store.ID,
})
if err != nil {
glog.Errorf("failed to truncate the sst file: ns=%s tc=%s store=%s err=%s",
info.Namespace, info.ClusterName, store.ID, err.Error())
return err
}
oa.EmitEvent(info, fmt.Sprintf("TruncateSSTFile: tikv: %s/%s", info.Namespace, store.PodName))
Expand Down
84 changes: 56 additions & 28 deletions tests/pkg/ops/tikv.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,21 @@
package ops

import (
"fmt"
"strconv"
"strings"
"time"

"github.com/golang/glog"
"github.com/pingcap/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

const (
retryLimit = 15
maxSSTFilesToTruncate = 20
)

type TruncateOptions struct {
Namespace string
Cluster string
Expand All @@ -32,7 +40,7 @@ type TiKVOps struct {
}

func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error {
glog.Infof("truncate sst option: %+v", opts)
logHdr := fmt.Sprintf("store: %s cluster: [%s/%s] ", opts.Store, opts.Namespace, opts.Cluster)

tc, err := ops.PingcapV1alpha1().TidbClusters(opts.Namespace).Get(opts.Cluster, metav1.GetOptions{})
if err != nil {
Expand All @@ -54,41 +62,61 @@ func (ops *TiKVOps) TruncateSSTFile(opts TruncateOptions) error {
})
}

stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save")
if err != nil {
glog.Errorf("list sst files: stderr=%s err=%s", stderr, err.Error())
return errors.Annotate(err, "list sst files")
}
retryCount := 0
for ; retryCount < retryLimit; retryCount++ {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for ; retryCount < retryLimit; retryCount++ {
for ; retryCount < retryLimit; retryCount++ {
time.Sleep(10*time.Second)

if retryCount > 0 {
time.Sleep(10 * time.Second)
}
stdout, stderr, err := exec("find", "/var/lib/tikv/db", "-name", "*.sst", "-o", "-name", "*.save")
if err != nil {
glog.Warningf(logHdr+"list sst files: stderr=%s err=%s", stderr, err.Error())
continue
}

sstCandidates := make(map[string]bool)
sstCandidates := make(map[string]bool)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

delete the blanks

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the one more indent because it's in retry loop now.


for _, f := range strings.Split(stdout, "\n") {
f = strings.TrimSpace(f)
if len(f) > 0 {
sstCandidates[f] = true
for _, f := range strings.Split(stdout, "\n") {
f = strings.TrimSpace(f)
if len(f) > 0 {
sstCandidates[f] = true
}
}
}

sst := ""
for k := range sstCandidates {
if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] {
sst = k
ssts := make([]string, 0, maxSSTFilesToTruncate)
for k := range sstCandidates {
if len(ssts) >= maxSSTFilesToTruncate {
break
}
if strings.HasSuffix(k, ".sst") && !sstCandidates[k+".save"] {
ssts = append(ssts, k)
}
}
if len(ssts) == 0 {
glog.Warning(logHdr + "cannot find a sst file")
continue
}
}
if len(sst) == 0 {
return errors.New("cannot find a sst file")
}

_, stderr, err = exec("cp", sst, sst+".save")
if err != nil {
glog.Errorf("backup sst file: stderr=%s err=%s", stderr, err.Error())
return errors.Annotate(err, "backup sst file")
truncated := 0
for _, sst := range ssts {
_, stderr, err = exec("sh", "-c",
fmt.Sprintf("cp %s %s.save && truncate -s 0 %s", sst, sst, sst))
if err != nil {
glog.Warningf(logHdr+"truncate sst file: sst=%s stderr=%s err=%s", sst, stderr, err.Error())
continue
}
truncated++
}
if truncated == 0 {
glog.Warningf(logHdr + "no sst file has been truncated")
continue
}

glog.Infof(logHdr+"%d sst files got truncated", truncated)
break
}

_, stderr, err = exec("truncate", "-s", "0", sst)
if err != nil {
glog.Errorf("truncate sst file: stderr=%s err=%s", stderr, err.Error())
return errors.Annotate(err, "truncate sst file")
if retryCount == retryLimit {
return errors.New("failed to truncate sst file after " + strconv.Itoa(retryLimit) + " trials")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the error log needs opts.Namespace and opts.Cluster field

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added annotations to log methods. however, it's caller who pass the opts arg, that is, the caller must know these info, so I don't think there is a need for adding these fields to returned error.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but the caller has not added these fields to error log too
ref: https://github.com/pingcap/tidb-operator/blob/master/tests/failover.go#L74-L81

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added some error logs to the case func. If you think every error should be annotated with additional fields, it would be better to create an another PR to do it, because most of methods of operatorActions failed to do so.

}

return nil
Expand Down