Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Owner: Fix gcttl bug #1861

Merged
merged 11 commits into from
May 29, 2021
17 changes: 15 additions & 2 deletions cdc/owner.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ func (o *Owner) getMinGCSafePointCache(ctx context.Context) model.Ts {
return o.minGCSafePointCache.ts
}
o.minGCSafePointCache.ts = oracle.ComposeTS(physicalTs-(o.gcTTL*1000), logicalTs)
// get minimum safepoint across all services
actualGCSafePoint, err := o.pdClient.UpdateServiceGCSafePoint(ctx, CDCMinGCSafePointCacheServiceID, CDCMinGCSafePointCacheServiceTTL, o.minGCSafePointCache.ts)
zier-one marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
log.Warn("Fail to get actualGCSafePoint from PD.", zap.Error(err))
asddongmen marked this conversation as resolved.
Show resolved Hide resolved
}
if actualGCSafePoint < o.minGCSafePointCache.ts {
o.minGCSafePointCache.ts = actualGCSafePoint
}

o.minGCSafePointCache.lastUpdated = time.Now()
}
return o.minGCSafePointCache.ts
Expand Down Expand Up @@ -136,6 +145,10 @@ type Owner struct {
const (
// CDCServiceSafePointID is the ID of CDC service in pd.UpdateServiceGCSafePoint.
CDCServiceSafePointID = "ticdc"
// CDCMinGCSafePointCacheServiceID is this service GC safe point ID
CDCMinGCSafePointCacheServiceID = "ticdc-minGCSafePoint-cache"
asddongmen marked this conversation as resolved.
Show resolved Hide resolved
// CDCMinGCSafePointCacheServiceTTL is this service GC safe point TTL
CDCMinGCSafePointCacheServiceTTL = 10 * 60 // 10mins
// GCSafepointUpdateInterval is the minimual interval that CDC can update gc safepoint
GCSafepointUpdateInterval = time.Duration(2 * time.Second)
// MinGCSafePointCacheUpdateInterval is the interval that update minGCSafePointCache
Expand Down Expand Up @@ -758,7 +771,7 @@ func (o *Owner) flushChangeFeedInfos(ctx context.Context) error {
// A changefeed will not enter the map twice, because in run(),
// handleAdminJob() will always be executed before flushChangeFeedInfos(),
// ensuring that the previous changefeed in staleChangeFeeds has been stopped and removed from o.changeFeeds.
if changefeed.status.CheckpointTs < minGCSafePoint {
if changefeed.status.CheckpointTs <= minGCSafePoint {
staleChangeFeeds[id] = changefeed.status
}

Expand All @@ -784,7 +797,7 @@ func (o *Owner) flushChangeFeedInfos(ctx context.Context) error {
// If a stopped changefeed's CheckpoinTs < minGCSafePoint, means this changefeed is stagnant.
// It should never be resumed. This part of the logic is in newChangeFeed()
// So here we can skip it.
if status.CheckpointTs < minGCSafePoint {
if status.CheckpointTs <= minGCSafePoint {
continue
}

Expand Down
86 changes: 79 additions & 7 deletions cdc/owner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ import (
"golang.org/x/sync/errgroup"
)

const TiKVGCLifeTime = 10 * 60 * time.Second // 10 min

type ownerSuite struct {
e *embed.Etcd
clientURL *url.URL
Expand Down Expand Up @@ -87,9 +89,10 @@ func (s *ownerSuite) TearDownTest(c *check.C) {

type mockPDClient struct {
pd.Client
invokeCounter int
mockSafePointLost bool
mockPDFailure bool
invokeCounter int
mockSafePointLost bool
mockPDFailure bool
mockTiKVGCLifeTime bool
}

func (m *mockPDClient) GetTS(ctx context.Context) (int64, int64, error) {
Expand All @@ -111,7 +114,10 @@ func (m *mockPDClient) UpdateServiceGCSafePoint(ctx context.Context, serviceID s
if m.mockPDFailure {
return 0, errors.New("injected PD failure")
}

if m.mockTiKVGCLifeTime {
Ts := oracle.GoTimeToTS(time.Now().Add(-TiKVGCLifeTime))
return Ts, nil
}
return safePoint, nil
}

Expand Down Expand Up @@ -186,6 +192,72 @@ func (s *ownerSuite) TestOwnerFlushChangeFeedInfosFailed(c *check.C) {
s.TearDownTest(c)
}

// Test whether it is possible to successfully create a changefeed
// with startTs less than currentTs - gcTTL when tikv_gc_life_time is greater than gc-ttl
func (s *ownerSuite) TestTiKVGCLifeTimeLargeThanGCTTL(c *check.C) {
defer testleak.AfterTest(c)
mockPDCli := &mockPDClient{}
mockPDCli.mockTiKVGCLifeTime = true

changeFeeds := map[model.ChangeFeedID]*changeFeed{
"test_change_feed_1": {
info: &model.ChangeFeedInfo{State: model.StateNormal},
etcdCli: s.client,
status: &model.ChangeFeedStatus{
CheckpointTs: oracle.GoTimeToTS(time.Now().Add(-6 * time.Second)),
},
targetTs: 2000,
ddlState: model.ChangeFeedSyncDML,
taskStatus: model.ProcessorsInfos{
"capture_1": {},
"capture_2": {},
},
taskPositions: map[string]*model.TaskPosition{
"capture_1": {},
"capture_2": {},
},
},
}

session, err := concurrency.NewSession(s.client.Client.Unwrap(),
concurrency.WithTTL(config.GetDefaultServerConfig().CaptureSessionTTL))
c.Assert(err, check.IsNil)

mockOwner := Owner{
session: session,
pdClient: mockPDCli,
etcdClient: s.client,
lastFlushChangefeeds: time.Now(),
flushChangefeedInterval: 1 * time.Hour,
// gcSafepointLastUpdate: time.Now(),
gcTTL: 6, // 6 seconds
changeFeeds: changeFeeds,
cfRWriter: s.client,
stoppedFeeds: make(map[model.ChangeFeedID]*model.ChangeFeedStatus),
minGCSafePointCache: minGCSafePointCacheEntry{},
}

err = mockOwner.flushChangeFeedInfos(s.ctx)
c.Assert(err, check.IsNil)
c.Assert(mockPDCli.invokeCounter, check.Equals, 2)

err = mockOwner.handleAdminJob(s.ctx)
c.Assert(err, check.IsNil)
c.Assert(mockOwner.stoppedFeeds["test_change_feed_1"], check.IsNil)
c.Assert(mockOwner.changeFeeds["test_change_feed_1"].info.State, check.Equals, model.StateNormal)

time.Sleep(7 * time.Second) // wait for gcTTL time pass
err = mockOwner.flushChangeFeedInfos(s.ctx)
c.Assert(err, check.IsNil)
c.Assert(mockPDCli.invokeCounter, check.Equals, 4)

err = mockOwner.handleAdminJob(s.ctx)
c.Assert(err, check.IsNil)
c.Assert(mockOwner.stoppedFeeds["test_change_feed_1"], check.IsNil)

s.TearDownTest(c)
}

// Test whether the owner handles the stagnant task correctly, so that it can't block the update of gcSafePoint.
// If a changefeed is put into the stop queue due to stagnation, it can no longer affect the update of gcSafePoint.
// So we just need to test whether the stagnant changefeed is put into the stop queue.
Expand Down Expand Up @@ -267,7 +339,7 @@ func (s *ownerSuite) TestOwnerHandleStaleChangeFeed(c *check.C) {
time.Sleep(3 * time.Second)
err = mockOwner.flushChangeFeedInfos(s.ctx)
c.Assert(err, check.IsNil)
c.Assert(mockPDCli.invokeCounter, check.Equals, 1)
c.Assert(mockPDCli.invokeCounter, check.Equals, 2)

err = mockOwner.handleAdminJob(s.ctx)
c.Assert(err, check.IsNil)
Expand All @@ -280,7 +352,7 @@ func (s *ownerSuite) TestOwnerHandleStaleChangeFeed(c *check.C) {
time.Sleep(6 * time.Second)
err = mockOwner.flushChangeFeedInfos(s.ctx)
c.Assert(err, check.IsNil)
c.Assert(mockPDCli.invokeCounter, check.Equals, 2)
c.Assert(mockPDCli.invokeCounter, check.Equals, 4)

err = mockOwner.handleAdminJob(s.ctx)
c.Assert(err, check.IsNil)
Expand Down Expand Up @@ -349,7 +421,7 @@ func (s *ownerSuite) TestOwnerUploadGCSafePointOutdated(c *check.C) {

err = mockOwner.flushChangeFeedInfos(s.ctx)
c.Assert(err, check.IsNil)
c.Assert(mockPDCli.invokeCounter, check.Equals, 1)
c.Assert(mockPDCli.invokeCounter, check.Equals, 2)

err = mockOwner.handleAdminJob(s.ctx)
c.Assert(err, check.IsNil)
Expand Down