From 14f5a7e1b98ec6dea91ca45321d3be90854fa6d2 Mon Sep 17 00:00:00 2001 From: BornChanger <97348524+BornChanger@users.noreply.github.com> Date: Mon, 18 Dec 2023 22:16:41 +0800 Subject: [PATCH] ebs br: make sure backup can be resumed even after being paused for a long time (#5464) --- .../backupschedule/backup_schedule_manager.go | 14 ++++++-------- .../backup_schedule_manager_test.go | 6 +++++- .../backupschedule/backup_schedule_manager.go | 15 +++++++-------- .../backup_schedule_manager_test.go | 6 +++++- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/pkg/backup/backupschedule/backup_schedule_manager.go b/pkg/backup/backupschedule/backup_schedule_manager.go index e1330e916e..1f4ff7a8c6 100644 --- a/pkg/backup/backupschedule/backup_schedule_manager.go +++ b/pkg/backup/backupschedule/backup_schedule_manager.go @@ -192,20 +192,18 @@ func getLastScheduledTime(bs *v1alpha1.BackupSchedule, nowFn nowFn) (*time.Time, // If there is a bug somewhere, or incorrect clock // on controller's server or apiservers (for setting creationTimestamp) // then there could be so many missed start times (it could be off - // by decades or more), that it would eat up all the CPU and memory - // of this controller. In that case, we want to not try to list - // all the missed start times. - // - // I've somewhat arbitrarily picked 100, as more than 80, - // but less than "lots". - if len(scheduledTimes) > 100 { + // by decades or more). So, we need to set LastBackupTime to now() in order to let + // next reconcile succeed. + if len(scheduledTimes) > 1000 { // We can't get the last backup schedule time if bs.Status.LastBackupTime == nil && bs.Status.AllBackupCleanTime != nil { // Recovery backup schedule from pause status, should refresh AllBackupCleanTime to avoid unschedulable problem bs.Status.AllBackupCleanTime = &metav1.Time{Time: nowFn()} return nil, controller.RequeueErrorf("recovery backup schedule %s/%s from pause status, refresh AllBackupCleanTime.", ns, bsName) } - klog.Error("Too many missed start backup schedule time (> 100). Check the clock.") + klog.Warning("Too many missed start backup schedule time (> 1000). Fail current one.") + offset := sched.Next(t).Sub(t) + bs.Status.LastBackupTime = &metav1.Time{Time: time.Now().Add(-offset)} return nil, nil } } diff --git a/pkg/backup/backupschedule/backup_schedule_manager_test.go b/pkg/backup/backupschedule/backup_schedule_manager_test.go index 26f6a80310..4573ccff1a 100644 --- a/pkg/backup/backupschedule/backup_schedule_manager_test.go +++ b/pkg/backup/backupschedule/backup_schedule_manager_test.go @@ -176,10 +176,14 @@ func TestGetLastScheduledTime(t *testing.T) { } // test too many miss - bs.Status.LastBackupTime.Time = now.AddDate(-1000, 0, 0) + bs.Status.LastBackupTime.Time = now.AddDate(-10, 0, 0) getTime, err = getLastScheduledTime(bs, time.Now) g.Expect(err).Should(BeNil()) g.Expect(getTime).Should(BeNil()) + // next reconcile should succeed + getTime, err = getLastScheduledTime(bs, time.Now) + g.Expect(err).Should(BeNil()) + g.Expect(getTime).ShouldNot(BeNil()) } func TestBuildBackup(t *testing.T) { diff --git a/pkg/fedvolumebackup/backupschedule/backup_schedule_manager.go b/pkg/fedvolumebackup/backupschedule/backup_schedule_manager.go index 618097a175..8f4853a6fc 100644 --- a/pkg/fedvolumebackup/backupschedule/backup_schedule_manager.go +++ b/pkg/fedvolumebackup/backupschedule/backup_schedule_manager.go @@ -117,20 +117,19 @@ func getLastScheduledTime(vbs *v1alpha1.VolumeBackupSchedule, nowFn nowFn) (*tim // If there is a bug somewhere, or incorrect clock // on controller's server or apiservers (for setting creationTimestamp) // then there could be so many missed start times (it could be off - // by decades or more), that it would eat up all the CPU and memory - // of this controller. In that case, we want to not try to list - // all the missed start times. - // - // I've somewhat arbitrarily picked 100, as more than 80, - // but less than "lots". - if len(scheduledTimes) > 100 { + // by decades or more). So, we need to set LastBackupTime to now() in order to let + // next reconcile succeed. + if len(scheduledTimes) > 1000 { // We can't get the last backup schedule time if vbs.Status.LastBackupTime == nil && vbs.Status.AllBackupCleanTime != nil { // Recovery backup schedule from pause status, should refresh AllBackupCleanTime to avoid unschedulable problem vbs.Status.AllBackupCleanTime = &metav1.Time{Time: nowFn()} return nil, controller.RequeueErrorf("recovery backup schedule %s/%s from pause status, refresh AllBackupCleanTime.", ns, bsName) } - klog.Error("Too many missed start backup schedule time (> 100). Check the clock.") + + klog.Warning("Too many missed start backup schedule time (> 1000). Fail current one.") + offset := sched.Next(t).Sub(t) + vbs.Status.LastBackupTime = &metav1.Time{Time: time.Now().Add(-offset)} return nil, nil } } diff --git a/pkg/fedvolumebackup/backupschedule/backup_schedule_manager_test.go b/pkg/fedvolumebackup/backupschedule/backup_schedule_manager_test.go index 6ab06eb1b5..90d6039559 100644 --- a/pkg/fedvolumebackup/backupschedule/backup_schedule_manager_test.go +++ b/pkg/fedvolumebackup/backupschedule/backup_schedule_manager_test.go @@ -170,10 +170,14 @@ func TestGetLastScheduledTime(t *testing.T) { } // test too many miss - bs.Status.LastBackupTime.Time = now.AddDate(-1000, 0, 0) + bs.Status.LastBackupTime.Time = now.AddDate(-10, 0, 0) getTime, err = getLastScheduledTime(bs, time.Now) g.Expect(err).Should(BeNil()) g.Expect(getTime).Should(BeNil()) + getTime, err = getLastScheduledTime(bs, time.Now) + // next reconcile should succeed + g.Expect(err).Should(BeNil()) + g.Expect(getTime).ShouldNot(BeNil()) } func TestBuildBackup(t *testing.T) {