Skip to content

Commit

Permalink
ebs br: control the snapshots batch size for fsr enable/disable (#48506
Browse files Browse the repository at this point in the history
…) (#48526)

close #48505
  • Loading branch information
ti-chi-bot committed Dec 11, 2023
1 parent 23796a4 commit 80f96ed
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 33 deletions.
79 changes: 48 additions & 31 deletions br/pkg/aws/ebs.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
const (
pollingPendingSnapshotInterval = 30 * time.Second
errCodeTooManyPendingSnapshots = "PendingSnapshotLimitExceeded"
FsrApiSnapshotsThreshold = 10
)

type EC2Session struct {
Expand Down Expand Up @@ -294,24 +295,32 @@ func (e *EC2Session) EnableDataFSR(meta *config.EBSBasedBRMeta, targetAZ string)

for availableZone := range snapshotsIDsMap {
targetAZ := availableZone
eg.Go(func() error {
log.Info("enable fsr for snapshots", zap.String("available zone", targetAZ))
resp, err := e.ec2.EnableFastSnapshotRestores(&ec2.EnableFastSnapshotRestoresInput{
AvailabilityZones: []*string{&targetAZ},
SourceSnapshotIds: snapshotsIDsMap[targetAZ],
})

if err != nil {
return errors.Trace(err)
// We have to control the batch size to avoid the error of "parameter SourceSnapshotIds must be less than or equal to 10"
for i := 0; i < len(snapshotsIDsMap[targetAZ]); i += FsrApiSnapshotsThreshold {
start := i
end := i + FsrApiSnapshotsThreshold
if end > len(snapshotsIDsMap[targetAZ]) {
end = len(snapshotsIDsMap[targetAZ])
}
eg.Go(func() error {
log.Info("enable fsr for snapshots", zap.String("available zone", targetAZ), zap.Any("snapshots", snapshotsIDsMap[targetAZ][start:end]))
resp, err := e.ec2.EnableFastSnapshotRestores(&ec2.EnableFastSnapshotRestoresInput{
AvailabilityZones: []*string{&targetAZ},
SourceSnapshotIds: snapshotsIDsMap[targetAZ][start:end],
})

if len(resp.Unsuccessful) > 0 {
log.Warn("not all snapshots enabled FSR")
return errors.Errorf("Some snapshot fails to enable FSR for available zone %s, such as %s, error code is %v", targetAZ, *resp.Unsuccessful[0].SnapshotId, resp.Unsuccessful[0].FastSnapshotRestoreStateErrors)
}
if err != nil {
return errors.Trace(err)
}

return e.waitDataFSREnabled(snapshotsIDsMap[targetAZ], targetAZ)
})
if len(resp.Unsuccessful) > 0 {
log.Warn("not all snapshots enabled FSR")
return errors.Errorf("Some snapshot fails to enable FSR for available zone %s, such as %s, error code is %v", targetAZ, *resp.Unsuccessful[0].SnapshotId, resp.Unsuccessful[0].FastSnapshotRestoreStateErrors)
}

return e.waitDataFSREnabled(snapshotsIDsMap[targetAZ][start:end], targetAZ)
})
}
}
return snapshotsIDsMap, eg.Wait()
}
Expand All @@ -329,7 +338,7 @@ func (e *EC2Session) waitDataFSREnabled(snapShotIDs []*string, targetAZ string)
log.Info("starts check fsr pending snapshots", zap.Any("snapshots", pendingSnapshots), zap.String("available zone", targetAZ))
for {
if len(pendingSnapshots) == 0 {
log.Info("all snapshots fsr enablement is finished", zap.String("available zone", targetAZ))
log.Info("all snapshots in current batch fsr enablement is finished", zap.String("available zone", targetAZ), zap.Any("snapshots", snapShotIDs))
return nil
}

Expand Down Expand Up @@ -380,25 +389,33 @@ func (e *EC2Session) DisableDataFSR(snapshotsIDsMap map[string][]*string) error

for availableZone := range snapshotsIDsMap {
targetAZ := availableZone
eg.Go(func() error {
resp, err := e.ec2.DisableFastSnapshotRestores(&ec2.DisableFastSnapshotRestoresInput{
AvailabilityZones: []*string{&targetAZ},
SourceSnapshotIds: snapshotsIDsMap[targetAZ],
})

if err != nil {
return errors.Trace(err)
// We have to control the batch size to avoid the error of "parameter SourceSnapshotIds must be less than or equal to 10"
for i := 0; i < len(snapshotsIDsMap[targetAZ]); i += FsrApiSnapshotsThreshold {
start := i
end := i + FsrApiSnapshotsThreshold
if end > len(snapshotsIDsMap[targetAZ]) {
end = len(snapshotsIDsMap[targetAZ])
}
eg.Go(func() error {
resp, err := e.ec2.DisableFastSnapshotRestores(&ec2.DisableFastSnapshotRestoresInput{
AvailabilityZones: []*string{&targetAZ},
SourceSnapshotIds: snapshotsIDsMap[targetAZ][start:end],
})

if len(resp.Unsuccessful) > 0 {
log.Warn("not all snapshots disabled FSR", zap.String("available zone", targetAZ))
return errors.Errorf("Some snapshot fails to disable FSR for available zone %s, such as %s, error code is %v", targetAZ, *resp.Unsuccessful[0].SnapshotId, resp.Unsuccessful[0].FastSnapshotRestoreStateErrors)
}
if err != nil {
return errors.Trace(err)
}

log.Info("Disable FSR issued", zap.String("available zone", targetAZ))
if len(resp.Unsuccessful) > 0 {
log.Warn("not all snapshots disabled FSR", zap.String("available zone", targetAZ))
return errors.Errorf("Some snapshot fails to disable FSR for available zone %s, such as %s, error code is %v", targetAZ, *resp.Unsuccessful[0].SnapshotId, resp.Unsuccessful[0].FastSnapshotRestoreStateErrors)
}

return nil
})
log.Info("Disable FSR issued", zap.String("available zone", targetAZ), zap.Any("snapshots", snapshotsIDsMap[targetAZ][start:end]))

return nil
})
}
}
return eg.Wait()
}
Expand Down
5 changes: 3 additions & 2 deletions br/pkg/task/restore_ebs_meta.go
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,11 @@ func (h *restoreEBSMetaHelper) restoreVolumes(progress glue.Progress) (map[strin
log.Error("failed to create all volumes, cleaning up created volume")
ec2Session.DeleteVolumes(volumeIDMap)
}

if h.cfg.UseFSR {
err = ec2Session.DisableDataFSR(snapshotsIDsMap)
log.Error("disable fsr failed", zap.Error(err))
if err != nil {
log.Error("disable fsr failed", zap.Error(err))
}
}
}()

Expand Down

0 comments on commit 80f96ed

Please sign in to comment.