Skip to content

Commit

Permalink
roachprod: run scheduled backup init without timeout
Browse files Browse the repository at this point in the history
Previously, several roachtests failed during a cluster restart because a node
serving the default scheduled backup command was not ready to serve requests.
At this time, when roachprod start returns, not every node may be ready to
serve requests.

To prevent this failure mode, this patch changes the scheduled backup cmd
during roachprod.Start() to run with infinite timeout and only on the the first
node in the cluster.

Fixes cockroachdb#97010, cockroachdb#97232

Release note: None

Epic: none
  • Loading branch information
msbutler committed Feb 22, 2023
1 parent 46631a4 commit c1a3eed
Showing 1 changed file with 14 additions and 14 deletions.
28 changes: 14 additions & 14 deletions pkg/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,15 @@ func (c *SyncedCluster) Start(ctx context.Context, l *logger.Logger, startOpts S
return res, nil
}

// For single node clusters, this can be skipped because during the c.StartNode call above,
// the `--start-single-node` flag will handle all of this for us.
shouldInit := !c.useStartSingleNode()
if shouldInit {
if err := c.initializeCluster(ctx, l, node); err != nil {
res.Err = err
return res, errors.Wrap(err, "failed to initialize cluster")
}
}

if err := c.setClusterSettings(ctx, l, node); err != nil {
res.Err = err
return res, errors.Wrap(err, "failed to set cluster settings")
Expand Down Expand Up @@ -805,10 +806,6 @@ func (c *SyncedCluster) createFixedBackupSchedule(
ctx context.Context, l *logger.Logger, scheduledBackupArgs string,
) error {
externalStoragePath := `gs://cockroachdb-backup-testing`

if c.IsLocal() {
externalStoragePath = `nodelocal://1`
}
for _, cloud := range c.Clouds() {
if !strings.Contains(cloud, gce.ProviderName) {
l.Printf(`no scheduled backup created as there exists a vm not on google cloud`)
Expand All @@ -817,24 +814,27 @@ func (c *SyncedCluster) createFixedBackupSchedule(
}
l.Printf("%s: creating backup schedule", c.Name)
auth := "AUTH=implicit"

collectionPath := fmt.Sprintf(`%s/roachprod-scheduled-backups/%s/%v?%s`,
externalStoragePath, c.Name, timeutil.Now().UnixNano(), auth)

// Default scheduled backup runs a full backup every hour and an incremental
// every 15 minutes.
scheduleArgs := `RECURRING '*/15 * * * *'
FULL BACKUP '@hourly'
WITH SCHEDULE OPTIONS first_run = 'now'`

scheduleArgs := `RECURRING '*/15 * * * *' FULL BACKUP '@hourly' WITH SCHEDULE OPTIONS first_run = 'now'`
if scheduledBackupArgs != "" {
scheduleArgs = scheduledBackupArgs
}

createScheduleCmd := fmt.Sprintf(`-e
CREATE SCHEDULE IF NOT EXISTS test_only_backup FOR BACKUP INTO '%s' %s`,
createScheduleCmd := fmt.Sprintf(`CREATE SCHEDULE IF NOT EXISTS test_only_backup FOR BACKUP INTO '%s' %s`,
collectionPath, scheduleArgs)
return c.ExecSQL(ctx, l, "" /*tenantName*/, []string{createScheduleCmd})

node := Node(1)
binary := cockroachNodeBinary(c, node)
url := c.NodeURL("localhost", c.NodePort(node), "" /* tenantName */)
fullCmd := fmt.Sprintf(`COCKROACH_CONNECT_TIMEOUT=0 %s sql --url %s -e %q`,
binary, url, createScheduleCmd)
// Instead of using `c.ExecSQL()`, use the more flexible c.Run(), which allows us to
// 1) prefix the schedule backup cmd with COCKROACH_CONNECT_TIMEOUT=0.
// 2) run the command against the first node on the cluster.
return c.Run(ctx, l, l.Stdout, l.Stderr, Nodes{node}, "init scheduled backup", fullCmd)
}

// getEnvVars returns all COCKROACH_* environment variables, in the form
Expand Down

0 comments on commit c1a3eed

Please sign in to comment.