Make DBWaitReadyTimeout configurable

This value is currently hardcoded to 60 seconds, and it is the time that stolon will wait for Postgres to boot. In some circumstances (e.g. if there is a lot of WAL to recover) Postgres can take longer than 60 seconds to boot. Allowing this value to be configured lets users cater for that scenario. We do this by adding it to the cluster spec, so that this value is consistent across all nodes in the cluster.
sorintlab · May 23, 2019 · 14bc547 · 14bc547
1 parent a27bcae
commit 14bc547
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 4 deletions.
diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go
@@ -1101,7 +1101,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
 				log.Errorw("failed to start instance", zap.Error(err))
 				return
 			}
-			if err = pgm.WaitReady(cluster.DefaultDBWaitReadyTimeout); err != nil {
+			if err = pgm.WaitReady(cd.Cluster.DefSpec().DBWaitReadyTimeout.Duration); err != nil {
 				log.Errorw("timeout waiting for instance to be ready", zap.Error(err))
 				return
 			}
@@ -1280,7 +1280,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
 				fullResync := false
 				// if not accepting connection assume that it's blocked waiting for missing wal
 				// (see above TODO), so do a full resync using pg_basebackup.
-				if err = pgm.WaitReady(cluster.DefaultDBWaitReadyTimeout); err != nil {
+				if err = pgm.WaitReady(cd.Cluster.DefSpec().DBWaitReadyTimeout.Duration); err != nil {
 					log.Errorw("pg_rewinded standby is not accepting connection. it's probably waiting for unavailable wals. Forcing a full resync")
 					fullResync = true
 				} else {
@@ -1334,7 +1334,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
 				log.Errorw("failed to start instance", zap.Error(err))
 				return
 			}
-			if err = pgm.WaitReady(cluster.DefaultDBWaitReadyTimeout); err != nil {
+			if err = pgm.WaitReady(cd.Cluster.DefSpec().DBWaitReadyTimeout.Duration); err != nil {
 				log.Errorw("timeout waiting for instance to be ready", zap.Error(err))
 				return
 			}
@@ -1430,7 +1430,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
 				log.Errorw("failed to start postgres", zap.Error(err))
 				return
 			}
-			if err = pgm.WaitReady(cluster.DefaultDBWaitReadyTimeout); err != nil {
+			if err = pgm.WaitReady(cd.Cluster.DefSpec().DBWaitReadyTimeout.Duration); err != nil {
 				log.Errorw("timeout waiting for instance to be ready", zap.Error(err))
 				return
 			}

diff --git a/doc/cluster_spec.md b/doc/cluster_spec.md
@@ -38,6 +38,7 @@ Some options in a running cluster specification can be changed to update the des
 | pgParameters              | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file                                                                                                                                                                                                                                          | no                        | map[string]string |                                                                                                                                     |
 | pgHBA                     | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file                                                                                                                                                                                          | no                        | []string          | null. Will use the default behiavior of accepting connections from all hosts for all dbs and users with md5 password authentication                                                   |
 | automaticPgRestart        | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html)    | no | bool | false |
+| dbWaitReadyTimeout        | Time to wait for the database to become ready after starting.  Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s |
 
 #### ExistingConfig
 

diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
@@ -222,6 +222,8 @@ type ClusterSpec struct {
 	InitTimeout *Duration `json:"initTimeout,omitempty"`
 	// Interval to wait for a db to be synced with a master
 	SyncTimeout *Duration `json:"syncTimeout,omitempty"`
+	// Interval to wait for a db to boot and become ready
+	DBWaitReadyTimeout *Duration `json:"dbWaitReadyTimeout,omitempty"`
 	// Interval after the first fail to declare a keeper or a db as not healthy.
 	FailInterval *Duration `json:"failInterval,omitempty"`
 	// Interval after which a dead keeper will be removed from the cluster data
@@ -353,6 +355,9 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
 	if s.SyncTimeout == nil {
 		s.SyncTimeout = &Duration{Duration: DefaultSyncTimeout}
 	}
+	if s.DBWaitReadyTimeout == nil {
+		s.DBWaitReadyTimeout = &Duration{Duration: DefaultDBWaitReadyTimeout}
+	}
 	if s.FailInterval == nil {
 		s.FailInterval = &Duration{Duration: DefaultFailInterval}
 	}
@@ -418,6 +423,9 @@ func (os *ClusterSpec) Validate() error {
 	if s.SyncTimeout.Duration < 0 {
 		return fmt.Errorf("syncTimeout must be positive")
 	}
+	if s.DBWaitReadyTimeout.Duration < 0 {
+		return fmt.Errorf("dbWaitReadyTimeout must be positive")
+	}
 	if s.DeadKeeperRemovalInterval.Duration < 0 {
 		return fmt.Errorf("deadKeeperRemovalInterval must be positive")
 	}