Skip to content

Commit

Permalink
Make DBWaitReadyTimeout configurable
Browse files Browse the repository at this point in the history
This value is currently hardcoded to 60 seconds, and it is the time that
stolon will wait for Postgres to boot. In some circumstances (e.g. if
there is a lot of WAL to recover) Postgres can take longer than 60
seconds to boot. Allowing this value to be configured lets users cater
for that scenario. We do this by adding it to the cluster spec, so that
this value is consistent across all nodes in the cluster.
  • Loading branch information
Harry Maclean committed May 23, 2019
1 parent a27bcae commit 14bc547
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 4 deletions.
8 changes: 4 additions & 4 deletions cmd/keeper/cmd/keeper.go
Original file line number Diff line number Diff line change
Expand Up @@ -1101,7 +1101,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
log.Errorw("failed to start instance", zap.Error(err))
return
}
if err = pgm.WaitReady(cluster.DefaultDBWaitReadyTimeout); err != nil {
if err = pgm.WaitReady(cd.Cluster.DefSpec().DBWaitReadyTimeout.Duration); err != nil {
log.Errorw("timeout waiting for instance to be ready", zap.Error(err))
return
}
Expand Down Expand Up @@ -1280,7 +1280,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
fullResync := false
// if not accepting connection assume that it's blocked waiting for missing wal
// (see above TODO), so do a full resync using pg_basebackup.
if err = pgm.WaitReady(cluster.DefaultDBWaitReadyTimeout); err != nil {
if err = pgm.WaitReady(cd.Cluster.DefSpec().DBWaitReadyTimeout.Duration); err != nil {
log.Errorw("pg_rewinded standby is not accepting connection. it's probably waiting for unavailable wals. Forcing a full resync")
fullResync = true
} else {
Expand Down Expand Up @@ -1334,7 +1334,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
log.Errorw("failed to start instance", zap.Error(err))
return
}
if err = pgm.WaitReady(cluster.DefaultDBWaitReadyTimeout); err != nil {
if err = pgm.WaitReady(cd.Cluster.DefSpec().DBWaitReadyTimeout.Duration); err != nil {
log.Errorw("timeout waiting for instance to be ready", zap.Error(err))
return
}
Expand Down Expand Up @@ -1430,7 +1430,7 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
log.Errorw("failed to start postgres", zap.Error(err))
return
}
if err = pgm.WaitReady(cluster.DefaultDBWaitReadyTimeout); err != nil {
if err = pgm.WaitReady(cd.Cluster.DefSpec().DBWaitReadyTimeout.Duration); err != nil {
log.Errorw("timeout waiting for instance to be ready", zap.Error(err))
return
}
Expand Down
1 change: 1 addition & 0 deletions doc/cluster_spec.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Some options in a running cluster specification can be changed to update the des
| pgParameters | a map containing the postgres server parameters and their values. The parameters value don't have to be quoted and single quotes don't have to be doubled since this is already done by the keeper when writing the postgresql.conf file | no | map[string]string | |
| pgHBA | a list containing additional pg_hba.conf entries. They will be added to the pg_hba.conf generated by stolon. **NOTE**: these lines aren't validated so if some of them are wrong postgres will refuse to start or, on reload, will log a warning and ignore the updated pg_hba.conf file | no | []string | null. Will use the default behiavior of accepting connections from all hosts for all dbs and users with md5 password authentication |
| automaticPgRestart | restart postgres automatically after changing the pgParameters that requires restart. Refer `pending_restart` in [pg_settings](https://www.postgresql.org/docs/9.5/static/view-pg-settings.html) | no | bool | false |
| dbWaitReadyTimeout | Time to wait for the database to become ready after starting. Increase this value if your Postgres takes longer to boot, e.g. because it has to recover a lot of WAL. | no | string (duration) | 60s |

#### ExistingConfig

Expand Down
8 changes: 8 additions & 0 deletions internal/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ type ClusterSpec struct {
InitTimeout *Duration `json:"initTimeout,omitempty"`
// Interval to wait for a db to be synced with a master
SyncTimeout *Duration `json:"syncTimeout,omitempty"`
// Interval to wait for a db to boot and become ready
DBWaitReadyTimeout *Duration `json:"dbWaitReadyTimeout,omitempty"`
// Interval after the first fail to declare a keeper or a db as not healthy.
FailInterval *Duration `json:"failInterval,omitempty"`
// Interval after which a dead keeper will be removed from the cluster data
Expand Down Expand Up @@ -353,6 +355,9 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
if s.SyncTimeout == nil {
s.SyncTimeout = &Duration{Duration: DefaultSyncTimeout}
}
if s.DBWaitReadyTimeout == nil {
s.DBWaitReadyTimeout = &Duration{Duration: DefaultDBWaitReadyTimeout}
}
if s.FailInterval == nil {
s.FailInterval = &Duration{Duration: DefaultFailInterval}
}
Expand Down Expand Up @@ -418,6 +423,9 @@ func (os *ClusterSpec) Validate() error {
if s.SyncTimeout.Duration < 0 {
return fmt.Errorf("syncTimeout must be positive")
}
if s.DBWaitReadyTimeout.Duration < 0 {
return fmt.Errorf("dbWaitReadyTimeout must be positive")
}
if s.DeadKeeperRemovalInterval.Duration < 0 {
return fmt.Errorf("deadKeeperRemovalInterval must be positive")
}
Expand Down

0 comments on commit 14bc547

Please sign in to comment.