Skip to content

Commit bd0582a

Browse files
committed
spec.CheckpointBeforePgrewind
Provide a configuration parameter that enables issuing of a CHECKPOINT prior to starting a pg_rewind, ensuring the control file is up-to-date with the latest timeline information before the rewind process starts. When this setting is disabled, it's possible for a newly promoted master to have a control file with a timeline from before the promotion occured. If a follower tries to resync using rewind against this database it will quit without doing anything as it will see no timeline deviation when it reads the control file, stating "no rewind required". Some users run large databases where the cost of a basebackup is many times more expensive than rewind. These users would probably trade the performance impact of immediate checkpointing for faster recovery of the standby, especially for those using synchronous replication who may be unable to accept writes until the standby is recovered. For now, this change will at least enable a "rewind at all costs" option as opposed to the "maybe rewind if possible" that existed before. Future work might support a retry timeout for rewind operations, for those users who don't want the performance hit but do value rewinding over basebackups. We could also use the backup API to trigger a checkpoint with the target Postgres spread configuration, then retry rewinding until the checkpoint_timeout has elapsed.
1 parent dead601 commit bd0582a

File tree

7 files changed

+302
-45
lines changed

7 files changed

+302
-45
lines changed

cmd/keeper/cmd/keeper.go

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -840,8 +840,10 @@ func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind bool) er
840840
// fallback to pg_basebackup
841841
if tryPgrewind && p.usePgrewind(db) {
842842
connParams := p.getSUConnParams(db, followedDB)
843-
log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID)
844-
if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword); err != nil {
843+
checkpointBeforePgrewind := db.Spec.CheckpointBeforePgrewind
844+
log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID,
845+
"keeper", followedDB.Spec.KeeperUID, "forcingCheckpoint", checkpointBeforePgrewind)
846+
if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, checkpointBeforePgrewind); err != nil {
845847
// log pg_rewind error and fallback to pg_basebackup
846848
log.Errorw("error syncing with pg_rewind", zap.Error(err))
847849
} else {
@@ -1284,19 +1286,18 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
12841286
tryPgrewind = false
12851287
}
12861288

1287-
// TODO(sgotti) pg_rewind considers databases on the same timeline
1288-
// as in sync and doesn't check if they diverged at different
1289-
// position in previous timelines.
1290-
// So check that the db as been synced or resync again with
1291-
// pg_rewind disabled. Will need to report this upstream.
1292-
1293-
// TODO(sgotti) The rewinded standby needs wal from the master
1294-
// starting from the common ancestor, if they aren't available the
1295-
// instance will keep waiting for them, now we assume that if the
1296-
// instance isn't ready after the start timeout, it's waiting for
1297-
// wals and we'll force a full resync.
1298-
// We have to find a better way to detect if a standby is waiting
1299-
// for unavailable wals.
1289+
// TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and
1290+
// doesn't check if they diverged at different position in previous timelines. So
1291+
// check that the db has been synced or resync again with pg_rewind disabled. Will
1292+
// need to report this upstream.
1293+
1294+
// TODO(sgotti) The rewinded standby needs wal from the master starting from the
1295+
// common ancestor, if they aren't available the instance will keep waiting for
1296+
// them, now we assume that if the instance isn't ready after the start timeout,
1297+
// it's waiting for wals and we'll force a full resync.
1298+
//
1299+
// We have to find a better way to detect if a standby is waiting for unavailable
1300+
// wals.
13001301
if err = p.resync(db, followedDB, tryPgrewind); err != nil {
13011302
log.Errorw("failed to resync from followed instance", zap.Error(err))
13021303
return

cmd/sentinel/cmd/sentinel.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ func (s *Sentinel) setDBSpecFromClusterSpec(cd *cluster.ClusterData) {
377377
db.Spec.RequestTimeout = *clusterSpec.RequestTimeout
378378
db.Spec.MaxStandbys = *clusterSpec.MaxStandbys
379379
db.Spec.UsePgrewind = *clusterSpec.UsePgrewind
380+
db.Spec.CheckpointBeforePgrewind = *clusterSpec.CheckpointBeforePgrewind
380381
db.Spec.PGParameters = clusterSpec.PGParameters
381382
db.Spec.PGHBA = clusterSpec.PGHBA
382383
if db.Spec.FollowConfig != nil && db.Spec.FollowConfig.Type == cluster.FollowTypeExternal {

doc/cluster_spec.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ Some options in a running cluster specification can be changed to update the des
2727
| additionalWalSenders | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart) | no | uint16 | 5 |
2828
| additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance. | no | []string | null |
2929
| usePgrewind | try to use pg_rewind for faster instance resyncronization. | no | bool | false |
30+
| checkpointBeforePgrewind | Force a checkpoint before pg_rewind to prevent the rewind racing the checkpointer process after a standby is newly promoted. This will cause increased IO on whatever Postgres node the currently resync'ing Postgres is following as the checkpoint will not immediate, and not respect spread configuration.
3031
| initMode | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated. | yes | string | |
3132
| existingConfig | configuration for initMode of type "existing" | if initMode is "existing" | ExistingConfig | |
3233
| mergePgParameters | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr. | no | bool | true |

internal/cluster/cluster.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ const (
6666
DefaultMaxSynchronousStandbys uint16 = 1
6767
DefaultAdditionalWalSenders = 5
6868
DefaultUsePgrewind = false
69+
DefaultCheckpointBeforePgrewind = false
6970
DefaultMergePGParameter = true
7071
DefaultRole ClusterRole = ClusterRoleMaster
7172
DefaultSUReplAccess SUReplAccessMode = SUReplAccessAll
@@ -261,6 +262,8 @@ type ClusterSpec struct {
261262
AdditionalMasterReplicationSlots []string `json:"additionalMasterReplicationSlots"`
262263
// Whether to use pg_rewind
263264
UsePgrewind *bool `json:"usePgrewind,omitempty"`
265+
// Whether to issue a CHECKPOINT; before attempting a rewind
266+
CheckpointBeforePgrewind *bool `json:"checkpointBeforePgrewind,omitempty"`
264267
// InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr
265268
InitMode *ClusterInitMode `json:"initMode,omitempty"`
266269
// Whether to merge pgParameters of the initialized db cluster, useful
@@ -379,6 +382,9 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
379382
if s.UsePgrewind == nil {
380383
s.UsePgrewind = BoolP(DefaultUsePgrewind)
381384
}
385+
if s.CheckpointBeforePgrewind == nil {
386+
s.CheckpointBeforePgrewind = BoolP(DefaultCheckpointBeforePgrewind)
387+
}
382388
if s.MinSynchronousStandbys == nil {
383389
s.MinSynchronousStandbys = Uint16P(DefaultMinSynchronousStandbys)
384390
}
@@ -607,6 +613,8 @@ type DBSpec struct {
607613
SynchronousReplication bool `json:"synchronousReplication,omitempty"`
608614
// Whether to use pg_rewind
609615
UsePgrewind bool `json:"usePgrewind,omitempty"`
616+
// Whether to issue a CHECKPOINT; before attempting a rewind
617+
CheckpointBeforePgrewind bool `json:"checkpointBeforePgrewind,omitempty"`
610618
// AdditionalWalSenders defines the number of additional wal_senders in
611619
// addition to the ones internally defined by stolon
612620
AdditionalWalSenders uint16 `json:"additionalWalSenders"`

internal/postgresql/postgresql.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -760,7 +760,7 @@ func (p *Manager) createPostgresqlAutoConf() error {
760760
return nil
761761
}
762762

763-
func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string) error {
763+
func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string, forceCheckpoint bool) error {
764764
// Remove postgresql.auto.conf since pg_rewind will error if it's a symlink to /dev/null
765765
pgAutoConfPath := filepath.Join(p.dataDir, postgresAutoConf)
766766
if err := os.Remove(pgAutoConfPath); err != nil && !os.IsNotExist(err) {
@@ -786,6 +786,32 @@ func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, passwo
786786
followedConnParams.Set("options", "-c synchronous_commit=off")
787787
followedConnString := followedConnParams.ConnString()
788788

789+
// We need to issue a checkpoint on the source before pg_rewind'ing as until the primary
790+
// checkpoints the global/pg_control file won't contain up-to-date information about
791+
// what timeline the primary exists in.
792+
//
793+
// Imagine everyone is on timeline 1, then we promote a node to timeline 2. Standbys
794+
// attempt to replicate from the newly promoted node but fail due to diverged timelines.
795+
// pg_rewind is then used to resync the standbys, but if the new primary hasn't yet
796+
// checkpointed, the pg_control file will tell us we're both on the same timeline (1)
797+
// and pg_rewind will exit without performing any action.
798+
//
799+
// If we checkpoint before invoking pg_rewind we will avoid this problem, at the slight
800+
// cost of forcing a checkpoint on a newly promoted node, which might hurt performance.
801+
// We (GoCardless) can't afford this, so we take the performance penalty to avoid hours
802+
// of downtime.
803+
if forceCheckpoint {
804+
log.Infow("issuing checkpoint on primary")
805+
psqlName := filepath.Join(p.pgBinPath, "psql")
806+
cmd := exec.Command(psqlName, followedConnString, "-c", "CHECKPOINT;")
807+
cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSFILE=%s", pgpass.Name()))
808+
cmd.Stdout = os.Stdout
809+
cmd.Stderr = os.Stderr
810+
if err := cmd.Run(); err != nil {
811+
return fmt.Errorf("error: %v", err)
812+
}
813+
}
814+
789815
log.Infow("running pg_rewind")
790816
name := filepath.Join(p.pgBinPath, "pg_rewind")
791817
cmd := exec.Command(name, "--debug", "-D", p.dataDir, "--source-server="+followedConnString)

0 commit comments

Comments
 (0)