spec.CheckpointBeforePgrewind

lawrencejones · lawrencejones · commit bd0582a3ef19 · 2019-06-12T17:17:30.000+01:00
Provide a configuration parameter that enables issuing of a CHECKPOINT
prior to starting a pg_rewind, ensuring the control file is up-to-date
with the latest timeline information before the rewind process starts.

When this setting is disabled, it's possible for a newly promoted master
to have a control file with a timeline from before the promotion
occured. If a follower tries to resync using rewind against this
database it will quit without doing anything as it will see no timeline
deviation when it reads the control file, stating "no rewind required".

Some users run large databases where the cost of a basebackup is many
times more expensive than rewind. These users would probably trade the
performance impact of immediate checkpointing for faster recovery of the
standby, especially for those using synchronous replication who may be
unable to accept writes until the standby is recovered.

For now, this change will at least enable a "rewind at all costs" option
as opposed to the "maybe rewind if possible" that existed before. Future
work might support a retry timeout for rewind operations, for those
users who don't want the performance hit but do value rewinding over
basebackups. We could also use the backup API to trigger a checkpoint
with the target Postgres spread configuration, then retry rewinding
until the checkpoint_timeout has elapsed.
diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go
@@ -840,8 +840,10 @@ func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind bool) er
 	// fallback to pg_basebackup
 	if tryPgrewind && p.usePgrewind(db) {
 		connParams := p.getSUConnParams(db, followedDB)
-		log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID)
-		if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword); err != nil {
+		checkpointBeforePgrewind := db.Spec.CheckpointBeforePgrewind
+		log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID,
+			"keeper", followedDB.Spec.KeeperUID, "forcingCheckpoint", checkpointBeforePgrewind)
+		if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, checkpointBeforePgrewind); err != nil {
 			// log pg_rewind error and fallback to pg_basebackup
 			log.Errorw("error syncing with pg_rewind", zap.Error(err))
 		} else {
@@ -1284,19 +1286,18 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
 				tryPgrewind = false
 			}
 
-			// TODO(sgotti) pg_rewind considers databases on the same timeline
-			// as in sync and doesn't check if they diverged at different
-			// position in previous timelines.
-			// So check that the db as been synced or resync again with
-			// pg_rewind disabled. Will need to report this upstream.
-
-			// TODO(sgotti) The rewinded standby needs wal from the master
-			// starting from the common ancestor, if they aren't available the
-			// instance will keep waiting for them, now we assume that if the
-			// instance isn't ready after the start timeout, it's waiting for
-			// wals and we'll force a full resync.
-			// We have to find a better way to detect if a standby is waiting
-			// for unavailable wals.
+			// TODO(sgotti) pg_rewind considers databases on the same timeline as in sync and
+			// doesn't check if they diverged at different position in previous timelines. So
+			// check that the db has been synced or resync again with pg_rewind disabled. Will
+			// need to report this upstream.
+
+			// TODO(sgotti) The rewinded standby needs wal from the master starting from the
+			// common ancestor, if they aren't available the instance will keep waiting for
+			// them, now we assume that if the instance isn't ready after the start timeout,
+			// it's waiting for wals and we'll force a full resync.
+			//
+			// We have to find a better way to detect if a standby is waiting for unavailable
+			// wals.
 			if err = p.resync(db, followedDB, tryPgrewind); err != nil {
 				log.Errorw("failed to resync from followed instance", zap.Error(err))
 				return
diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go
@@ -377,6 +377,7 @@ func (s *Sentinel) setDBSpecFromClusterSpec(cd *cluster.ClusterData) {
 		db.Spec.RequestTimeout = *clusterSpec.RequestTimeout
 		db.Spec.MaxStandbys = *clusterSpec.MaxStandbys
 		db.Spec.UsePgrewind = *clusterSpec.UsePgrewind
+		db.Spec.CheckpointBeforePgrewind = *clusterSpec.CheckpointBeforePgrewind
 		db.Spec.PGParameters = clusterSpec.PGParameters
 		db.Spec.PGHBA = clusterSpec.PGHBA
 		if db.Spec.FollowConfig != nil && db.Spec.FollowConfig.Type == cluster.FollowTypeExternal {
diff --git a/doc/cluster_spec.md b/doc/cluster_spec.md
@@ -27,6 +27,7 @@ Some options in a running cluster specification can be changed to update the des
 | additionalWalSenders      | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart)                                                                                                                                                                                                                                                                              | no                        | uint16            | 5                                                                                                                                   |
 | additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance.                                                                                                                                                                | no                        | []string          | null                                                                                                                                |
 | usePgrewind               | try to use pg_rewind for faster instance resyncronization.                                                                                                                                                                                                                                                                                                                                                                                                                        | no                        | bool              | false                                                                                                                               |
+| checkpointBeforePgrewind  | Force a checkpoint before pg_rewind to prevent the rewind racing the checkpointer process after a standby is newly promoted. This will cause increased IO on whatever Postgres node the currently resync'ing Postgres is following as the checkpoint will not immediate, and not respect spread configuration.
 | initMode                  | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated.                                                                       | yes                       | string            |                                                                                                                                     |
 | existingConfig            | configuration for initMode of type "existing"                                                                                                                                                                                                                                                                                                                                                                                                                                     | if initMode is "existing" | ExistingConfig    |                                                                                                                                     |
 | mergePgParameters         | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr.                                                                                                                                                                                                                                                                                                | no                        | bool              | true                                                                                                                                |
diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
@@ -66,6 +66,7 @@ const (
 	DefaultMaxSynchronousStandbys    uint16           = 1
 	DefaultAdditionalWalSenders                       = 5
 	DefaultUsePgrewind                                = false
+	DefaultCheckpointBeforePgrewind                   = false
 	DefaultMergePGParameter                           = true
 	DefaultRole                      ClusterRole      = ClusterRoleMaster
 	DefaultSUReplAccess              SUReplAccessMode = SUReplAccessAll
@@ -261,6 +262,8 @@ type ClusterSpec struct {
 	AdditionalMasterReplicationSlots []string `json:"additionalMasterReplicationSlots"`
 	// Whether to use pg_rewind
 	UsePgrewind *bool `json:"usePgrewind,omitempty"`
+	// Whether to issue a CHECKPOINT; before attempting a rewind
+	CheckpointBeforePgrewind *bool `json:"checkpointBeforePgrewind,omitempty"`
 	// InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr
 	InitMode *ClusterInitMode `json:"initMode,omitempty"`
 	// Whether to merge pgParameters of the initialized db cluster, useful
@@ -379,6 +382,9 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
 	if s.UsePgrewind == nil {
 		s.UsePgrewind = BoolP(DefaultUsePgrewind)
 	}
+	if s.CheckpointBeforePgrewind == nil {
+		s.CheckpointBeforePgrewind = BoolP(DefaultCheckpointBeforePgrewind)
+	}
 	if s.MinSynchronousStandbys == nil {
 		s.MinSynchronousStandbys = Uint16P(DefaultMinSynchronousStandbys)
 	}
@@ -607,6 +613,8 @@ type DBSpec struct {
 	SynchronousReplication bool `json:"synchronousReplication,omitempty"`
 	// Whether to use pg_rewind
 	UsePgrewind bool `json:"usePgrewind,omitempty"`
+	// Whether to issue a CHECKPOINT; before attempting a rewind
+	CheckpointBeforePgrewind bool `json:"checkpointBeforePgrewind,omitempty"`
 	// AdditionalWalSenders defines the number of additional wal_senders in
 	// addition to the ones internally defined by stolon
 	AdditionalWalSenders uint16 `json:"additionalWalSenders"`
diff --git a/internal/postgresql/postgresql.go b/internal/postgresql/postgresql.go
@@ -760,7 +760,7 @@ func (p *Manager) createPostgresqlAutoConf() error {
 	return nil
 }
 
-func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string) error {
+func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, password string, forceCheckpoint bool) error {
 	// Remove postgresql.auto.conf since pg_rewind will error if it's a symlink to /dev/null
 	pgAutoConfPath := filepath.Join(p.dataDir, postgresAutoConf)
 	if err := os.Remove(pgAutoConfPath); err != nil && !os.IsNotExist(err) {
@@ -786,6 +786,32 @@ func (p *Manager) SyncFromFollowedPGRewind(followedConnParams ConnParams, passwo
 	followedConnParams.Set("options", "-c synchronous_commit=off")
 	followedConnString := followedConnParams.ConnString()
 
+	// We need to issue a checkpoint on the source before pg_rewind'ing as until the primary
+	// checkpoints the global/pg_control file won't contain up-to-date information about
+	// what timeline the primary exists in.
+	//
+	// Imagine everyone is on timeline 1, then we promote a node to timeline 2. Standbys
+	// attempt to replicate from the newly promoted node but fail due to diverged timelines.
+	// pg_rewind is then used to resync the standbys, but if the new primary hasn't yet
+	// checkpointed, the pg_control file will tell us we're both on the same timeline (1)
+	// and pg_rewind will exit without performing any action.
+	//
+	// If we checkpoint before invoking pg_rewind we will avoid this problem, at the slight
+	// cost of forcing a checkpoint on a newly promoted node, which might hurt performance.
+	// We (GoCardless) can't afford this, so we take the performance penalty to avoid hours
+	// of downtime.
+	if forceCheckpoint {
+		log.Infow("issuing checkpoint on primary")
+		psqlName := filepath.Join(p.pgBinPath, "psql")
+		cmd := exec.Command(psqlName, followedConnString, "-c", "CHECKPOINT;")
+		cmd.Env = append(os.Environ(), fmt.Sprintf("PGPASSFILE=%s", pgpass.Name()))
+		cmd.Stdout = os.Stdout
+		cmd.Stderr = os.Stderr
+		if err := cmd.Run(); err != nil {
+			return fmt.Errorf("error: %v", err)
+		}
+	}
+
 	log.Infow("running pg_rewind")
 	name := filepath.Join(p.pgBinPath, "pg_rewind")
 	cmd := exec.Command(name, "--debug", "-D", p.dataDir, "--source-server="+followedConnString)
diff --git a/tests/integration/ha_test.go b/tests/integration/ha_test.go
diff --git a/tests/integration/utils.go b/tests/integration/utils.go