Skip to content

Commit f1c214b

Browse files
author
Shlomi Noach
committed
Merge pull request #179 from outbrain/merge-downstream-gh
added force-master-takeover: planned master switch onto a direct child
2 parents 18ddffa + 617f859 commit f1c214b

File tree

4 files changed

+85
-10
lines changed

4 files changed

+85
-10
lines changed

build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#
77
set -e
88

9-
RELEASE_VERSION="1.4.579"
9+
RELEASE_VERSION="1.4.580"
1010
TOPDIR=/tmp/orchestrator-release
1111
export RELEASE_VERSION TOPDIR
1212
export GO15VENDOREXPERIMENT=1

go/app/cli.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,6 +1117,44 @@ func Cli(command string, strict bool, instance string, destination string, owner
11171117
fmt.Println(promotedInstanceKey.DisplayString())
11181118
}
11191119
}
1120+
case registerCliCommand("force-master-takeover", "Recovery", `Forcibly discard master and promote another (direct child) instance instead, even if everything is running well`):
1121+
{
1122+
clusterName := getClusterName(clusterAlias, instanceKey)
1123+
clusterMasters, err := inst.ReadClusterWriteableMaster(clusterName)
1124+
if err != nil {
1125+
log.Fatalf("Cannot deduce cluster master for %+v", clusterName)
1126+
}
1127+
var clusterMaster *inst.Instance
1128+
if len(clusterMasters) == 1 {
1129+
clusterMaster = clusterMasters[0]
1130+
} else {
1131+
log.Fatalf("Cannot deduce cluster master for %+v", clusterName)
1132+
}
1133+
1134+
if destinationKey == nil {
1135+
log.Fatal("Cannot deduce destination, the instance to promote in place of the master. Please provide with -d")
1136+
}
1137+
destination := validateInstanceIsFound(destinationKey)
1138+
if !destination.MasterKey.Equals(&clusterMaster.Key) {
1139+
log.Fatalf("You may only promote a direct child of the master %+v. The master of %+v is %+v.", clusterMaster.Key, destination.Key, destination.MasterKey)
1140+
}
1141+
log.Debugf("Will demote %+v and promote %+v instead", clusterMaster.Key, *destinationKey)
1142+
1143+
recoveryAttempted, topologyRecovery, err := logic.ForceExecuteRecovery(clusterName, inst.DeadMaster, &clusterMaster.Key, destinationKey, false)
1144+
if err != nil {
1145+
log.Fatale(err)
1146+
}
1147+
if !recoveryAttempted {
1148+
log.Fatalf("Unexpected error: recovery not attempted. This should not happen")
1149+
}
1150+
if topologyRecovery == nil {
1151+
log.Fatalf("Recovery attempted but with no results. This should not happen")
1152+
}
1153+
if topologyRecovery.SuccessorKey == nil {
1154+
log.Fatalf("Recovery attempted yet no slave promoted")
1155+
}
1156+
fmt.Println(topologyRecovery.SuccessorKey.DisplayString())
1157+
}
11201158
case registerCliCommand("replication-analysis", "Recovery", `Request an analysis of potential crash incidents in all known topologies`):
11211159
{
11221160
analysis, err := inst.GetReplicationAnalysis("", false, false)

go/cmd/orchestrator/main.go

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -761,6 +761,30 @@ Cheatsheet:
761761
762762
orchestrator -c recover-lite -i dead.instance.com --debug
763763
764+
force-master-takeover
765+
Forcibly discard master and promote another (direct child) instance instead, even if everything is running well.
766+
This allows for planned switchover.
767+
NOTE:
768+
- You must specify the instance to promote via "-d"
769+
- Promoted instance must be a direct child of the existing master
770+
- This will not work in a master-master configuration
771+
- Orchestrator just treats this command as a DeadMaster failover scenario
772+
- It is STRONGLY suggested that you first relocate everything below your chosen instance-to-promote.
773+
It *is* a planned failover thing.
774+
- Otherwise orchestrator will do its thing in moving instances around, hopefully promoting your requested
775+
server on top.
776+
- Orchestrator will issue all relevant pre-failover and post-failover external processes.
777+
- At this time orchestrator will not issue 'SET GLOBAL read_only=1' on the existing master, nor will
778+
it issue a 'FLUSH TABLES WITH READ LOCK'. This is being investigated.
779+
Examples:
780+
781+
orchestrator -c force-master-takeover -alias mycluster -d immediate.child.of.master.com
782+
Indicate cluster by alias. Orchestrator automatically figures out the master
783+
784+
orchestrator -c force-master-takeover -i instance.in.relevant.cluster.com -d immediate.child.of.master.com
785+
Indicate cluster by an instance. You don't structly need to specify the master, orchestrator
786+
will infer the master's identify.
787+
764788
replication-analysis
765789
Request an analysis of potential crash incidents in all known topologies.
766790
Output format is not yet stabilized and may change in the future. Do not trust the output
@@ -870,15 +894,11 @@ Cheatsheet:
870894
871895
orchestrator -c resolve -i cname.to.resolve
872896
873-
reset-internal-db-deployment
874-
Clear internal db deployment history, use if somehow corrupted internal deployment history.
875-
When configured with '"SmartOrchestratorDatabaseUpdate": true', Orchestrator does housekeeping for its
876-
own database schema, and verifies proposed deployment vs deployment history.
877-
In case of contradiction between the two orchestrator bails out. Such a contradiction should not occur, and may
878-
signify an inconsistency in the orchestrator code itself.
879-
By resetting history orchestrator redeploys its schema (without causing data loss) and accepts the new instructions
880-
as the de-factor deployment rule.
881-
897+
redeploy-internal-db
898+
Force internal schema migration to current backend structure. Orchestrator keeps track of the deployed
899+
versions and will not reissue a migration for a version already deployed. Normally you should not use
900+
this command, and it is provided mostly for building and testing purposes. Nonetheless it is safe to
901+
use and at most it wastes some cycles.
882902
`
883903

884904
// main is the application's entry point. It will either spawn a CLI or HTTP itnerfaces.

go/logic/topology_recovery.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,3 +1020,20 @@ func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *i
10201020
}
10211021
return recoveryAttempted, promotedSlaveKey, err
10221022
}
1023+
1024+
// ForceExecuteRecovery can be called to issue a recovery process even if analysis says there is no recovery case.
1025+
// The caller of this function injects the type of analysis it wishes the function to assume.
1026+
// By calling this function one takes responsibility for one's actions.
1027+
func ForceExecuteRecovery(clusterName string, analysisCode inst.AnalysisCode, failedInstanceKey *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) {
1028+
clusterInfo, err := inst.ReadClusterInfo(clusterName)
1029+
if err != nil {
1030+
return recoveryAttempted, topologyRecovery, err
1031+
}
1032+
1033+
analysisEntry := inst.ReplicationAnalysis{
1034+
Analysis: analysisCode,
1035+
ClusterDetails: *clusterInfo,
1036+
AnalyzedInstanceKey: *failedInstanceKey,
1037+
}
1038+
return executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses)
1039+
}

0 commit comments

Comments
 (0)