Skip to content

Commit 557bad3

Browse files
committed
ensure a stale Postgres does not become leader
1 parent a1bc07d commit 557bad3

File tree

5 files changed

+33
-7
lines changed

5 files changed

+33
-7
lines changed

helpers/etcd.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,22 @@ def attempt_to_acquire_leader(self, value):
7979
logger.info("Could not take out TTL lock: %s" % e)
8080
return False
8181

82-
def update_leader(self, value):
82+
def update_leader(self, state_handler):
8383
try:
84-
self.put_client_path("/leader", {"value": value, "ttl": self.ttl, "prevValue": value})
84+
self.put_client_path("/leader", {"value": state_handler.name, "ttl": self.ttl, "prevValue": state_handler.name})
85+
self.put_client_path("/optime/leader", {"value": state_handler.last_operation()})
8586
except urllib2.HTTPError:
86-
logger.error("Error updating TTL on ETCD for primary.")
87+
logger.error("Error updating leader lock and optime on ETCD for primary.")
8788
return False
8889

90+
def last_leader_operation(self):
91+
try:
92+
return int(self.get_client_path("/optime/leader")["node"]["value"])
93+
except urllib2.HTTPError as e:
94+
if e.code == 404:
95+
logger.error("Error updating TTL on ETCD for primary.")
96+
return None
97+
8998
def leader_unlocked(self):
9099
try:
91100
self.get_client_path("/leader")

helpers/ha.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ def acquire_lock(self):
2222
return self.etcd.attempt_to_acquire_leader(self.state_handler.name)
2323

2424
def update_lock(self):
25-
return self.etcd.update_leader(self.state_handler.name)
25+
return self.etcd.update_leader(self.state_handler)
26+
27+
def update_last_leader_operation(self):
28+
return self.etcd.update_last_leader_operation(self.state_handler.last_operation)
2629

2730
def is_unlocked(self):
2831
return self.etcd.leader_unlocked()
@@ -35,7 +38,7 @@ def fetch_current_leader(self):
3538

3639
def run_cycle(self):
3740
try:
38-
if self.state_handler.is_healthy():
41+
if self.state_handler.is_healthy(self.etcd.last_leader_operation()):
3942
if self.is_unlocked():
4043
if self.state_handler.is_healthiest_node(self.etcd.members()):
4144
if self.acquire_lock():

helpers/postgresql.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
logger = logging.getLogger(__name__)
88

9-
109
class Postgresql:
1110

1211
def __init__(self, config):
@@ -111,11 +110,21 @@ def server_options(self):
111110
options += " -c \"%s=%s\"" % (setting, value)
112111
return options
113112

114-
def is_healthy(self):
113+
def is_healthy(self, last_leader_operation):
115114
if not self.is_running():
116115
logger.warning("Postgresql is not running.")
117116
return False
118117

118+
if self.is_leader():
119+
return True
120+
121+
# this should only happen on initialization
122+
if last_leader_operation is None:
123+
return True
124+
125+
if (last_leader_operation - self.xlog_position()) > self.config["maximum_lag_on_failover"]:
126+
return False
127+
119128
return True
120129

121130
def is_healthiest_node(self, members):
@@ -182,3 +191,6 @@ def create_replication_user(self):
182191

183192
def xlog_position(self):
184193
return self.query("SELECT pg_last_xlog_replay_location() - '0/0000000'::pg_lsn;").fetchone()[0]
194+
195+
def last_operation(self):
196+
return self.query("SELECT pg_current_xlog_location() - '0/00000'::pg_lsn;").fetchone()[0]

postgres0.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ postgresql:
77
name: postgresql0
88
listen: 127.0.0.1:5432
99
data_dir: data/postgresql0
10+
maximum_lag_on_failover: 1048576 # 1 megabyte in bytes
1011
replication:
1112
username: replicator
1213
password: rep-pass

postgres1.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ postgresql:
77
name: postgresql1
88
listen: 127.0.0.1:5433
99
data_dir: data/postgresql1
10+
maximum_lag_on_failover: 1048576 # 1 megabyte in bytes
1011
replication:
1112
username: replicator
1213
password: rep-pass

0 commit comments

Comments
 (0)