Skip to content

Commit f9ff9d2

Browse files
committed
podman-etcd: enhance etcd data backup with snapshots and retention
Replace basic data directory backup with proper etcd database snapshot functionality. The new implementation: - Creates timestamped snapshot files instead of moving the entire data directory - Stores backups in a non-volatile location (backup_location parameter) instead of the previous volatile HA_RSCTMP directory - Validates backup file existence and size after creation - Implements configurable retention policy via max_backup_snapshots parameter - Automatically cleans up old snapshots to control storage usage Default retention is set to 3 snapshots, with backups stored in /var/lib/etcd by default. This provides better backup reliability, persistence across reboots, and storage management for etcd databases.
1 parent 08c6416 commit f9ff9d2

File tree

1 file changed

+128
-8
lines changed

1 file changed

+128
-8
lines changed

heartbeat/podman-etcd

Lines changed: 128 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
4949
OCF_RESKEY_oom_default="-997"
5050
OCF_RESKEY_config_location_default="/var/lib/etcd"
5151
OCF_RESKEY_backup_location_default="/var/lib/etcd"
52+
OCF_RESKEY_max_backup_snapshots_default="3"
5253

5354
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
5455
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
6162
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
6263
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
6364
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
65+
: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
6466

6567

6668
#######################################################################
@@ -275,6 +277,16 @@ The directory where the resource agent stores its backups.
275277
<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
276278
</parameter>
277279
280+
<parameter name="max_backup_snapshots" required="0" unique="0">
281+
<longdesc lang="en">
282+
Maximum number of etcd database snapshots to retain. When a new snapshot is created,
283+
older snapshots will be automatically removed to maintain this limit. This helps
284+
control storage usage while ensuring recent backups are available for recovery.
285+
</longdesc>
286+
<shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
287+
<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
288+
</parameter>
289+
278290
</parameters>
279291
280292
<actions>
@@ -702,20 +714,127 @@ EOF
702714
} >> "$ETCD_CONFIGURATION_FILE"
703715
}
704716

717+
# Archive etcd database with backup and cleanup
718+
#
719+
# Creates a timestamped snapshot of the etcd database and removes old member
720+
# directory to allow the node to rejoin the cluster as a learner.
721+
#
722+
# Error handling strategy:
723+
# - Backup failures (copy, validation) return OCF_SUCCESS to prevent blocking
724+
# cluster recovery. Backups are beneficial but not critical for recovery.
725+
# - Member directory removal failure returns OCF_ERR_GENERIC because this
726+
# operation is critical for cluster rejoin.
727+
#
728+
# NOTE: the agent cannot use etcdctl/etcdutl utilities as there
729+
# is no running etcd server when this backup is performed.
730+
#
731+
# Returns:
732+
# OCF_SUCCESS - Archive completed or backup failed non-critically
733+
# OCF_ERR_GENERIC - Critical failure removing member directory
705734
archive_data_folder()
706735
{
707-
# TODO: use etcd snapshots
708-
local dest_dir_name
709-
local data_dir="/var/lib/etcd/member"
736+
local etcd_db_path="$ETCD_MEMBER_DIR/snap/db"
737+
local backup_dir="$OCF_RESKEY_backup_location"
738+
local timestamp
739+
local backup_file
740+
local backup_size
741+
742+
# Check if the etcd database file exists
743+
if [ ! -f "$etcd_db_path" ]; then
744+
ocf_log warn "etcd database file not found at $etcd_db_path, skipping backup"
745+
return $OCF_SUCCESS
746+
fi
710747

711-
dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
712-
if [ ! -d $data_dir ]; then
713-
ocf_log info "no data dir to backup"
748+
# Ensure backup directory exists
749+
if [ ! -d "$backup_dir" ]; then
750+
ocf_log info "creating backup directory: $backup_dir"
751+
if ! mkdir -p "$backup_dir"; then
752+
ocf_log err "failed to create backup directory $backup_dir, skipping backup"
753+
return $OCF_SUCCESS
754+
fi
755+
fi
756+
757+
# Generate timestamp and backup filename
758+
timestamp=$(date +%Y%m%d-%H%M%S)
759+
backup_file="$backup_dir/etcd-snapshot-$timestamp.db"
760+
761+
ocf_log info "creating etcd database backup: $backup_file"
762+
763+
# Create the backup by copying the database file
764+
if ! cp "$etcd_db_path" "$backup_file"; then
765+
ocf_log err "failed to create backup file $backup_file, error code: $?"
714766
return $OCF_SUCCESS
715767
fi
716-
ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
717-
mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"
768+
769+
# Validate the backup file exists and has non-zero size
770+
if [ ! -f "$backup_file" ]; then
771+
ocf_log err "backup validation failed: file $backup_file does not exist"
772+
return $OCF_SUCCESS
773+
fi
774+
775+
backup_size=$(stat -c %s "$backup_file" 2>/dev/null || echo "0")
776+
if [ "$backup_size" -eq 0 ]; then
777+
ocf_log err "backup validation failed: file $backup_file has zero size"
778+
rm -f "$backup_file"
779+
return $OCF_SUCCESS
780+
fi
781+
782+
ocf_log info "backup created successfully: $backup_file (size: $backup_size bytes)"
783+
784+
# Cleanup old backups based on retention policy
785+
cleanup_old_backups "$backup_dir"
786+
787+
# Remove Etcd "member" folder to allow the client to rejoin as learner
788+
ocf_log info "Removing Etcd members directory to allow rejoin as learner"
789+
if ! rm -rf "$ETCD_MEMBER_DIR"; then
790+
ocf_log err "could not remove $ETCD_MEMBER_DIR, error code: $?"
791+
return $OCF_ERR_GENERIC
792+
fi
793+
718794
sync
795+
return $OCF_SUCCESS
796+
}
797+
798+
cleanup_old_backups()
799+
{
800+
local backup_dir="$1"
801+
local max_snapshots="$OCF_RESKEY_max_backup_snapshots"
802+
local backup_count
803+
local backups_to_remove
804+
local old_backups
805+
806+
# Validate max_snapshots is a positive integer
807+
if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then
808+
ocf_log warn "invalid max_backup_snapshots value: $max_snapshots, skipping cleanup"
809+
return $OCF_SUCCESS
810+
fi
811+
812+
# Count existing backup files
813+
backup_count=$(find "$backup_dir" -maxdepth 1 -name "etcd-snapshot-*.db" -type f 2>/dev/null | wc -l)
814+
815+
if [ "$backup_count" -le "$max_snapshots" ]; then
816+
ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed"
817+
return $OCF_SUCCESS
818+
fi
819+
820+
# Calculate how many backups to remove
821+
backups_to_remove=$((backup_count - max_snapshots))
822+
ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots"
823+
824+
# Find oldest backups sorted by modification time
825+
old_backups=$(find "$backup_dir" -maxdepth 1 -name "etcd-snapshot-*.db" -type f -printf '%T@ %p\n' 2>/dev/null | \
826+
sort -n | \
827+
head -n "$backups_to_remove" | \
828+
cut -d' ' -f2-)
829+
830+
if [ -n "$old_backups" ]; then
831+
ocf_log info "removing old backups: $old_backups"
832+
if ! echo "$old_backups" | xargs -r rm -f; then
833+
ocf_log warn "failed to remove some old backups, error code: $?"
834+
fi
835+
fi
836+
837+
return $OCF_SUCCESS
719838
}
720839

721840
etcd_pod_container_exists() {
@@ -2189,6 +2308,7 @@ CONTAINER=$OCF_RESKEY_name
21892308
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
21902309
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
21912310
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
2311+
ETCD_MEMBER_DIR="/var/lib/etcd/member"
21922312
ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
21932313
ETCD_REVISION_BUMP_PERCENTAGE=0.2
21942314
ETCD_BUMP_REV_DEFAULT=1000000000

0 commit comments

Comments
 (0)