Skip to content

Commit 11e1dfa

Browse files
committed
podman-etcd: enhance etcd data backup with snapshots and retention
Replace basic data directory backup with proper etcd database snapshot functionality. The new implementation: - Creates timestamped snapshot files instead of moving the entire data directory - Stores backups in a non-volatile location (backup_location parameter) instead of the previous volatile HA_RSCTMP directory - Validates backup file existence and size after creation - Implements configurable retention policy via max_backup_snapshots parameter - Automatically cleans up old snapshots to control storage usage Default retention is set to 3 snapshots, with backups stored in /var/lib/etcd by default. This provides better backup reliability, persistence across reboots, and storage management for etcd databases.
1 parent 192b0ec commit 11e1dfa

File tree

1 file changed

+196
-9
lines changed

1 file changed

+196
-9
lines changed

heartbeat/podman-etcd

Lines changed: 196 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
4949
OCF_RESKEY_oom_default="-997"
5050
OCF_RESKEY_config_location_default="/var/lib/etcd"
5151
OCF_RESKEY_backup_location_default="/var/lib/etcd"
52+
OCF_RESKEY_max_backup_snapshots_default="3"
5253

5354
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
5455
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
6162
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
6263
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
6364
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
65+
: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
6466

6567

6668
#######################################################################
@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups.
275277
<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
276278
</parameter>
277279
280+
<parameter name="max_backup_snapshots" required="0" unique="0">
281+
<longdesc lang="en">
282+
Maximum number of etcd database snapshots to retain. When a new snapshot is created,
283+
older snapshots will be automatically removed to maintain this limit. This helps
284+
control storage usage while ensuring recent backups are available for recovery.
285+
Set max_backup_snapshots=0 to disable backups.
286+
</longdesc>
287+
<shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
288+
<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
289+
</parameter>
290+
278291
</parameters>
279292
280293
<actions>
@@ -719,20 +732,190 @@ EOF
719732
return $OCF_SUCCESS
720733
}
721734

735+
# Remove etcd member directory to allow the node to rejoin the cluster as a learner.
736+
#
737+
# When a node rejoins an etcd cluster, it must start fresh as a learner to prevent
738+
# data inconsistencies. This function removes the member directory and syncs to disk.
739+
#
740+
# Returns:
741+
# OCF_SUCCESS - Member directory successfully removed
742+
# OCF_ERR_GENERIC - Failed to remove member directory (critical error)
743+
wipe_data_folder_for_learner()
744+
{
745+
ocf_log info "deleting etcd member directory ($ETCD_MEMBER_DIR) to enable learner rejoin"
746+
if ! rm -rf "$ETCD_MEMBER_DIR"; then
747+
ocf_log err "could not delete etcd member directory ($ETCD_MEMBER_DIR), error code: $?"
748+
return $OCF_ERR_GENERIC
749+
fi
750+
sync
751+
return $OCF_SUCCESS
752+
}
753+
754+
755+
# Calculate available disk space in bytes for a given directory.
756+
#
757+
# This function queries the filesystem and returns available space in bytes.
758+
# It converts df output (KB) to bytes for consistent size comparisons.
759+
#
760+
# Arguments:
761+
# $1 - Target directory path to check
762+
#
763+
# Returns:
764+
# OCF_SUCCESS - Available space in bytes (via stdout)
765+
# OCF_ERR_GENERIC - Failed to determine available space (error message via stdout)
766+
get_available_space_in_directory()
767+
{
768+
local target_dir=$1
769+
local available_space_kb
770+
local available_space_bytes
771+
772+
available_space_kb=$(df -P "$target_dir" | awk 'NR==2 {print $4}' 2>&1)
773+
774+
# Validate output is numeric
775+
if ! echo "$available_space_kb" | grep -q '^[0-9]\+$'; then
776+
echo "df command failed or returned invalid value: $available_space_kb"
777+
return $OCF_ERR_GENERIC
778+
fi
779+
780+
available_space_bytes=$((available_space_kb*1024))
781+
echo "$available_space_bytes"
782+
return $OCF_SUCCESS
783+
}
784+
785+
# Archive etcd database with backup and cleanup
786+
#
787+
# This function creates a backup copy of the etcd database, validates it, and
788+
# removes old backups according to the retention policy. Backups are optional
789+
# and can be disabled by setting max_backup_snapshots=0.
790+
#
791+
# Error handling strategy:
792+
# All backup failures return OCF_SUCCESS to prevent blocking cluster recovery.
793+
# Backups are beneficial but not critical for recovery operations.
794+
#
795+
# NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd
796+
# server is not running when this backup is performed.
722797
archive_data_folder()
723798
{
724-
# TODO: use etcd snapshots
725-
local dest_dir_name
726-
local data_dir="/var/lib/etcd/member"
799+
local backup_dir="$OCF_RESKEY_backup_location"
800+
local etcd_db_path="$ETCD_MEMBER_DIR/snap/db"
727801

728-
dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
729-
if [ ! -d $data_dir ]; then
730-
ocf_log info "no data dir to backup"
802+
if [ "$OCF_RESKEY_max_backup_snapshots" -eq 0 ]; then
803+
ocf_log debug "etcd backup disabled (max_backup_snapshots=0)"
731804
return $OCF_SUCCESS
732805
fi
733-
ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
734-
mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"
735-
sync
806+
807+
# Check if the etcd database file exists
808+
if [ ! -f "$etcd_db_path" ]; then
809+
ocf_log warn "backup skipped: etcd database file not found at '$etcd_db_path'"
810+
return $OCF_SUCCESS
811+
fi
812+
813+
# Ensure backup directory exists
814+
if [ ! -d "$backup_dir" ]; then
815+
ocf_log debug "creating backup directory: '$backup_dir'"
816+
if ! mkdir -p "$backup_dir"; then
817+
ocf_log warn "backup skipped: failed to create backup directory '$backup_dir'"
818+
return $OCF_SUCCESS
819+
fi
820+
fi
821+
822+
ocf_log debug "checking disk space: backup_dir=$backup_dir"
823+
local available_space_bytes
824+
if ! available_space_bytes=$(get_available_space_in_directory "$backup_dir"); then
825+
ocf_log warn "backup skipped: could not compute available disk space in '$backup_dir', error msg: $available_space_bytes"
826+
return $OCF_SUCCESS
827+
fi
828+
829+
local required_space_bytes
830+
required_space_bytes=$(stat -c %s "$etcd_db_path" 2>&1)
831+
if ! echo "$required_space_bytes" | grep -q '^[0-9]\+$'; then
832+
ocf_log warn "backup skipped: could not compute etcd database size at '$etcd_db_path', error msg: $required_space_bytes"
833+
return $OCF_SUCCESS
834+
fi
835+
836+
if [ "$required_space_bytes" -gt "$available_space_bytes" ]; then
837+
ocf_log warn "backup skipped: insufficient disk space (required: ${required_space_bytes}B, available: ${available_space_bytes}B)"
838+
return $OCF_SUCCESS
839+
fi
840+
841+
# Generate timestamp and backup filename
842+
local timestamp
843+
timestamp=$(date +%Y%m%d-%H%M%S)
844+
845+
local backup_file
846+
backup_file="$backup_dir/snapshot-$timestamp.db"
847+
848+
ocf_log info "creating etcd database backup: '$backup_file'"
849+
850+
# Create the backup by copying the database file (enable Copy-on-Write copy)
851+
if ! cp --reflink=auto "$etcd_db_path" "$backup_file"; then
852+
ocf_log warn "backup creation failed: could not copy '$etcd_db_path' to '$backup_file', error code: $?"
853+
return $OCF_SUCCESS
854+
fi
855+
856+
# Validate the backup file exists and has the expected size
857+
if [ ! -f "$backup_file" ]; then
858+
ocf_log warn "backup validation failed: snapshot file '$backup_file' does not exist"
859+
return $OCF_SUCCESS
860+
fi
861+
862+
local backup_size_bytes
863+
backup_size_bytes=$(stat -c %s "$backup_file" 2>/dev/null || echo "0")
864+
if [ "$backup_size_bytes" -ne "$required_space_bytes" ]; then
865+
ocf_log warn "backup validation failed: size mismatch (expected: ${required_space_bytes}B, got: ${backup_size_bytes}B)"
866+
rm -f "$backup_file"
867+
return $OCF_SUCCESS
868+
fi
869+
870+
ocf_log info "backup created successfully: $backup_file (${backup_size_bytes}B)"
871+
872+
# Cleanup old backups based on retention policy
873+
cleanup_old_backups "$backup_dir"
874+
875+
return $OCF_SUCCESS
876+
}
877+
878+
cleanup_old_backups()
879+
{
880+
local backup_dir="$1"
881+
local max_snapshots="$OCF_RESKEY_max_backup_snapshots"
882+
local backup_count
883+
local backups_to_remove
884+
local old_backups
885+
886+
# Validate max_snapshots is a positive integer
887+
if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then
888+
ocf_log warn "invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots' instead, skipping cleanup"
889+
return $OCF_SUCCESS
890+
fi
891+
892+
# Count existing backup files
893+
backup_count=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f 2>/dev/null | wc -l)
894+
895+
if [ "$backup_count" -le "$max_snapshots" ]; then
896+
ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed"
897+
return $OCF_SUCCESS
898+
fi
899+
900+
# Calculate how many backups to remove
901+
backups_to_remove=$((backup_count - max_snapshots))
902+
ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots"
903+
904+
# Find oldest backups sorted by modification time
905+
# -t sorts by modification time, -r reverses (oldest first)
906+
# -print0 and -0 handle filenames with spaces/special characters
907+
old_backups=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f -print0 2>/dev/null | \
908+
xargs -0 -r ls -tr | \
909+
head -n "$backups_to_remove")
910+
911+
if [ -n "$old_backups" ]; then
912+
ocf_log info "removing old backups: $old_backups"
913+
if ! echo "$old_backups" | xargs -r rm -f; then
914+
ocf_log warn "failed to remove some old backups, error code: $?"
915+
fi
916+
fi
917+
918+
return $OCF_SUCCESS
736919
}
737920

738921
etcd_pod_container_exists() {
@@ -1901,6 +2084,9 @@ podman_start()
19012084
fi
19022085

19032086
archive_data_folder
2087+
if ! wipe_data_folder_for_learner; then
2088+
return "$OCF_ERR_GENERIC"
2089+
fi
19042090
fi
19052091

19062092
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
@@ -2250,6 +2436,7 @@ CONTAINER=$OCF_RESKEY_name
22502436
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
22512437
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
22522438
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
2439+
ETCD_MEMBER_DIR="/var/lib/etcd/member"
22532440
ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
22542441
ETCD_REVISION_BUMP_PERCENTAGE=0.2
22552442
ETCD_BUMP_REV_DEFAULT=1000000000

0 commit comments

Comments
 (0)