Skip to content

Commit 8ae83e6

Browse files
committed
podman-etcd: enhance etcd data backup with snapshots and retention
Replace basic data directory backup with proper etcd database snapshot functionality. The new implementation: - Creates timestamped snapshot files instead of moving the entire data directory - Stores backups in a non-volatile location (backup_location parameter) instead of the previous volatile HA_RSCTMP directory - Validates backup file existence and size after creation - Implements configurable retention policy via max_backup_snapshots parameter - Automatically cleans up old snapshots to control storage usage Default retention is set to 3 snapshots, with backups stored in /var/lib/etcd by default. This provides better backup reliability, persistence across reboots, and storage management for etcd databases.
1 parent 08c6416 commit 8ae83e6

File tree

1 file changed

+196
-9
lines changed

1 file changed

+196
-9
lines changed

heartbeat/podman-etcd

Lines changed: 196 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
4949
OCF_RESKEY_oom_default="-997"
5050
OCF_RESKEY_config_location_default="/var/lib/etcd"
5151
OCF_RESKEY_backup_location_default="/var/lib/etcd"
52+
OCF_RESKEY_max_backup_snapshots_default="3"
5253

5354
: ${OCF_RESKEY_image=${OCF_RESKEY_image_default}}
5455
: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default}}
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
6162
: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default}}
6263
: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default}}
6364
: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default}}
65+
: ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default}}
6466

6567

6668
#######################################################################
@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups.
275277
<content type="string" default="${OCF_RESKEY_backup_location_default}"/>
276278
</parameter>
277279
280+
<parameter name="max_backup_snapshots" required="0" unique="0">
281+
<longdesc lang="en">
282+
Maximum number of etcd database snapshots to retain. When a new snapshot is created,
283+
older snapshots will be automatically removed to maintain this limit. This helps
284+
control storage usage while ensuring recent backups are available for recovery.
285+
Set max_backup_snapshots=0 to disable backups.
286+
</longdesc>
287+
<shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
288+
<content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default}"/>
289+
</parameter>
290+
278291
</parameters>
279292
280293
<actions>
@@ -702,20 +715,190 @@ EOF
702715
} >> "$ETCD_CONFIGURATION_FILE"
703716
}
704717

718+
# Remove etcd member directory to allow the node to rejoin the cluster as a learner.
719+
#
720+
# When a node rejoins an etcd cluster, it must start fresh as a learner to prevent
721+
# data inconsistencies. This function removes the member directory and syncs to disk.
722+
#
723+
# Returns:
724+
# OCF_SUCCESS - Member directory successfully removed
725+
# OCF_ERR_GENERIC - Failed to remove member directory (critical error)
726+
wipe_data_folder_for_learner()
727+
{
728+
ocf_log info "deleting etcd member directory ($ETCD_MEMBER_DIR) to enable learner rejoin"
729+
if ! rm -rf "$ETCD_MEMBER_DIR"; then
730+
ocf_log err "could not delete etcd member directory ($ETCD_MEMBER_DIR), error code: $?"
731+
return $OCF_ERR_GENERIC
732+
fi
733+
sync
734+
return $OCF_SUCCESS
735+
}
736+
737+
738+
# Calculate available disk space in bytes for a given directory.
739+
#
740+
# This function queries the filesystem and returns available space in bytes.
741+
# It converts df output (KB) to bytes for consistent size comparisons.
742+
#
743+
# Arguments:
744+
# $1 - Target directory path to check
745+
#
746+
# Returns:
747+
# OCF_SUCCESS - Available space in bytes (via stdout)
748+
# OCF_ERR_GENERIC - Failed to determine available space (error message via stdout)
749+
get_available_space_in_directory()
750+
{
751+
local target_dir=$1
752+
local available_space_kb
753+
local available_space_bytes
754+
755+
available_space_kb=$(df -P "$target_dir" | awk 'NR==2 {print $4}' 2>&1)
756+
757+
# Validate output is numeric
758+
if ! echo "$available_space_kb" | grep -q '^[0-9]\+$'; then
759+
echo "df command failed or returned invalid value: $available_space_kb"
760+
return $OCF_ERR_GENERIC
761+
fi
762+
763+
available_space_bytes=$((available_space_kb*1024))
764+
echo "$available_space_bytes"
765+
return $OCF_SUCCESS
766+
}
767+
768+
# Archive etcd database with backup and cleanup
769+
#
770+
# This function creates a backup copy of the etcd database, validates it, and
771+
# removes old backups according to the retention policy. Backups are optional
772+
# and can be disabled by setting max_backup_snapshots=0.
773+
#
774+
# Error handling strategy:
775+
# All backup failures return OCF_SUCCESS to prevent blocking cluster recovery.
776+
# Backups are beneficial but not critical for recovery operations.
777+
#
778+
# NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd
779+
# server is not running when this backup is performed.
705780
archive_data_folder()
706781
{
707-
# TODO: use etcd snapshots
708-
local dest_dir_name
709-
local data_dir="/var/lib/etcd/member"
782+
local backup_dir="$OCF_RESKEY_backup_location"
783+
local etcd_db_path="$ETCD_MEMBER_DIR/snap/db"
710784

711-
dest_dir_name="members-snapshot-$(date +%Y%M%d%H%M%S)"
712-
if [ ! -d $data_dir ]; then
713-
ocf_log info "no data dir to backup"
785+
if [ "$OCF_RESKEY_max_backup_snapshots" -eq 0 ]; then
786+
ocf_log debug "etcd backup disabled (max_backup_snapshots=0)"
714787
return $OCF_SUCCESS
715788
fi
716-
ocf_log info "backing up $data_dir under $HA_RSCTMP/$dest_dir_name"
717-
mv "$data_dir" "$HA_RSCTMP/$dest_dir_name"
718-
sync
789+
790+
# Check if the etcd database file exists
791+
if [ ! -f "$etcd_db_path" ]; then
792+
ocf_log warn "backup skipped: etcd database file not found at '$etcd_db_path'"
793+
return $OCF_SUCCESS
794+
fi
795+
796+
# Ensure backup directory exists
797+
if [ ! -d "$backup_dir" ]; then
798+
ocf_log debug "creating backup directory: '$backup_dir'"
799+
if ! mkdir -p "$backup_dir"; then
800+
ocf_log warn "backup skipped: failed to create backup directory '$backup_dir'"
801+
return $OCF_SUCCESS
802+
fi
803+
fi
804+
805+
ocf_log debug "checking disk space: backup_dir=$backup_dir"
806+
local available_space_bytes
807+
if ! available_space_bytes=$(get_available_space_in_directory "$backup_dir"); then
808+
ocf_log warn "backup skipped: could not compute available disk space in '$backup_dir', error msg: $available_space_bytes"
809+
return $OCF_SUCCESS
810+
fi
811+
812+
local required_space_bytes
813+
required_space_bytes=$(stat -c %s "$etcd_db_path" 2>&1)
814+
if ! echo "$required_space_bytes" | grep -q '^[0-9]\+$'; then
815+
ocf_log warn "backup skipped: could not compute etcd database size at '$etcd_db_path', error msg: $required_space_bytes"
816+
return $OCF_SUCCESS
817+
fi
818+
819+
if [ "$required_space_bytes" -gt "$available_space_bytes" ]; then
820+
ocf_log warn "backup skipped: insufficient disk space (required: ${required_space_bytes}B, available: ${available_space_bytes}B)"
821+
return $OCF_SUCCESS
822+
fi
823+
824+
# Generate timestamp and backup filename
825+
local timestamp
826+
timestamp=$(date +%Y%m%d-%H%M%S)
827+
828+
local backup_file
829+
backup_file="$backup_dir/snapshot-$timestamp.db"
830+
831+
ocf_log info "creating etcd database backup: '$backup_file'"
832+
833+
# Create the backup by copying the database file (enable Copy-on-Write copy)
834+
if ! cp --reflink=auto "$etcd_db_path" "$backup_file"; then
835+
ocf_log warn "backup creation failed: could not copy '$etcd_db_path' to '$backup_file', error code: $?"
836+
return $OCF_SUCCESS
837+
fi
838+
839+
# Validate the backup file exists and has non-zero size
840+
if [ ! -f "$backup_file" ]; then
841+
ocf_log warn "backup validation failed: snapshot file '$backup_file' does not exist"
842+
return $OCF_SUCCESS
843+
fi
844+
845+
local backup_size_bytes
846+
backup_size_bytes=$(stat -c %s "$backup_file" 2>/dev/null || echo "0")
847+
if [ "$backup_size_bytes" -eq 0 ]; then
848+
ocf_log warn "backup validation failed: backup file '$backup_file' has zero size"
849+
rm -f "$backup_file"
850+
return $OCF_SUCCESS
851+
fi
852+
853+
ocf_log info "backup created successfully: $backup_file (${backup_size_bytes}B)"
854+
855+
# Cleanup old backups based on retention policy
856+
cleanup_old_backups "$backup_dir"
857+
858+
859+
return $OCF_SUCCESS
860+
}
861+
862+
cleanup_old_backups()
863+
{
864+
local backup_dir="$1"
865+
local max_snapshots="$OCF_RESKEY_max_backup_snapshots"
866+
local backup_count
867+
local backups_to_remove
868+
local old_backups
869+
870+
# Validate max_snapshots is a positive integer
871+
if ! echo "$max_snapshots" | grep -q '^[1-9][0-9]*$'; then
872+
ocf_log warn "invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots' instead, skipping cleanup"
873+
return $OCF_SUCCESS
874+
fi
875+
876+
# Count existing backup files
877+
backup_count=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f 2>/dev/null | wc -l)
878+
879+
if [ "$backup_count" -le "$max_snapshots" ]; then
880+
ocf_log info "backup count ($backup_count) is within retention limit ($max_snapshots), no cleanup needed"
881+
return $OCF_SUCCESS
882+
fi
883+
884+
# Calculate how many backups to remove
885+
backups_to_remove=$((backup_count - max_snapshots))
886+
ocf_log info "removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots"
887+
888+
# Find oldest backups sorted by modification time
889+
old_backups=$(find "$backup_dir" -maxdepth 1 -name "snapshot-*.db" -type f -printf '%T@ %p\n' 2>/dev/null | \
890+
sort -n | \
891+
head -n "$backups_to_remove" | \
892+
cut -d' ' -f2-)
893+
894+
if [ -n "$old_backups" ]; then
895+
ocf_log info "removing old backups: $old_backups"
896+
if ! echo "$old_backups" | xargs -r rm -f; then
897+
ocf_log warn "failed to remove some old backups, error code: $?"
898+
fi
899+
fi
900+
901+
return $OCF_SUCCESS
719902
}
720903

721904
etcd_pod_container_exists() {
@@ -1886,6 +2069,9 @@ podman_start()
18862069
fi
18872070

18882071
archive_data_folder
2072+
if ! wipe_data_folder_for_learner; then
2073+
return "$OCF_ERR_GENERIC"
2074+
fi
18892075
fi
18902076

18912077
ocf_log info "check for changes in pod manifest to decide if the container should be reused or replaced"
@@ -2189,6 +2375,7 @@ CONTAINER=$OCF_RESKEY_name
21892375
POD_MANIFEST_COPY="${OCF_RESKEY_config_location}/pod.yaml"
21902376
ETCD_CONFIGURATION_FILE="${OCF_RESKEY_config_location}/config.yaml"
21912377
ETCD_BACKUP_FILE="${OCF_RESKEY_backup_location}/config-previous.tar.gz"
2378+
ETCD_MEMBER_DIR="/var/lib/etcd/member"
21922379
ETCD_REVISION_JSON="/var/lib/etcd/revision.json"
21932380
ETCD_REVISION_BUMP_PERCENTAGE=0.2
21942381
ETCD_BUMP_REV_DEFAULT=1000000000

0 commit comments

Comments
 (0)