@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
4949OCF_RESKEY_oom_default=" -997"
5050OCF_RESKEY_config_location_default=" /var/lib/etcd"
5151OCF_RESKEY_backup_location_default=" /var/lib/etcd"
52+ OCF_RESKEY_max_backup_snapshots_default=" 3"
5253
5354: ${OCF_RESKEY_image=${OCF_RESKEY_image_default} }
5455: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default} }
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
6162: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default} }
6263: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default} }
6364: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default} }
65+ : ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default} }
6466
6567
6668# ######################################################################
@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups.
275277<content type="string" default="${OCF_RESKEY_backup_location_default} "/>
276278</parameter>
277279
280+ <parameter name="max_backup_snapshots" required="0" unique="0">
281+ <longdesc lang="en">
282+ Maximum number of etcd database snapshots to retain. When a new snapshot is created,
283+ older snapshots will be automatically removed to maintain this limit. This helps
284+ control storage usage while ensuring recent backups are available for recovery.
285+ Set max_backup_snapshots=0 to disable backups.
286+ </longdesc>
287+ <shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
288+ <content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default} "/>
289+ </parameter>
290+
278291</parameters>
279292
280293<actions>
@@ -702,20 +715,190 @@ EOF
702715 } >> " $ETCD_CONFIGURATION_FILE "
703716}
704717
718+ # Remove etcd member directory to allow the node to rejoin the cluster as a learner.
719+ #
720+ # When a node rejoins an etcd cluster, it must start fresh as a learner to prevent
721+ # data inconsistencies. This function removes the member directory and syncs to disk.
722+ #
723+ # Returns:
724+ # OCF_SUCCESS - Member directory successfully removed
725+ # OCF_ERR_GENERIC - Failed to remove member directory (critical error)
726+ wipe_data_folder_for_learner ()
727+ {
728+ ocf_log info " deleting etcd member directory ($ETCD_MEMBER_DIR ) to enable learner rejoin"
729+ if ! rm -rf " $ETCD_MEMBER_DIR " ; then
730+ ocf_log err " could not delete etcd member directory ($ETCD_MEMBER_DIR ), error code: $? "
731+ return $OCF_ERR_GENERIC
732+ fi
733+ sync
734+ return $OCF_SUCCESS
735+ }
736+
737+
738+ # Calculate available disk space in bytes for a given directory.
739+ #
740+ # This function queries the filesystem and returns available space in bytes.
741+ # It converts df output (KB) to bytes for consistent size comparisons.
742+ #
743+ # Arguments:
744+ # $1 - Target directory path to check
745+ #
746+ # Returns:
747+ # OCF_SUCCESS - Available space in bytes (via stdout)
748+ # OCF_ERR_GENERIC - Failed to determine available space (error message via stdout)
749+ get_available_space_in_directory ()
750+ {
751+ local target_dir=$1
752+ local available_space_kb
753+ local available_space_bytes
754+
755+ available_space_kb=$( df -P " $target_dir " | awk ' NR==2 {print $4}' 2>&1 )
756+
757+ # Validate output is numeric
758+ if ! echo " $available_space_kb " | grep -q ' ^[0-9]\+$' ; then
759+ echo " df command failed or returned invalid value: $available_space_kb "
760+ return $OCF_ERR_GENERIC
761+ fi
762+
763+ available_space_bytes=$(( available_space_kb* 1024 ))
764+ echo " $available_space_bytes "
765+ return $OCF_SUCCESS
766+ }
767+
768+ # Archive etcd database with backup and cleanup
769+ #
770+ # This function creates a backup copy of the etcd database, validates it, and
771+ # removes old backups according to the retention policy. Backups are optional
772+ # and can be disabled by setting max_backup_snapshots=0.
773+ #
774+ # Error handling strategy:
775+ # All backup failures return OCF_SUCCESS to prevent blocking cluster recovery.
776+ # Backups are beneficial but not critical for recovery operations.
777+ #
778+ # NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd
779+ # server is not running when this backup is performed.
705780archive_data_folder ()
706781{
707- # TODO: use etcd snapshots
708- local dest_dir_name
709- local data_dir=" /var/lib/etcd/member"
782+ local backup_dir=" $OCF_RESKEY_backup_location "
783+ local etcd_db_path=" $ETCD_MEMBER_DIR /snap/db"
710784
711- dest_dir_name=" members-snapshot-$( date +%Y%M%d%H%M%S) "
712- if [ ! -d $data_dir ]; then
713- ocf_log info " no data dir to backup"
785+ if [ " $OCF_RESKEY_max_backup_snapshots " -eq 0 ]; then
786+ ocf_log debug " etcd backup disabled (max_backup_snapshots=0)"
714787 return $OCF_SUCCESS
715788 fi
716- ocf_log info " backing up $data_dir under $HA_RSCTMP /$dest_dir_name "
717- mv " $data_dir " " $HA_RSCTMP /$dest_dir_name "
718- sync
789+
790+ # Check if the etcd database file exists
791+ if [ ! -f " $etcd_db_path " ]; then
792+ ocf_log warn " backup skipped: etcd database file not found at '$etcd_db_path '"
793+ return $OCF_SUCCESS
794+ fi
795+
796+ # Ensure backup directory exists
797+ if [ ! -d " $backup_dir " ]; then
798+ ocf_log debug " creating backup directory: '$backup_dir '"
799+ if ! mkdir -p " $backup_dir " ; then
800+ ocf_log warn " backup skipped: failed to create backup directory '$backup_dir '"
801+ return $OCF_SUCCESS
802+ fi
803+ fi
804+
805+ ocf_log debug " checking disk space: backup_dir=$backup_dir "
806+ local available_space_bytes
807+ if ! available_space_bytes=$( get_available_space_in_directory " $backup_dir " ) ; then
808+ ocf_log warn " backup skipped: could not compute available disk space in '$backup_dir ', error msg: $available_space_bytes "
809+ return $OCF_SUCCESS
810+ fi
811+
812+ local required_space_bytes
813+ required_space_bytes=$( stat -c %s " $etcd_db_path " 2>&1 )
814+ if ! echo " $required_space_bytes " | grep -q ' ^[0-9]\+$' ; then
815+ ocf_log warn " backup skipped: could not compute etcd database size at '$etcd_db_path ', error msg: $required_space_bytes "
816+ return $OCF_SUCCESS
817+ fi
818+
819+ if [ " $required_space_bytes " -gt " $available_space_bytes " ]; then
820+ ocf_log warn " backup skipped: insufficient disk space (required: ${required_space_bytes} B, available: ${available_space_bytes} B)"
821+ return $OCF_SUCCESS
822+ fi
823+
824+ # Generate timestamp and backup filename
825+ local timestamp
826+ timestamp=$( date +%Y%m%d-%H%M%S)
827+
828+ local backup_file
829+ backup_file=" $backup_dir /snapshot-$timestamp .db"
830+
831+ ocf_log info " creating etcd database backup: '$backup_file '"
832+
833+ # Create the backup by copying the database file (enable Copy-on-Write copy)
834+ if ! cp --reflink=auto " $etcd_db_path " " $backup_file " ; then
835+ ocf_log warn " backup creation failed: could not copy '$etcd_db_path ' to '$backup_file ', error code: $? "
836+ return $OCF_SUCCESS
837+ fi
838+
839+ # Validate the backup file exists and has non-zero size
840+ if [ ! -f " $backup_file " ]; then
841+ ocf_log warn " backup validation failed: snapshot file '$backup_file ' does not exist"
842+ return $OCF_SUCCESS
843+ fi
844+
845+ local backup_size_bytes
846+ backup_size_bytes=$( stat -c %s " $backup_file " 2> /dev/null || echo " 0" )
847+ if [ " $backup_size_bytes " -eq 0 ]; then
848+ ocf_log warn " backup validation failed: backup file '$backup_file ' has zero size"
849+ rm -f " $backup_file "
850+ return $OCF_SUCCESS
851+ fi
852+
853+ ocf_log info " backup created successfully: $backup_file (${backup_size_bytes} B)"
854+
855+ # Cleanup old backups based on retention policy
856+ cleanup_old_backups " $backup_dir "
857+
858+
859+ return $OCF_SUCCESS
860+ }
861+
862+ cleanup_old_backups ()
863+ {
864+ local backup_dir=" $1 "
865+ local max_snapshots=" $OCF_RESKEY_max_backup_snapshots "
866+ local backup_count
867+ local backups_to_remove
868+ local old_backups
869+
870+ # Validate max_snapshots is a positive integer
871+ if ! echo " $max_snapshots " | grep -q ' ^[1-9][0-9]*$' ; then
872+ ocf_log warn " invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots ' instead, skipping cleanup"
873+ return $OCF_SUCCESS
874+ fi
875+
876+ # Count existing backup files
877+ backup_count=$( find " $backup_dir " -maxdepth 1 -name " snapshot-*.db" -type f 2> /dev/null | wc -l)
878+
879+ if [ " $backup_count " -le " $max_snapshots " ]; then
880+ ocf_log info " backup count ($backup_count ) is within retention limit ($max_snapshots ), no cleanup needed"
881+ return $OCF_SUCCESS
882+ fi
883+
884+ # Calculate how many backups to remove
885+ backups_to_remove=$(( backup_count - max_snapshots))
886+ ocf_log info " removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots "
887+
888+ # Find oldest backups sorted by modification time
889+ old_backups=$( find " $backup_dir " -maxdepth 1 -name " snapshot-*.db" -type f -printf ' %T@ %p\n' 2> /dev/null | \
890+ sort -n | \
891+ head -n " $backups_to_remove " | \
892+ cut -d' ' -f2-)
893+
894+ if [ -n " $old_backups " ]; then
895+ ocf_log info " removing old backups: $old_backups "
896+ if ! echo " $old_backups " | xargs -r rm -f; then
897+ ocf_log warn " failed to remove some old backups, error code: $? "
898+ fi
899+ fi
900+
901+ return $OCF_SUCCESS
719902}
720903
721904etcd_pod_container_exists () {
@@ -1886,6 +2069,9 @@ podman_start()
18862069 fi
18872070
18882071 archive_data_folder
2072+ if ! wipe_data_folder_for_learner; then
2073+ return " $OCF_ERR_GENERIC "
2074+ fi
18892075 fi
18902076
18912077 ocf_log info " check for changes in pod manifest to decide if the container should be reused or replaced"
@@ -2189,6 +2375,7 @@ CONTAINER=$OCF_RESKEY_name
21892375POD_MANIFEST_COPY=" ${OCF_RESKEY_config_location} /pod.yaml"
21902376ETCD_CONFIGURATION_FILE=" ${OCF_RESKEY_config_location} /config.yaml"
21912377ETCD_BACKUP_FILE=" ${OCF_RESKEY_backup_location} /config-previous.tar.gz"
2378+ ETCD_MEMBER_DIR=" /var/lib/etcd/member"
21922379ETCD_REVISION_JSON=" /var/lib/etcd/revision.json"
21932380ETCD_REVISION_BUMP_PERCENTAGE=0.2
21942381ETCD_BUMP_REV_DEFAULT=1000000000
0 commit comments