@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
4949OCF_RESKEY_oom_default=" -997"
5050OCF_RESKEY_config_location_default=" /var/lib/etcd"
5151OCF_RESKEY_backup_location_default=" /var/lib/etcd"
52+ OCF_RESKEY_max_backup_snapshots_default=" 3"
5253
5354: ${OCF_RESKEY_image=${OCF_RESKEY_image_default} }
5455: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default} }
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
6162: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default} }
6263: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default} }
6364: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default} }
65+ : ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default} }
6466
6567
6668# ######################################################################
@@ -275,6 +277,17 @@ The directory where the resource agent stores its backups.
275277<content type="string" default="${OCF_RESKEY_backup_location_default} "/>
276278</parameter>
277279
280+ <parameter name="max_backup_snapshots" required="0" unique="0">
281+ <longdesc lang="en">
282+ Maximum number of etcd database snapshots to retain. When a new snapshot is created,
283+ older snapshots will be automatically removed to maintain this limit. This helps
284+ control storage usage while ensuring recent backups are available for recovery.
285+ Set max_backup_snapshots=0 to disable backups.
286+ </longdesc>
287+ <shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
288+ <content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default} "/>
289+ </parameter>
290+
278291</parameters>
279292
280293<actions>
@@ -719,20 +732,190 @@ EOF
719732 return $OCF_SUCCESS
720733}
721734
735+ # Remove etcd member directory to allow the node to rejoin the cluster as a learner.
736+ #
737+ # When a node rejoins an etcd cluster, it must start fresh as a learner to prevent
738+ # data inconsistencies. This function removes the member directory and syncs to disk.
739+ #
740+ # Returns:
741+ # OCF_SUCCESS - Member directory successfully removed
742+ # OCF_ERR_GENERIC - Failed to remove member directory (critical error)
743+ wipe_data_folder_for_learner ()
744+ {
745+ ocf_log info " deleting etcd member directory ($ETCD_MEMBER_DIR ) to enable learner rejoin"
746+ if ! rm -rf " $ETCD_MEMBER_DIR " ; then
747+ ocf_log err " could not delete etcd member directory ($ETCD_MEMBER_DIR ), error code: $? "
748+ return $OCF_ERR_GENERIC
749+ fi
750+ sync
751+ return $OCF_SUCCESS
752+ }
753+
754+
755+ # Calculate available disk space in bytes for a given directory.
756+ #
757+ # This function queries the filesystem and returns available space in bytes.
758+ # It converts df output (KB) to bytes for consistent size comparisons.
759+ #
760+ # Arguments:
761+ # $1 - Target directory path to check
762+ #
763+ # Returns:
764+ # OCF_SUCCESS - Available space in bytes (via stdout)
765+ # OCF_ERR_GENERIC - Failed to determine available space (error message via stdout)
766+ get_available_space_in_directory ()
767+ {
768+ local target_dir=$1
769+ local available_space_kb
770+ local available_space_bytes
771+
772+ available_space_kb=$( df -P " $target_dir " | awk ' NR==2 {print $4}' 2>&1 )
773+
774+ # Validate output is numeric
775+ if ! echo " $available_space_kb " | grep -q ' ^[0-9]\+$' ; then
776+ echo " df command failed or returned invalid value: $available_space_kb "
777+ return $OCF_ERR_GENERIC
778+ fi
779+
780+ available_space_bytes=$(( available_space_kb* 1024 ))
781+ echo " $available_space_bytes "
782+ return $OCF_SUCCESS
783+ }
784+
785+ # Archive etcd database with backup and cleanup
786+ #
787+ # This function creates a backup copy of the etcd database, validates it, and
788+ # removes old backups according to the retention policy. Backups are optional
789+ # and can be disabled by setting max_backup_snapshots=0.
790+ #
791+ # Error handling strategy:
792+ # All backup failures return OCF_SUCCESS to prevent blocking cluster recovery.
793+ # Backups are beneficial but not critical for recovery operations.
794+ #
795+ # NOTE: This function cannot use etcdctl/etcdutl utilities because the etcd
796+ # server is not running when this backup is performed.
722797archive_data_folder ()
723798{
724- # TODO: use etcd snapshots
725- local dest_dir_name
726- local data_dir=" /var/lib/etcd/member"
799+ local backup_dir=" $OCF_RESKEY_backup_location "
800+ local etcd_db_path=" $ETCD_MEMBER_DIR /snap/db"
727801
728- dest_dir_name=" members-snapshot-$( date +%Y%M%d%H%M%S) "
729- if [ ! -d $data_dir ]; then
730- ocf_log info " no data dir to backup"
802+ if [ " $OCF_RESKEY_max_backup_snapshots " -eq 0 ]; then
803+ ocf_log debug " etcd backup disabled (max_backup_snapshots=0)"
731804 return $OCF_SUCCESS
732805 fi
733- ocf_log info " backing up $data_dir under $HA_RSCTMP /$dest_dir_name "
734- mv " $data_dir " " $HA_RSCTMP /$dest_dir_name "
735- sync
806+
807+ # Check if the etcd database file exists
808+ if [ ! -f " $etcd_db_path " ]; then
809+ ocf_log warn " backup skipped: etcd database file not found at '$etcd_db_path '"
810+ return $OCF_SUCCESS
811+ fi
812+
813+ # Ensure backup directory exists
814+ if [ ! -d " $backup_dir " ]; then
815+ ocf_log debug " creating backup directory: '$backup_dir '"
816+ if ! mkdir -p " $backup_dir " ; then
817+ ocf_log warn " backup skipped: failed to create backup directory '$backup_dir '"
818+ return $OCF_SUCCESS
819+ fi
820+ fi
821+
822+ ocf_log debug " checking disk space: backup_dir=$backup_dir "
823+ local available_space_bytes
824+ if ! available_space_bytes=$( get_available_space_in_directory " $backup_dir " ) ; then
825+ ocf_log warn " backup skipped: could not compute available disk space in '$backup_dir ', error msg: $available_space_bytes "
826+ return $OCF_SUCCESS
827+ fi
828+
829+ local required_space_bytes
830+ required_space_bytes=$( stat -c %s " $etcd_db_path " 2>&1 )
831+ if ! echo " $required_space_bytes " | grep -q ' ^[0-9]\+$' ; then
832+ ocf_log warn " backup skipped: could not compute etcd database size at '$etcd_db_path ', error msg: $required_space_bytes "
833+ return $OCF_SUCCESS
834+ fi
835+
836+ if [ " $required_space_bytes " -gt " $available_space_bytes " ]; then
837+ ocf_log warn " backup skipped: insufficient disk space (required: ${required_space_bytes} B, available: ${available_space_bytes} B)"
838+ return $OCF_SUCCESS
839+ fi
840+
841+ # Generate timestamp and backup filename
842+ local timestamp
843+ timestamp=$( date +%Y%m%d-%H%M%S)
844+
845+ local backup_file
846+ backup_file=" $backup_dir /snapshot-$timestamp .db"
847+
848+ ocf_log info " creating etcd database backup: '$backup_file '"
849+
850+ # Create the backup by copying the database file (enable Copy-on-Write copy)
851+ if ! cp --reflink=auto " $etcd_db_path " " $backup_file " ; then
852+ ocf_log warn " backup creation failed: could not copy '$etcd_db_path ' to '$backup_file ', error code: $? "
853+ return $OCF_SUCCESS
854+ fi
855+
856+ # Validate the backup file exists and has the expected size
857+ if [ ! -f " $backup_file " ]; then
858+ ocf_log warn " backup validation failed: snapshot file '$backup_file ' does not exist"
859+ return $OCF_SUCCESS
860+ fi
861+
862+ local backup_size_bytes
863+ backup_size_bytes=$( stat -c %s " $backup_file " 2> /dev/null || echo " 0" )
864+ if [ " $backup_size_bytes " -ne " $required_space_bytes " ]; then
865+ ocf_log warn " backup validation failed: size mismatch (expected: ${required_space_bytes} B, got: ${backup_size_bytes} B)"
866+ rm -f " $backup_file "
867+ return $OCF_SUCCESS
868+ fi
869+
870+ ocf_log info " backup created successfully: $backup_file (${backup_size_bytes} B)"
871+
872+ # Cleanup old backups based on retention policy
873+ cleanup_old_backups " $backup_dir "
874+
875+ return $OCF_SUCCESS
876+ }
877+
878+ cleanup_old_backups ()
879+ {
880+ local backup_dir=" $1 "
881+ local max_snapshots=" $OCF_RESKEY_max_backup_snapshots "
882+ local backup_count
883+ local backups_to_remove
884+ local old_backups
885+
886+ # Validate max_snapshots is a positive integer
887+ if ! echo " $max_snapshots " | grep -q ' ^[1-9][0-9]*$' ; then
888+ ocf_log warn " invalid max_backup_snapshots value. Positive integer expected, got '$max_snapshots ' instead, skipping cleanup"
889+ return $OCF_SUCCESS
890+ fi
891+
892+ # Count existing backup files
893+ backup_count=$( find " $backup_dir " -maxdepth 1 -name " snapshot-*.db" -type f 2> /dev/null | wc -l)
894+
895+ if [ " $backup_count " -le " $max_snapshots " ]; then
896+ ocf_log info " backup count ($backup_count ) is within retention limit ($max_snapshots ), no cleanup needed"
897+ return $OCF_SUCCESS
898+ fi
899+
900+ # Calculate how many backups to remove
901+ backups_to_remove=$(( backup_count - max_snapshots))
902+ ocf_log info " removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots "
903+
904+ # Find oldest backups sorted by modification time
905+ # -t sorts by modification time, -r reverses (oldest first)
906+ # -print0 and -0 handle filenames with spaces/special characters
907+ old_backups=$( find " $backup_dir " -maxdepth 1 -name " snapshot-*.db" -type f -print0 2> /dev/null | \
908+ xargs -0 -r ls -tr | \
909+ head -n " $backups_to_remove " )
910+
911+ if [ -n " $old_backups " ]; then
912+ ocf_log info " removing old backups: $old_backups "
913+ if ! echo " $old_backups " | xargs -r rm -f; then
914+ ocf_log warn " failed to remove some old backups, error code: $? "
915+ fi
916+ fi
917+
918+ return $OCF_SUCCESS
736919}
737920
738921etcd_pod_container_exists () {
@@ -1901,6 +2084,9 @@ podman_start()
19012084 fi
19022085
19032086 archive_data_folder
2087+ if ! wipe_data_folder_for_learner; then
2088+ return " $OCF_ERR_GENERIC "
2089+ fi
19042090 fi
19052091
19062092 ocf_log info " check for changes in pod manifest to decide if the container should be reused or replaced"
@@ -2250,6 +2436,7 @@ CONTAINER=$OCF_RESKEY_name
22502436POD_MANIFEST_COPY=" ${OCF_RESKEY_config_location} /pod.yaml"
22512437ETCD_CONFIGURATION_FILE=" ${OCF_RESKEY_config_location} /config.yaml"
22522438ETCD_BACKUP_FILE=" ${OCF_RESKEY_backup_location} /config-previous.tar.gz"
2439+ ETCD_MEMBER_DIR=" /var/lib/etcd/member"
22532440ETCD_REVISION_JSON=" /var/lib/etcd/revision.json"
22542441ETCD_REVISION_BUMP_PERCENTAGE=0.2
22552442ETCD_BUMP_REV_DEFAULT=1000000000
0 commit comments