@@ -49,6 +49,7 @@ OCF_RESKEY_reuse_default="0"
4949OCF_RESKEY_oom_default=" -997"
5050OCF_RESKEY_config_location_default=" /var/lib/etcd"
5151OCF_RESKEY_backup_location_default=" /var/lib/etcd"
52+ OCF_RESKEY_max_backup_snapshots_default=" 3"
5253
5354: ${OCF_RESKEY_image=${OCF_RESKEY_image_default} }
5455: ${OCF_RESKEY_pod_manifest=${OCF_RESKEY_pod_manifest_default} }
@@ -61,6 +62,7 @@ OCF_RESKEY_backup_location_default="/var/lib/etcd"
6162: ${OCF_RESKEY_oom=${OCF_RESKEY_oom_default} }
6263: ${OCF_RESKEY_config_location=${OCF_RESKEY_config_location_default} }
6364: ${OCF_RESKEY_backup_location=${OCF_RESKEY_backup_location_default} }
65+ : ${OCF_RESKEY_max_backup_snapshots=${OCF_RESKEY_max_backup_snapshots_default} }
6466
6567
6668# ######################################################################
@@ -275,6 +277,16 @@ The directory where the resource agent stores its backups.
275277<content type="string" default="${OCF_RESKEY_backup_location_default} "/>
276278</parameter>
277279
280+ <parameter name="max_backup_snapshots" required="0" unique="0">
281+ <longdesc lang="en">
282+ Maximum number of etcd database snapshots to retain. When a new snapshot is created,
283+ older snapshots will be automatically removed to maintain this limit. This helps
284+ control storage usage while ensuring recent backups are available for recovery.
285+ </longdesc>
286+ <shortdesc lang="en">Maximum number of backup snapshots to retain</shortdesc>
287+ <content type="integer" default="${OCF_RESKEY_max_backup_snapshots_default} "/>
288+ </parameter>
289+
278290</parameters>
279291
280292<actions>
@@ -702,20 +714,127 @@ EOF
702714 } >> " $ETCD_CONFIGURATION_FILE "
703715}
704716
717+ # Archive etcd database with backup and cleanup
718+ #
719+ # Creates a timestamped snapshot of the etcd database and removes old member
720+ # directory to allow the node to rejoin the cluster as a learner.
721+ #
722+ # Error handling strategy:
723+ # - Backup failures (copy, validation) return OCF_SUCCESS to prevent blocking
724+ # cluster recovery. Backups are beneficial but not critical for recovery.
725+ # - Member directory removal failure returns OCF_ERR_GENERIC because this
726+ # operation is critical for cluster rejoin.
727+ #
728+ # NOTE: the agent cannot use etcdctl/etcdutl utilities as there
729+ # is no running etcd server when this backup is performed.
730+ #
731+ # Returns:
732+ # OCF_SUCCESS - Archive completed or backup failed non-critically
733+ # OCF_ERR_GENERIC - Critical failure removing member directory
705734archive_data_folder ()
706735{
707- # TODO: use etcd snapshots
708- local dest_dir_name
709- local data_dir=" /var/lib/etcd/member"
736+ local etcd_db_path=" $ETCD_MEMBER_DIR /snap/db"
737+ local backup_dir=" $OCF_RESKEY_backup_location "
738+ local timestamp
739+ local backup_file
740+ local backup_size
741+
742+ # Check if the etcd database file exists
743+ if [ ! -f " $etcd_db_path " ]; then
744+ ocf_log warn " etcd database file not found at $etcd_db_path , skipping backup"
745+ return $OCF_SUCCESS
746+ fi
710747
711- dest_dir_name=" members-snapshot-$( date +%Y%M%d%H%M%S) "
712- if [ ! -d $data_dir ]; then
713- ocf_log info " no data dir to backup"
748+ # Ensure backup directory exists
749+ if [ ! -d " $backup_dir " ]; then
750+ ocf_log info " creating backup directory: $backup_dir "
751+ if ! mkdir -p " $backup_dir " ; then
752+ ocf_log err " failed to create backup directory $backup_dir , skipping backup"
753+ return $OCF_SUCCESS
754+ fi
755+ fi
756+
757+ # Generate timestamp and backup filename
758+ timestamp=$( date +%Y%m%d-%H%M%S)
759+ backup_file=" $backup_dir /etcd-snapshot-$timestamp .db"
760+
761+ ocf_log info " creating etcd database backup: $backup_file "
762+
763+ # Create the backup by copying the database file
764+ if ! cp " $etcd_db_path " " $backup_file " ; then
765+ ocf_log err " failed to create backup file $backup_file , error code: $? "
714766 return $OCF_SUCCESS
715767 fi
716- ocf_log info " backing up $data_dir under $HA_RSCTMP /$dest_dir_name "
717- mv " $data_dir " " $HA_RSCTMP /$dest_dir_name "
768+
769+ # Validate the backup file exists and has non-zero size
770+ if [ ! -f " $backup_file " ]; then
771+ ocf_log err " backup validation failed: file $backup_file does not exist"
772+ return $OCF_SUCCESS
773+ fi
774+
775+ backup_size=$( stat -c %s " $backup_file " 2> /dev/null || echo " 0" )
776+ if [ " $backup_size " -eq 0 ]; then
777+ ocf_log err " backup validation failed: file $backup_file has zero size"
778+ rm -f " $backup_file "
779+ return $OCF_SUCCESS
780+ fi
781+
782+ ocf_log info " backup created successfully: $backup_file (size: $backup_size bytes)"
783+
784+ # Cleanup old backups based on retention policy
785+ cleanup_old_backups " $backup_dir "
786+
787+ # Remove Etcd "member" folder to allow the client to rejoin as learner
788+ ocf_log info " Removing Etcd members directory to allow rejoin as learner"
789+ if ! rm -rf " $ETCD_MEMBER_DIR " ; then
790+ ocf_log err " could not remove $ETCD_MEMBER_DIR , error code: $? "
791+ return $OCF_ERR_GENERIC
792+ fi
793+
718794 sync
795+ return $OCF_SUCCESS
796+ }
797+
798+ cleanup_old_backups ()
799+ {
800+ local backup_dir=" $1 "
801+ local max_snapshots=" $OCF_RESKEY_max_backup_snapshots "
802+ local backup_count
803+ local backups_to_remove
804+ local old_backups
805+
806+ # Validate max_snapshots is a positive integer
807+ if ! echo " $max_snapshots " | grep -q ' ^[1-9][0-9]*$' ; then
808+ ocf_log warn " invalid max_backup_snapshots value: $max_snapshots , skipping cleanup"
809+ return $OCF_SUCCESS
810+ fi
811+
812+ # Count existing backup files
813+ backup_count=$( find " $backup_dir " -maxdepth 1 -name " etcd-snapshot-*.db" -type f 2> /dev/null | wc -l)
814+
815+ if [ " $backup_count " -le " $max_snapshots " ]; then
816+ ocf_log info " backup count ($backup_count ) is within retention limit ($max_snapshots ), no cleanup needed"
817+ return $OCF_SUCCESS
818+ fi
819+
820+ # Calculate how many backups to remove
821+ backups_to_remove=$(( backup_count - max_snapshots))
822+ ocf_log info " removing $backups_to_remove old backup(s) to maintain retention limit of $max_snapshots "
823+
824+ # Find oldest backups sorted by modification time
825+ old_backups=$( find " $backup_dir " -maxdepth 1 -name " etcd-snapshot-*.db" -type f -printf ' %T@ %p\n' 2> /dev/null | \
826+ sort -n | \
827+ head -n " $backups_to_remove " | \
828+ cut -d' ' -f2-)
829+
830+ if [ -n " $old_backups " ]; then
831+ ocf_log info " removing old backups: $old_backups "
832+ if ! echo " $old_backups " | xargs -r rm -f; then
833+ ocf_log warn " failed to remove some old backups, error code: $? "
834+ fi
835+ fi
836+
837+ return $OCF_SUCCESS
719838}
720839
721840etcd_pod_container_exists () {
@@ -2189,6 +2308,7 @@ CONTAINER=$OCF_RESKEY_name
21892308POD_MANIFEST_COPY=" ${OCF_RESKEY_config_location} /pod.yaml"
21902309ETCD_CONFIGURATION_FILE=" ${OCF_RESKEY_config_location} /config.yaml"
21912310ETCD_BACKUP_FILE=" ${OCF_RESKEY_backup_location} /config-previous.tar.gz"
2311+ ETCD_MEMBER_DIR=" /var/lib/etcd/member"
21922312ETCD_REVISION_JSON=" /var/lib/etcd/revision.json"
21932313ETCD_REVISION_BUMP_PERCENTAGE=0.2
21942314ETCD_BUMP_REV_DEFAULT=1000000000
0 commit comments