diff --git a/08-snakemake/Snakefile_1 b/08-snakemake/Snakefile_1 index aacaff2..c971f0d 100644 --- a/08-snakemake/Snakefile_1 +++ b/08-snakemake/Snakefile_1 @@ -5,7 +5,7 @@ config = { "apptainer_bind": "/agr/scratch,/agr/persist", "apptainer_image": "/agr/persist/projects/2024_apsim_improvements/apsim-simulations/container/apsim-2024.09.7579.0.aimg", "excluded_txt_files": ["ExampleConfig.txt"], - "max_consecutive_failures": 10, + "max_consecutive_failures": 3, "slurm_logdir": "slurmlogs" } @@ -22,7 +22,9 @@ rule process_txt_files: output: "txt_files_processed" params: - logfile = lambda wildcards, output: f"{config['slurm_logdir']}/txt_files_%j.out" + logfile = lambda wildcards, output: f"{config['slurm_logdir']}/txt_files_%j.out", + processed_files_log = "processed_txt_files.log", + failed_files_log = "failed_txt_files.log" resources: mem_mb = 8000, time = "16:00:00" @@ -36,26 +38,45 @@ rule process_txt_files: export APPTAINER_CMD="apptainer exec {config[apptainer_image]}" mkdir -p FAILED_CONFIG - consecutive_failures=0 + rm -f {params.processed_files_log} {params.failed_files_log} + touch {params.processed_files_log} {params.failed_files_log} + + total_files=$(echo "{input.txt_files}" | wc -w) + processed_files=0 + failed_files=0 max_consecutive_failures={config[max_consecutive_failures]} + consecutive_failures=0 for file in {input.txt_files}; do + echo "Processing $file" + if $APPTAINER_CMD Models --cpu-count {threads} --apply "$file"; then - echo "Successfully processed $file" + echo "$file" >> {params.processed_files_log} + processed_files=$((processed_files + 1)) consecutive_failures=0 else echo "Failed to process $file" + echo "$file" >> {params.failed_files_log} mv "$file" FAILED_CONFIG/ - ((consecutive_failures++)) + failed_files=$((failed_files + 1)) + consecutive_failures=$((consecutive_failures + 1)) if [ $consecutive_failures -ge $max_consecutive_failures ]; then - echo "Error: $max_consecutive_failures consecutive failures reached. Terminating job." >&2 + echo "Error: $max_consecutive_failures consecutive failures reached." >&2 exit 1 fi fi done - touch {output} + echo "Total files: $total_files" + echo "Processed files: $processed_files" + echo "Failed files: $failed_files" + + if [ $failed_files -eq 0 ]; then + touch {output} + else + exit 1 + fi """ rule create_logdir: diff --git a/08-snakemake/run_snakefile.sh b/08-snakemake/run_snakefile.sh index efbba31..fe42b6b 100755 --- a/08-snakemake/run_snakefile.sh +++ b/08-snakemake/run_snakefile.sh @@ -1,44 +1,79 @@ #!/bin/bash -# Set the number of jobs for parallel processing of APSIM files +# Exit on error +set -e + +# Configuration APSIM_JOBS=100 +MAX_WORKFLOW_RETRIES=3 +RETRY_DELAY=60 # seconds + +# Logging function +log_message() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +# Function to run a Snakefile with comprehensive retry and recovery logic +run_snakefile() { + local snakefile=$1 + local jobs=$2 + local retry_count=0 + local success=false + + while [ $retry_count -lt $MAX_WORKFLOW_RETRIES ] && [ "$success" = false ]; do + log_message "Attempting to run $snakefile (Attempt $((retry_count + 1)))" + + # Run Snakemake with full verbosity and rerun-incomplete + if snakemake -s "$snakefile" --profile slurm --jobs "$jobs" --rerun-incomplete -p; then + success=true + log_message "$snakefile completed successfully" + else + ((retry_count++)) + + # Check for specific failure conditions + if [ -f failed_txt_files.log ]; then + failed_count=$(wc -l < failed_txt_files.log) + log_message "Number of failed files: $failed_count" + + # Optional: Add logic to handle specific failure scenarios + if [ "$failed_count" -gt 0 ]; then + log_message "Examining failed files in FAILED_CONFIG" + ls -l FAILED_CONFIG/ + fi + fi + + if [ $retry_count -lt $MAX_WORKFLOW_RETRIES ]; then + log_message "Workflow failed. Waiting $RETRY_DELAY seconds before retry..." + sleep $RETRY_DELAY + else + log_message "Workflow failed after $MAX_WORKFLOW_RETRIES attempts" + return 1 + fi + fi + done -# Run Snakefile_txt -echo "Processing text files..." -snakemake -s Snakefile_1 --profile slurm --jobs 1 - -# Check if the previous command was successful -if [ $? -eq 0 ]; then - echo "Text file processing completed successfully." - - # Run Snakefile_apsimx - echo "Processing APSIM files..." - snakemake -s Snakefile_2 --profile slurm --jobs $APSIM_JOBS - - if [ $? -eq 0 ]; then - echo "APSIM file processing completed successfully." - else - echo "Error: APSIM file processing failed." + return 0 +} + +# Main workflow execution +main() { + # Process Snakefile_1 + if ! run_snakefile "Snakefile_1" 1; then + log_message "Error: Text file processing failed" + exit 1 + fi + + # Process Snakefile_2 + if ! run_snakefile "Snakefile_2" "$APSIM_JOBS"; then + log_message "Error: APSIM file processing failed" exit 1 fi -else - echo "Error: Text file processing failed." - exit 1 -fi - -echo "All processing completed." - -# Check if FAILED_CONFIG is empty and FAILED_DB has at most one file -if [ -z "$(ls -A FAILED_CONFIG 2>/dev/null)" ] && [ $(ls -1 FAILED_DB 2>/dev/null | wc -l) -le 1 ]; then - echo "FAILED_CONFIG is empty and FAILED_DB has at most one file. Cleaning up files..." - - # Delete files with .processed and .apsimx extensions - find . -maxdepth 1 -type f \( -name "*.processed" -o -name "*.apsimx" -o -name "*.txt" -o -name "*.met" \) -delete - - # Delete txt_files_processed and db_files_sorted files - rm -f txt_files_processed db_files_sorted - - echo "Cleanup completed." -else - echo "FAILED_CONFIG is not empty or FAILED_DB has more than one file. Skipping cleanup." -fi + + log_message "Workflow completed successfully" +} + +# Trap for handling interrupts +trap 'log_message "Workflow interrupted"' INT TERM + +# Execute main workflow +main