Skip to content

Commit

Permalink
Merge pull request #90 from nesi/dini-dev
Browse files Browse the repository at this point in the history
update run_snakefile.sh and Snakefile_1 for better restarts
  • Loading branch information
Ollehar authored Nov 26, 2024
2 parents 6e3237e + 5306114 commit dd36d6c
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 45 deletions.
35 changes: 28 additions & 7 deletions 08-snakemake/Snakefile_1
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ config = {
"apptainer_bind": "/agr/scratch,/agr/persist",
"apptainer_image": "/agr/persist/projects/2024_apsim_improvements/apsim-simulations/container/apsim-2024.09.7579.0.aimg",
"excluded_txt_files": ["ExampleConfig.txt"],
"max_consecutive_failures": 10,
"max_consecutive_failures": 3,
"slurm_logdir": "slurmlogs"
}

Expand All @@ -22,7 +22,9 @@ rule process_txt_files:
output:
"txt_files_processed"
params:
logfile = lambda wildcards, output: f"{config['slurm_logdir']}/txt_files_%j.out"
logfile = lambda wildcards, output: f"{config['slurm_logdir']}/txt_files_%j.out",
processed_files_log = "processed_txt_files.log",
failed_files_log = "failed_txt_files.log"
resources:
mem_mb = 8000,
time = "16:00:00"
Expand All @@ -36,26 +38,45 @@ rule process_txt_files:
export APPTAINER_CMD="apptainer exec {config[apptainer_image]}"

mkdir -p FAILED_CONFIG
consecutive_failures=0
rm -f {params.processed_files_log} {params.failed_files_log}
touch {params.processed_files_log} {params.failed_files_log}

total_files=$(echo "{input.txt_files}" | wc -w)
processed_files=0
failed_files=0
max_consecutive_failures={config[max_consecutive_failures]}
consecutive_failures=0

for file in {input.txt_files}; do
echo "Processing $file"

if $APPTAINER_CMD Models --cpu-count {threads} --apply "$file"; then
echo "Successfully processed $file"
echo "$file" >> {params.processed_files_log}
processed_files=$((processed_files + 1))
consecutive_failures=0
else
echo "Failed to process $file"
echo "$file" >> {params.failed_files_log}
mv "$file" FAILED_CONFIG/
((consecutive_failures++))
failed_files=$((failed_files + 1))
consecutive_failures=$((consecutive_failures + 1))

if [ $consecutive_failures -ge $max_consecutive_failures ]; then
echo "Error: $max_consecutive_failures consecutive failures reached. Terminating job." >&2
echo "Error: $max_consecutive_failures consecutive failures reached." >&2
exit 1
fi
fi
done

touch {output}
echo "Total files: $total_files"
echo "Processed files: $processed_files"
echo "Failed files: $failed_files"

if [ $failed_files -eq 0 ]; then
touch {output}
else
exit 1
fi
"""

rule create_logdir:
Expand Down
111 changes: 73 additions & 38 deletions 08-snakemake/run_snakefile.sh
Original file line number Diff line number Diff line change
@@ -1,44 +1,79 @@
#!/bin/bash

# Set the number of jobs for parallel processing of APSIM files
# Exit on error
set -e

# Configuration
APSIM_JOBS=100
MAX_WORKFLOW_RETRIES=3
RETRY_DELAY=60 # seconds

# Logging function
log_message() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# Function to run a Snakefile with comprehensive retry and recovery logic
run_snakefile() {
local snakefile=$1
local jobs=$2
local retry_count=0
local success=false

while [ $retry_count -lt $MAX_WORKFLOW_RETRIES ] && [ "$success" = false ]; do
log_message "Attempting to run $snakefile (Attempt $((retry_count + 1)))"

# Run Snakemake with full verbosity and rerun-incomplete
if snakemake -s "$snakefile" --profile slurm --jobs "$jobs" --rerun-incomplete -p; then
success=true
log_message "$snakefile completed successfully"
else
((retry_count++))

# Check for specific failure conditions
if [ -f failed_txt_files.log ]; then
failed_count=$(wc -l < failed_txt_files.log)
log_message "Number of failed files: $failed_count"

# Optional: Add logic to handle specific failure scenarios
if [ "$failed_count" -gt 0 ]; then
log_message "Examining failed files in FAILED_CONFIG"
ls -l FAILED_CONFIG/
fi
fi

if [ $retry_count -lt $MAX_WORKFLOW_RETRIES ]; then
log_message "Workflow failed. Waiting $RETRY_DELAY seconds before retry..."
sleep $RETRY_DELAY
else
log_message "Workflow failed after $MAX_WORKFLOW_RETRIES attempts"
return 1
fi
fi
done

# Run Snakefile_txt
echo "Processing text files..."
snakemake -s Snakefile_1 --profile slurm --jobs 1

# Check if the previous command was successful
if [ $? -eq 0 ]; then
echo "Text file processing completed successfully."

# Run Snakefile_apsimx
echo "Processing APSIM files..."
snakemake -s Snakefile_2 --profile slurm --jobs $APSIM_JOBS

if [ $? -eq 0 ]; then
echo "APSIM file processing completed successfully."
else
echo "Error: APSIM file processing failed."
return 0
}

# Main workflow execution
main() {
# Process Snakefile_1
if ! run_snakefile "Snakefile_1" 1; then
log_message "Error: Text file processing failed"
exit 1
fi

# Process Snakefile_2
if ! run_snakefile "Snakefile_2" "$APSIM_JOBS"; then
log_message "Error: APSIM file processing failed"
exit 1
fi
else
echo "Error: Text file processing failed."
exit 1
fi

echo "All processing completed."

# Check if FAILED_CONFIG is empty and FAILED_DB has at most one file
if [ -z "$(ls -A FAILED_CONFIG 2>/dev/null)" ] && [ $(ls -1 FAILED_DB 2>/dev/null | wc -l) -le 1 ]; then
echo "FAILED_CONFIG is empty and FAILED_DB has at most one file. Cleaning up files..."

# Delete files with .processed and .apsimx extensions
find . -maxdepth 1 -type f \( -name "*.processed" -o -name "*.apsimx" -o -name "*.txt" -o -name "*.met" \) -delete

# Delete txt_files_processed and db_files_sorted files
rm -f txt_files_processed db_files_sorted

echo "Cleanup completed."
else
echo "FAILED_CONFIG is not empty or FAILED_DB has more than one file. Skipping cleanup."
fi

log_message "Workflow completed successfully"
}

# Trap for handling interrupts
trap 'log_message "Workflow interrupted"' INT TERM

# Execute main workflow
main

0 comments on commit dd36d6c

Please sign in to comment.