-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #90 from nesi/dini-dev
update run_snakefile.sh and Snakefile_1 for better restarts
- Loading branch information
Showing
2 changed files
with
101 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,44 +1,79 @@ | ||
#!/bin/bash | ||
|
||
# Set the number of jobs for parallel processing of APSIM files | ||
# Exit on error | ||
set -e | ||
|
||
# Configuration | ||
APSIM_JOBS=100 | ||
MAX_WORKFLOW_RETRIES=3 | ||
RETRY_DELAY=60 # seconds | ||
|
||
# Logging function | ||
log_message() { | ||
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | ||
} | ||
|
||
# Function to run a Snakefile with comprehensive retry and recovery logic | ||
run_snakefile() { | ||
local snakefile=$1 | ||
local jobs=$2 | ||
local retry_count=0 | ||
local success=false | ||
|
||
while [ $retry_count -lt $MAX_WORKFLOW_RETRIES ] && [ "$success" = false ]; do | ||
log_message "Attempting to run $snakefile (Attempt $((retry_count + 1)))" | ||
|
||
# Run Snakemake with full verbosity and rerun-incomplete | ||
if snakemake -s "$snakefile" --profile slurm --jobs "$jobs" --rerun-incomplete -p; then | ||
success=true | ||
log_message "$snakefile completed successfully" | ||
else | ||
((retry_count++)) | ||
|
||
# Check for specific failure conditions | ||
if [ -f failed_txt_files.log ]; then | ||
failed_count=$(wc -l < failed_txt_files.log) | ||
log_message "Number of failed files: $failed_count" | ||
|
||
# Optional: Add logic to handle specific failure scenarios | ||
if [ "$failed_count" -gt 0 ]; then | ||
log_message "Examining failed files in FAILED_CONFIG" | ||
ls -l FAILED_CONFIG/ | ||
fi | ||
fi | ||
|
||
if [ $retry_count -lt $MAX_WORKFLOW_RETRIES ]; then | ||
log_message "Workflow failed. Waiting $RETRY_DELAY seconds before retry..." | ||
sleep $RETRY_DELAY | ||
else | ||
log_message "Workflow failed after $MAX_WORKFLOW_RETRIES attempts" | ||
return 1 | ||
fi | ||
fi | ||
done | ||
|
||
# Run Snakefile_txt | ||
echo "Processing text files..." | ||
snakemake -s Snakefile_1 --profile slurm --jobs 1 | ||
|
||
# Check if the previous command was successful | ||
if [ $? -eq 0 ]; then | ||
echo "Text file processing completed successfully." | ||
|
||
# Run Snakefile_apsimx | ||
echo "Processing APSIM files..." | ||
snakemake -s Snakefile_2 --profile slurm --jobs $APSIM_JOBS | ||
|
||
if [ $? -eq 0 ]; then | ||
echo "APSIM file processing completed successfully." | ||
else | ||
echo "Error: APSIM file processing failed." | ||
return 0 | ||
} | ||
|
||
# Main workflow execution | ||
main() { | ||
# Process Snakefile_1 | ||
if ! run_snakefile "Snakefile_1" 1; then | ||
log_message "Error: Text file processing failed" | ||
exit 1 | ||
fi | ||
|
||
# Process Snakefile_2 | ||
if ! run_snakefile "Snakefile_2" "$APSIM_JOBS"; then | ||
log_message "Error: APSIM file processing failed" | ||
exit 1 | ||
fi | ||
else | ||
echo "Error: Text file processing failed." | ||
exit 1 | ||
fi | ||
|
||
echo "All processing completed." | ||
|
||
# Check if FAILED_CONFIG is empty and FAILED_DB has at most one file | ||
if [ -z "$(ls -A FAILED_CONFIG 2>/dev/null)" ] && [ $(ls -1 FAILED_DB 2>/dev/null | wc -l) -le 1 ]; then | ||
echo "FAILED_CONFIG is empty and FAILED_DB has at most one file. Cleaning up files..." | ||
|
||
# Delete files with .processed and .apsimx extensions | ||
find . -maxdepth 1 -type f \( -name "*.processed" -o -name "*.apsimx" -o -name "*.txt" -o -name "*.met" \) -delete | ||
|
||
# Delete txt_files_processed and db_files_sorted files | ||
rm -f txt_files_processed db_files_sorted | ||
|
||
echo "Cleanup completed." | ||
else | ||
echo "FAILED_CONFIG is not empty or FAILED_DB has more than one file. Skipping cleanup." | ||
fi | ||
|
||
log_message "Workflow completed successfully" | ||
} | ||
|
||
# Trap for handling interrupts | ||
trap 'log_message "Workflow interrupted"' INT TERM | ||
|
||
# Execute main workflow | ||
main |