Skip to content

Commit

Permalink
Merge pull request #247 from broadinstitute/no-race
Browse files Browse the repository at this point in the history
Mitigate EmptyDataError race condition
  • Loading branch information
mayasheth authored Jan 31, 2025
2 parents 1c7ae9c + 96c3966 commit 6e4bc92
Showing 1 changed file with 30 additions and 2 deletions.
32 changes: 30 additions & 2 deletions workflow/rules/utils.smk
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import pandas as pd
import time
import random
from pandas.errors import EmptyDataError

class InvalidConfig(Exception):
pass

Expand Down Expand Up @@ -77,9 +82,32 @@ def determine_filtered_prediction_file_format(threshold, config):
other_flags = ''
return FILTERED_PREDICTION_FILE_FORMAT_TEMPLATE.format(threshold=threshold, separator=separator, other_flags=other_flags)

def enable_retry(func, func_args={}, max_attempts=3, delay=0.5):
"""
To prevent EmptyDataError race condition when using SLURM ro launch jobs as processes
Assuming the EmptyDataError is caused by a file caching or synchronization lag
Retry with delay
@Param
func: Function to retry
func_args: Dictionary of kwargs for function
max_attempts: Maximum number of attempts allowable before raising error
delay: minimum delay before retry
"""
for attempt in range(max_attempts):
try:
return func(**func_args)
except Exception as e:
if attempt == max_attempts - 1:
raise
sleep_time = delay + random.uniform(0, 0.5)
time.sleep(sleep_time)
return None

def load_biosamples_config(config):
biosamples_config = pd.read_csv(
config["biosamplesTable"], sep="\t", na_values=""
biosamples_config = enable_retry(
pd.read_csv,
func_args={'filepath_or_buffer': config["biosamplesTable"], 'sep': "\t"}
).replace([np.nan], [None]).set_index("biosample", drop=False)
biosamples_config["HiC_resolution"] = biosamples_config["HiC_resolution"].replace([None], [0]).astype(int)
_validate_biosamples_config(biosamples_config)
Expand Down

0 comments on commit 6e4bc92

Please sign in to comment.