Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Deep workflow #48

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,6 @@ _html/

# Parsl log files
run_logs/

# Emacs
*~
262 changes: 235 additions & 27 deletions src/kbmod_wf/resource_configs/klone_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,93 +18,301 @@ def klone_resource_config():
app_cache=True,
checkpoint_mode="task_exit",
checkpoint_files=get_all_checkpoints(
os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat())
os.path.join(os.path.abspath(os.curdir), datetime.date.today().isoformat())
),
run_dir=os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat()),
run_dir=os.path.join(os.path.abspath(os.curdir), datetime.date.today().isoformat()),
retries=1,
executors=[
####################
# Resample resources
####################
HighThroughputExecutor(
label="small_cpu",
label="astro_96gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
partition="compute-bigmem",
account="astro",
min_blocks=0,
max_blocks=4, # Low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=96, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="astro_48gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="compute-bigmem",
account="astro",
min_blocks=0,
max_blocks=4,
max_blocks=4, # Low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=1, # perhaps should be 8???
mem_per_node=256, # In GB
mem_per_node=48, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["compute_bigmem"],
# Command to run before starting worker - i.e. conda activate <special_env>
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="large_mem",
label="esci_96gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resources
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=96, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="esci_48gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resources
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=48, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="ckpt_96gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-all",
account="astro",
min_blocks=0,
max_blocks=2,
max_blocks=50, # scale to the size of the GPU blocks, big number for low memory
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=32,
mem_per_node=512,
mem_per_node=96, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["large_mem"],
# Command to run before starting worker - i.e. conda activate <special_env>
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
HighThroughputExecutor(
label="sharded_reproject",
label="ckpt_48gb_8cpus",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
partition="ckpt-all",
account="astro",
min_blocks=0,
max_blocks=2,
max_blocks=50, # scale to the size of the GPU blocks, big number for low memory
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=32,
mem_per_node=128, # ~2-4 GB per core
mem_per_node=48, # 96 GB for >100, 48 for < 100
cores_per_node=8,
exclusive=False,
walltime=walltimes["sharded_reproject"],
worker_init="",
),
),
####################
# Search resources
####################
HighThroughputExecutor(
label="esci_96gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=96, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="esci_48gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=48, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="esci_32gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="gpu-a40",
account="escience",
min_blocks=0,
max_blocks=4, # low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=32, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="ckpt_96gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
account="escience",
min_blocks=0,
max_blocks=50, # 20 for 96, 50 for 48
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=96, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="ckpt_48gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
account="escience",
min_blocks=0,
max_blocks=50, # 20 for 96, 50 for 48
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=48, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),
HighThroughputExecutor(
label="gpu",
label="ckpt_32gb_2cpu_1gpu",
max_workers=1,
provider=SlurmProvider(
partition="ckpt-g2",
account="escience",
min_blocks=0,
max_blocks=2,
max_blocks=50, # 20 for 96, 50 for 48
init_blocks=0,
parallelism=1,
nodes_per_block=1,
cores_per_node=2, # perhaps should be 8???
mem_per_node=512, # In GB
mem_per_node=32, # 96 GB for >100, 48 for < 100
exclusive=False,
walltime=walltimes["gpu_max"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
scheduler_options="#SBATCH --gpus=1",
),
),

####################
# Analysis resource
####################
HighThroughputExecutor(
label="local_thread",
provider=LocalProvider(
label="astro_4gb_2cpus",
max_workers=1, # Do we mean max_workers_per_node here?
provider=SlurmProvider(
partition="compute-bigmem", # ckpt-all
account="astro", # astro
min_blocks=0,
max_blocks=12, # low block count for shared resource
init_blocks=0,
max_blocks=1,
parallelism=1,
nodes_per_block=1,
mem_per_node=4,
cores_per_node=2,
exclusive=False,
walltime=walltimes["sharded_reproject"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
),
),
HighThroughputExecutor(
label="esci_4gb_2cpus",
max_workers=1, # Do we mean max_workers_per_node here?
provider=SlurmProvider(
partition="gpu-a40", # ckpt-all
account="escience", # astro
min_blocks=0,
max_blocks=12, # low block count for shared resource
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=4,
cores_per_node=2,
exclusive=False,
walltime=walltimes["sharded_reproject"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
),
),
HighThroughputExecutor(
label="ckpt_4gb_2cpus",
max_workers=1, # Do we mean max_workers_per_node here?
provider=SlurmProvider(
partition="ckpt-all", # ckpt-all
account="astro", # astro
min_blocks=0,
max_blocks=100, # can leave large at all times
init_blocks=0,
parallelism=1,
nodes_per_block=1,
mem_per_node=4,
cores_per_node=2,
exclusive=False,
walltime=walltimes["sharded_reproject"],
# Command to run before starting worker - i.e. conda activate <special_env>
worker_init="",
),
),
],
Expand Down
Loading
Loading