dirac-institute · DinoBektesevic · Sep 18, 2024 · Oct 3, 2024 · Oct 16, 2024 · Nov 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -151,3 +151,6 @@ _html/
 
 # Parsl log files
 run_logs/
+
+# Emacs
+*~
diff --git a/src/kbmod_wf/resource_configs/klone_configuration.py b/src/kbmod_wf/resource_configs/klone_configuration.py
@@ -18,93 +18,301 @@ def klone_resource_config():
         app_cache=True,
         checkpoint_mode="task_exit",
         checkpoint_files=get_all_checkpoints(
-            os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat())
+            os.path.join(os.path.abspath(os.curdir), datetime.date.today().isoformat())
         ),
-        run_dir=os.path.join("/gscratch/dirac/kbmod/workflow/run_logs", datetime.date.today().isoformat()),
+        run_dir=os.path.join(os.path.abspath(os.curdir), datetime.date.today().isoformat()),
         retries=1,
         executors=[
+            ####################
+            #          Resample resources
+            ####################
             HighThroughputExecutor(
-                label="small_cpu",
+                label="astro_96gb_8cpus",
                 max_workers=1,
                 provider=SlurmProvider(
-                    partition="ckpt-g2",
+                    partition="compute-bigmem",
+                    account="astro", 
+                    min_blocks=0,
+                    max_blocks=4,   # Low block count for shared resource
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    mem_per_node=96, # 96 GB for >100, 48 for < 100
+                    cores_per_node=8,
+                    exclusive=False,
+                    walltime=walltimes["sharded_reproject"],
+                    worker_init="",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="astro_48gb_8cpus",
+                max_workers=1,
+                provider=SlurmProvider(
+                    partition="compute-bigmem",
                     account="astro",
                     min_blocks=0,
-                    max_blocks=4,
+                    max_blocks=4,   # Low block count for shared resource
                     init_blocks=0,
                     parallelism=1,
                     nodes_per_block=1,
-                    cores_per_node=1,  # perhaps should be 8???
-                    mem_per_node=256,  # In GB
+                    mem_per_node=48, # 96 GB for >100, 48 for < 100
+                    cores_per_node=8,
                     exclusive=False,
-                    walltime=walltimes["compute_bigmem"],
-                    # Command to run before starting worker - i.e. conda activate <special_env>
+                    walltime=walltimes["sharded_reproject"],
                     worker_init="",
                 ),
             ),
             HighThroughputExecutor(
-                label="large_mem",
+                label="esci_96gb_8cpus",
                 max_workers=1,
                 provider=SlurmProvider(
-                    partition="ckpt-g2",
+                    partition="gpu-a40",
+                    account="escience",
+                    min_blocks=0,
+                    max_blocks=4,  # low block count for shared resources
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    mem_per_node=96, # 96 GB for >100, 48 for < 100
+                    cores_per_node=8,
+                    exclusive=False,
+                    walltime=walltimes["sharded_reproject"],
+                    worker_init="",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="esci_48gb_8cpus",
+                max_workers=1,
+                provider=SlurmProvider(
+                    partition="gpu-a40",
+                    account="escience",
+                    min_blocks=0,
+                    max_blocks=4,  # low block count for shared resources
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    mem_per_node=48, # 96 GB for >100, 48 for < 100
+                    cores_per_node=8,
+                    exclusive=False,
+                    walltime=walltimes["sharded_reproject"],
+                    worker_init="",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="ckpt_96gb_8cpus",
+                max_workers=1,
+                provider=SlurmProvider(
+                    partition="ckpt-all",
                     account="astro",
                     min_blocks=0,
-                    max_blocks=2,
+                    max_blocks=50,  # scale to the size of the GPU blocks, big number for low memory
                     init_blocks=0,
                     parallelism=1,
                     nodes_per_block=1,
-                    cores_per_node=32,
-                    mem_per_node=512,
+                    mem_per_node=96,  # 96 GB for >100, 48 for < 100
+                    cores_per_node=8,
                     exclusive=False,
-                    walltime=walltimes["large_mem"],
-                    # Command to run before starting worker - i.e. conda activate <special_env>
+                    walltime=walltimes["sharded_reproject"],
                     worker_init="",
                 ),
             ),
             HighThroughputExecutor(
-                label="sharded_reproject",
+                label="ckpt_48gb_8cpus",
                 max_workers=1,
                 provider=SlurmProvider(
-                    partition="ckpt-g2",
+                    partition="ckpt-all",
                     account="astro",
                     min_blocks=0,
-                    max_blocks=2,
+                    max_blocks=50,  # scale to the size of the GPU blocks, big number for low memory
                     init_blocks=0,
                     parallelism=1,
                     nodes_per_block=1,
-                    cores_per_node=32,
-                    mem_per_node=128,  # ~2-4 GB per core
+                    mem_per_node=48, # 96 GB for >100, 48 for < 100
+                    cores_per_node=8,
                     exclusive=False,
                     walltime=walltimes["sharded_reproject"],
+                    worker_init="",
+                ),
+            ),
+            ####################
+            #          Search resources
+            ####################
+            HighThroughputExecutor(
+                label="esci_96gb_2cpu_1gpu",
+                max_workers=1,
+                provider=SlurmProvider(
+                    partition="gpu-a40",
+                    account="escience",
+                    min_blocks=0,
+                    max_blocks=4,  # low block count for shared resource
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    cores_per_node=2,  # perhaps should be 8???
+                    mem_per_node=96,  # 96 GB for >100, 48 for < 100
+                    exclusive=False,
+                    walltime=walltimes["gpu_max"],
+                    worker_init="",
+                    scheduler_options="#SBATCH --gpus=1",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="esci_48gb_2cpu_1gpu",
+                max_workers=1,
+                provider=SlurmProvider(
+                    partition="gpu-a40",
+                    account="escience",
+                    min_blocks=0,
+                    max_blocks=4,  # low block count for shared resource
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    cores_per_node=2,  # perhaps should be 8???
+                    mem_per_node=48,  # 96 GB for >100, 48 for < 100
+                    exclusive=False,
+                    walltime=walltimes["gpu_max"],
+                    worker_init="",
+                    scheduler_options="#SBATCH --gpus=1",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="esci_32gb_2cpu_1gpu",
+                max_workers=1,
+                provider=SlurmProvider(
+                    partition="gpu-a40",
+                    account="escience",
+                    min_blocks=0,
+                    max_blocks=4,  # low block count for shared resource
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    cores_per_node=2,  # perhaps should be 8???
+                    mem_per_node=32,  # 96 GB for >100, 48 for < 100
+                    exclusive=False,
+                    walltime=walltimes["gpu_max"],
+                    worker_init="",
+                    scheduler_options="#SBATCH --gpus=1",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="ckpt_96gb_2cpu_1gpu",
+                max_workers=1,
+                provider=SlurmProvider(
+                    partition="ckpt-g2",
+                    account="escience",
+                    min_blocks=0,
+                    max_blocks=50, # 20 for 96, 50 for 48
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    cores_per_node=2,  # perhaps should be 8???
+                    mem_per_node=96, # 96 GB for >100, 48 for < 100
+                    exclusive=False,
+                    walltime=walltimes["gpu_max"],
+                    # Command to run before starting worker - i.e. conda activate <special_env>
+                    worker_init="",
+                    scheduler_options="#SBATCH --gpus=1",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="ckpt_48gb_2cpu_1gpu",
+                max_workers=1,
+                provider=SlurmProvider(
+                    partition="ckpt-g2",
+                    account="escience",
+                    min_blocks=0,
+                    max_blocks=50, # 20 for 96, 50 for 48
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    cores_per_node=2,  # perhaps should be 8???
+                    mem_per_node=48, # 96 GB for >100, 48 for < 100
+                    exclusive=False,
+                    walltime=walltimes["gpu_max"],
                     # Command to run before starting worker - i.e. conda activate <special_env>
                     worker_init="",
+                    scheduler_options="#SBATCH --gpus=1",
                 ),
             ),
             HighThroughputExecutor(
-                label="gpu",
+                label="ckpt_32gb_2cpu_1gpu",
                 max_workers=1,
                 provider=SlurmProvider(
                     partition="ckpt-g2",
                     account="escience",
                     min_blocks=0,
-                    max_blocks=2,
+                    max_blocks=50, # 20 for 96, 50 for 48
                     init_blocks=0,
                     parallelism=1,
                     nodes_per_block=1,
                     cores_per_node=2,  # perhaps should be 8???
-                    mem_per_node=512,  # In GB
+                    mem_per_node=32, # 96 GB for >100, 48 for < 100
                     exclusive=False,
                     walltime=walltimes["gpu_max"],
                     # Command to run before starting worker - i.e. conda activate <special_env>
                     worker_init="",
                     scheduler_options="#SBATCH --gpus=1",
                 ),
             ),
+
+            ####################
+            #          Analysis resource
+            ####################
             HighThroughputExecutor(
-                label="local_thread",
-                provider=LocalProvider(
+                label="astro_4gb_2cpus",
+                max_workers=1,  # Do we mean max_workers_per_node here?
+                provider=SlurmProvider(
+                    partition="compute-bigmem", # ckpt-all
+                    account="astro", # astro
+                    min_blocks=0,
+                    max_blocks=12, # low block count for shared resource
                     init_blocks=0,
-                    max_blocks=1,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    mem_per_node=4,
+                    cores_per_node=2,
+                    exclusive=False,
+                    walltime=walltimes["sharded_reproject"],
+                    # Command to run before starting worker - i.e. conda activate <special_env>
+                    worker_init="",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="esci_4gb_2cpus",
+                max_workers=1,  # Do we mean max_workers_per_node here?
+                provider=SlurmProvider(
+                    partition="gpu-a40", # ckpt-all
+                    account="escience", # astro
+                    min_blocks=0,
+                    max_blocks=12, # low block count for shared resource
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    mem_per_node=4,
+                    cores_per_node=2,
+                    exclusive=False,
+                    walltime=walltimes["sharded_reproject"],
+                    # Command to run before starting worker - i.e. conda activate <special_env>
+                    worker_init="",
+                ),
+            ),
+            HighThroughputExecutor(
+                label="ckpt_4gb_2cpus",
+                max_workers=1,  # Do we mean max_workers_per_node here?
+                provider=SlurmProvider(
+                    partition="ckpt-all", # ckpt-all
+                    account="astro", # astro
+                    min_blocks=0,
+                    max_blocks=100, # can leave large at all times
+                    init_blocks=0,
+                    parallelism=1,
+                    nodes_per_block=1,
+                    mem_per_node=4,
+                    cores_per_node=2,
+                    exclusive=False,
+                    walltime=walltimes["sharded_reproject"],
+                    # Command to run before starting worker - i.e. conda activate <special_env>
+                    worker_init="",
                 ),
             ),
         ],
-Original file line number
+Diff line change
@@ Expand Up / @@ -151,3 +151,6 @@ _html/ @@
     # Parsl log files
     run_logs/
+    # Emacs
+    *~