updates

argonne-lcf · Oct 30, 2024 · 28ce7ff · 28ce7ff
1 parent 62bb7d3
commit 28ce7ff
Show file tree

Hide file tree

Showing 6 changed files with 328 additions and 23 deletions.
diff --git a/workflows/balsam/.gitignore b/workflows/balsam/.gitignore
@@ -1,2 +1,3 @@
 *~
-*.png
+*.png
+polaris_tutorial
diff --git a/workflows/globus_compute/1_register_function.py b/workflows/globus_compute/1_register_function.py
@@ -13,7 +13,7 @@ def hello_affinity(run_directory):
     os.chdir(os.path.expandvars(run_directory))
 
     # This is the command that calls the compiled executable
-    command = "/eagle/fallwkshp23/workflows/affinity_gpu/hello_affinity"
+    command = "/grand/alcf_training/workflows_2024/GettingStarted/Examples/Polaris/affinity_gpu/hello_affinity"
 
     # This runs the application command
     res = subprocess.run(command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

diff --git a/workflows/globus_compute/README.md b/workflows/globus_compute/README.md
@@ -22,7 +22,7 @@ On Polaris, you will need to create a python virtual environment or a conda envi
 
 For the workshop, you can use the workshop python virtual environment:
 ```bash
-source /eagle/fallwkshp23/workflows/env/bin/activate
+source /grand/alcf_training/workflows_2024/_env/bin/activate
 ```
 
 To create your own environment:
@@ -60,7 +60,7 @@ git clone [email protected]:argonne-lcf/ALCF_Hands_on_HPC_Workshop.git
 cd ALCF_Hands_on_HPC_Workshop/workflows/globus_compute
 
 # If you haven't already, activate the environment
-source /eagle/fallwkshp23/workflows/env/bin/activate
+source /grand/alcf_training/workflows_2024/_env/bin/activate
 ```
 
 Use the sample config [polaris_config.yaml](polaris_config.yaml) provided to configure and start your endpoint.  The sample config has similar features to the Parsl config and looks like this:
@@ -70,16 +70,17 @@ engine:
     type: GlobusComputeEngine
 
     available_accelerators: 4 # Assign one worker per GPU
-    cpu_affinity: block-reverse  # Assigns cpus in reverse sequential order                                                   
-    prefetch_capacity: 0  # Increase if you have many more tasks than workers                                                    
-
-    address:
-        type: address_by_interface
-        ifname: bond0
+    max_workers_per_node: 4
+
+    cpu_affinity: "list:24-31,56-63:16-23,48-55:8-15,40-47:0-7,32-39"
+
+    prefetch_capacity: 0  # Increase if you have many more tasks than workers                                              
+    max_retries_on_system_failure: 2
 
-    strategy:
-        type: SimpleStrategy
+    strategy: simple
+    job_status_kwargs:
         max_idletime: 300
+        strategy_period: 60
 
     provider:
         type: PBSProProvider
@@ -90,18 +91,18 @@ engine:
             bind_cmd: --cpu-bind
             overrides: --ppn 1
 
-        account: fallwkshp23
-        queue: fallws23single
+        account: alcf_training
+        queue: HandsOnHPC
         cpus_per_node: 64
         select_options: ngpus=4
 
         # e.g., "#PBS -l filesystems=home:grand:eagle\n#PBS -k doe"
-        scheduler_options: "#PBS -l filesystems=home:eagle"
+        scheduler_options: "#PBS -l filesystems=home:eagle:grand"
 
         # Node setup: activate necessary conda environment and such
-        worker_init: "source /eagle/fallwkshp23/workflows/env/bin/activate; module load PrgEnv-nvhpc; cd $HOME/.globus_compute/workshop-endpoint"
+        worker_init: "source /grand/alcf_training/workflows_2024/_env/bin/activate; module load PrgEnv-nvhpc; cd $HOME/.globus_compute/workshop-endpoint"
 
-        walltime: 00:05:00
+        walltime: 00:30:00
         nodes_per_block: 1
         init_blocks: 0
         min_blocks: 0
@@ -180,7 +181,7 @@ def hello_affinity(run_directory):
     os.chdir(os.path.expandvars(run_directory))
 
     # This is the command that calls the compiled executable
-    command = f"/eagle/fallwkshp23/workflows/affinity_gpu/hello_affinity"
+    command = f"/grand/alcf_training/workflows_2024/GettingStarted/Examples/Polaris/affinity_gpu/hello_affinity"
 
     # This runs the application command
     res = subprocess.run(command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)

diff --git a/workflows/globus_compute/polaris_config.yaml b/workflows/globus_compute/polaris_config.yaml
@@ -10,10 +10,6 @@ engine:
     prefetch_capacity: 0  # Increase if you have many more tasks than workers                                              
     max_retries_on_system_failure: 2
 
-    address:
-        type: address_by_interface
-        ifname: bond0
-
     strategy: simple
     job_status_kwargs:
         max_idletime: 300

diff --git a/workflows/parsl/5_mpi_app_example.py b/workflows/parsl/5_mpi_app_example.py
@@ -0,0 +1,61 @@
+import parsl
+import os
+from parsl.config import Config
+from parsl import bash_app
+# PBSPro is the right provider for polaris:
+from parsl.providers import PBSProProvider
+# The MPIExecutor is for running MPI applications:
+from parsl.executors import MPIExecutor
+# Use the Simple launcher
+from parsl.launchers import SimpleLauncher
+
+# We will save outputs in the current working directory
+working_directory = os.getcwd()
+
+config = Config(
+    executors=[
+        MPIExecutor(
+            max_workers_per_block=2,  # Assuming 2 nodes per task
+            provider=PBSProProvider(
+                account="alcf_training",
+                worker_init=f"""source /grand/alcf_training/workflows_2024/_env/bin/activate; \
+                                cd {working_directory}""",
+                walltime="1:00:00",
+                queue="debug-scaling",
+                scheduler_options="#PBS -l filesystems=home:eagle:grand",
+                launcher=SimpleLauncher(),
+                select_options="ngpus=4",
+                nodes_per_block=4,
+                max_blocks=1,
+                cpus_per_node=64,
+            ),
+        ),
+    ]
+)
+
+resource_specification = {
+  'num_nodes': 2,        # Number of nodes required for the application instance
+  'ranks_per_node': 4,   # Number of ranks / application elements to be launched per node
+  'num_ranks': 8,        # Number of ranks in total
+}
+
+@bash_app
+def mpi_hello_affinity(parsl_resource_specification, depth=8, stdout='mpi_hello.stdout', stderr='mpi_hello.stderr'):
+    # PARSL_MPI_PREFIX will resolve to `mpiexec -n 8 -ppn 4 -hosts NODE001,NODE002`
+    APP_DIR = "/grand/alcf_training/workflows_2024/GettingStarted/Examples/Polaris/affinity_gpu"
+    return f"$PARSL_MPI_PREFIX --cpu-bind depth --depth={depth} \
+            {APP_DIR}/set_affinity_gpu_polaris.sh {APP_DIR}/hello_affinity"
+
+with parsl.load(config):
+    tasks = []
+    for i in range(4):
+        tasks.append(mpi_hello_affinity(parsl_resource_specification=resource_specification,
+                                        stdout=f"{working_directory}/mpi_output/hello_{i}.stdout",
+                                        stderr=f"{working_directory}/mpi_output/hello_{i}.stderr"))
+
+    # Wait on futures to return, and print results
+    for i, t in enumerate(tasks):
+        t.result()
+        with open(f"{working_directory}/mpi_output/hello_{i}.stdout", "r") as f:
+            print(f"Stdout of task {i}:")
+            print(f.read())