Skip to content

Commit 16bbdfd

Browse files
jychoi-hpcJong ChoiJong ChoiJong Choi
authored
Updating HPO for Perlmutter (ORNL#239)
* perlmutter * perlmutter * add perlmutter * update on perlmutter * add perlmutter hpo * black * add negative loss for deephyper --------- Co-authored-by: Jong Choi <[email protected]> Co-authored-by: Jong Choi <[email protected]> Co-authored-by: Jong Choi <[email protected]>
1 parent 1402ead commit 16bbdfd

File tree

6 files changed

+295
-11
lines changed

6 files changed

+295
-11
lines changed

examples/multidataset_hpo/gfm.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,12 @@ def main():
6060
parser.add_argument(
6161
"--multi_model_list", help="multidataset list", default="OC2020"
6262
)
63+
parser.add_argument(
64+
"--num_samples",
65+
type=int,
66+
help="set num samples per process for weak-scaling test",
67+
default=None,
68+
)
6369

6470
group = parser.add_mutually_exclusive_group()
6571
group.add_argument(
@@ -304,6 +310,13 @@ def main():
304310
## Set local set
305311
for dataset in [trainset, valset, testset]:
306312
rx = list(nsplit(range(len(dataset)), local_comm_size))[local_comm_rank]
313+
if args.num_samples is not None:
314+
if args.num_samples > len(rx):
315+
log(
316+
f"WARN: requested samples are larger than what is available. Use only {len(rx)}."
317+
)
318+
rx = rx[: args.num_samples]
319+
307320
dataset.setkeys(common_variable_names)
308321
dataset.setsubset(rx[0], rx[-1] + 1, preload=True)
309322

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
import os, sys
2+
3+
import torch
4+
5+
torch.backends.cudnn.enabled = False
6+
7+
# deprecated in torch_geometric 2.0
8+
try:
9+
from torch_geometric.loader import DataLoader
10+
except:
11+
from torch_geometric.data import DataLoader
12+
13+
import pandas as pd
14+
import subprocess
15+
import re
16+
17+
pd.options.display.max_columns = None
18+
pd.options.display.max_rows = None
19+
pd.options.display.width = None
20+
21+
# Retrieve constants
22+
NNODES = int(os.environ["NNODES"])
23+
NTOTGPUS = int(os.environ["NTOTGPUS"])
24+
NNODES_PER_TRIAL = int(os.environ["NNODES_PER_TRIAL"])
25+
NGPUS_PER_TRIAL = int(os.environ["NGPUS_PER_TRIAL"])
26+
NUM_CONCURRENT_TRIALS = int(os.environ["NUM_CONCURRENT_TRIALS"])
27+
NTOT_DEEPHYPER_RANKS = int(os.environ["NTOT_DEEPHYPER_RANKS"])
28+
OMP_NUM_THREADS = int(os.environ["OMP_NUM_THREADS"])
29+
DEEPHYPER_LOG_DIR = os.environ["DEEPHYPER_LOG_DIR"]
30+
DEEPHYPER_DB_HOST = os.environ["DEEPHYPER_DB_HOST"]
31+
SLURM_JOB_ID = os.environ["SLURM_JOB_ID"]
32+
33+
34+
def _parse_results(stdout):
35+
pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)"
36+
matches = re.findall(pattern, stdout.decode())
37+
if matches:
38+
return matches[-1][0]
39+
else:
40+
return "F"
41+
42+
43+
def run(trial, dequed=None):
44+
f = open(f"output-{SLURM_JOB_ID}-{trial.id}.txt", "w")
45+
python_exe = sys.executable
46+
python_script = os.path.join(os.path.dirname(__file__), "gfm.py")
47+
48+
# TODO: Launch a subprocess with `srun` to train neural networks
49+
params = trial.parameters
50+
log_name = "gfm" + "_" + str(trial.id)
51+
master_addr = f"HYDRAGNN_MASTER_ADDR={dequed[0]}"
52+
nodelist = ",".join(dequed)
53+
54+
# time srun -u -n32 -c2 --ntasks-per-node=8 --gpus-per-node=8 --gpu-bind=closest
55+
prefix = " ".join(
56+
[
57+
f"srun",
58+
f"-N {NNODES_PER_TRIAL} -n {NGPUS_PER_TRIAL}",
59+
f"--ntasks-per-node=4 --gpus-per-node=4",
60+
f"--cpus-per-task {OMP_NUM_THREADS} --threads-per-core 1 --cpu-bind threads",
61+
f"--gpus-per-task=1",
62+
f"--export=ALL,{master_addr},HYDRAGNN_MAX_NUM_BATCH=100,HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1,HYDRAGNN_AGGR_BACKEND=mpi",
63+
f"--nodelist={nodelist}",
64+
f"--output {DEEPHYPER_LOG_DIR}/output_{SLURM_JOB_ID}_{trial.id}.txt",
65+
f"--error {DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt",
66+
]
67+
)
68+
69+
command = " ".join(
70+
[
71+
prefix,
72+
python_exe,
73+
"-u",
74+
python_script,
75+
f"--model_type={trial.parameters['model_type']}",
76+
f"--hidden_dim={trial.parameters['hidden_dim']}",
77+
f"--num_conv_layers={trial.parameters['num_conv_layers']}",
78+
f"--num_headlayers={trial.parameters['num_headlayers']}",
79+
f"--dim_headlayers={trial.parameters['dim_headlayers']}",
80+
f"--multi",
81+
f"--ddstore",
82+
# f'--multi_model_list="ANI1x,MPTrj,OC2020-2M,OC2022,qm7x"',
83+
## debugging
84+
f'--multi_model_list="ANI1x,MPTrj,qm7x"',
85+
f"--num_samples=3200",
86+
f"--num_epoch=5",
87+
f"--log={log_name}",
88+
]
89+
)
90+
print("Command = ", command, flush=True, file=f)
91+
92+
result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT)
93+
output = "F"
94+
try:
95+
pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)"
96+
fout = open(f"{DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", "r")
97+
while True:
98+
line = fout.readline()
99+
matches = re.findall(pattern, line)
100+
if matches:
101+
output = -float(matches[-1][0])
102+
if not line:
103+
break
104+
fout.close()
105+
106+
except Exception as excp:
107+
print(excp, flush=True, file=f)
108+
output = "F"
109+
110+
print("Output:", output, flush=True, file=f)
111+
objective = output
112+
print(objective, flush=True, file=f)
113+
metadata = {"some_info": "some_value"}
114+
f.close()
115+
116+
return {"objective": objective, "metadata": metadata}
117+
118+
119+
if __name__ == "__main__":
120+
121+
log_name = f"gfm-{SLURM_JOB_ID}"
122+
123+
# Choose the sampler (e.g., TPESampler or RandomSampler)
124+
from deephyper.evaluator import Evaluator, ProcessPoolEvaluator, queued
125+
from deephyper.problem import HpProblem
126+
from deephyper.search.hps import CBO
127+
from hydragnn.utils.deephyper import read_node_list
128+
129+
# define the variable you want to optimize
130+
problem = HpProblem()
131+
132+
# Define the search space for hyperparameters
133+
problem.add_hyperparameter((2, 6), "num_conv_layers") # discrete parameter
134+
problem.add_hyperparameter((100, 2000), "hidden_dim") # discrete parameter
135+
problem.add_hyperparameter((1, 3), "num_headlayers") # discrete parameter
136+
problem.add_hyperparameter((100, 1000), "dim_headlayers") # discrete parameter
137+
problem.add_hyperparameter(
138+
["EGNN", "SchNet", "PNA"], "model_type"
139+
) # categorical parameter
140+
141+
# Create the node queue
142+
queue, _ = read_node_list()
143+
print("The queue:", queue, len(queue))
144+
print("NNODES_PER_TRIAL", NNODES_PER_TRIAL)
145+
print("NUM_CONCURRENT_TRIALS", NUM_CONCURRENT_TRIALS)
146+
print("NGPUS_PER_TRIAL", NGPUS_PER_TRIAL)
147+
print("NTOTGPUS", NTOTGPUS)
148+
print(NTOTGPUS, NGPUS_PER_TRIAL, NTOTGPUS // NGPUS_PER_TRIAL, len(queue))
149+
150+
# Define the search space for hyperparameters
151+
# define the evaluator to distribute the computation
152+
evaluator = queued(ProcessPoolEvaluator)(
153+
run,
154+
num_workers=NUM_CONCURRENT_TRIALS,
155+
queue=queue,
156+
queue_pop_per_task=NNODES_PER_TRIAL, # Remove the hard-coded value later
157+
)
158+
159+
# Define the search method and scalarization
160+
# search = CBO(problem, parallel_evaluator, random_state=42, log_dir=log_name)
161+
search = CBO(
162+
problem,
163+
evaluator,
164+
acq_func="UCB",
165+
multi_point_strategy="cl_min", # Constant liar strategy
166+
random_state=42,
167+
# Location where to store the results
168+
log_dir=log_name,
169+
# Number of threads used to update surrogate model of BO
170+
n_jobs=OMP_NUM_THREADS,
171+
)
172+
173+
timeout = None
174+
results = search.search(max_evals=100, timeout=timeout)
175+
print(results)
176+
177+
sys.exit(0)
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/bin/bash
2+
#SBATCH -A m4452_g
3+
#SBATCH -J HydraGNN
4+
#SBATCH -C gpu
5+
#SBATCH -q regular
6+
#SBATCH -t 0:30:00
7+
#SBATCH -N 64
8+
##SBATCH --ntasks-per-node=4
9+
##SBATCH --gpus-per-task=1
10+
##SBATCH -c 32
11+
12+
set -x
13+
14+
export MIOPEN_DISABLE_CACHE=1
15+
#export HSA_DISABLE_CACHE=1
16+
17+
#export ROCM_HOME=/opt/rocm-5.4.2
18+
# export TRANSFORMERS_OFFLINE=1
19+
# export HF_DATASETS_OFFLINE=1
20+
# export NCCL_DEBUG=INFO
21+
# export settings
22+
#export TORCH_EXTENSIONS_DIR=$PWD/deepspeed
23+
# export HF_HOME=$PWD/hfdata
24+
25+
# setup hostfile
26+
HOSTS=.hosts-job$SLURM_JOB_ID
27+
HOSTFILE=hostfile.txt
28+
srun hostname > $HOSTS
29+
sed 's/$/ slots=4/' $HOSTS > $HOSTFILE
30+
31+
# setup env file
32+
#echo "PATH=$PATH" > .deepspeed_env
33+
#echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> .deepspeed_env
34+
#echo "CPATH=$CPATH" >> .deepspeed_env
35+
#echo "TORCH_EXTENSIONS_DIR=$PWD/deepspeed" >> .deepspeed_env
36+
#echo "HF_HOME=$PWD/hfdata" >> .deepspeed_env
37+
#echo "ROCM_HOME=/opt/rocm-5.4.0" >> .deepspeed_env
38+
39+
# Configuration
40+
export NNODES=$SLURM_JOB_NUM_NODES # e.g., 100 total nodes
41+
export NNODES_PER_TRIAL=64
42+
export NUM_CONCURRENT_TRIALS=$(( $NNODES / $NNODES_PER_TRIAL ))
43+
export NTOTGPUS=$(( $NNODES * 4 )) # e.g., 800 total GPUs
44+
export NGPUS_PER_TRIAL=$(( 4 * $NNODES_PER_TRIAL )) # e.g., 32 GPUs per training
45+
export NTOT_DEEPHYPER_RANKS=$(( $NTOTGPUS / $NGPUS_PER_TRIAL )) # e.g., 25 total DH ranks
46+
export OMP_NUM_THREADS=4 # e.g., 8 threads per rank
47+
[ $NTOTGPUS -ne $(($NGPUS_PER_TRIAL*$NUM_CONCURRENT_TRIALS)) ] && echo "ERROR!!"
48+
49+
#export CUDA_DEVICE_MAX_CONNECTIONS=1
50+
#export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
51+
52+
# DeepHyper variables
53+
export DEEPHYPER_LOG_DIR="deephyper-experiment"-$SLURM_JOB_ID
54+
mkdir -p $DEEPHYPER_LOG_DIR
55+
export DEEPHYPER_DB_HOST=$HOST
56+
# Start Redis server (shared memory between search processes)
57+
# TODO: install Redis and set the `redis.conf` path here
58+
#export REDIS_CONF=...
59+
#pushd $DEEPHYPER_LOG_DIR
60+
#redis-server $REDIS_CONF &
61+
#popd
62+
63+
# Safe sleep to let everything start
64+
sleep 5
65+
66+
echo "Doing something"
67+
68+
# Launch DeepHyper (1 rank per node, NTOT_DEEPHYPER_RANKS <= NNODES here)
69+
# meaning NGPUS_PER_TRAINING >= 8
70+
#$NTOT_DEEPHYPER_RANKS
71+
#srun -n1 python qm9_deephyper_multi.py
72+
export HYDRAGNN_SYSTEM=perlmutter
73+
python gfm_deephyper_multi_perlmutter.py

examples/open_catalyst_2020/train.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ def get(self, idx):
138138
parser.add_argument("--shmem", action="store_true", help="shmem")
139139
parser.add_argument("--log", help="log name")
140140
parser.add_argument("--batch_size", type=int, help="batch_size", default=None)
141+
parser.add_argument("--num_epoch", type=int, help="num_epoch", default=None)
141142
parser.add_argument("--everyone", action="store_true", help="gptimer")
142143
parser.add_argument("--modelname", help="model name")
143144

@@ -181,6 +182,9 @@ def get(self, idx):
181182
if args.batch_size is not None:
182183
config["NeuralNetwork"]["Training"]["batch_size"] = args.batch_size
183184

185+
if args.num_epoch is not None:
186+
config["NeuralNetwork"]["Training"]["num_epoch"] = args.num_epoch
187+
184188
##################################################################################################################
185189
# Always initialize for multi-rank training.
186190
comm_size, rank = hydragnn.utils.setup_ddp()

hydragnn/train/train_validate_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ def train_validate_test(
154154
if epoch == 0:
155155
tr.reset()
156156

157+
if int(os.getenv("HYDRAGNN_VALTEST", "1")) == 0:
158+
continue
159+
157160
val_loss, val_taskserr = validate(
158161
val_loader, model, verbosity, reduce_ranks=True
159162
)

hydragnn/utils/deephyper.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,31 @@ def master_from_host(host):
1313
def read_node_list():
1414
node_list = os.environ["SLURM_NODELIST"]
1515
nodes = []
16-
node_subsets = node_list[9:-1].split(",")
17-
for subset in node_subsets:
18-
if "-" in subset:
19-
start, end = subset.split("-")
20-
start, end = int(start), int(end)
21-
for i in range(start, end + 1):
22-
leading_zeros = "".join(["0"] * (5 - len(str(i))))
23-
nodes.append(f"frontier{leading_zeros}{i}")
24-
else:
25-
nodes.append(f"frontier{subset}")
26-
nodes_string = ",".join(nodes)
16+
system = os.getenv("HYDRAGNN_SYSTEM", "frontier")
17+
if system == "frontier":
18+
node_subsets = node_list[9:-1].split(",")
19+
for subset in node_subsets:
20+
if "-" in subset:
21+
start, end = subset.split("-")
22+
start, end = int(start), int(end)
23+
for i in range(start, end + 1):
24+
leading_zeros = "".join(["0"] * (5 - len(str(i))))
25+
nodes.append(f"frontier{leading_zeros}{i}")
26+
else:
27+
nodes.append(f"frontier{subset}")
28+
nodes_string = ",".join(nodes)
29+
elif system == "perlmutter":
30+
node_subsets = node_list[4:-1].split(",")
31+
for subset in node_subsets:
32+
if "-" in subset:
33+
start, end = subset.split("-")
34+
start, end = int(start), int(end)
35+
for i in range(start, end + 1):
36+
leading_zeros = "".join(["0"] * (6 - len(str(i))))
37+
nodes.append(f"nid{leading_zeros}{i}")
38+
else:
39+
nodes.append(f"nid{subset}")
40+
nodes_string = ",".join(nodes)
2741
return nodes, nodes_string
2842

2943

0 commit comments

Comments
 (0)