|
| 1 | +import os, sys |
| 2 | + |
| 3 | +import torch |
| 4 | + |
| 5 | +torch.backends.cudnn.enabled = False |
| 6 | + |
| 7 | +# deprecated in torch_geometric 2.0 |
| 8 | +try: |
| 9 | + from torch_geometric.loader import DataLoader |
| 10 | +except: |
| 11 | + from torch_geometric.data import DataLoader |
| 12 | + |
| 13 | +import pandas as pd |
| 14 | +import subprocess |
| 15 | +import re |
| 16 | + |
| 17 | +pd.options.display.max_columns = None |
| 18 | +pd.options.display.max_rows = None |
| 19 | +pd.options.display.width = None |
| 20 | + |
| 21 | +# Retrieve constants |
| 22 | +NNODES = int(os.environ["NNODES"]) |
| 23 | +NTOTGPUS = int(os.environ["NTOTGPUS"]) |
| 24 | +NNODES_PER_TRIAL = int(os.environ["NNODES_PER_TRIAL"]) |
| 25 | +NGPUS_PER_TRIAL = int(os.environ["NGPUS_PER_TRIAL"]) |
| 26 | +NUM_CONCURRENT_TRIALS = int(os.environ["NUM_CONCURRENT_TRIALS"]) |
| 27 | +NTOT_DEEPHYPER_RANKS = int(os.environ["NTOT_DEEPHYPER_RANKS"]) |
| 28 | +OMP_NUM_THREADS = int(os.environ["OMP_NUM_THREADS"]) |
| 29 | +DEEPHYPER_LOG_DIR = os.environ["DEEPHYPER_LOG_DIR"] |
| 30 | +DEEPHYPER_DB_HOST = os.environ["DEEPHYPER_DB_HOST"] |
| 31 | +SLURM_JOB_ID = os.environ["SLURM_JOB_ID"] |
| 32 | + |
| 33 | + |
| 34 | +def _parse_results(stdout): |
| 35 | + pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)" |
| 36 | + matches = re.findall(pattern, stdout.decode()) |
| 37 | + if matches: |
| 38 | + return matches[-1][0] |
| 39 | + else: |
| 40 | + return "F" |
| 41 | + |
| 42 | + |
| 43 | +def run(trial, dequed=None): |
| 44 | + f = open(f"output-{SLURM_JOB_ID}-{trial.id}.txt", "w") |
| 45 | + python_exe = sys.executable |
| 46 | + python_script = os.path.join(os.path.dirname(__file__), "gfm.py") |
| 47 | + |
| 48 | + # TODO: Launch a subprocess with `srun` to train neural networks |
| 49 | + params = trial.parameters |
| 50 | + log_name = "gfm" + "_" + str(trial.id) |
| 51 | + master_addr = f"HYDRAGNN_MASTER_ADDR={dequed[0]}" |
| 52 | + nodelist = ",".join(dequed) |
| 53 | + |
| 54 | + # time srun -u -n32 -c2 --ntasks-per-node=8 --gpus-per-node=8 --gpu-bind=closest |
| 55 | + prefix = " ".join( |
| 56 | + [ |
| 57 | + f"srun", |
| 58 | + f"-N {NNODES_PER_TRIAL} -n {NGPUS_PER_TRIAL}", |
| 59 | + f"--ntasks-per-node=4 --gpus-per-node=4", |
| 60 | + f"--cpus-per-task {OMP_NUM_THREADS} --threads-per-core 1 --cpu-bind threads", |
| 61 | + f"--gpus-per-task=1", |
| 62 | + f"--export=ALL,{master_addr},HYDRAGNN_MAX_NUM_BATCH=100,HYDRAGNN_USE_VARIABLE_GRAPH_SIZE=1,HYDRAGNN_AGGR_BACKEND=mpi", |
| 63 | + f"--nodelist={nodelist}", |
| 64 | + f"--output {DEEPHYPER_LOG_DIR}/output_{SLURM_JOB_ID}_{trial.id}.txt", |
| 65 | + f"--error {DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", |
| 66 | + ] |
| 67 | + ) |
| 68 | + |
| 69 | + command = " ".join( |
| 70 | + [ |
| 71 | + prefix, |
| 72 | + python_exe, |
| 73 | + "-u", |
| 74 | + python_script, |
| 75 | + f"--model_type={trial.parameters['model_type']}", |
| 76 | + f"--hidden_dim={trial.parameters['hidden_dim']}", |
| 77 | + f"--num_conv_layers={trial.parameters['num_conv_layers']}", |
| 78 | + f"--num_headlayers={trial.parameters['num_headlayers']}", |
| 79 | + f"--dim_headlayers={trial.parameters['dim_headlayers']}", |
| 80 | + f"--multi", |
| 81 | + f"--ddstore", |
| 82 | + # f'--multi_model_list="ANI1x,MPTrj,OC2020-2M,OC2022,qm7x"', |
| 83 | + ## debugging |
| 84 | + f'--multi_model_list="ANI1x,MPTrj,qm7x"', |
| 85 | + f"--num_samples=3200", |
| 86 | + f"--num_epoch=5", |
| 87 | + f"--log={log_name}", |
| 88 | + ] |
| 89 | + ) |
| 90 | + print("Command = ", command, flush=True, file=f) |
| 91 | + |
| 92 | + result = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) |
| 93 | + output = "F" |
| 94 | + try: |
| 95 | + pattern = r"Val Loss: ([-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?)" |
| 96 | + fout = open(f"{DEEPHYPER_LOG_DIR}/error_{SLURM_JOB_ID}_{trial.id}.txt", "r") |
| 97 | + while True: |
| 98 | + line = fout.readline() |
| 99 | + matches = re.findall(pattern, line) |
| 100 | + if matches: |
| 101 | + output = -float(matches[-1][0]) |
| 102 | + if not line: |
| 103 | + break |
| 104 | + fout.close() |
| 105 | + |
| 106 | + except Exception as excp: |
| 107 | + print(excp, flush=True, file=f) |
| 108 | + output = "F" |
| 109 | + |
| 110 | + print("Output:", output, flush=True, file=f) |
| 111 | + objective = output |
| 112 | + print(objective, flush=True, file=f) |
| 113 | + metadata = {"some_info": "some_value"} |
| 114 | + f.close() |
| 115 | + |
| 116 | + return {"objective": objective, "metadata": metadata} |
| 117 | + |
| 118 | + |
| 119 | +if __name__ == "__main__": |
| 120 | + |
| 121 | + log_name = f"gfm-{SLURM_JOB_ID}" |
| 122 | + |
| 123 | + # Choose the sampler (e.g., TPESampler or RandomSampler) |
| 124 | + from deephyper.evaluator import Evaluator, ProcessPoolEvaluator, queued |
| 125 | + from deephyper.problem import HpProblem |
| 126 | + from deephyper.search.hps import CBO |
| 127 | + from hydragnn.utils.deephyper import read_node_list |
| 128 | + |
| 129 | + # define the variable you want to optimize |
| 130 | + problem = HpProblem() |
| 131 | + |
| 132 | + # Define the search space for hyperparameters |
| 133 | + problem.add_hyperparameter((2, 6), "num_conv_layers") # discrete parameter |
| 134 | + problem.add_hyperparameter((100, 2000), "hidden_dim") # discrete parameter |
| 135 | + problem.add_hyperparameter((1, 3), "num_headlayers") # discrete parameter |
| 136 | + problem.add_hyperparameter((100, 1000), "dim_headlayers") # discrete parameter |
| 137 | + problem.add_hyperparameter( |
| 138 | + ["EGNN", "SchNet", "PNA"], "model_type" |
| 139 | + ) # categorical parameter |
| 140 | + |
| 141 | + # Create the node queue |
| 142 | + queue, _ = read_node_list() |
| 143 | + print("The queue:", queue, len(queue)) |
| 144 | + print("NNODES_PER_TRIAL", NNODES_PER_TRIAL) |
| 145 | + print("NUM_CONCURRENT_TRIALS", NUM_CONCURRENT_TRIALS) |
| 146 | + print("NGPUS_PER_TRIAL", NGPUS_PER_TRIAL) |
| 147 | + print("NTOTGPUS", NTOTGPUS) |
| 148 | + print(NTOTGPUS, NGPUS_PER_TRIAL, NTOTGPUS // NGPUS_PER_TRIAL, len(queue)) |
| 149 | + |
| 150 | + # Define the search space for hyperparameters |
| 151 | + # define the evaluator to distribute the computation |
| 152 | + evaluator = queued(ProcessPoolEvaluator)( |
| 153 | + run, |
| 154 | + num_workers=NUM_CONCURRENT_TRIALS, |
| 155 | + queue=queue, |
| 156 | + queue_pop_per_task=NNODES_PER_TRIAL, # Remove the hard-coded value later |
| 157 | + ) |
| 158 | + |
| 159 | + # Define the search method and scalarization |
| 160 | + # search = CBO(problem, parallel_evaluator, random_state=42, log_dir=log_name) |
| 161 | + search = CBO( |
| 162 | + problem, |
| 163 | + evaluator, |
| 164 | + acq_func="UCB", |
| 165 | + multi_point_strategy="cl_min", # Constant liar strategy |
| 166 | + random_state=42, |
| 167 | + # Location where to store the results |
| 168 | + log_dir=log_name, |
| 169 | + # Number of threads used to update surrogate model of BO |
| 170 | + n_jobs=OMP_NUM_THREADS, |
| 171 | + ) |
| 172 | + |
| 173 | + timeout = None |
| 174 | + results = search.search(max_evals=100, timeout=timeout) |
| 175 | + print(results) |
| 176 | + |
| 177 | + sys.exit(0) |
0 commit comments