Skip to content

Commit

Permalink
Ran isort and black. Remove docstring test.
Browse files Browse the repository at this point in the history
  • Loading branch information
leojklarner committed Dec 4, 2023
1 parent d5d2ee2 commit 2b14ee2
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 66 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
strategy:
fail-fast: true
matrix:
python-version: ["3.8", "3.9", "3.10"]
python-version: ["3.9", "3.10", "3.11"]

steps:
- uses: actions/checkout@v4
Expand Down
19 changes: 0 additions & 19 deletions .github/workflows/code-style.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,6 @@ jobs:
- name: Run black
run: black --check .

interrogate:
name: "Ensure docstring coverage"
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Install Python
uses: actions/setup-python@v4
with:
python-version: "3.10"

- name: Install code style checking tools
run: python -m pip install interrogate

- name: Run interrogate
run: interrogate .


isort:
name: "Ensure imports are correctly sorted"
runs-on: ubuntu-latest
Expand Down
5 changes: 2 additions & 3 deletions benchmarks/benchmark_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
GP Model definitions to be used in the benchmarks
"""

from gauche.kernels.fingerprint_kernels.tanimoto_kernel import (
TanimotoKernel,
)
from gpytorch.distributions import MultivariateNormal
from gpytorch.kernels import LinearKernel, ScaleKernel
from gpytorch.means import ConstantMean
from gpytorch.models import ExactGP

from gauche.kernels.fingerprint_kernels.tanimoto_kernel import TanimotoKernel


class TanimotoGP(ExactGP):
def __init__(self, train_x, train_y, likelihood):
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
import torch
from benchmark_models import ScalarProductGP, TanimotoGP
from botorch import fit_gpytorch_model
from gauche.dataloader import MolPropLoader
from gauche.dataloader.data_utils import transform_data
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.mlls import ExactMarginalLogLikelihood
from gpytorch_metrics import (
Expand All @@ -23,6 +21,9 @@
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

from gauche.dataloader import MolPropLoader
from gauche.dataloader.data_utils import transform_data

# Remove Graphein warnings
logging.getLogger("graphein").setLevel("ERROR")

Expand Down
119 changes: 79 additions & 40 deletions notebooks/pretrained_kernel.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,44 @@
import sys, time, warnings, numpy as np, torch, random
sys.path.append('..')
from matplotlib import pyplot as plt
from rdkit.Chem import MolFromSmiles
from sklearn.model_selection import train_test_split
import random
import sys
import time
import warnings

import numpy as np
import torch
from botorch import fit_gpytorch_model
from botorch.acquisition import ExpectedImprovement
from botorch.exceptions import BadInitialCandidatesWarning
from botorch.models.gp_regression import SingleTaskGP

from gpytorch.distributions import MultivariateNormal
from gpytorch.kernels import ScaleKernel, RBFKernel
from gpytorch.kernels import RBFKernel, ScaleKernel
from gpytorch.likelihoods import GaussianLikelihood
from gpytorch.means import ConstantMean
from gpytorch.mlls import ExactMarginalLogLikelihood
from matplotlib import pyplot as plt
from rdkit.Chem import MolFromSmiles
from sklearn.model_selection import train_test_split

from gauche.dataloader import MolPropLoader
from gauche.kernels.gnn_kernels.pretrained_kernel import GNN, mol_to_pyg

sys.path.append("..")





def set_seed(seed):
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
torch.backends.cudnn.deterministic=True
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.cuda.manual_seed_all(seed)


# We define our custom GP surrogate model using the RBF kernel
class GP_PretrainedKernel(SingleTaskGP):

def __init__(self, train_X, train_Y):
super().__init__(train_X, train_Y, GaussianLikelihood())
self.mean_module = ConstantMean()
Expand All @@ -41,8 +50,10 @@ def forward(self, x):
covar_x = self.covar_module(x)
return MultivariateNormal(mean_x, covar_x)


"""We define helper functions for the Bayesian optimisation loop. In particular the acquisition function optimisation procedure is framed so as to take the maximum over a discrete set of heldout molecules."""


def initialize_model(train_x, train_obj, state_dict=None):
"""
Initialise model and loss function.
Expand All @@ -65,7 +76,9 @@ def initialize_model(train_x, train_obj, state_dict=None):
return mll, model


def optimize_acqf_and_get_observation(acq_func, heldout_inputs, heldout_outputs):
def optimize_acqf_and_get_observation(
acq_func, heldout_inputs, heldout_outputs
):
"""
Optimizes the acquisition function, and returns a new candidate and an observation.
Expand All @@ -79,7 +92,9 @@ def optimize_acqf_and_get_observation(acq_func, heldout_inputs, heldout_outputs)
# Loop over the discrete set of points to evaluate the acquisition function at.
acq_vals = []
for i in range(len(heldout_outputs)):
acq_vals.append(acq_func(heldout_inputs[i].unsqueeze(-2))) # use unsqueeze to append batch dimension
acq_vals.append(
acq_func(heldout_inputs[i].unsqueeze(-2))
) # use unsqueeze to append batch dimension

# observe new values
acq_vals = torch.tensor(acq_vals)
Expand All @@ -88,8 +103,12 @@ def optimize_acqf_and_get_observation(acq_func, heldout_inputs, heldout_outputs)
new_obj = heldout_outputs[best_idx].unsqueeze(-1) # add output dimension

# Delete the selected input and value from the heldout set.
heldout_inputs = torch.cat((heldout_inputs[:best_idx], heldout_inputs[best_idx+1:]), axis=0)
heldout_outputs = torch.cat((heldout_outputs[:best_idx], heldout_outputs[best_idx+1:]), axis=0)
heldout_inputs = torch.cat(
(heldout_inputs[:best_idx], heldout_inputs[best_idx + 1 :]), axis=0
)
heldout_outputs = torch.cat(
(heldout_outputs[:best_idx], heldout_outputs[best_idx + 1 :]), axis=0
)

return new_x, new_obj, heldout_inputs, heldout_outputs

Expand All @@ -113,11 +132,16 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs):
best_random.append(max(best_random[-1], next_random_best))

# Delete the selected input and value from the heldout set.
heldout_inputs = torch.cat((heldout_inputs[:index], heldout_inputs[index+1:]), axis=0)
heldout_outputs = torch.cat((heldout_outputs[:index], heldout_outputs[index+1:]), axis=0)
heldout_inputs = torch.cat(
(heldout_inputs[:index], heldout_inputs[index + 1 :]), axis=0
)
heldout_outputs = torch.cat(
(heldout_outputs[:index], heldout_outputs[index + 1 :]), axis=0
)

return best_random, heldout_inputs, heldout_outputs


"""Run the Bayesian optimisation loop, comparing the analytic (sequential) expected improvement acquisition funciton with a random policy."""

# Bayesian optimisation experiment parameters, number of random trials, split size, batch size
Expand All @@ -129,46 +153,48 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs):
verbose = False
set_seed(12)

warnings.filterwarnings('ignore', category=BadInitialCandidatesWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings("ignore", category=BadInitialCandidatesWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)

best_observed_all_ei, best_random_all = [], []

# Load the Photoswitch dataset
loader = MolPropLoader()
loader.load_benchmark("Photoswitch", "../data/property_prediction/Photoswitch.csv")
loader.load_benchmark(
"Photoswitch", "../data/property_prediction/Photoswitch.csv"
)

# We use the fragprints representations (a concatenation of Morgan fingerprints and RDKit fragment features)
y = loader.labels

# get PyTorch Geometric featurisation of molecules
graphs = [
mol_to_pyg(MolFromSmiles(smiles)) for smiles in loader.features
]
graphs = [mol_to_pyg(MolFromSmiles(smiles)) for smiles in loader.features]

# load pretrained model
with torch.no_grad():
model = GNN(gnn_type='gin')
model.load_pretrained('contextpred', device=torch.device("cpu"))
model = GNN(gnn_type="gin")
model.load_pretrained("contextpred", device=torch.device("cpu"))

X = torch.stack(
[
model(
x=graphs[i].x,
edge_index=graphs[i].edge_index,
edge_attr=graphs[i].edge_attr,
).mean(0) for i in range(len(graphs))
).mean(0)
for i in range(len(graphs))
]
)

# average over multiple random trials (each trial splits the initial training set for the GP in a random manner)
for trial in range(1, N_TRIALS + 1):

print(f"\nTrial {trial:>2} of {N_TRIALS} ", end="")
best_observed_ei, best_random = [], []

# Generate initial training data and initialize model
train_x_ei, heldout_x_ei, train_y_ei, heldout_y_ei = train_test_split(X, y, test_size=holdout_set_size, random_state=trial)
train_x_ei, heldout_x_ei, train_y_ei, heldout_y_ei = train_test_split(
X, y, test_size=holdout_set_size, random_state=trial
)
best_observed_value_ei = torch.tensor(np.max(train_y_ei))

# Convert numpy arrays to PyTorch tensors and flatten the label vectors
Expand All @@ -186,27 +212,37 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs):

# run N_ITERS rounds of BayesOpt after the initial random batch
for iteration in range(1, N_ITERS + 1):

t0 = time.time()

# fit the model
fit_gpytorch_model(mll_ei)

# Use analytic acquisition function for batch size of 1.
EI = ExpectedImprovement(model=model_ei, best_f=(train_y_ei.to(train_y_ei)).max())
EI = ExpectedImprovement(
model=model_ei, best_f=(train_y_ei.to(train_y_ei)).max()
)

new_x_ei, new_obj_ei, heldout_x_ei, heldout_y_ei = optimize_acqf_and_get_observation(EI,
heldout_x_ei,
heldout_y_ei)
(
new_x_ei,
new_obj_ei,
heldout_x_ei,
heldout_y_ei,
) = optimize_acqf_and_get_observation(EI, heldout_x_ei, heldout_y_ei)

# update training points
train_x_ei = torch.cat([train_x_ei, new_x_ei])
train_y_ei = torch.cat([train_y_ei, new_obj_ei])

# update random search progress
best_random, heldout_x_random, heldout_y_random = update_random_observations(best_random,
heldout_inputs=heldout_x_random,
heldout_outputs=heldout_y_random)
(
best_random,
heldout_x_random,
heldout_y_random,
) = update_random_observations(
best_random,
heldout_inputs=heldout_x_random,
heldout_outputs=heldout_y_random,
)
best_value_ei = torch.max(new_obj_ei, best_observed_ei[-1])
best_observed_ei.append(best_value_ei)

Expand All @@ -224,18 +260,21 @@ def update_random_observations(best_random, heldout_inputs, heldout_outputs):
print(
f"\nBatch {iteration:>2}: best_value (random, qEI) = "
f"({max(best_random):>4.2f}, {best_value_ei:>4.2f}), "
f"time = {t1 - t0:>4.2f}.", end=""
f"time = {t1 - t0:>4.2f}.",
end="",
)
else:
print(".", end="")

best_observed_all_ei.append(best_observed_ei)
best_random_all.append(best_random)


# Define a confience interval function for plotting.
def ci(y):
return 1.96 * y.std(axis=0) / np.sqrt(N_TRIALS)


iters = np.arange(N_ITERS + 1)

y_ei = []
Expand Down Expand Up @@ -264,14 +303,14 @@ def ci(y):
lower_ei = y_ei_mean - y_ei_std
upper_ei = y_ei_mean + y_ei_std

plt.plot(iters, y_rnd_mean, label='Random')
plt.plot(iters, y_rnd_mean, label="Random")
plt.fill_between(iters, lower_rnd, upper_rnd, alpha=0.2)
plt.plot(iters, y_ei_mean, label='EI')
plt.plot(iters, y_ei_mean, label="EI")
plt.fill_between(iters, lower_ei, upper_ei, alpha=0.2)
plt.xlabel('Number of Iterations')
plt.ylabel('Best Objective Value')
plt.xlabel("Number of Iterations")
plt.ylabel("Best Objective Value")
plt.legend(loc="lower right")
plt.xticks(list(np.arange(1, 21)))
plt.show()

"""EI outperforms random search in terms of selecting molecules with high E isomer pi-pi* transition wavelength! It should be noted that the true objective for photoswitch optimisation would consider all transition wavelengths as well as the thermal half-life and this will hopefully be included in a future notebook!"""
"""EI outperforms random search in terms of selecting molecules with high E isomer pi-pi* transition wavelength! It should be noted that the true objective for photoswitch optimisation would consider all transition wavelengths as well as the thermal half-life and this will hopefully be included in a future notebook!"""
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
package setup
"""

import codecs
import io
import os
import re
import codecs
from typing import Dict, List

from setuptools import find_packages, setup

HERE = os.path.abspath(os.path.dirname(__file__))
Expand Down

0 comments on commit 2b14ee2

Please sign in to comment.