Skip to content

Commit 0d574b4

Browse files
piecewise linear policy, refactor classification-based policies
1 parent 48f9263 commit 0d574b4

File tree

5 files changed

+263
-79
lines changed

5 files changed

+263
-79
lines changed

hopes/fun_utils.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import numpy as np
2+
3+
4+
def piecewise_linear(x, left_cp, right_cp, slope, y0, y1) -> np.ndarray:
5+
r"""Define a piecewise linear function with 3 segments, such as:
6+
7+
y0 --- \ left_cp
8+
\ slope
9+
\ right_cp
10+
\ --- y1
11+
12+
Note: the slope is not necessarily negative, the 2nd segment function can be increasing or decreasing.
13+
14+
:param x: the input variable.
15+
:param left_cp: the left change point.
16+
:param right_cp: the right change point.
17+
:param slope: the slope of the linear segment.
18+
:param y0: the base value of the left segment.
19+
:param y1: the base value of the right segment.
20+
"""
21+
# define the conditions for each segment
22+
conditions = [x < left_cp, (x >= left_cp) & (x <= right_cp), x > right_cp]
23+
# first segment is flat until lcp
24+
# second segment is linear between lcp and rcp
25+
# third segment is flat after rcp
26+
funcs = [
27+
lambda _: y0,
28+
lambda v: slope * (v - left_cp) + y0,
29+
lambda _: y1,
30+
]
31+
return np.piecewise(x, conditions, funcs)

hopes/policy/policies.py

Lines changed: 157 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22

33
import numpy as np
44
import requests
5+
import torch
6+
from scipy import optimize
57
from sklearn.linear_model import LogisticRegression
68

79
from hopes.dev_utils import override
10+
from hopes.fun_utils import piecewise_linear
811

912

1013
class Policy(ABC):
@@ -31,6 +34,16 @@ def compute_action_probs(self, obs: np.ndarray) -> np.ndarray:
3134
action_probs = np.exp(log_likelihoods)
3235
return action_probs
3336

37+
def select_action(self, obs: np.ndarray) -> np.ndarray:
38+
"""Select actions under the policy for given observations.
39+
40+
:param obs: the observation(s) for which to select an action, shape (batch_size,
41+
obs_dim).
42+
:return: the selected action(s).
43+
"""
44+
action_probs = self.compute_action_probs(obs)
45+
return np.array([np.random.choice(len(probs), p=probs) for probs in action_probs])
46+
3447

3548
class RandomPolicy(Policy):
3649
"""A random policy that selects actions uniformly at random."""
@@ -46,32 +59,163 @@ def log_likelihoods(self, obs: np.ndarray) -> np.ndarray:
4659
return np.log(action_probs)
4760

4861

49-
class RegressionBasedPolicy(Policy):
50-
"""A policy that uses a regression model to predict the log-likelihoods of actions given
51-
observations."""
62+
class ClassificationBasedPolicy(Policy):
63+
"""A policy that uses a classification model to predict the log-likelihoods of actions given
64+
observations.
65+
66+
In absence of an actual control policy, this can be used to train a policy on a dataset
67+
of (obs, act) pairs that would have been collected offline.
68+
"""
5269

5370
def __init__(
54-
self, obs: np.ndarray, act: np.ndarray, regression_model: str = "logistic"
71+
self,
72+
obs: np.ndarray,
73+
act: np.ndarray,
74+
classification_model: str = "logistic",
75+
model_params: dict | None = None,
5576
) -> None:
5677
"""
57-
:param obs: the observations for training the regression model, shape: (batch_size, obs_dim).
58-
:param act: the actions for training the regression model, shape: (batch_size,).
59-
:param regression_model: the type of regression model to use. For now, only logistic is supported.
78+
:param obs: the observations for training the classification model, shape: (batch_size, obs_dim).
79+
:param act: the actions for training the classification model, shape: (batch_size,).
80+
:param classification_model: the type of classification model to use. For now, only logistic and mlp are supported.
81+
:param model_params: optional parameters for the classification model.
6082
"""
61-
assert regression_model in ["logistic"], "Only logistic regression is supported for now."
83+
supported_models = ["logistic", "mlp"]
84+
assert (
85+
classification_model in supported_models
86+
), f"Only {supported_models} supported for now."
6287
assert obs.ndim == 2, "Observations must have shape (batch_size, obs_dim)."
6388
assert obs.shape[0] == act.shape[0], "Number of observations and actions must match."
6489

65-
self.model_x = obs
66-
self.model_y = act
67-
self.model = LogisticRegression()
90+
self.model_obs = obs
91+
self.model_act = act
92+
self.num_actions = len(np.unique(act))
93+
self.classification_model = classification_model
94+
self.model_params = model_params or {}
95+
96+
if self.classification_model == "logistic":
97+
self.model = LogisticRegression()
98+
99+
elif self.classification_model == "mlp":
100+
hidden_size = self.model_params.get("hidden_size", 64)
101+
activation = self.model_params.get("activation", "relu")
102+
act_cls = torch.nn.ReLU if activation == "relu" else torch.nn.Tanh
103+
self.model = torch.nn.Sequential(
104+
torch.nn.Linear(self.model_obs.shape[1], hidden_size),
105+
act_cls(),
106+
torch.nn.Linear(hidden_size, hidden_size),
107+
act_cls(),
108+
torch.nn.Linear(hidden_size, self.num_actions),
109+
)
68110

69111
def fit(self):
70-
self.model.fit(self.model_x, self.model_y)
112+
if self.classification_model == "mlp":
113+
criterion = torch.nn.CrossEntropyLoss()
114+
optimizer = torch.optim.Adam(
115+
self.model.parameters(), lr=self.model_params.get("lr", 0.01)
116+
)
117+
118+
for epoch in range(self.model_params.get("num_epochs", 1000)):
119+
optimizer.zero_grad()
120+
output = self.model(torch.tensor(self.model_obs, dtype=torch.float32))
121+
loss = criterion(
122+
output, torch.tensor(self.model_act, dtype=torch.float32).view(-1).long()
123+
)
124+
loss.backward()
125+
optimizer.step()
126+
# print(f"Epoch {epoch}, Loss: {loss.item()}")
127+
128+
else:
129+
self.model.fit(self.model_obs, self.model_act)
130+
131+
@override(Policy)
132+
def log_likelihoods(self, obs: np.ndarray) -> np.ndarray:
133+
if self.classification_model == "mlp":
134+
with torch.no_grad():
135+
output = self.model(torch.Tensor(obs))
136+
return torch.log_softmax(output, dim=1).numpy()
137+
else:
138+
return self.model.predict_log_proba(obs)
139+
140+
141+
class PiecewiseLinearPolicy(Policy):
142+
"""A piecewise linear policy that selects actions based on a set of linear segments defined by
143+
thresholds and slopes.
144+
145+
This can be used to estimate a probability distribution over actions drawn from a BMS
146+
reset rule, for instance an outdoor air reset that is a function of outdoor air
147+
temperature and is bounded by a minimum and maximum on both axis. This can also be
148+
helpful to model a simple schedule, where action is a function of time.
149+
"""
150+
151+
def __init__(
152+
self,
153+
obs: np.ndarray,
154+
act: np.ndarray,
155+
actions_bins: list[float | int] | None = None,
156+
):
157+
"""
158+
:param obs: the observations for training the piecewise linear model, shape: (batch_size, obs_dim).
159+
:param act: the actions for training the piecewise linear model, shape: (batch_size,).
160+
:param actions_bins: the bins for discretizing the action space. If not provided, we assume the action space
161+
is already discretized.
162+
"""
163+
assert (
164+
len(obs.shape) == 1 or obs.shape[1] == 1
165+
), "Piecewise linear policy only supports 1D observations."
166+
assert obs.shape[0] == act.shape[0], "Number of observations and actions must match."
167+
168+
self.model_obs = obs.squeeze() if obs.ndim == 2 else obs
169+
self.model_act = act.squeeze() if act.ndim == 2 else act
170+
self.model_params = None
171+
172+
# discretize the action space
173+
self.actions_bins = actions_bins if actions_bins else np.unique(self.model_act)
174+
self.num_actions = len(actions_bins)
175+
176+
def fit(self):
177+
# estimate bounds from input data
178+
left_cp_bound_percentile = 30
179+
right_cp_bound_percentile = 70
180+
left_cp, right_cp = np.percentile(
181+
self.model_act, (left_cp_bound_percentile, right_cp_bound_percentile)
182+
)
183+
left_cp_min = left_cp
184+
left_cp_max = right_cp
185+
right_cp_min = left_cp
186+
right_cp_max = right_cp
187+
y0_min = np.min(self.model_act)
188+
y0_max = np.max(self.model_act)
189+
y1_min = np.min(self.model_act)
190+
y1_max = np.max(self.model_act)
191+
slope_min = -np.inf
192+
slope_max = np.inf
193+
194+
output = optimize.curve_fit(
195+
piecewise_linear,
196+
self.model_obs,
197+
self.model_act,
198+
bounds=(
199+
[left_cp_min, right_cp_min, slope_min, y0_min, y1_min],
200+
[left_cp_max, right_cp_max, slope_max, y0_max, y1_max],
201+
),
202+
)
203+
self.model_params, error = output # noqa
204+
# print(f"Model params: {self.model_params}")
205+
# print(f"Error: {error}")
71206

72207
@override(Policy)
73208
def log_likelihoods(self, obs: np.ndarray) -> np.ndarray:
74-
return self.model.predict_log_proba(obs)
209+
raw_actions = piecewise_linear(obs, *self.model_params)
210+
# bin the action to the nearest action using the discretized action space
211+
actions = [min(self.actions_bins, key=lambda x: abs(x - ra)) for ra in raw_actions]
212+
# return the log-likelihoods
213+
return np.array(
214+
[
215+
[np.log(1.0) if a == action else np.log(1e-6) for a in self.actions_bins]
216+
for action in actions
217+
]
218+
)
75219

76220

77221
class HttpPolicy(Policy):

hopes/rew/rewards.py

Lines changed: 19 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -43,22 +43,20 @@ def __init__(
4343
obs: np.ndarray,
4444
act: np.ndarray,
4545
rew: np.ndarray,
46-
reward_model: str = "linear",
46+
regression_model: str = "linear",
4747
model_params: dict | None = None,
4848
) -> None:
4949
"""
5050
:param obs: the observations for training the reward model, shape: (batch_size, obs_dim).
5151
:param act: the actions for training the reward model, shape: (batch_size,).
5252
:param rew: the rewards for training the reward model, shape: (batch_size,).
53-
:param reward_model: the type of reward model to use. For now, only linear, polynomial and mlp are supported.
53+
:param regression_model: the type of reward model to use. For now, only linear, polynomial and mlp are supported.
5454
:param model_params: optional parameters for the reward model.
5555
"""
56-
if model_params is None:
57-
model_params = {}
5856
supported_reward_models = ["linear", "polynomial", "mlp"]
5957

6058
assert (
61-
reward_model in supported_reward_models
59+
regression_model in supported_reward_models
6260
), f"Only {supported_reward_models} supported for now."
6361
assert obs.ndim == 2, "Observations must have shape (batch_size, obs_dim)."
6462
assert (
@@ -68,18 +66,18 @@ def __init__(
6866
self.obs = obs
6967
self.act = act.reshape(-1, 1) if act.ndim == 1 else act
7068
self.rew = rew.reshape(-1, 1) if rew.ndim == 1 else rew
71-
self.model_params = model_params
72-
self.reward_model = reward_model
69+
self.model_params = model_params or {}
70+
self.regression_model = regression_model
7371
self.poly_features = None
7472

7573
# both linear and polynomial models are implemented using sklearn LinearRegression
7674
# for polynomial model, we use PolynomialFeatures to generate polynomial features then fit the linear model
77-
if self.reward_model == "linear" or self.reward_model == "polynomial":
75+
if self.regression_model == "linear" or self.regression_model == "polynomial":
7876
self.model = LinearRegression()
7977

8078
# mlp model is implemented using torch. We use a simple feedforward neural network and MSE loss.
8179
# configuration is basic for now, but can be extended in the future
82-
elif self.reward_model == "mlp":
80+
elif self.regression_model == "mlp":
8381
hidden_size = model_params.get("hidden_size", 64)
8482
activation = model_params.get("activation", "relu")
8583
act_cls = torch.nn.ReLU if activation == "relu" else torch.nn.Tanh
@@ -93,8 +91,10 @@ def fit(self) -> None:
9391
"""Fit the reward model to the training data."""
9492
model_in = np.concatenate((self.obs, self.act), axis=1)
9593

96-
if self.reward_model == "mlp":
97-
optimizer = torch.optim.Adam(self.model.parameters())
94+
if self.regression_model == "mlp":
95+
optimizer = torch.optim.Adam(
96+
self.model.parameters(), lr=self.model_params.get("lr", 0.01)
97+
)
9898
criterion = torch.nn.MSELoss()
9999
for _ in range(self.model_params.get("num_epochs", 1000)):
100100
optimizer.zero_grad()
@@ -103,12 +103,12 @@ def fit(self) -> None:
103103
loss.backward()
104104
optimizer.step()
105105

106-
elif self.reward_model == "polynomial":
106+
elif self.regression_model == "polynomial":
107107
self.poly_features = PolynomialFeatures(degree=self.model_params.get("degree", 2))
108108
self.model.fit(self.poly_features.fit_transform(model_in), self.rew)
109109

110-
elif isinstance(self.model, LinearRegression):
111-
self.model.fit(np.concatenate((self.obs, self.act), axis=1), self.rew)
110+
elif self.regression_model == "linear":
111+
self.model.fit(model_in, self.rew)
112112

113113
def estimate(self, obs: np.ndarray, act: np.ndarray) -> np.ndarray:
114114
"""Estimate the rewards for a given set of observations and actions.
@@ -121,17 +121,12 @@ def estimate(self, obs: np.ndarray, act: np.ndarray) -> np.ndarray:
121121
if act.ndim == 1:
122122
act = act.reshape(-1, 1)
123123

124-
if isinstance(self.model, torch.nn.Module):
124+
inputs = np.concatenate((obs, act), axis=1)
125+
126+
if self.regression_model == "mlp":
125127
with torch.no_grad():
126-
return (
127-
self.model(
128-
torch.tensor(np.concatenate((obs, act), axis=1), dtype=torch.float32)
129-
)
130-
.numpy()
131-
.flatten()
132-
)
128+
return self.model(torch.tensor(inputs, dtype=torch.float32)).numpy().flatten()
133129
else:
134-
inputs = np.concatenate((obs, act), axis=1)
135-
if self.reward_model == "polynomial":
130+
if self.regression_model == "polynomial":
136131
inputs = self.poly_features.transform(inputs)
137132
return np.squeeze(self.model.predict(inputs))

0 commit comments

Comments
 (0)