2
2
3
3
import numpy as np
4
4
import requests
5
+ import torch
6
+ from scipy import optimize
5
7
from sklearn .linear_model import LogisticRegression
6
8
7
9
from hopes .dev_utils import override
10
+ from hopes .fun_utils import piecewise_linear
8
11
9
12
10
13
class Policy (ABC ):
@@ -31,6 +34,16 @@ def compute_action_probs(self, obs: np.ndarray) -> np.ndarray:
31
34
action_probs = np .exp (log_likelihoods )
32
35
return action_probs
33
36
37
+ def select_action (self , obs : np .ndarray ) -> np .ndarray :
38
+ """Select actions under the policy for given observations.
39
+
40
+ :param obs: the observation(s) for which to select an action, shape (batch_size,
41
+ obs_dim).
42
+ :return: the selected action(s).
43
+ """
44
+ action_probs = self .compute_action_probs (obs )
45
+ return np .array ([np .random .choice (len (probs ), p = probs ) for probs in action_probs ])
46
+
34
47
35
48
class RandomPolicy (Policy ):
36
49
"""A random policy that selects actions uniformly at random."""
@@ -46,32 +59,163 @@ def log_likelihoods(self, obs: np.ndarray) -> np.ndarray:
46
59
return np .log (action_probs )
47
60
48
61
49
- class RegressionBasedPolicy (Policy ):
50
- """A policy that uses a regression model to predict the log-likelihoods of actions given
51
- observations."""
62
+ class ClassificationBasedPolicy (Policy ):
63
+ """A policy that uses a classification model to predict the log-likelihoods of actions given
64
+ observations.
65
+
66
+ In absence of an actual control policy, this can be used to train a policy on a dataset
67
+ of (obs, act) pairs that would have been collected offline.
68
+ """
52
69
53
70
def __init__ (
54
- self , obs : np .ndarray , act : np .ndarray , regression_model : str = "logistic"
71
+ self ,
72
+ obs : np .ndarray ,
73
+ act : np .ndarray ,
74
+ classification_model : str = "logistic" ,
75
+ model_params : dict | None = None ,
55
76
) -> None :
56
77
"""
57
- :param obs: the observations for training the regression model, shape: (batch_size, obs_dim).
58
- :param act: the actions for training the regression model, shape: (batch_size,).
59
- :param regression_model: the type of regression model to use. For now, only logistic is supported.
78
+ :param obs: the observations for training the classification model, shape: (batch_size, obs_dim).
79
+ :param act: the actions for training the classification model, shape: (batch_size,).
80
+ :param classification_model: the type of classification model to use. For now, only logistic and mlp are supported.
81
+ :param model_params: optional parameters for the classification model.
60
82
"""
61
- assert regression_model in ["logistic" ], "Only logistic regression is supported for now."
83
+ supported_models = ["logistic" , "mlp" ]
84
+ assert (
85
+ classification_model in supported_models
86
+ ), f"Only { supported_models } supported for now."
62
87
assert obs .ndim == 2 , "Observations must have shape (batch_size, obs_dim)."
63
88
assert obs .shape [0 ] == act .shape [0 ], "Number of observations and actions must match."
64
89
65
- self .model_x = obs
66
- self .model_y = act
67
- self .model = LogisticRegression ()
90
+ self .model_obs = obs
91
+ self .model_act = act
92
+ self .num_actions = len (np .unique (act ))
93
+ self .classification_model = classification_model
94
+ self .model_params = model_params or {}
95
+
96
+ if self .classification_model == "logistic" :
97
+ self .model = LogisticRegression ()
98
+
99
+ elif self .classification_model == "mlp" :
100
+ hidden_size = self .model_params .get ("hidden_size" , 64 )
101
+ activation = self .model_params .get ("activation" , "relu" )
102
+ act_cls = torch .nn .ReLU if activation == "relu" else torch .nn .Tanh
103
+ self .model = torch .nn .Sequential (
104
+ torch .nn .Linear (self .model_obs .shape [1 ], hidden_size ),
105
+ act_cls (),
106
+ torch .nn .Linear (hidden_size , hidden_size ),
107
+ act_cls (),
108
+ torch .nn .Linear (hidden_size , self .num_actions ),
109
+ )
68
110
69
111
def fit (self ):
70
- self .model .fit (self .model_x , self .model_y )
112
+ if self .classification_model == "mlp" :
113
+ criterion = torch .nn .CrossEntropyLoss ()
114
+ optimizer = torch .optim .Adam (
115
+ self .model .parameters (), lr = self .model_params .get ("lr" , 0.01 )
116
+ )
117
+
118
+ for epoch in range (self .model_params .get ("num_epochs" , 1000 )):
119
+ optimizer .zero_grad ()
120
+ output = self .model (torch .tensor (self .model_obs , dtype = torch .float32 ))
121
+ loss = criterion (
122
+ output , torch .tensor (self .model_act , dtype = torch .float32 ).view (- 1 ).long ()
123
+ )
124
+ loss .backward ()
125
+ optimizer .step ()
126
+ # print(f"Epoch {epoch}, Loss: {loss.item()}")
127
+
128
+ else :
129
+ self .model .fit (self .model_obs , self .model_act )
130
+
131
+ @override (Policy )
132
+ def log_likelihoods (self , obs : np .ndarray ) -> np .ndarray :
133
+ if self .classification_model == "mlp" :
134
+ with torch .no_grad ():
135
+ output = self .model (torch .Tensor (obs ))
136
+ return torch .log_softmax (output , dim = 1 ).numpy ()
137
+ else :
138
+ return self .model .predict_log_proba (obs )
139
+
140
+
141
+ class PiecewiseLinearPolicy (Policy ):
142
+ """A piecewise linear policy that selects actions based on a set of linear segments defined by
143
+ thresholds and slopes.
144
+
145
+ This can be used to estimate a probability distribution over actions drawn from a BMS
146
+ reset rule, for instance an outdoor air reset that is a function of outdoor air
147
+ temperature and is bounded by a minimum and maximum on both axis. This can also be
148
+ helpful to model a simple schedule, where action is a function of time.
149
+ """
150
+
151
+ def __init__ (
152
+ self ,
153
+ obs : np .ndarray ,
154
+ act : np .ndarray ,
155
+ actions_bins : list [float | int ] | None = None ,
156
+ ):
157
+ """
158
+ :param obs: the observations for training the piecewise linear model, shape: (batch_size, obs_dim).
159
+ :param act: the actions for training the piecewise linear model, shape: (batch_size,).
160
+ :param actions_bins: the bins for discretizing the action space. If not provided, we assume the action space
161
+ is already discretized.
162
+ """
163
+ assert (
164
+ len (obs .shape ) == 1 or obs .shape [1 ] == 1
165
+ ), "Piecewise linear policy only supports 1D observations."
166
+ assert obs .shape [0 ] == act .shape [0 ], "Number of observations and actions must match."
167
+
168
+ self .model_obs = obs .squeeze () if obs .ndim == 2 else obs
169
+ self .model_act = act .squeeze () if act .ndim == 2 else act
170
+ self .model_params = None
171
+
172
+ # discretize the action space
173
+ self .actions_bins = actions_bins if actions_bins else np .unique (self .model_act )
174
+ self .num_actions = len (actions_bins )
175
+
176
+ def fit (self ):
177
+ # estimate bounds from input data
178
+ left_cp_bound_percentile = 30
179
+ right_cp_bound_percentile = 70
180
+ left_cp , right_cp = np .percentile (
181
+ self .model_act , (left_cp_bound_percentile , right_cp_bound_percentile )
182
+ )
183
+ left_cp_min = left_cp
184
+ left_cp_max = right_cp
185
+ right_cp_min = left_cp
186
+ right_cp_max = right_cp
187
+ y0_min = np .min (self .model_act )
188
+ y0_max = np .max (self .model_act )
189
+ y1_min = np .min (self .model_act )
190
+ y1_max = np .max (self .model_act )
191
+ slope_min = - np .inf
192
+ slope_max = np .inf
193
+
194
+ output = optimize .curve_fit (
195
+ piecewise_linear ,
196
+ self .model_obs ,
197
+ self .model_act ,
198
+ bounds = (
199
+ [left_cp_min , right_cp_min , slope_min , y0_min , y1_min ],
200
+ [left_cp_max , right_cp_max , slope_max , y0_max , y1_max ],
201
+ ),
202
+ )
203
+ self .model_params , error = output # noqa
204
+ # print(f"Model params: {self.model_params}")
205
+ # print(f"Error: {error}")
71
206
72
207
@override (Policy )
73
208
def log_likelihoods (self , obs : np .ndarray ) -> np .ndarray :
74
- return self .model .predict_log_proba (obs )
209
+ raw_actions = piecewise_linear (obs , * self .model_params )
210
+ # bin the action to the nearest action using the discretized action space
211
+ actions = [min (self .actions_bins , key = lambda x : abs (x - ra )) for ra in raw_actions ]
212
+ # return the log-likelihoods
213
+ return np .array (
214
+ [
215
+ [np .log (1.0 ) if a == action else np .log (1e-6 ) for a in self .actions_bins ]
216
+ for action in actions
217
+ ]
218
+ )
75
219
76
220
77
221
class HttpPolicy (Policy ):
0 commit comments