Skip to content

Commit 4836052

Browse files
reward models
1 parent 5bf1d4d commit 4836052

File tree

5 files changed

+619
-2
lines changed

5 files changed

+619
-2
lines changed

hopes/rew/__init__.py

Whitespace-only changes.

hopes/rew/rewards.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
from abc import ABC, abstractmethod
2+
3+
import numpy as np
4+
import torch
5+
from sklearn.linear_model import LinearRegression
6+
from sklearn.preprocessing import PolynomialFeatures
7+
8+
9+
class RewardModel(ABC):
10+
@abstractmethod
11+
def estimate(self, obs: np.ndarray, act: np.ndarray) -> np.ndarray:
12+
"""Estimate the rewards for a given set of observations and actions.
13+
14+
:param obs: the observations for which to estimate the rewards, shape: (batch_size,
15+
obs_dim).
16+
:param act: the actions for which to estimate the rewards, shape: (batch_size,).
17+
:return: the estimated rewards.
18+
"""
19+
raise NotImplementedError
20+
21+
22+
class RewardFunctionModel(RewardModel):
23+
"""A reward model that uses a given reward function to estimate rewards."""
24+
25+
def __init__(self, reward_function: callable) -> None:
26+
"""
27+
:param reward_function: a function that takes in observations and actions and returns rewards.
28+
"""
29+
self.reward_function = reward_function
30+
31+
def estimate(self, obs: np.ndarray, act: np.ndarray) -> np.ndarray:
32+
if obs.ndim == 1:
33+
return self.reward_function(obs, act)
34+
else:
35+
return np.array([self.reward_function(o, a) for o, a in zip(obs, act)])
36+
37+
38+
class RegressionBasedRewardModel(RewardModel):
39+
def __init__(
40+
self,
41+
obs: np.ndarray,
42+
act: np.ndarray,
43+
rew: np.ndarray,
44+
reward_model: str = "linear",
45+
model_params: dict = {},
46+
) -> None:
47+
"""
48+
:param obs: the observations for training the reward model, shape: (batch_size, obs_dim).
49+
:param act: the actions for training the reward model, shape: (batch_size,).
50+
:param rew: the rewards for training the reward model, shape: (batch_size,).
51+
:param reward_model: the type of reward model to use. For now, only linear, polynomial and mlp are supported.
52+
:param model_params: optional parameters for the reward model.
53+
"""
54+
supported_reward_models = ["linear", "polynomial", "mlp"]
55+
assert (
56+
reward_model in supported_reward_models
57+
), f"Only {supported_reward_models} supported for now."
58+
assert obs.ndim == 2, "Observations must have shape (batch_size, obs_dim)."
59+
assert (
60+
obs.shape[0] == act.shape[0] == rew.shape[0]
61+
), "The number of observations, actions, and rewards must be the same."
62+
63+
self.obs = obs
64+
if act.ndim == 1:
65+
act = act.reshape(-1, 1)
66+
self.act = act
67+
if rew.ndim == 1:
68+
rew = rew.reshape(-1, 1)
69+
self.rew = rew
70+
self.model_params = model_params
71+
self.reward_model = reward_model
72+
self.poly_features = None
73+
74+
if self.reward_model == "linear" or self.reward_model == "polynomial":
75+
self.model = LinearRegression()
76+
elif self.reward_model == "mlp":
77+
hidden_size = model_params.get("hidden_size", 64)
78+
activation = model_params.get("activation", "relu")
79+
act_cls = torch.nn.ReLU if activation == "relu" else torch.nn.Tanh
80+
self.model = torch.nn.Sequential(
81+
torch.nn.Linear(obs.shape[1] + act.shape[1], hidden_size),
82+
act_cls(),
83+
torch.nn.Linear(hidden_size, 1),
84+
)
85+
86+
def fit(self) -> None:
87+
model_in = np.concatenate((self.obs, self.act), axis=1)
88+
89+
if self.reward_model == "mlp":
90+
optimizer = torch.optim.Adam(self.model.parameters())
91+
criterion = torch.nn.MSELoss()
92+
for e in range(self.model_params.get("num_epochs", 1000)):
93+
optimizer.zero_grad()
94+
pred_rew = self.model(torch.tensor(model_in, dtype=torch.float32))
95+
loss = criterion(pred_rew, torch.tensor(self.rew, dtype=torch.float32))
96+
loss.backward()
97+
optimizer.step()
98+
99+
elif self.reward_model == "polynomial":
100+
self.poly_features = PolynomialFeatures(degree=self.model_params.get("degree", 2))
101+
self.model.fit(self.poly_features.fit_transform(model_in), self.rew)
102+
103+
elif isinstance(self.model, LinearRegression):
104+
self.model.fit(np.concatenate((self.obs, self.act), axis=1), self.rew)
105+
106+
def estimate(self, obs: np.ndarray, act: np.ndarray) -> np.ndarray:
107+
"""Estimate the rewards for a given set of observations and actions.
108+
109+
:param obs: the observations for which to estimate the rewards, shape: (batch_size,
110+
obs_dim).
111+
:param act: the actions for which to estimate the rewards, shape: (batch_size,).
112+
:return: the estimated rewards, shape: (batch_size,).
113+
"""
114+
if act.ndim == 1:
115+
act = act.reshape(-1, 1)
116+
117+
if isinstance(self.model, torch.nn.Module):
118+
with torch.no_grad():
119+
return (
120+
self.model(
121+
torch.tensor(np.concatenate((obs, act), axis=1), dtype=torch.float32)
122+
)
123+
.numpy()
124+
.flatten()
125+
)
126+
else:
127+
inputs = np.concatenate((obs, act), axis=1)
128+
if self.reward_model == "polynomial":
129+
inputs = self.poly_features.transform(inputs)
130+
return np.squeeze(self.model.predict(inputs))

0 commit comments

Comments
 (0)