sfujim
diff --git a/‎DDPG.py
Lines changed: 43 additions & 62 deletions b/‎DDPG.py
Lines changed: 43 additions & 62 deletions
diff --git a/‎OurDDPG.py
Lines changed: 42 additions & 60 deletions b/‎OurDDPG.py
Lines changed: 42 additions & 60 deletions
diff --git a/‎README.md
Lines changed: 18 additions & 4 deletions b/‎README.md
Lines changed: 18 additions & 4 deletions
@@ -1,9 +1,9 @@
+import copy
 import numpy as np
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import torch.nn.functional as F
-import utils
+
 
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -23,11 +23,10 @@ def __init__(self, state_dim, action_dim, max_action):
 		self.max_action = max_action
 
 
-	def forward(self, x):
-		x = F.relu(self.l1(x))
-		x = F.relu(self.l2(x))
-		x = self.max_action * torch.tanh(self.l3(x)) 
-		return x 
+	def forward(self, state):
+		a = F.relu(self.l1(state))
+		a = F.relu(self.l2(a))
+		return self.max_action * torch.tanh(self.l3(a)) 
 
 
 class Critic(nn.Module):
@@ -39,79 +38,61 @@ def __init__(self, state_dim, action_dim):
 		self.l3 = nn.Linear(300, 1)
 
 
-	def forward(self, x, u):
-		x = F.relu(self.l1(x))
-		x = F.relu(self.l2(torch.cat([x, u], 1)))
-		x = self.l3(x)
-		return x 
+	def forward(self, state, action):
+		q = F.relu(self.l1(state))
+		q = F.relu(self.l2(torch.cat([q, action], 1)))
+		return self.l3(q)
 
 
 class DDPG(object):
-	def __init__(self, state_dim, action_dim, max_action):
+	def __init__(self, state_dim, action_dim, max_action, discount=0.99, tau=0.001):
 		self.actor = Actor(state_dim, action_dim, max_action).to(device)
-		self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
-		self.actor_target.load_state_dict(self.actor.state_dict())
+		self.actor_target = copy.deepcopy(self.actor)
 		self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
 
 		self.critic = Critic(state_dim, action_dim).to(device)
-		self.critic_target = Critic(state_dim, action_dim).to(device)
-		self.critic_target.load_state_dict(self.critic.state_dict())
-		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2)		
+		self.critic_target = copy.deepcopy(self.critic)
+		self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2)
+
+		self.discount = discount
+		self.tau = tau
 
 
 	def select_action(self, state):
 		state = torch.FloatTensor(state.reshape(1, -1)).to(device)
 		return self.actor(state).cpu().data.numpy().flatten()
 
 
-	def train(self, replay_buffer, iterations, batch_size=64, discount=0.99, tau=0.001):
-
-		for it in range(iterations):
-
-			# Sample replay buffer 
-			x, y, u, r, d = replay_buffer.sample(batch_size)
-			state = torch.FloatTensor(x).to(device)
-			action = torch.FloatTensor(u).to(device)
-			next_state = torch.FloatTensor(y).to(device)
-			done = torch.FloatTensor(1 - d).to(device)
-			reward = torch.FloatTensor(r).to(device)
+	def train(self, replay_buffer, batch_size=64):
+		# Sample replay buffer 
+		state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)
 
-			# Compute the target Q value
-			target_Q = self.critic_target(next_state, self.actor_target(next_state))
-			target_Q = reward + (done * discount * target_Q).detach()
+		# Compute the target Q value
+		target_Q = self.critic_target(next_state, self.actor_target(next_state))
+		target_Q = reward + (not_done * self.discount * target_Q).detach()
 
-			# Get current Q estimate
-			current_Q = self.critic(state, action)
+		# Get current Q estimate
+		current_Q = self.critic(state, action)
 
-			# Compute critic loss
-			critic_loss = F.mse_loss(current_Q, target_Q)
+		# Compute critic loss
+		critic_loss = F.mse_loss(current_Q, target_Q)
 
-			# Optimize the critic
-			self.critic_optimizer.zero_grad()
-			critic_loss.backward()
-			self.critic_optimizer.step()
+		# Optimize the critic
+		self.critic_optimizer.zero_grad()
+		critic_loss.backward()
+		self.critic_optimizer.step()
 
-			# Compute actor loss
-			actor_loss = -self.critic(state, self.actor(state)).mean()
-			
-			# Optimize the actor 
-			self.actor_optimizer.zero_grad()
-			actor_loss.backward()
-			self.actor_optimizer.step()
-
-			# Update the frozen target models
-			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
-				target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
-
-			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
-				target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
-
-
-	def save(self, filename, directory):
-		torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
-		torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
+		# Compute actor loss
+		actor_loss = -self.critic(state, self.actor(state)).mean()
+		
+		# Optimize the actor 
+		self.actor_optimizer.zero_grad()
+		actor_loss.backward()
+		self.actor_optimizer.step()
 
+		# Update the frozen target models
+		for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
+			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
 
-	def load(self, filename, directory):
-		self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
-		self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
+		for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
+			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
@@ -1,10 +1,11 @@
+import copy
 import numpy as np
 import torch
 import torch.nn as nn
-from torch.autograd import Variable
 import torch.nn.functional as F
 import utils
 
+
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 # Re-tuned version of Deep Deterministic Policy Gradients (DDPG)
@@ -22,11 +23,10 @@ def __init__(self, state_dim, action_dim, max_action):
 		self.max_action = max_action
 
 
-	def forward(self, x):
-		x = F.relu(self.l1(x))
-		x = F.relu(self.l2(x))
-		x = self.max_action * torch.tanh(self.l3(x)) 
-		return x 
+	def forward(self, state):
+		a = F.relu(self.l1(state))
+		a = F.relu(self.l2(a))
+		return self.max_action * torch.tanh(self.l3(a)) 
 
 
 class Critic(nn.Module):
@@ -38,79 +38,61 @@ def __init__(self, state_dim, action_dim):
 		self.l3 = nn.Linear(300, 1)
 
 
-	def forward(self, x, u):
-		x = F.relu(self.l1(torch.cat([x, u], 1)))
-		x = F.relu(self.l2(x))
-		x = self.l3(x)
-		return x 
+	def forward(self, state, action):
+		q = F.relu(self.l1(torch.cat([state, action], 1)))
+		q = F.relu(self.l2(q))
+		return self.l3(q)
 
 
 class DDPG(object):
-	def __init__(self, state_dim, action_dim, max_action):
+	def __init__(self, state_dim, action_dim, max_action, discount=0.99, tau=0.005):
 		self.actor = Actor(state_dim, action_dim, max_action).to(device)
-		self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
-		self.actor_target.load_state_dict(self.actor.state_dict())
+		self.actor_target = copy.deepcopy(self.actor)
 		self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
 
 		self.critic = Critic(state_dim, action_dim).to(device)
-		self.critic_target = Critic(state_dim, action_dim).to(device)
-		self.critic_target.load_state_dict(self.critic.state_dict())
+		self.critic_target = copy.deepcopy(self.critic)
 		self.critic_optimizer = torch.optim.Adam(self.critic.parameters())		
 
+		self.discount = discount
+		self.tau = tau
+
 
 	def select_action(self, state):
 		state = torch.FloatTensor(state.reshape(1, -1)).to(device)
 		return self.actor(state).cpu().data.numpy().flatten()
 
 
-	def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005):
-
-		for it in range(iterations):
-
-			# Sample replay buffer 
-			x, y, u, r, d = replay_buffer.sample(batch_size)
-			state = torch.FloatTensor(x).to(device)
-			action = torch.FloatTensor(u).to(device)
-			next_state = torch.FloatTensor(y).to(device)
-			done = torch.FloatTensor(1 - d).to(device)
-			reward = torch.FloatTensor(r).to(device)
+	def train(self, replay_buffer, batch_size=100):
+		# Sample replay buffer 
+		state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)
 
-			# Compute the target Q value
-			target_Q = self.critic_target(next_state, self.actor_target(next_state))
-			target_Q = reward + (done * discount * target_Q).detach()
+		# Compute the target Q value
+		target_Q = self.critic_target(next_state, self.actor_target(next_state))
+		target_Q = reward + (not_done * self.discount * target_Q).detach()
 
-			# Get current Q estimate
-			current_Q = self.critic(state, action)
+		# Get current Q estimate
+		current_Q = self.critic(state, action)
 
-			# Compute critic loss
-			critic_loss = F.mse_loss(current_Q, target_Q)
+		# Compute critic loss
+		critic_loss = F.mse_loss(current_Q, target_Q)
 
-			# Optimize the critic
-			self.critic_optimizer.zero_grad()
-			critic_loss.backward()
-			self.critic_optimizer.step()
+		# Optimize the critic
+		self.critic_optimizer.zero_grad()
+		critic_loss.backward()
+		self.critic_optimizer.step()
 
-			# Compute actor loss
-			actor_loss = -self.critic(state, self.actor(state)).mean()
-			
-			# Optimize the actor 
-			self.actor_optimizer.zero_grad()
-			actor_loss.backward()
-			self.actor_optimizer.step()
-
-			# Update the frozen target models
-			for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
-				target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
-
-			for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
-				target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
-
-
-	def save(self, filename, directory):
-		torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
-		torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
+		# Compute actor loss
+		actor_loss = -self.critic(state, self.actor(state)).mean()
+		
+		# Optimize the actor 
+		self.actor_optimizer.zero_grad()
+		actor_loss.backward()
+		self.actor_optimizer.step()
 
+		# Update the frozen target models
+		for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
+			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
 
-	def load(self, filename, directory):
-		self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
-		self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
+		for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
+			target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
@@ -3,23 +3,37 @@
 PyTorch implementation of Twin Delayed Deep Deterministic Policy Gradients (TD3). If you use our code or data please cite the [paper](https://arxiv.org/abs/1802.09477).
 
 Method is tested on [MuJoCo](http://www.mujoco.org/) continuous control tasks in [OpenAI gym](https://github.com/openai/gym). 
-Networks are trained using [PyTorch 0.4](https://github.com/pytorch/pytorch) and Python 2.7. 
+Networks are trained using [PyTorch 1.2](https://github.com/pytorch/pytorch) and Python 3.7. 
 
 ### Usage
-The paper results can be reproduced exactly by running:
+The paper results can be reproduced by running:
 ```
 ./experiments.sh
 ```
 Experiments on single environments can be run by calling:
 ```
-python2 main.py --env HalfCheetah-v1
+python main.py --env HalfCheetah-v2
 ```
 
-Hyper-parameters can be modified with different arguments to main.py. We include an implementation of DDPG (DDPG.py) for easy comparison of hyper-parameters with TD3, this is not the implementation of "Our DDPG" as used in the paper (see OurDDPG.py). 
+Hyper-parameters can be modified with different arguments to main.py. We include an implementation of DDPG (DDPG.py), which is not used in the paper, for easy comparison of hyper-parameters with TD3. This is not the implementation of "Our DDPG" as used in the paper (see OurDDPG.py). 
 
 Algorithms which TD3 compares against (PPO, TRPO, ACKTR, DDPG) can be found at [OpenAI baselines repository](https://github.com/openai/baselines). 
 
 ### Results
+Code is no longer exactly representative of the code used in the paper. Minor adjustments to hyperparamters, etc, to improve performance. Learning curves are still the original results found in the paper.
+
 Learning curves found in the paper are found under /learning_curves. Each learning curve are formatted as NumPy arrays of 201 evaluations (201,), where each evaluation corresponds to the average total reward from running the policy for 10 episodes with no exploration. The first evaluation is the randomly initialized policy network (unused in the paper). Evaluations are peformed every 5000 time steps, over a total of 1 million time steps. 
 
 Numerical results can be found in the paper, or from the learning curves. Video of the learned agent can be found [here](https://youtu.be/x33Vw-6vzso). 
+
+### Bibtex
+
+'''
+@inproceedings{fujimoto2018addressing,
+  title={Addressing Function Approximation Error in Actor-Critic Methods},
+  author={Fujimoto, Scott and Hoof, Herke and Meger, David},
+  booktitle={International Conference on Machine Learning},
+  pages={1582--1591},
+  year={2018}
+}
+'''