cleanup + 1e6 max buffer

sfujim · sfujim · commit c717e75c9ab7 · 2018-11-30T10:57:24.000-05:00
diff --git a/TD3.py b/TD3.py
@@ -76,7 +76,7 @@ def __init__(self, state_dim, action_dim, max_action):
 		self.critic = Critic(state_dim, action_dim).to(device)
 		self.critic_target = Critic(state_dim, action_dim).to(device)
 		self.critic_target.load_state_dict(self.critic.state_dict())
-		self.critic_optimizer = torch.optim.Adam(self.critic.parameters())		
+		self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
 
 		self.max_action = max_action
 
@@ -102,7 +102,6 @@ def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.
 			noise = torch.FloatTensor(u).data.normal_(0, policy_noise).to(device)
 			noise = noise.clamp(-noise_clip, noise_clip)
 			next_action = (self.actor_target(next_state) + noise).clamp(-self.max_action, self.max_action)
-			next_action = next_action.clamp(-self.max_action, self.max_action)
 
 			# Compute the target Q value
 			target_Q1, target_Q2 = self.critic_target(next_state, next_action)
diff --git a/utils.py b/utils.py
@@ -3,16 +3,21 @@
 # Code based on: 
 # https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
 
-# Simple replay buffer
+# Expects tuples of (state, next_state, action, reward, done)
 class ReplayBuffer(object):
-	def __init__(self):
+	def __init__(self, max_size=1e6):
 		self.storage = []
+		self.max_size = max_size
+		self.ptr = 0
 
-	# Expects tuples of (state, next_state, action, reward, done)
 	def add(self, data):
-		self.storage.append(data)
+		if len(self.storage) == self.max_size:
+			self.storage[int(self.ptr)] = data
+			self.ptr = (self.ptr + 1) % self.max_size
+		else:
+			self.storage.append(data)
 
-	def sample(self, batch_size=100):
+	def sample(self, batch_size):
 		ind = np.random.randint(0, len(self.storage), size=batch_size)
 		x, y, u, r, d = [], [], [], [], []