This repository was archived by the owner on Jun 13, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathlrpg_cartpole.py
executable file
·305 lines (256 loc) · 12 KB
/
lrpg_cartpole.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#!/usr/bin/env python
import argparse
import bullet_cartpole
import collections
import datetime
import gym
import json
import numpy as np
import signal
import sys
import tensorflow as tf
from tensorflow.python.ops import init_ops
import time
import util
np.set_printoptions(precision=5, threshold=10000, suppress=True, linewidth=10000)
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--num-eval', type=int, default=0,
help="if >0 just run this many episodes with no training")
parser.add_argument('--max-num-actions', type=int, default=0,
help="train for (at least) this number of actions (always finish current episode)"
" ignore if <=0")
parser.add_argument('--max-run-time', type=int, default=0,
help="train for (at least) this number of seconds (always finish current episode)"
" ignore if <=0")
parser.add_argument('--ckpt-dir', type=str, default=None, help="if set save ckpts to this dir")
parser.add_argument('--ckpt-freq', type=int, default=3600, help="freq (sec) to save ckpts")
parser.add_argument('--hidden-layers', type=str, default="100,50", help="hidden layer sizes")
parser.add_argument('--learning-rate', type=float, default=0.0001, help="learning rate")
parser.add_argument('--num-train-batches', type=int, default=10,
help="number of training batches to run")
parser.add_argument('--rollouts-per-batch', type=int, default=10,
help="number of rollouts to run for each training batch")
parser.add_argument('--eval-action-noise', action='store_true', help="whether to use noise during eval")
util.add_opts(parser)
bullet_cartpole.add_opts(parser)
opts = parser.parse_args()
sys.stderr.write("%s\n" % opts)
assert not opts.use_raw_pixels, "TODO: add convnet from ddpg here"
# TODO: if we import slim _before_ building cartpole env we can't start bullet with GL gui o_O
env = bullet_cartpole.BulletCartpole(opts=opts, discrete_actions=True)
import base_network
import tensorflow.contrib.slim as slim
VERBOSE_DEBUG = False
def toggle_verbose_debug(signal, frame):
global VERBOSE_DEBUG
VERBOSE_DEBUG = not VERBOSE_DEBUG
signal.signal(signal.SIGUSR1, toggle_verbose_debug)
DUMP_WEIGHTS = False
def set_dump_weights(signal, frame):
global DUMP_WEIGHTS
DUMP_WEIGHTS = True
signal.signal(signal.SIGUSR2, set_dump_weights)
class LikelihoodRatioPolicyGradientAgent(base_network.Network):
def __init__(self, env):
self.env = env
num_actions = self.env.action_space.n
# we have three place holders we'll use...
# observations; used either during rollout to sample some actions, or
# during training when combined with actions_taken and advantages.
shape_with_batch = [None] + list(self.env.observation_space.shape)
self.observations = tf.placeholder(shape=shape_with_batch,
dtype=tf.float32)
# the actions we took during rollout
self.actions = tf.placeholder(tf.int32, name='actions')
# the advantages we got from taken 'action_taken' in 'observation'
self.advantages = tf.placeholder(tf.float32, name='advantages')
# our model is a very simple MLP
with tf.variable_scope("model"):
# stack of hidden layers on flattened input; (batch,2,2,7) -> (batch,28)
flat_input_state = slim.flatten(self.observations, scope='flat')
final_hidden = self.hidden_layers_starting_at(flat_input_state,
opts.hidden_layers)
logits = slim.fully_connected(inputs=final_hidden,
num_outputs=num_actions,
activation_fn=None)
# in the eval case just pick arg max
self.action_argmax = tf.argmax(logits, 1)
# for rollouts we need an op that samples actions from this
# model to give a stochastic action.
sample_action = tf.multinomial(logits, num_samples=1)
self.sampled_action_op = tf.reshape(sample_action, shape=[])
# we are trying to maximise the product of two components...
# 1) the log_p of "good" actions.
# 2) the advantage term based on the rewards from actions.
# first we need the log_p values for each observation for the actions we specifically
# took by sampling... we first run a log_softmax over the action logits to get
# probabilities.
log_softmax = tf.nn.log_softmax(logits)
self.debug_softmax = tf.exp(log_softmax)
# we then use a mask to only select the elements of the softmaxs that correspond
# to the actions we actually took. we could also do this by complex indexing and a
# gather but i always think this is more natural. the "cost" of dealing with the
# mostly zero one hot, as opposed to doing a gather on sparse indexes, isn't a big
# deal when the number of observations is >> number of actions.
action_mask = tf.one_hot(indices=self.actions, depth=num_actions)
action_log_prob = tf.reduce_sum(log_softmax * action_mask, reduction_indices=1)
# the (element wise) product of these action log_p's with the total reward of the
# episode represents the quantity we want to maximise. we standardise the advantage
# values so roughly 1/2 +ve / -ve as a variance control.
action_mul_advantages = tf.mul(action_log_prob,
util.standardise(self.advantages))
self.loss = -tf.reduce_sum(action_mul_advantages) # recall: we are maximising.
with tf.variable_scope("optimiser"):
# dynamically create optimiser based on opts
optimiser = util.construct_optimiser(opts)
# calc gradients
gradients = optimiser.compute_gradients(self.loss)
# potentially clip and wrap with debugging tf.Print
gradients = util.clip_and_debug_gradients(gradients, opts)
# apply
self.train_op = optimiser.apply_gradients(gradients)
def sample_action_given(self, observation, doing_eval=False):
""" sample one action given observation"""
if doing_eval:
sao, sm = tf.get_default_session().run([self.sampled_action_op, self.debug_softmax],
feed_dict={self.observations: [observation]})
print "EVAL sm ", sm, "action", sao
return sao
# epilson greedy "noise" will do for this simple case..
if np.random.random() < 0.1:
return self.env.action_space.sample()
# sample from logits
return tf.get_default_session().run(self.sampled_action_op,
feed_dict={self.observations: [observation]})
def rollout(self, doing_eval=False):
""" run one episode collecting observations, actions and advantages"""
observations, actions, rewards = [], [], []
observation = self.env.reset()
done = False
while not done:
observations.append(observation)
action = self.sample_action_given(observation, doing_eval)
assert action != 5, "FAIL! (multinomial logits sampling bug?"
observation, reward, done, _ = self.env.step(action)
actions.append(action)
rewards.append(reward)
if VERBOSE_DEBUG:
print "rollout: actions=%s" % (actions)
return observations, actions, rewards
def train(self, observations, actions, advantages):
""" take one training step given observations, actions and subsequent advantages"""
if VERBOSE_DEBUG:
print "TRAIN"
print "observations", np.stack(observations)
print "actions", actions
print "advantages", advantages
_, loss = tf.get_default_session().run([self.train_op, self.loss],
feed_dict={self.observations: observations,
self.actions: actions,
self.advantages: advantages})
else:
_, loss = tf.get_default_session().run([self.train_op, self.loss],
feed_dict={self.observations: observations,
self.actions: actions,
self.advantages: advantages})
return float(loss)
def post_var_init_setup(self):
pass
def run_training(self, max_num_actions, max_run_time, rollouts_per_batch,
saver_util):
# log start time, in case we are limiting by time...
start_time = time.time()
# run for some max number of actions
num_actions_taken = 0
n = 0
while True:
total_rewards = []
losses = []
# perform a number of rollouts
batch_observations, batch_actions, batch_advantages = [], [], []
for _ in xrange(rollouts_per_batch):
observations, actions, rewards = self.rollout()
batch_observations += observations
batch_actions += actions
# train with advantages, not per observation/action rewards.
# _every_ observation/action in this rollout gets assigned
# the _total_ reward of the episode. (crazy that this works!)
batch_advantages += [sum(rewards)] * len(rewards)
# keep total rewards just for debugging / stats
total_rewards.append(sum(rewards))
if min(total_rewards) == max(total_rewards):
# converged ??
sys.stderr.write("converged? standardisation of advantaged will barf here....\n")
loss = 0
else:
loss = self.train(batch_observations, batch_actions, batch_advantages)
losses.append(loss)
# dump some stats and progress info
stats = collections.OrderedDict()
stats["time"] = time.time()
stats["n"] = n
stats["mean_losses"] = float(np.mean(losses))
stats["total_reward"] = np.sum(total_rewards)
stats["episode_len"] = len(rewards)
print "STATS %s\t%s" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
json.dumps(stats))
sys.stdout.flush()
n += 1
# save if required
if saver_util is not None:
saver_util.save_if_required()
# emit occasional eval
if VERBOSE_DEBUG or n % 10 == 0:
self.run_eval(1)
# dump weights once if requested
global DUMP_WEIGHTS
if DUMP_WEIGHTS:
self.debug_dump_network_weights()
DUMP_WEIGHTS = False
# exit when finished
num_actions_taken += len(rewards)
if max_num_actions > 0 and num_actions_taken > max_num_actions:
break
if max_run_time > 0 and time.time() > start_time + max_run_time:
break
def run_eval(self, num_episodes, add_noise=False):
for _ in xrange(num_episodes):
_, _, rewards = self.rollout(doing_eval=True)
print sum(rewards)
def debug_dump_network_weights(self):
fn = "/tmp/weights.%s" % time.time()
with open(fn, "w") as f:
f.write("DUMP time %s\n" % time.time())
for var in tf.all_variables():
f.write("VAR %s %s\n" % (var.name, var.get_shape()))
f.write("%s\n" % var.eval())
print "weights written to", fn
def main():
config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# config.log_device_placement = True
with tf.Session(config=config) as sess:
agent = LikelihoodRatioPolicyGradientAgent(env)
# setup saver util and either load latest ckpt or init variables
saver_util = None
if opts.ckpt_dir is not None:
saver_util = util.SaverUtil(sess, opts.ckpt_dir, opts.ckpt_freq)
else:
sess.run(tf.initialize_all_variables())
for v in tf.all_variables():
print >>sys.stderr, v.name, util.shape_and_product_of(v)
# now that we've either init'd from scratch, or loaded up a checkpoint,
# we can do any required post init work.
agent.post_var_init_setup()
# run either eval or training
if opts.num_eval > 0:
agent.run_eval(opts.num_eval, opts.eval_action_noise)
else:
agent.run_training(opts.max_num_actions, opts.max_run_time,
opts.rollouts_per_batch,
saver_util)
if saver_util is not None:
saver_util.force_save()
env.reset() # just to flush logging, clumsy :/
if __name__ == "__main__":
main()