Description
I wrote a gym environment similar to gym's CartPole, except that I want to move the cart pole from one location to another location while still keep balance. However, my new environment got an error when I train with TRPO algorithm. My new environment looks good if I simulate it and the error message also seem to point to max_path_length not assigned somehow.
Here is my environment definition:
import logging
import math
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
logger = logging.getLogger(__name__)
class CartPoleMovingEnv(gym.Env):
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second' : 50
}
def __init__(self):
self.gravity = 9.8
self.masscart = 1.0
self.masspole = 0.1
self.total_mass = (self.masspole + self.masscart)
self.length = 0.5 # actually half the pole's length
self.polemass_length = (self.masspole * self.length)
self.force_mag = 10.0
self.tau = 0.02 # seconds between state updates
self._destination = 2
# Angle at which to fail the episode
self.theta_threshold_radians = 30 * 2 * math.pi / 360
self.x_threshold = 4
# Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
high = np.array([
5,
np.finfo(np.float32).max,
self.theta_threshold_radians * 2,
np.finfo(np.float32).max])
self.action_space = spaces.Discrete(2)
self.observation_space = spaces.Box(-high, high)
self._seed()
self.viewer = None
self.state = None
self.steps_beyond_done = None
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def _step(self, action):
assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
state = self.state
x, x_dot, theta, theta_dot = state
force = self.force_mag if action==1 else -self.force_mag
costheta = math.cos(theta)
sintheta = math.sin(theta)
temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
d_old = np.abs(x - self._destination)
x = x + self.tau * x_dot
x_dot = x_dot + self.tau * xacc
theta = theta + self.tau * theta_dot
theta_dot = theta_dot + self.tau * thetaacc
self.state = (x,x_dot,theta,theta_dot)
done = x < -self.x_threshold \
or x > self.x_threshold \
or theta < -self.theta_threshold_radians \
or theta > self.theta_threshold_radians
done = bool(done)
if not done:
# destination is at x = 4.
d = np.abs(x - self._destination) # distancce to destination
# constant -1 to reward fast moving. Only when d is small, we care about theta and theta_dot, otherwise only reward moving towards destination.
reward = -1 + np.exp(-d)*(theta) + np.exp(-d)*theta_dot - (d - d_old)
elif self.steps_beyond_done is None:
# Pole just fell!
self.steps_beyond_done = 0
reward = 1.0
else:
if self.steps_beyond_done == 0:
logger.warning("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
self.steps_beyond_done += 1
reward = 0.0
return np.array(self.state), reward, done, {}
def _reset(self):
# position is set to one end of the pole. valocity, angle and angle valocity is set as before, to some random number.
self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
self.state[0] = - 2
self.steps_beyond_done = None
return np.array(self.state)
def _render(self, mode='human', close=False):
if close:
if self.viewer is not None:
self.viewer.close()
self.viewer = None
return
screen_width = 600
screen_height = 400
world_width = 5
scale = screen_width/world_width
carty = 100 # TOP OF CART
polewidth = 10.0
polelen = scale * 1.0
cartwidth = 50.0
cartheight = 30.0
if self.viewer is None:
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(screen_width, screen_height)
l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2
axleoffset =cartheight/4.0
cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
self.carttrans = rendering.Transform()
cart.add_attr(self.carttrans)
self.viewer.add_geom(cart)
l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
pole.set_color(.8,.6,.4)
self.poletrans = rendering.Transform(translation=(0, axleoffset))
pole.add_attr(self.poletrans)
pole.add_attr(self.carttrans)
self.viewer.add_geom(pole)
self.axle = rendering.make_circle(polewidth/2)
self.axle.add_attr(self.poletrans)
self.axle.add_attr(self.carttrans)
self.axle.set_color(.5,.5,.8)
self.viewer.add_geom(self.axle)
self.track = rendering.Line((0,carty), (screen_width,carty))
self.track.set_color(0,0,0)
self.viewer.add_geom(self.track)
if self.state is None: return None
x = self.state
cartx = x[0]*scale+screen_width/2.0 # MIDDLE OF CART
self.carttrans.set_translation(cartx, carty)
self.poletrans.set_rotation(-x[2])
return self.viewer.render(return_rgb_array = mode=='rgb_array')
And here is the code I train it:
env = normalize(GymEnv("CartPoleMoving-v0", force_reset = True))
policy = CategoricalMLPPolicy(
env_spec=env.spec,
# The neural network policy should have two hidden layers, each with 32 hidden units.
hidden_sizes=(8, 8)
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = TRPO(
env=env,
policy=policy,
baseline=baseline,
batch_size=4000,
max_path_length=env.horizon,
n_itr=50,
discount=0.99,
step_size=0.01,
# Uncomment both lines (this and the plot parameter below) to enable plotting
# plot=True,
)
algo.train()
Then I got errors
In [12]: rl_prac.run_task()
2017-03-27 17:47:46.192399 EDT | Warning: skipping Gym environment monitoring since snapshot_dir not configured.
[2017-03-27 17:47:46,192] Making new env: CartPoleMoving-v0
2017-03-27 17:47:46.196242 EDT | observation space: Box(4,)
2017-03-27 17:47:46.196631 EDT | action space: Discrete(2)
2017-03-27 17:47:46.264618 EDT | Populating workers...
2017-03-27 17:47:46.264905 EDT | Populated
0% 100%
[ ]---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-12-f8411f69ae72> in <module>()
----> 1 rl_prac.run_task()
/home/weiliu/projects/rl_learning/rl_prac.py in run_task(*_)
70 # plot=True,
71 )
---> 72 algo.train()
73
74 def run():
/home/weiliu/packages/rllab/rllab/algos/batch_polopt.py in train(self)
120 for itr in range(self.current_itr, self.n_itr):
121 with logger.prefix('itr #%d | ' % itr):
--> 122 paths = self.sampler.obtain_samples(itr)
123 samples_data = self.sampler.process_samples(itr, paths)
124 self.log_diagnostics(paths)
/home/weiliu/packages/rllab/rllab/algos/batch_polopt.py in obtain_samples(self, itr)
27 max_samples=self.algo.batch_size,
28 max_path_length=self.algo.max_path_length,
---> 29 scope=self.algo.scope,
30 )
31 if self.algo.whole_paths:
/home/weiliu/packages/rllab/rllab/sampler/parallel_sampler.py in sample_paths(policy_params, max_samples, max_path_length, env_params, scope)
123 threshold=max_samples,
124 args=(max_path_length, scope),
--> 125 show_prog_bar=True
126 )
127
/home/weiliu/packages/rllab/rllab/sampler/stateful_pool.py in run_collect(self, collect_once, threshold, args, show_prog_bar)
148 pbar = ProgBarCounter(threshold)
149 while count < threshold:
--> 150 result, inc = collect_once(self.G, *args)
151 results.append(result)
152 count += inc
/home/weiliu/packages/rllab/rllab/sampler/parallel_sampler.py in _worker_collect_one_path(G, max_path_length, scope)
92 def _worker_collect_one_path(G, max_path_length, scope=None):
93 G = _get_scoped_G(G, scope)
---> 94 path = rollout(G.env, G.policy, max_path_length)
95 return path, len(path["rewards"])
96
/home/weiliu/packages/rllab/rllab/sampler/utils.py in rollout(env, agent, max_path_length, animated, speedup, always_return_paths)
16 if animated:
17 env.render()
---> 18 while path_length < max_path_length:
19 a, agent_info = agent.get_action(o)
20 next_o, r, d, env_info = env.step(a)
TypeError: unorderable types: int() < NoneType()
It looks for my new env, max_path_lengh is not initialized correctly. After a careful look, I found that it is related to this line
self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
. It seems this is not set for the environment that I created.
How to fix it for my environment? What does the 'tags' mean for the environments from gym?