Skip to content

horizon not defined for my environment #103

Open
@weiliu620

Description

@weiliu620

I wrote a gym environment similar to gym's CartPole, except that I want to move the cart pole from one location to another location while still keep balance. However, my new environment got an error when I train with TRPO algorithm. My new environment looks good if I simulate it and the error message also seem to point to max_path_length not assigned somehow.

Here is my environment definition:

import logging
import math
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np

logger = logging.getLogger(__name__)

class CartPoleMovingEnv(gym.Env):
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second' : 50
    }

    def __init__(self):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = (self.masspole + self.masscart)
        self.length = 0.5 # actually half the pole's length
        self.polemass_length = (self.masspole * self.length)
        self.force_mag = 10.0
        self.tau = 0.02  # seconds between state updates
        self._destination = 2

        # Angle at which to fail the episode
        self.theta_threshold_radians = 30 * 2 * math.pi / 360
        self.x_threshold = 4

        # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
        high = np.array([
            5,
            np.finfo(np.float32).max,
            self.theta_threshold_radians * 2,
            np.finfo(np.float32).max])

        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Box(-high, high)

        self._seed()
        self.viewer = None
        self.state = None

        self.steps_beyond_done = None

    def _seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def _step(self, action):
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
        state = self.state
        x, x_dot, theta, theta_dot = state
        force = self.force_mag if action==1 else -self.force_mag
        costheta = math.cos(theta)
        sintheta = math.sin(theta)
        temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
        xacc  = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        d_old = np.abs(x - self._destination)
        x  = x + self.tau * x_dot
        x_dot = x_dot + self.tau * xacc
        theta = theta + self.tau * theta_dot
        theta_dot = theta_dot + self.tau * thetaacc
        self.state = (x,x_dot,theta,theta_dot)
        done =  x < -self.x_threshold \
                or x > self.x_threshold \
                or theta < -self.theta_threshold_radians \
                or theta > self.theta_threshold_radians
        done = bool(done)

        if not done:
            # destination is at x = 4.
            d = np.abs(x - self._destination) # distancce to destination
            # constant -1 to reward fast moving. Only when d is small, we care about theta and theta_dot, otherwise only reward moving towards destination.
            reward = -1 + np.exp(-d)*(theta) + np.exp(-d)*theta_dot - (d - d_old)
        elif self.steps_beyond_done is None:
            # Pole just fell!
            self.steps_beyond_done = 0
            reward = 1.0
        else:
            if self.steps_beyond_done == 0:
                logger.warning("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
            self.steps_beyond_done += 1
            reward = 0.0

        return np.array(self.state), reward, done, {}

    def _reset(self):
        # position is set to one end of the pole. valocity, angle and angle valocity is set as before, to some random number.
        self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
        self.state[0] = - 2
        self.steps_beyond_done = None
        return np.array(self.state)

    def _render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return

        screen_width = 600
        screen_height = 400

        world_width = 5
        scale = screen_width/world_width
        carty = 100 # TOP OF CART
        polewidth = 10.0
        polelen = scale * 1.0
        cartwidth = 50.0
        cartheight = 30.0

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)
            l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2
            axleoffset =cartheight/4.0
            cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
            self.carttrans = rendering.Transform()
            cart.add_attr(self.carttrans)
            self.viewer.add_geom(cart)
            l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
            pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
            pole.set_color(.8,.6,.4)
            self.poletrans = rendering.Transform(translation=(0, axleoffset))
            pole.add_attr(self.poletrans)
            pole.add_attr(self.carttrans)
            self.viewer.add_geom(pole)
            self.axle = rendering.make_circle(polewidth/2)
            self.axle.add_attr(self.poletrans)
            self.axle.add_attr(self.carttrans)
            self.axle.set_color(.5,.5,.8)
            self.viewer.add_geom(self.axle)
            self.track = rendering.Line((0,carty), (screen_width,carty))
            self.track.set_color(0,0,0)
            self.viewer.add_geom(self.track)

        if self.state is None: return None

        x = self.state
        cartx = x[0]*scale+screen_width/2.0 # MIDDLE OF CART
        self.carttrans.set_translation(cartx, carty)
        self.poletrans.set_rotation(-x[2])

        return self.viewer.render(return_rgb_array = mode=='rgb_array')

And here is the code I train it:

    env = normalize(GymEnv("CartPoleMoving-v0", force_reset = True))

    policy = CategoricalMLPPolicy(
        env_spec=env.spec,
        # The neural network policy should have two hidden layers, each with 32 hidden units.
        hidden_sizes=(8, 8)
    )

    baseline = LinearFeatureBaseline(env_spec=env.spec)

    algo = TRPO(
        env=env,
        policy=policy,
        baseline=baseline,
        batch_size=4000,
        max_path_length=env.horizon,
        n_itr=50,
        discount=0.99,
        step_size=0.01,
        # Uncomment both lines (this and the plot parameter below) to enable plotting
        # plot=True,
    )
    algo.train()

Then I got errors

In [12]: rl_prac.run_task()
2017-03-27 17:47:46.192399 EDT | Warning: skipping Gym environment monitoring since snapshot_dir not configured.
[2017-03-27 17:47:46,192] Making new env: CartPoleMoving-v0
2017-03-27 17:47:46.196242 EDT | observation space: Box(4,)
2017-03-27 17:47:46.196631 EDT | action space: Discrete(2)
2017-03-27 17:47:46.264618 EDT | Populating workers...
2017-03-27 17:47:46.264905 EDT | Populated
0%                          100%
[                              ]---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-12-f8411f69ae72> in <module>()
----> 1 rl_prac.run_task()

/home/weiliu/projects/rl_learning/rl_prac.py in run_task(*_)
     70         # plot=True,
     71     )
---> 72     algo.train()
     73 
     74 def run():

/home/weiliu/packages/rllab/rllab/algos/batch_polopt.py in train(self)
    120         for itr in range(self.current_itr, self.n_itr):
    121             with logger.prefix('itr #%d | ' % itr):
--> 122                 paths = self.sampler.obtain_samples(itr)
    123                 samples_data = self.sampler.process_samples(itr, paths)
    124                 self.log_diagnostics(paths)

/home/weiliu/packages/rllab/rllab/algos/batch_polopt.py in obtain_samples(self, itr)
     27             max_samples=self.algo.batch_size,
     28             max_path_length=self.algo.max_path_length,
---> 29             scope=self.algo.scope,
     30         )
     31         if self.algo.whole_paths:

/home/weiliu/packages/rllab/rllab/sampler/parallel_sampler.py in sample_paths(policy_params, max_samples, max_path_length, env_params, scope)
    123         threshold=max_samples,
    124         args=(max_path_length, scope),
--> 125         show_prog_bar=True
    126     )
    127 

/home/weiliu/packages/rllab/rllab/sampler/stateful_pool.py in run_collect(self, collect_once, threshold, args, show_prog_bar)
    148                 pbar = ProgBarCounter(threshold)
    149             while count < threshold:
--> 150                 result, inc = collect_once(self.G, *args)
    151                 results.append(result)
    152                 count += inc

/home/weiliu/packages/rllab/rllab/sampler/parallel_sampler.py in _worker_collect_one_path(G, max_path_length, scope)
     92 def _worker_collect_one_path(G, max_path_length, scope=None):
     93     G = _get_scoped_G(G, scope)
---> 94     path = rollout(G.env, G.policy, max_path_length)
     95     return path, len(path["rewards"])
     96 

/home/weiliu/packages/rllab/rllab/sampler/utils.py in rollout(env, agent, max_path_length, animated, speedup, always_return_paths)
     16     if animated:
     17         env.render()
---> 18     while path_length < max_path_length:
     19         a, agent_info = agent.get_action(o)
     20         next_o, r, d, env_info = env.step(a)

TypeError: unorderable types: int() < NoneType()

It looks for my new env, max_path_lengh is not initialized correctly. After a careful look, I found that it is related to this line
self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']. It seems this is not set for the environment that I created.

How to fix it for my environment? What does the 'tags' mean for the environments from gym?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions