You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I wrote a gym environment similar to gym's CartPole, except that I want to move the cart pole from one location to another location while still keep balance. However, my new environment got an error when I train with TRPO algorithm. My new environment looks good if I simulate it and the error message also seem to point to max_path_length not assigned somehow.
Here is my environment definition:
import logging
import math
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
logger = logging.getLogger(__name__)
class CartPoleMovingEnv(gym.Env):
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second' : 50
}
def __init__(self):
self.gravity = 9.8
self.masscart = 1.0
self.masspole = 0.1
self.total_mass = (self.masspole + self.masscart)
self.length = 0.5 # actually half the pole's length
self.polemass_length = (self.masspole * self.length)
self.force_mag = 10.0
self.tau = 0.02 # seconds between state updates
self._destination = 2
# Angle at which to fail the episode
self.theta_threshold_radians = 30 * 2 * math.pi / 360
self.x_threshold = 4
# Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds
high = np.array([
5,
np.finfo(np.float32).max,
self.theta_threshold_radians * 2,
np.finfo(np.float32).max])
self.action_space = spaces.Discrete(2)
self.observation_space = spaces.Box(-high, high)
self._seed()
self.viewer = None
self.state = None
self.steps_beyond_done = None
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def _step(self, action):
assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
state = self.state
x, x_dot, theta, theta_dot = state
force = self.force_mag if action==1 else -self.force_mag
costheta = math.cos(theta)
sintheta = math.sin(theta)
temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta) / self.total_mass
thetaacc = (self.gravity * sintheta - costheta* temp) / (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
d_old = np.abs(x - self._destination)
x = x + self.tau * x_dot
x_dot = x_dot + self.tau * xacc
theta = theta + self.tau * theta_dot
theta_dot = theta_dot + self.tau * thetaacc
self.state = (x,x_dot,theta,theta_dot)
done = x < -self.x_threshold \
or x > self.x_threshold \
or theta < -self.theta_threshold_radians \
or theta > self.theta_threshold_radians
done = bool(done)
if not done:
# destination is at x = 4.
d = np.abs(x - self._destination) # distancce to destination
# constant -1 to reward fast moving. Only when d is small, we care about theta and theta_dot, otherwise only reward moving towards destination.
reward = -1 + np.exp(-d)*(theta) + np.exp(-d)*theta_dot - (d - d_old)
elif self.steps_beyond_done is None:
# Pole just fell!
self.steps_beyond_done = 0
reward = 1.0
else:
if self.steps_beyond_done == 0:
logger.warning("You are calling 'step()' even though this environment has already returned done = True. You should always call 'reset()' once you receive 'done = True' -- any further steps are undefined behavior.")
self.steps_beyond_done += 1
reward = 0.0
return np.array(self.state), reward, done, {}
def _reset(self):
# position is set to one end of the pole. valocity, angle and angle valocity is set as before, to some random number.
self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
self.state[0] = - 2
self.steps_beyond_done = None
return np.array(self.state)
def _render(self, mode='human', close=False):
if close:
if self.viewer is not None:
self.viewer.close()
self.viewer = None
return
screen_width = 600
screen_height = 400
world_width = 5
scale = screen_width/world_width
carty = 100 # TOP OF CART
polewidth = 10.0
polelen = scale * 1.0
cartwidth = 50.0
cartheight = 30.0
if self.viewer is None:
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(screen_width, screen_height)
l,r,t,b = -cartwidth/2, cartwidth/2, cartheight/2, -cartheight/2
axleoffset =cartheight/4.0
cart = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
self.carttrans = rendering.Transform()
cart.add_attr(self.carttrans)
self.viewer.add_geom(cart)
l,r,t,b = -polewidth/2,polewidth/2,polelen-polewidth/2,-polewidth/2
pole = rendering.FilledPolygon([(l,b), (l,t), (r,t), (r,b)])
pole.set_color(.8,.6,.4)
self.poletrans = rendering.Transform(translation=(0, axleoffset))
pole.add_attr(self.poletrans)
pole.add_attr(self.carttrans)
self.viewer.add_geom(pole)
self.axle = rendering.make_circle(polewidth/2)
self.axle.add_attr(self.poletrans)
self.axle.add_attr(self.carttrans)
self.axle.set_color(.5,.5,.8)
self.viewer.add_geom(self.axle)
self.track = rendering.Line((0,carty), (screen_width,carty))
self.track.set_color(0,0,0)
self.viewer.add_geom(self.track)
if self.state is None: return None
x = self.state
cartx = x[0]*scale+screen_width/2.0 # MIDDLE OF CART
self.carttrans.set_translation(cartx, carty)
self.poletrans.set_rotation(-x[2])
return self.viewer.render(return_rgb_array = mode=='rgb_array')
And here is the code I train it:
env = normalize(GymEnv("CartPoleMoving-v0", force_reset = True))
policy = CategoricalMLPPolicy(
env_spec=env.spec,
# The neural network policy should have two hidden layers, each with 32 hidden units.
hidden_sizes=(8, 8)
)
baseline = LinearFeatureBaseline(env_spec=env.spec)
algo = TRPO(
env=env,
policy=policy,
baseline=baseline,
batch_size=4000,
max_path_length=env.horizon,
n_itr=50,
discount=0.99,
step_size=0.01,
# Uncomment both lines (this and the plot parameter below) to enable plotting
# plot=True,
)
algo.train()
It looks for my new env, max_path_lengh is not initialized correctly. After a careful look, I found that it is related to this line self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']. It seems this is not set for the environment that I created.
How to fix it for my environment? What does the 'tags' mean for the environments from gym?
The text was updated successfully, but these errors were encountered:
Silly me, I just need to set max_path_length = 200 in the definitino of TRPO, and quickly fix it. At least I got my env running, though I don't know why the horizon is not set correctly.
Hi @weiliu620
Could you please explain me - env.spec.tags['wrapper_config.TimeLimit.max_episode_steps'] property is the one which is responsible for max episode length? The default one is 200?
I wrote a gym environment similar to gym's CartPole, except that I want to move the cart pole from one location to another location while still keep balance. However, my new environment got an error when I train with TRPO algorithm. My new environment looks good if I simulate it and the error message also seem to point to max_path_length not assigned somehow.
Here is my environment definition:
And here is the code I train it:
Then I got errors
It looks for my new env, max_path_lengh is not initialized correctly. After a careful look, I found that it is related to this line
self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
. It seems this is not set for the environment that I created.How to fix it for my environment? What does the 'tags' mean for the environments from gym?
The text was updated successfully, but these errors were encountered: