Add PPO evaluation

Closes: #75. Disable gravity to match thesis results. Fix bug where RNG seed was ignored.
wil3 · Jun 17, 2020 · 6b7913f · 6b7913f
1 parent 677995c
commit 6b7913f
Show file tree

Hide file tree

Showing 15 changed files with 686 additions and 190 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -41,15 +41,71 @@ quadcopter model can be found in `gymfc_nf/twins/nf1`. To use this model with gy
 provide the path to `model.sdf`. 
 
 # Controller examples
+Two controller examples are provided, a PID controller (`pid_example.py`) and a neuro-controller trained via PPO.
 
-Two controller examples are provided, a PID controller (`pid_example.py`) and a neuro-controller trained via PPO (`ppo_example.py`).
+## PID Controller
 
 The PID controller has been tuned using the Ziegler-Nichols method. This tune
-has poor performance, this method is known to cause significant overshoot and should only be used as a baseline or to validate functionality of the environment. 
+has poor performance, this method is known to cause significant overshoot and should only be used as a baseline or to validate functionality of the environment. An interesting use of GymFC would be to use optimization algorithms to compute the PID gains for example genetic algorithms. 
 
+To run the example execute,
+
+```
+python3 pid_example.py
+```
+This comman will display a graph showing the step response of the controller.
+
+## PPO Neuro Controller
 The neuro-controller is synthesized via PPO to generate an optimal controller
-based on the provided reward function. During training a checkpoint directory
-is created containing snapshots of the neuro-controller. Typically you would want to monitor the checkpoints directory and evaluate new checkpoints on the fly so you can track training performance. 
+based on the provided reward function. Neuro-controllers are significantly more
+complex than traditional linear controllers and thus running them is more
+involved.
+
+To train a neuro-controller first execute the trainer,
+```
+python3 ppo_baselines_train.py
+```
+This will train a neuro-controller and by default save Tensorflow checkpoints
+of the neural network every 100,000 timesteps to
+`../../models/<model_name>/checkpoints`.
+
+While we are training we would like to monitor its progress and evaluate each
+checkpoint. In a separate window (suggest using tmux or equivalent), execute the monitor
+and evaluator,
+```
+python3 tf_checkpoint_evaluate.py ../../models/<model_name>/checkpoints --num-trials=3
+```
+This script will monitor the checkpoints directory and when a new checkpoint is
+saved it will evaluate the neural network against num-trials number of random
+setpoints and save the results to `../../models/<model_name>/evaluations`
+
+
+We can then plot some metrics of each checkpoint using,
+```
+python plot_flight_metrics.py ../../models/<model_name>/evaluations
+```
+and look at specific evaluation trials using,
+```
+python3 plot_flight.py ../../models/<model_name>/evaluations/<checkpoint name>/trial-X.csv
+```
+
+After about 2,000,000 time steps you should start to see the model converge.
+Using `plot_flight_metrics.py` and other analysis, select the best checkpoint
+to use in Neuroflight. 
+
+## FAQ
+
+1. Why is my model is not converging?
+
+RL is notorious for being difficult to reproduce. Train multiple agents using different seeds and take the best one. Improvements to the reward function can help increase stability and reproducibility. 
+
+2. Why does my model have such high oscillations?
+
+When selecting an agent **low MAE is not everything!** By training to minimize
+the error the agent is trying to constantly correct it self like an over tuned
+PID controller. The most challenging aspect of this research has been minimizing output oscillations. This has been discussion repeatedly in the reference literature if you like to learn more. Look for agents with minimal 
+changes to their output and try it in the real world to verify oscillations are
+not visible. There is huge room for improvement here. 
 
 # Research challenges 
 

diff --git a/examples/gymfc_nf/envs/base.py b/examples/gymfc_nf/envs/base.py
@@ -45,6 +45,10 @@ def __init__(self, max_sim_time = 30, state_fn = None):
  # has been created the user can update this function.
  self.sample_noise = lambda _: 0
 
+ # A callback made at the end of each step. It takes a single
+ # parameter containing the class reference
+ self.step_callback = None
+
  def set_aircraft_model(self, model):
  """Set the aircraft's model.sdf
  
@@ -69,13 +73,6 @@ def step(self, action):
  """
  self.action = action.copy()
 
- # XXX seed must be called to initialize the RNG which means
- # this can't be set in the constructor
- # Set it here if the user didn't call reset first.
- if not self.np_random:
- seed = int(time.time()* 1e6) 
- self.seed(seed)
-
  # Translate the agents output to the aircraft control signals. In this
  # case our control signal is represented as a percentage. This 
  # function also needs to exist in the flight control firmware. 
@@ -102,6 +99,9 @@ def step(self, action):
 
  self.last_measured_error = self.measured_error.copy() 
  self.last_y = self.y.copy()
+ self.step_counter += 1
+ if self.step_callback:
+ self.step_callback(self, state, reward, done)
  return state, reward, done, {}
 
  def action_to_control_signal(self, action, action_low, action_high, 
@@ -137,6 +137,10 @@ def _init(self):
  self.imu_angular_velocity_rpy = np.zeros(3)
  #self.imu_orientation_quat = np.array([0, 0, 0, 1])
 
+ # Keep track of the number of steps so we can determine how many steps
+ # occur in an episode.
+ self.step_counter = 0
+
  def reset(self):
  self._init()
  self.obs = super().reset()

diff --git a/examples/gymfc_nf/envs/rewards.py b/examples/gymfc_nf/envs/rewards.py
@@ -72,6 +72,7 @@ def compute_reward(self):
  # penalty if the agent does nothing, i.e., refusing to 'play'
  self.doing_nothing_penalty(),
  ]
+ self.ind_rewards = rewards
 
  return np.sum(rewards)
 

diff --git a/examples/gymfc_nf/envs/step.py b/examples/gymfc_nf/envs/step.py
@@ -9,18 +9,19 @@
 
 class StepEnv(RewardEnv): 
  def __init__(self, pulse_width = 1, max_rate = 100, state_fn = None,
- max_sim_time = 1 ): 
+ max_sim_time = 1 ):
  """Create a reinforcement learning environment that generates step input
- setpoints. 
- 
- This environment was created to teach an agent how to respond to 
- worst-case inputs, that is, step inputs in which there is a request for 
- immediate change in the target angular velocity. 
+ setpoints. Technically this is a multi-axis singlet input, the
+ terminology in this package needs to be updated to reflect flight test
+ maneuvers.
 
- Start at zero deg/s to 
- establish an initial condition and teach the agent to idle. Sample 
- random input and hold for pulse_width, then return to zero deg/s to 
- allow system to settle. 
+ This environment was created to teach an agent how to respond to
+ worst-case inputs, that is, step inputs in which there is a request for
+ immediate change in the target angular velocity.
+
+ Start at zero deg/s to establish an initial condition and teach the
+ agent to idle. Sample random input and hold for pulse_width, then
+ return to zero deg/s to allow system to settle.
 
  Args:
  pulse_width: Number of seconds the step is held at the target 
@@ -41,10 +42,11 @@ def __init__(self, pulse_width = 1, max_rate = 100, state_fn = None,
  self.angular_rate_sp = np.zeros(3)
  self.next_pulse_time = 0.512
 
+
  def update_setpoint(self):
  if self.sim_time > self.next_pulse_time:
  if (self.angular_rate_sp == np.zeros(3)).all():
- self.angular_rate_sp = self.sample_target()
+ self.angular_rate_sp = self.generated_input
  self.next_pulse_time += self.pulse_width
  else:
  self.angular_rate_sp = np.zeros(3)
@@ -56,14 +58,13 @@ def reset(self):
  self.outputs = []
  self.angular_rate_sp = np.zeros(3)
  self.next_pulse_time = 0.512
+ # Define the singlet input in the beginning so it can be overriden
+ # externally if needed for testing.
+ self.generated_input = self.sample_target()
  return super().reset()
 
  def sample_target(self):
  """Sample a random angular velocity setpoint """
- if not self.np_random:
- seed = int(time.time() * 1e6) 
- self.seed(seed)
-
  return self.np_random.normal(0, self.max_rate, size=3)
 
 
diff --git a/examples/gymfc_nf/policies/__init__.py b/examples/gymfc_nf/policies/__init__.py
@@ -1,3 +1,4 @@
 
 from gymfc_nf.policies.pidpolicy import PidPolicy
-__all__ = ['PidPolicy']
+from gymfc_nf.policies.baselinespolicy import PpoBaselinesPolicy
+__all__ = ['PidPolicy', 'PpoBaselinesPolicy']
diff --git a/examples/gymfc_nf/policies/baselinespolicy.py b/examples/gymfc_nf/policies/baselinespolicy.py
@@ -0,0 +1,14 @@
+import numpy as np
+import tensorflow as tf
+from .policy import Policy
+class PpoBaselinesPolicy(Policy):
+ def __init__(self, sess):
+ graph = tf.get_default_graph()
+ self.x = graph.get_tensor_by_name('pi/ob:0') 
+ self.y = graph.get_tensor_by_name('pi/pol/final/BiasAdd:0')
+ self.sess = sess
+
+ def action(self, state, sim_time=0, desired=np.zeros(3), actual=np.zeros(3) ):
+
+ y_out = self.sess.run(self.y, feed_dict={self.x:[state] })
+ return y_out[0]
diff --git a/examples/gymfc_nf/utils/log.py b/examples/gymfc_nf/utils/log.py
@@ -0,0 +1,26 @@
+
+def make_header(ob_size):
+ """Make the log header.
+
+ This needs to be done dynamically because the observations used as input 
+ to the NN may differ.
+ """
+ entries = []
+ entries.append("t")
+ for i in range(ob_size):
+ entries.append("ob{}".format(i))
+ for i in range(4):
+ entries.append("ac{}".format(i))
+ entries.append("p") # roll rate
+ entries.append("q") # pitch rate
+ entries.append("r") # yaw rate
+ entries.append("p-sp") # roll rate setpoint
+ entries.append("q-sp") # pitch rate setpoint
+ entries.append("r-sp") # yaw rate setpoint
+ for i in range(4):
+ entries.append("y{}".format(i))
+ for i in range(4):
+ entries.append("w{}".format(i)) # ESC rpms
+ entries.append("reward")
+
+ return ",".join(entries)
diff --git a/examples/gymfc_nf/utils/monitor.py b/examples/gymfc_nf/utils/monitor.py
@@ -0,0 +1,63 @@
+import tensorflow as tf
+import os.path
+import time
+
+
+class CheckpointMonitor:
+ """Helper class to monitor the Tensorflow checkpoints and call a callback
+ when a new checkpoint has been created."""
+
+ def __init__(self, checkpoint_dir, callback):
+ """
+ Args:
+ checkpoint_dir: Directory to monitor where new checkpoint
+ directories will be created
+ callback: A callback for when a new checkpoint is created.
+ """
+ self.checkpoint_dir = checkpoint_dir
+ self.callback = callback
+ # Track which checkpoints have already been called. 
+ self.processed = []
+
+ self.watching = True
+
+ def _check_new_checkpoint(self):
+ """Update the queue with newly found checkpoints.
+
+ When a checkpoint directory is created a 'checkpoint' file is created
+ containing a list of all the checkpoints. We can monitor this file to
+ determine when new checkpoints have been created.
+ """
+ # TODO (wfk) check if there is a way to get a callback when a file has
+ # changed. 
+
+ ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir)
+ for path in ckpt.all_model_checkpoint_paths:
+ checkpoint_filename = os.path.split(path)[-1]
+ if tf.train.checkpoint_exists(path):
+ # Make sure there is a checkpoint meta file before allowing it
+ # to be processed
+ meta_file = path + ".meta"
+ if os.path.isfile(meta_file):
+ if (checkpoint_filename not in self.processed):
+ self.callback(path)
+ self.processed.append(checkpoint_filename)
+ else:
+ print ("Meta file {} doesn't exist.".format(meta_file))
+
+ def start(self):
+
+ # Sit and wait until the checkpoint directory is created, otherwise we
+ # can't monitor it. If it never gets created this could be an indicator
+ # something is wrong with the trainer.
+ c=0
+ while not os.path.isdir(self.checkpoint_dir):
+ print("[WARN {}] Directory {} doesn't exist yet, waiting until "
+ "created...".format(c, self.checkpoint_dir))
+ time.sleep(30)
+ c+=1
+
+ while self.watching:
+ self._check_new_checkpoint()
+ time.sleep(10)
+
diff --git a/examples/plot_flight.py b/examples/plot_flight.py
@@ -0,0 +1,33 @@
+import argparse
+import numpy as np
+import matplotlib.pyplot as plt
+
+from gymfc.tools.plot import *
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser("Plot recorded flight data.")
+ parser.add_argument("log_file", help="Log file.")
+ parser.add_argument("--title", help="Title for the plot.",
+ default="Aircraft Response")
+ args = parser.parse_args()
+
+ fdata = np.loadtxt(args.log_file, delimiter=",")
+
+ # Plot the response
+ f, ax = plt.subplots(5, sharex=True, sharey=False)
+ plt.suptitle(args.title)
+ plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
+ t = fdata[:, 0]
+ pqr = fdata[:, 11:14]
+ pqr_sp = fdata[:, 14:17]
+ plot_rates(ax[:3], t, pqr_sp, pqr)
+
+ us = fdata[:, 17:21]
+ plot_u(ax[3], t, us)
+
+ rpms = fdata[:, 21:25]
+ plot_motor_rpms(ax[4], t, rpms)
+
+ ax[-1].set_xlabel("Time (s)")
+ plt.show()
+