From 1be76e536f40b434bb7dd8849843ecd547f458de Mon Sep 17 00:00:00 2001 From: Wil Koch Date: Sun, 14 Jun 2020 19:02:22 -0400 Subject: [PATCH] Add PPO evaluation Closes: #75. Disable gravity to match thesis results. Fix bug where RNG seed was ignored. --- examples/README.md | 64 ++++- examples/gymfc_nf/envs/base.py | 18 +- examples/gymfc_nf/envs/rewards.py | 1 + examples/gymfc_nf/envs/step.py | 31 ++- examples/gymfc_nf/policies/__init__.py | 3 +- examples/gymfc_nf/policies/baselinespolicy.py | 14 + examples/gymfc_nf/utils/log.py | 26 ++ examples/gymfc_nf/utils/monitor.py | 63 +++++ examples/plot_flight.py | 33 +++ examples/plot_flight_metrics.py | 63 +++++ examples/ppo_baselines_train.py | 256 ++++++++++++++++++ examples/ppo_example.py | 127 --------- examples/tf_checkpoint_evaluate.py | 119 ++++++++ .../envs/assets/gazebo/worlds/attitude.world | 2 + gymfc/tools/plot.py | 56 ++-- 15 files changed, 686 insertions(+), 190 deletions(-) create mode 100644 examples/gymfc_nf/policies/baselinespolicy.py create mode 100644 examples/gymfc_nf/utils/log.py create mode 100644 examples/gymfc_nf/utils/monitor.py create mode 100644 examples/plot_flight.py create mode 100644 examples/plot_flight_metrics.py create mode 100644 examples/ppo_baselines_train.py delete mode 100644 examples/ppo_example.py create mode 100644 examples/tf_checkpoint_evaluate.py diff --git a/examples/README.md b/examples/README.md index 85a1234..9a99407 100644 --- a/examples/README.md +++ b/examples/README.md @@ -41,15 +41,71 @@ quadcopter model can be found in `gymfc_nf/twins/nf1`. To use this model with gy provide the path to `model.sdf`. # Controller examples +Two controller examples are provided, a PID controller (`pid_example.py`) and a neuro-controller trained via PPO. -Two controller examples are provided, a PID controller (`pid_example.py`) and a neuro-controller trained via PPO (`ppo_example.py`). +## PID Controller The PID controller has been tuned using the Ziegler-Nichols method. This tune -has poor performance, this method is known to cause significant overshoot and should only be used as a baseline or to validate functionality of the environment. +has poor performance, this method is known to cause significant overshoot and should only be used as a baseline or to validate functionality of the environment. An interesting use of GymFC would be to use optimization algorithms to compute the PID gains for example genetic algorithms. +To run the example execute, + +``` +python3 pid_example.py +``` +This comman will display a graph showing the step response of the controller. + +## PPO Neuro Controller The neuro-controller is synthesized via PPO to generate an optimal controller -based on the provided reward function. During training a checkpoint directory -is created containing snapshots of the neuro-controller. Typically you would want to monitor the checkpoints directory and evaluate new checkpoints on the fly so you can track training performance. +based on the provided reward function. Neuro-controllers are significantly more +complex than traditional linear controllers and thus running them is more +involved. + +To train a neuro-controller first execute the trainer, +``` +python3 ppo_baselines_train.py +``` +This will train a neuro-controller and by default save Tensorflow checkpoints +of the neural network every 100,000 timesteps to +`../../models//checkpoints`. + +While we are training we would like to monitor its progress and evaluate each +checkpoint. In a separate window (suggest using tmux or equivalent), execute the monitor +and evaluator, +``` +python3 tf_checkpoint_evaluate.py ../../models//checkpoints --num-trials=3 +``` +This script will monitor the checkpoints directory and when a new checkpoint is +saved it will evaluate the neural network against num-trials number of random +setpoints and save the results to `../../models//evaluations` + + +We can then plot some metrics of each checkpoint using, +``` +python plot_flight_metrics.py ../../models//evaluations +``` +and look at specific evaluation trials using, +``` +python3 plot_flight.py ../../models//evaluations//trial-X.csv +``` + +After about 2,000,000 time steps you should start to see the model converge. +Using `plot_flight_metrics.py` and other analysis, select the best checkpoint +to use in Neuroflight. + +## FAQ + +1. Why is my model is not converging? + +RL is notorious for being difficult to reproduce. Train multiple agents using different seeds and take the best one. Improvements to the reward function can help increase stability and reproducibility. + +2. Why does my model have such high oscillations? + +When selecting an agent **low MAE is not everything!** By training to minimize +the error the agent is trying to constantly correct it self like an over tuned +PID controller. The most challenging aspect of this research has been minimizing output oscillations. This has been discussion repeatedly in the reference literature if you like to learn more. Look for agents with minimal +changes to their output and try it in the real world to verify oscillations are +not visible. There is huge room for improvement here. # Research challenges diff --git a/examples/gymfc_nf/envs/base.py b/examples/gymfc_nf/envs/base.py index 0916069..ebe2a8f 100644 --- a/examples/gymfc_nf/envs/base.py +++ b/examples/gymfc_nf/envs/base.py @@ -45,6 +45,10 @@ def __init__(self, max_sim_time = 30, state_fn = None): # has been created the user can update this function. self.sample_noise = lambda _: 0 + # A callback made at the end of each step. It takes a single + # parameter containing the class reference + self.step_callback = None + def set_aircraft_model(self, model): """Set the aircraft's model.sdf @@ -69,13 +73,6 @@ def step(self, action): """ self.action = action.copy() - # XXX seed must be called to initialize the RNG which means - # this can't be set in the constructor - # Set it here if the user didn't call reset first. - if not self.np_random: - seed = int(time.time()* 1e6) - self.seed(seed) - # Translate the agents output to the aircraft control signals. In this # case our control signal is represented as a percentage. This # function also needs to exist in the flight control firmware. @@ -102,6 +99,9 @@ def step(self, action): self.last_measured_error = self.measured_error.copy() self.last_y = self.y.copy() + self.step_counter += 1 + if self.step_callback: + self.step_callback(self, state, reward, done) return state, reward, done, {} def action_to_control_signal(self, action, action_low, action_high, @@ -137,6 +137,10 @@ def _init(self): self.imu_angular_velocity_rpy = np.zeros(3) #self.imu_orientation_quat = np.array([0, 0, 0, 1]) + # Keep track of the number of steps so we can determine how many steps + # occur in an episode. + self.step_counter = 0 + def reset(self): self._init() self.obs = super().reset() diff --git a/examples/gymfc_nf/envs/rewards.py b/examples/gymfc_nf/envs/rewards.py index d4645eb..365b07f 100644 --- a/examples/gymfc_nf/envs/rewards.py +++ b/examples/gymfc_nf/envs/rewards.py @@ -72,6 +72,7 @@ def compute_reward(self): # penalty if the agent does nothing, i.e., refusing to 'play' self.doing_nothing_penalty(), ] + self.ind_rewards = rewards return np.sum(rewards) diff --git a/examples/gymfc_nf/envs/step.py b/examples/gymfc_nf/envs/step.py index 95ed565..f1ca04c 100644 --- a/examples/gymfc_nf/envs/step.py +++ b/examples/gymfc_nf/envs/step.py @@ -9,18 +9,19 @@ class StepEnv(RewardEnv): def __init__(self, pulse_width = 1, max_rate = 100, state_fn = None, - max_sim_time = 1 ): + max_sim_time = 1 ): """Create a reinforcement learning environment that generates step input - setpoints. - - This environment was created to teach an agent how to respond to - worst-case inputs, that is, step inputs in which there is a request for - immediate change in the target angular velocity. + setpoints. Technically this is a multi-axis singlet input, the + terminology in this package needs to be updated to reflect flight test + maneuvers. - Start at zero deg/s to - establish an initial condition and teach the agent to idle. Sample - random input and hold for pulse_width, then return to zero deg/s to - allow system to settle. + This environment was created to teach an agent how to respond to + worst-case inputs, that is, step inputs in which there is a request for + immediate change in the target angular velocity. + + Start at zero deg/s to establish an initial condition and teach the + agent to idle. Sample random input and hold for pulse_width, then + return to zero deg/s to allow system to settle. Args: pulse_width: Number of seconds the step is held at the target @@ -41,10 +42,11 @@ def __init__(self, pulse_width = 1, max_rate = 100, state_fn = None, self.angular_rate_sp = np.zeros(3) self.next_pulse_time = 0.512 + def update_setpoint(self): if self.sim_time > self.next_pulse_time: if (self.angular_rate_sp == np.zeros(3)).all(): - self.angular_rate_sp = self.sample_target() + self.angular_rate_sp = self.generated_input self.next_pulse_time += self.pulse_width else: self.angular_rate_sp = np.zeros(3) @@ -56,14 +58,13 @@ def reset(self): self.outputs = [] self.angular_rate_sp = np.zeros(3) self.next_pulse_time = 0.512 + # Define the singlet input in the beginning so it can be overriden + # externally if needed for testing. + self.generated_input = self.sample_target() return super().reset() def sample_target(self): """Sample a random angular velocity setpoint """ - if not self.np_random: - seed = int(time.time() * 1e6) - self.seed(seed) - return self.np_random.normal(0, self.max_rate, size=3) diff --git a/examples/gymfc_nf/policies/__init__.py b/examples/gymfc_nf/policies/__init__.py index 38a545a..342ff2a 100644 --- a/examples/gymfc_nf/policies/__init__.py +++ b/examples/gymfc_nf/policies/__init__.py @@ -1,3 +1,4 @@ from gymfc_nf.policies.pidpolicy import PidPolicy -__all__ = ['PidPolicy'] +from gymfc_nf.policies.baselinespolicy import PpoBaselinesPolicy +__all__ = ['PidPolicy', 'PpoBaselinesPolicy'] diff --git a/examples/gymfc_nf/policies/baselinespolicy.py b/examples/gymfc_nf/policies/baselinespolicy.py new file mode 100644 index 0000000..3084f80 --- /dev/null +++ b/examples/gymfc_nf/policies/baselinespolicy.py @@ -0,0 +1,14 @@ +import numpy as np +import tensorflow as tf +from .policy import Policy +class PpoBaselinesPolicy(Policy): + def __init__(self, sess): + graph = tf.get_default_graph() + self.x = graph.get_tensor_by_name('pi/ob:0') + self.y = graph.get_tensor_by_name('pi/pol/final/BiasAdd:0') + self.sess = sess + + def action(self, state, sim_time=0, desired=np.zeros(3), actual=np.zeros(3) ): + + y_out = self.sess.run(self.y, feed_dict={self.x:[state] }) + return y_out[0] diff --git a/examples/gymfc_nf/utils/log.py b/examples/gymfc_nf/utils/log.py new file mode 100644 index 0000000..1c896bd --- /dev/null +++ b/examples/gymfc_nf/utils/log.py @@ -0,0 +1,26 @@ + +def make_header(ob_size): + """Make the log header. + + This needs to be done dynamically because the observations used as input + to the NN may differ. + """ + entries = [] + entries.append("t") + for i in range(ob_size): + entries.append("ob{}".format(i)) + for i in range(4): + entries.append("ac{}".format(i)) + entries.append("p") # roll rate + entries.append("q") # pitch rate + entries.append("r") # yaw rate + entries.append("p-sp") # roll rate setpoint + entries.append("q-sp") # pitch rate setpoint + entries.append("r-sp") # yaw rate setpoint + for i in range(4): + entries.append("y{}".format(i)) + for i in range(4): + entries.append("w{}".format(i)) # ESC rpms + entries.append("reward") + + return ",".join(entries) diff --git a/examples/gymfc_nf/utils/monitor.py b/examples/gymfc_nf/utils/monitor.py new file mode 100644 index 0000000..64feaa9 --- /dev/null +++ b/examples/gymfc_nf/utils/monitor.py @@ -0,0 +1,63 @@ +import tensorflow as tf +import os.path +import time + + +class CheckpointMonitor: + """Helper class to monitor the Tensorflow checkpoints and call a callback + when a new checkpoint has been created.""" + + def __init__(self, checkpoint_dir, callback): + """ + Args: + checkpoint_dir: Directory to monitor where new checkpoint + directories will be created + callback: A callback for when a new checkpoint is created. + """ + self.checkpoint_dir = checkpoint_dir + self.callback = callback + # Track which checkpoints have already been called. + self.processed = [] + + self.watching = True + + def _check_new_checkpoint(self): + """Update the queue with newly found checkpoints. + + When a checkpoint directory is created a 'checkpoint' file is created + containing a list of all the checkpoints. We can monitor this file to + determine when new checkpoints have been created. + """ + # TODO (wfk) check if there is a way to get a callback when a file has + # changed. + + ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) + for path in ckpt.all_model_checkpoint_paths: + checkpoint_filename = os.path.split(path)[-1] + if tf.train.checkpoint_exists(path): + # Make sure there is a checkpoint meta file before allowing it + # to be processed + meta_file = path + ".meta" + if os.path.isfile(meta_file): + if (checkpoint_filename not in self.processed): + self.callback(path) + self.processed.append(checkpoint_filename) + else: + print ("Meta file {} doesn't exist.".format(meta_file)) + + def start(self): + + # Sit and wait until the checkpoint directory is created, otherwise we + # can't monitor it. If it never gets created this could be an indicator + # something is wrong with the trainer. + c=0 + while not os.path.isdir(self.checkpoint_dir): + print("[WARN {}] Directory {} doesn't exist yet, waiting until " + "created...".format(c, self.checkpoint_dir)) + time.sleep(30) + c+=1 + + while self.watching: + self._check_new_checkpoint() + time.sleep(10) + diff --git a/examples/plot_flight.py b/examples/plot_flight.py new file mode 100644 index 0000000..80b4947 --- /dev/null +++ b/examples/plot_flight.py @@ -0,0 +1,33 @@ +import argparse +import numpy as np +import matplotlib.pyplot as plt + +from gymfc.tools.plot import * + +if __name__ == '__main__': + parser = argparse.ArgumentParser("Plot recorded flight data.") + parser.add_argument("log_file", help="Log file.") + parser.add_argument("--title", help="Title for the plot.", + default="Aircraft Response") + args = parser.parse_args() + + fdata = np.loadtxt(args.log_file, delimiter=",") + + # Plot the response + f, ax = plt.subplots(5, sharex=True, sharey=False) + plt.suptitle(args.title) + plt.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False) + t = fdata[:, 0] + pqr = fdata[:, 11:14] + pqr_sp = fdata[:, 14:17] + plot_rates(ax[:3], t, pqr_sp, pqr) + + us = fdata[:, 17:21] + plot_u(ax[3], t, us) + + rpms = fdata[:, 21:25] + plot_motor_rpms(ax[4], t, rpms) + + ax[-1].set_xlabel("Time (s)") + plt.show() + diff --git a/examples/plot_flight_metrics.py b/examples/plot_flight_metrics.py new file mode 100644 index 0000000..8651241 --- /dev/null +++ b/examples/plot_flight_metrics.py @@ -0,0 +1,63 @@ +import argparse +import glob +import os.path +import numpy as np +import matplotlib.pyplot as plt +def get_immediate_subdirectories(a_dir): + return [name for name in os.listdir(a_dir) + if os.path.isdir(os.path.join(a_dir, name))] + +def compute_checkpoint_evaluation_metrics(ckpt): + es = [] + rs = [] + dac = [] + us = [] + for trial in glob.glob(ckpt + "/*.csv"): + fdata = np.loadtxt(trial, delimiter=",") + pqr = fdata[:, 11:14] + pqr_sp = fdata[:, 14:17] + e = np.abs(pqr - pqr_sp) + es.append(e) + rs.append(fdata[:, -1]) + ac = fdata[:, 7:11] + dac.append(np.abs(np.diff(ac, axis=0))) + + u = fdata[:, 17:21] + us.append(u) + + + return [np.mean(es), np.mean(dac), np.mean(us), np.mean(rs)] + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Monitor and evaluate Tensorflow checkpoints.") + parser.add_argument('eval_dir', + help="Directory where evaluation logs are saved.") + args = parser.parse_args() + + ckpts = glob.glob(args.eval_dir + "/*/") + + evaluation_metrics = [] + num_ckpts = 0 + ckpts.sort(key=os.path.getmtime) + for ckpt in ckpts: + metrics = compute_checkpoint_evaluation_metrics(ckpt) + evaluation_metrics.append(metrics) + num_ckpts += 1 + + evaluation_metrics = np.array(evaluation_metrics) + + print ("Num=", num_ckpts) + x = range(num_ckpts) + + labels = ["MAE", "Ave. u", "Ave. delta action", "Ave. reward"] + f, ax = plt.subplots(len(labels), sharex=True, sharey=False) + for i in range(evaluation_metrics.shape[1]): + ax[i].plot(x, evaluation_metrics[:, i]) + ax[i].set_ylabel(labels[i]) + ax[i].grid() + + ax[1].hlines(0.12, 0, num_ckpts, label="idle") + ax[1].legend() + + ax[-1].set_xlabel("Checkpoint") + plt.show() diff --git a/examples/ppo_baselines_train.py b/examples/ppo_baselines_train.py new file mode 100644 index 0000000..8d7eadc --- /dev/null +++ b/examples/ppo_baselines_train.py @@ -0,0 +1,256 @@ +"""Train a neuro-flight controller attitude control using the PPO RL +algorithm. + +It is recommended before training for an extended period of time to set the +ckpt_freq and timestep arguments to small numbers and test everything is +functioning as expected. + +The Tensorflow input tensor is named 'pi/ob:0', and the output tensor is named +'pi/pol/final/BiasAdd:0'. This can be used to extract the subgraph and evaluate +the graph in pb format which is later needed by Neuroflight. An example of this +in Python is in gymfc_nf.policies.PpoBaselinesPolicy + +""" +from gymfc_nf.envs import * +import os.path +import time +import datetime +import subprocess +import numpy as np +np.seterr('ignore') +import gym +import argparse +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +import tensorflow as tf + +tf.logging.set_verbosity(tf.logging.ERROR) +def get_commit_hash(): + out = subprocess.run(["git", "describe", "--always"], stdout=subprocess.PIPE, encoding="utf-8") + commit = out.stdout.strip() + return commit + +def get_training_name(): + now = datetime.datetime.now() + timestamp = now.strftime('%Y%m%d-%H%M%S') + return "baselines_" + get_commit_hash() + "_" + timestamp + + +def train(env, num_timesteps, seed, ckpt_dir=None, + render=False, ckpt_freq=0, restore_dir=None, optim_stepsize=3e-4, + schedule="linear", gamma=0.99, optim_epochs=10, optim_batchsize=64, + horizon=2048): + + from baselines.common.fc_learning_utils import FlightLog + from mpi4py import MPI + from baselines import logger + from baselines.ppo1.mlp_policy import MlpPolicy + from baselines.common import set_global_seeds + from baselines.ppo1 import pposgd_simple + import baselines.common.tf_util as U + sess = U.single_threaded_session() + sess.__enter__() + + rank = MPI.COMM_WORLD.Get_rank() + if rank == 0: + logger.configure() + else: + logger.configure(format_strs=[]) + logger.set_level(logger.DISABLED) + workerseed = seed + 1000000 * rank + def policy_fn(name, ob_space, ac_space): + return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, + hid_size=32, num_hid_layers=2) + if render: + env.render() + env.seed(workerseed) + set_global_seeds(workerseed) + pposgd_simple.learn(env, policy_fn, + max_timesteps=num_timesteps, + timesteps_per_actorbatch=horizon, + clip_param=0.2, entcoeff=0.0, + optim_epochs=optim_epochs, + optim_stepsize=optim_stepsize, + optim_batchsize=optim_batchsize, + gamma=0.99, lam=0.95, schedule=schedule, + flight_log = None, + ckpt_dir = ckpt_dir, + restore_dir = restore_dir, + save_timestep_period= ckpt_freq + ) + env.close() + + +class StepCallback: + + def __init__(self, total_timesteps, log_freq=1): + """ + Args: + total_timesteps: Total timesteps for training + log_freq: Number of episodes until an update log message is printed + """ + self.timesteps = total_timesteps + self.steps_taken = 0 + self.es = [] + self.sps = [] + self.ep = 1 + self.rewards = [] + self.log_freq = log_freq + self.log_header = ["Ep", + "Done", + "Steps", + + "r", + "-ydelta", + "+ymin", + "+/-e", + "-ahigh", + "-nothing", + + "score", + + "pMAE", + "qMAE", + "rMAE"] + + header_format = ["{:<5}", + "{:<7}", + "{:<15}", + + "{:<15}", + "{:<15}", + "{:<15}", + "{:<15}", + "{:<15}", + "{:<15}", + + "{:<10}", + + "{:<7}", + "{:<7}", + "{:<7}"] + self.header_format = "".join(header_format) + + log_format_entries = ["{:<5}", + "{:<7.0%}", + "{:<15}", + + "{:<15.0f}", + "{:<15.0f}", + "{:<15.0f}", + "{:<15.0f}", + "{:<15.0f}", + "{:<15.0f}", + + "{:<10.2f}", + + "{:<7.0f}", + "{:<7.0f}", + "{:<7.0f}"] + + self.log_format = "".join(log_format_entries) + + def callback(self, local, state, reward, done): + + self.es.append(local.true_error) + self.sps.append(local.angular_rate_sp) + + assert local.ind_rewards[0] <= 0 # oscillation penalty + assert local.ind_rewards[1] >= 0 # min output reward + assert local.ind_rewards[3] <= 0 # over saturation penalty + assert local.ind_rewards[4] <= 0 # do nothing penalty + + self.rewards.append(local.ind_rewards) + + if done: + if self.ep == 1: + print(self.header_format.format(*self.log_header)) + # XXX (wfk) Try this new score, we need something normalized to handle the + # random setpoints. Scale by the setpoint, larger setpoints incur + # more error. +1 prevents divide by zero + mae = np.mean(np.abs(self.es)) + mae_pqr = np.mean(np.abs(self.es), axis=0) + e_score = mae / (1 + np.mean(np.abs(self.sps))) + self.steps_taken += local.step_counter + + if self.ep % self.log_freq == 0: + ave_ind_rewards = np.mean(self.rewards, axis=0) + ind_rewards = "" + for r in ave_ind_rewards: + ind_rewards += "{:<15.2f} ".format(r) + + log_data = [ + self.ep, + self.steps_taken/self.timesteps, + self.steps_taken, + + np.mean(self.rewards), + ave_ind_rewards[0], + ave_ind_rewards[1], + ave_ind_rewards[2], + ave_ind_rewards[3], + ave_ind_rewards[4], + + e_score, + mae_pqr[0], + mae_pqr[1], + mae_pqr[2] + ] + print (self.log_format.format(*log_data)) + + self.ep += 1 + self.es = [] + self.sps = [] + self.rewards = [] + +if __name__ == '__main__': + parser = argparse.ArgumentParser("Synthesize a neuro-flight controller.") + parser.add_argument('--model_dir', default="../../models", + help="Directory where models are saved to.") + parser.add_argument('--twin', default="./gymfc_nf/twins/nf1/model.sdf", + help="File path of the aircraft digitial twin/model SDF.") + parser.add_argument('--seed', type=int, default=np.random.randint(0, 1e6), + help="Seed for RNG.") + parser.add_argument('--gym-id', default="gymfc_nf-step-v1") + parser.add_argument('--timesteps', type=int, default=10e6) + parser.add_argument('--ckpt-freq', type=int, default=100e3) + args = parser.parse_args() + + training_dir = os.path.join(args.model_dir, get_training_name()) + + seed = args.seed + ckpt_dir = os.path.abspath(os.path.join(training_dir, "checkpoints")) + print ("Saving checkpoints to {}.".format(ckpt_dir)) + render = False + # How many timesteps until a checkpoint is saved + ckpt_freq = args.ckpt_freq + + # RL hyperparameters + timesteps = args.timesteps + schedule = "linear" + step_size = 1e-4 + horizon = 512 + batchsize = 64 + epochs = 5 + gamma = 0.99 + + env_id = args.gym_id + env = gym.make(env_id) + def sample_noise(inst): + # Experiementally derived for MatekF7 FC, see Chapter 5 of "Flight + # Controller Synthesis Via Deep Reinforcement Learning" for methodology. + r_noise = inst.np_random.normal(-0.25465, 1.3373) + p_noise = inst.np_random.normal(0.241961, 0.9990) + y_noise = inst.np_random.normal(0.07906, 1.45168) + return np.array([r_noise, p_noise, y_noise]) + + env.sample_noise = sample_noise + env.set_aircraft_model(args.twin) + + cb = StepCallback(timesteps) + env.step_callback = cb.callback + + train(env, timesteps, seed, ckpt_dir = ckpt_dir, render = render, + ckpt_freq = ckpt_freq, schedule = schedule, optim_stepsize = step_size, + horizon = horizon, optim_batchsize = batchsize, optim_epochs = epochs, + gamma = gamma) + diff --git a/examples/ppo_example.py b/examples/ppo_example.py deleted file mode 100644 index 056b246..0000000 --- a/examples/ppo_example.py +++ /dev/null @@ -1,127 +0,0 @@ -"""Train a neuro-flight controller attitude control using the PPO RL -algorithm. - -It is recommended before training for an extended period of time to set the ckpt_freq -and timestep arguments to small numbers and test everything is functioning as -expected. - -The Tensorflow input tensor is named 'pi/ob:0', and the output tensor is named -'pi/pol/final/BiasAdd:0'. This can be used to extract the subgraph and evaluate -the graph in pb format. -""" -from gymfc_nf.envs import * -import os.path -import time -import datetime -import subprocess -import numpy as np -import gym -import argparse -import tensorflow as tf - -tf.logging.set_verbosity(tf.logging.ERROR) -def get_commit_hash(): - out = subprocess.run(["git", "describe", "--always"], stdout=subprocess.PIPE, encoding="utf-8") - commit = out.stdout.strip() - return commit - -def get_training_name(): - now = datetime.datetime.now() - timestamp = now.strftime('%y%m%d-%H%M%S') - return "baselines_" + get_commit_hash() + "_" + timestamp - - -def train(env, num_timesteps, seed, flight_log_dir=None, ckpt_dir=None, - render=False, ckpt_freq=0, restore_dir=None, optim_stepsize=3e-4, - schedule="linear", gamma=0.99, optim_epochs=10, optim_batchsize=64, - horizon=2048): - - from baselines.common.fc_learning_utils import FlightLog - from mpi4py import MPI - from baselines import logger - from baselines.ppo1.mlp_policy import MlpPolicy - from baselines.common import set_global_seeds - from baselines.ppo1 import pposgd_simple - import baselines.common.tf_util as U - sess = U.single_threaded_session() - sess.__enter__() - - rank = MPI.COMM_WORLD.Get_rank() - if rank == 0: - logger.configure() - else: - logger.configure(format_strs=[]) - logger.set_level(logger.DISABLED) - workerseed = seed + 1000000 * rank - def policy_fn(name, ob_space, ac_space): - return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, - hid_size=32, num_hid_layers=2) - flight_log = None - #if flight_log_dir: - # flight_log = FlightLog(flight_log_dir) - if render: - env.render() - env.seed(workerseed) - set_global_seeds(workerseed) - pposgd_simple.learn(env, policy_fn, - max_timesteps=num_timesteps, - timesteps_per_actorbatch=horizon, - clip_param=0.2, entcoeff=0.0, - optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, - gamma=0.99, lam=0.95, schedule=schedule, - flight_log = flight_log, - ckpt_dir = ckpt_dir, - restore_dir = restore_dir, - save_timestep_period= ckpt_freq - ) - env.close() - - - -if __name__ == '__main__': - parser = argparse.ArgumentParser("Synthesize a neuro-flight controller.") - parser.add_argument('--checkpoint_dir', default="../../checkpoints") - parser.add_argument('--twin', default="./gymfc_nf/twins/nf1/model.sdf", - help="File path of the aircraft digitial twin/model SDF.") - parser.add_argument('--gym-id', default="gymfc_nf-step-v1") - parser.add_argument('--timesteps', type=int, default=10e6) - parser.add_argument('--ckpt-freq', type=int, default=100e3) - args = parser.parse_args() - - training_dir = os.path.join(args.checkpoint_dir, get_training_name()) - print ("Storing results to ", training_dir) - - seed = np.random.randint(0, 1e6) - ckpt_dir = os.path.join(training_dir, "checkpoints") - render = False - # How many timesteps until a checkpoint is saved - ckpt_freq = args.ckpt_freq - - # RL hyperparameters - timesteps = args.timesteps - schedule = "linear" - step_size = 1e-4 - horizon = 512 - batchsize = 64 - epochs = 5 - gamma = 0.99 - - env_id = args.gym_id - env = gym.make(env_id) - def sample_noise(inst): - # Experiementally derived for MatekF7 FC, see Chapter 5 of "Flight - # Controller Synthesis Via Deep Reinforcement Learning" for methodology. - r_noise = inst.np_random.normal(-0.25465, 1.3373) - p_noise = inst.np_random.normal(0.241961, 0.9990) - y_noise = inst.np_random.normal(0.07906, 1.45168) - return np.array([r_noise, p_noise, y_noise]) - - env.sample_noise = sample_noise - env.set_aircraft_model(args.twin) - - train(env, timesteps, seed, flight_log_dir=None, ckpt_dir = ckpt_dir, - render = render, - ckpt_freq = ckpt_freq, schedule = schedule, optim_stepsize = step_size, - horizon = horizon, optim_batchsize = batchsize, optim_epochs = epochs, - gamma = gamma) - diff --git a/examples/tf_checkpoint_evaluate.py b/examples/tf_checkpoint_evaluate.py new file mode 100644 index 0000000..d1158f7 --- /dev/null +++ b/examples/tf_checkpoint_evaluate.py @@ -0,0 +1,119 @@ +import argparse +from pathlib import Path +import os.path +import numpy as np +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +import tensorflow as tf +tf.logging.set_verbosity(tf.logging.ERROR) +import gym +from gymfc_nf.envs import * +from gymfc_nf.utils.monitor import CheckpointMonitor +from gymfc_nf.utils.log import make_header +from gymfc_nf.policies import PpoBaselinesPolicy + +def generate_inputs(num_trials, max_rate, seed): + inputs = [] + np.random.seed(seed) + for i in range(num_trials): + inputs.append(np.random.normal(0, max_rate, size=3)) + return inputs + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Monitor and evaluate Tensorflow checkpoints.") + parser.add_argument('ckpt_dir', help="Directory where checkpoints are saved") + parser.add_argument('--twin', default="./gymfc_nf/twins/nf1/model.sdf", + help="File path of the aircraft digitial twin/model SDF.") + parser.add_argument('--eval-dir', + help="Directory where evaluation logs are saved, if different than default.") + parser.add_argument('--gym-id', default="gymfc_nf-step-v1") + parser.add_argument('--num-trials', type=int, default=1) + # Provide a seed so the same setpoint will be created. Useful for debugging + parser.add_argument('--seed', help='RNG seed', type=int, default=-1) + # TODO (wfk) Support different policies from different trainers e.g., + # from Stable baselines and Tensorforce + + args = parser.parse_args() + + seed = np.random.randint(0, 1e6) if args.seed < 0 else args.seed + # Redefine all args, maybe later convert to a class + gym_id = args.gym_id + ckpt_dir = args.ckpt_dir + model_dir = Path(ckpt_dir).parent + eval_dir = args.eval_dir if args.eval_dir else os.path.join(model_dir, + "evaluations") + num_trials = args.num_trials + + env = gym.make(gym_id) + env.seed(seed) + env.set_aircraft_model(args.twin) + + # For evaluation we compute all of the singlet inputs upfront so we get a + # more accurate comparison. + # XXX (wfk) This will not work for other environments with mulitple + # setpoints! + inputs = generate_inputs(num_trials, env.max_rate, seed) + def callback(checkpoint_path): + + checkpoint_filename = os.path.split(checkpoint_path)[-1] + ckpt_eval_dir = os.path.join(eval_dir, checkpoint_filename) + Path(ckpt_eval_dir).mkdir(parents=True, exist_ok=True) + + print ("Evaluating checkpoint {}".format(checkpoint_path)) + with tf.Session() as sess: + saver = tf.train.import_meta_graph(checkpoint_path + '.meta', + clear_devices=True) + saver.restore(sess, checkpoint_path) + pi = PpoBaselinesPolicy(sess) + + es = [] + rs = [] + log_header = "" + for i in range(num_trials): + + pi.reset() + ob = env.reset() + # Override the random generatd input in the environment + # must do this after the reset becuase this is normally where + # this gets computed. + env.generated_input = inputs[i] + + if len(log_header) == 0: + log_header = make_header(len(ob)) + + log_file = os.path.join(ckpt_eval_dir, "trial-{}.csv".format(i)) + print("\t", log_file) + + sim_time = 0 + actual = np.zeros(3) + + logs = [] + while True: + ac = pi.action(ob, env.sim_time, env.angular_rate_sp, + env.imu_angular_velocity_rpy) + ob, reward, done, _ = env.step(ac) + + # TODO (wfk) Should we standardize this log format? We could + # use NASA's SIDPAC channel format. + log = ([env.sim_time] + + ob.tolist() + # The observations are the NN input + ac.tolist() + # The actions are the NN output + env.imu_angular_velocity_rpy.tolist() + # Angular velocites + env.angular_rate_sp.tolist() + # + env.y.tolist() + # Y is the output sent to the ESC + env.esc_motor_angular_velocity.tolist() + + [reward])# The reward that would have been given for the action, can be helpful for debugging + + e = env.imu_angular_velocity_rpy - env.angular_rate_sp + es.append(e) + rs.append(reward) + logs.append(log) + + if done: + break + np.savetxt(log_file, logs, delimiter=",", header=log_header) + + print("\tMAE={:.4f} Ave. Reward={:.4f}".format(np.mean(np.abs(es)), np.mean(rs))) + + + monitor = CheckpointMonitor(args.ckpt_dir, callback) + monitor.start() diff --git a/gymfc/envs/assets/gazebo/worlds/attitude.world b/gymfc/envs/assets/gazebo/worlds/attitude.world index f387da2..c6ce090 100644 --- a/gymfc/envs/assets/gazebo/worlds/attitude.world +++ b/gymfc/envs/assets/gazebo/worlds/attitude.world @@ -2,6 +2,8 @@ + + 0 0 0 diff --git a/gymfc/tools/plot.py b/gymfc/tools/plot.py index 6285bc4..aa14f64 100644 --- a/gymfc/tools/plot.py +++ b/gymfc/tools/plot.py @@ -30,7 +30,7 @@ def plot_rates(ax, t, pqr_desired, pqr_actual): ax[i].set_ylabel(axis_label[i] + unit_rotation) -def plot_motor_rpms(ax, t, rpms): +def plot_motor_rpms(ax, t, rpms, alpha=1.0): """ Plot the motor rpms values Args: @@ -40,42 +40,26 @@ def plot_motor_rpms(ax, t, rpms): rpms: 2D Numpy array where the length is equal to len(t) and the width corresponds to the number of actuators. """ - motorcolor = [['b','r','#ff8000','#00ff00'], ['m']*4] ax.set_ylabel("RPM") - for i in range(len(rpms)): - alpha = 1 - if len(self.alphas) > 0: - alpha = self.alphas[i] - m0 = rpms[i][:,0] - m1 = rpms[i][:,1] - m2 = rpms[i][:,2] - m3 = rpms[i][:,3] - - ax.plot(t, m0, label="{} M1".format(self.labels[i]), linestyle=':', color=colors[i], alpha=alpha)#, color=motorcolor[i][0]) - ax.plot(t, m1, label="{} M2".format(self.labels[i]), linestyle="-", color=colors[i], alpha=alpha)#, color=motorcolor[i][1],) - ax.plot(t, m2, label="{} M3".format(self.labels[i]), linestyle="-.", color=colors[i], alpha=alpha)#, color=motorcolor[i][2],) - ax.plot(t, m3, label="{} M4".format(self.labels[i]), linestyle='--', color=colors[i], alpha=alpha)#color=motorcolor[i][3], - ax.legend( loc='upper right', ncol=4) - - -def plot_u(ax): + lines = ["-", "-.", ":", "--"] + for i in range(4): + ax.plot(t, rpms[:,i], label="M{}".format(i+1), linestyle=lines[i], + alpha=alpha) + + ax.legend(loc='upper right', ncol=4) + ax.grid(True) + + +def plot_u(ax, t, u, alpha=1.0): """ Plot motor control signals """ - ax.set_ylabel("Motor Control Signal (\%)") - - - for i in range(len(motors)): - alpha = 1 - if len(self.alphas) > 0: - alpha = self.alphas[i] - m0 = motors[i][:,0]/1000.0 - m1 = motors[i][:,1]/1000.0 - m2 = motors[i][:,2]/1000.0 - m3 = motors[i][:,3]/1000.0 - - ax.plot(t, m0, label="{} M1".format(self.labels[i]), linestyle=':', color=colors[i], alpha=alpha)#, color=motorcolor[i][0]) - ax.plot(t, m1, label="{} M2".format(self.labels[i]), linestyle="-", color=colors[i], alpha=alpha)#, color=motorcolor[i][1],) - ax.plot(t, m2, label="{} M3".format(self.labels[i]), linestyle="-.", color=colors[i], alpha=alpha)#, color=motorcolor[i][2],) - ax.plot(t, m3, label="{} M4".format(self.labels[i]), linestyle='--', color=colors[i], alpha=alpha)#color=motorcolor[i][3], - ax.legend( loc='upper right', ncol=4) + ax.set_ylabel("u (\%)") + lines = ["-", "-.", ":", "--"] + for i in range(4): + ax.plot(t, u[:,i], label="M{}".format(i+1), linestyle=lines[i], + alpha=alpha) + + ax.legend(loc='upper right', ncol=4) + ax.grid(True) +