From 27ca8288a28169ec59252fc4370014c691449385 Mon Sep 17 00:00:00 2001
From: mishari <44849486+maliesa96@users.noreply.github.com>
Date: Wed, 14 Oct 2020 01:30:11 -0700
Subject: [PATCH 01/23] Add MAML doc (#2093)

---
 docs/index.md            |  1 +
 docs/user/algo_maml.md   | 86 ++++++++++++++++++++++++++++++++++++++++
 docs/user/references.bib | 12 +++++-
 3 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 docs/user/algo_maml.md

diff --git a/docs/index.md b/docs/index.md
index a817ab5649..505f3cf8a2 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -56,6 +56,7 @@ and how to implement new MDPs and new algorithms.
    user/algo_pearl
    user/algo_rl2
    user/algo_ppo
+   user/algo_maml
    user/algo_mtppo
    user/algo_vpg
    user/algo_td3
diff --git a/docs/user/algo_maml.md b/docs/user/algo_maml.md
new file mode 100644
index 0000000000..788325f946
--- /dev/null
+++ b/docs/user/algo_maml.md
@@ -0,0 +1,86 @@
+# MAML
+
+```eval_rst
++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Paper**         | Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks :cite:`finn2017modelagnostic`                                                                            |
++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Framework(s)**  | .. figure:: ./images/pytorch.png                                                                                                                                           |
+|                   |    :scale: 10%                                                                                                                                                             |
+|                   |    :class: no-scaled-link                                                                                                                                                  |
+|                   |                                                                                                                                                                            |
+|                   |    PyTorch                                                                                                                                                                 |
++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| **API Reference** | `garage.torch.algos.MAML <../_autoapi/garage/torch/algos/index.html#garage.torch.algos.maml>`_                                                                             |
++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Code**          | `garage/torch/algos/maml.py <https://github.com/rlworkgroup/garage/blob/master/src/garage/torch/algos/maml.py>`_                                                           |
++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Examples**      | :ref:`maml_ppo_half_cheetah_dir`, :ref:`maml_trpo_half_cheetah_dir`, :ref:`maml_trpo_metaworld_ml1_push`, :ref:`maml_trpo_metaworld_ml10`. :ref:`maml_trpo_metaworld_ml45` |
++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+```
+
+MAML is a meta-learning algorithm that trains the parameters of a policy such that they generalize well to unseen tasks. In essence, this technique produces models that are good few shot learners and easy to fine-tune.
+
+## Default Parameters
+
+```python
+meta_batch_size=40,
+inner_lr=0.1,
+outer_lr=1e-3,
+num_grad_updates=1,
+meta_evaluator=None,
+evaluate_every_n_epochs=1
+```
+
+## Examples
+
+### maml_ppo_half_cheetah_dir
+
+```eval_rst
+.. figure:: ./images/pytorch.png
+        :scale: 10%
+.. literalinclude:: ../../examples/torch/maml_ppo_half_cheetah_dir.py
+```
+
+### maml_trpo_half_cheetah_dir
+
+```eval_rst
+.. figure:: ./images/pytorch.png
+        :scale: 10%
+.. literalinclude:: ../../examples/torch/maml_trpo_half_cheetah_dir.py
+```
+
+### maml_trpo_metaworld_ml1_push
+
+```eval_rst
+.. figure:: ./images/pytorch.png
+        :scale: 10%
+.. literalinclude:: ../../examples/torch/maml_trpo_metaworld_ml1_push.py
+```
+
+### maml_trpo_metaworld_ml10
+
+```eval_rst
+.. figure:: ./images/pytorch.png
+        :scale: 10%
+.. literalinclude:: ../../examples/torch/maml_trpo_metaworld_ml10.py
+```
+
+### maml_trpo_metaworld_ml45
+
+```eval_rst
+.. figure:: ./images/pytorch.png
+        :scale: 10%
+.. literalinclude:: ../../examples/torch/maml_trpo_metaworld_ml45.py
+```
+
+## References
+
+```eval_rst
+.. bibliography:: references.bib
+   :style: unsrt
+   :filter: docname in docnames
+```
+
+----
+
+*This page was authored by Mishari Aliesa ([@maliesa96](https://github.com/maliesa96)).*
diff --git a/docs/user/references.bib b/docs/user/references.bib
index eea7a77918..740c4f71fa 100644
--- a/docs/user/references.bib
+++ b/docs/user/references.bib
@@ -97,7 +97,8 @@ @inproceedings{peters2007reward
   year={2007},
   volume={},
   number={},
-  pages={262-267},}
+  pages={262-267}
+}
 
 @article{2009koberpolicy,
   title = {Policy Search for Motor Primitives in Robotics},
@@ -114,3 +115,12 @@ @article{2009koberpolicy
   year = {2009},
   month_numeric = {6}
 }
+
+@misc{finn2017modelagnostic,
+  title={Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks},
+  author={Chelsea Finn and Pieter Abbeel and Sergey Levine},
+  year={2017},
+  eprint={1703.03400},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG}
+}

From 3b320484e6cad24013c57a9e96e078070f300963 Mon Sep 17 00:00:00 2001
From: Gitanshu Sardana <gitanshusardana@gmail.com>
Date: Wed, 14 Oct 2020 11:57:55 -0700
Subject: [PATCH 02/23] Make snapshot_config optional in benchmarks (#2085)

PR #2072 added an argument to functions decorated with @benchmark
to pass snapshot_config to auto benchmarks but that broke other
benchmarks that didn't need that argument.

This commit cleans it up by passing snapshot_config directly to
iterate_experiments, since trying to pass it from within decorator
function makes for a confusing API
---
 .../src/garage_benchmarks/benchmark_auto.py   | 26 +++++++++----------
 benchmarks/src/garage_benchmarks/helper.py    |  8 +++---
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/benchmarks/src/garage_benchmarks/benchmark_auto.py b/benchmarks/src/garage_benchmarks/benchmark_auto.py
index e42f025be2..577f2ae911 100644
--- a/benchmarks/src/garage_benchmarks/benchmark_auto.py
+++ b/benchmarks/src/garage_benchmarks/benchmark_auto.py
@@ -15,26 +15,26 @@
 
 
 @benchmark(plot=False, auto=True)
-def auto_ddpg_benchmarks(snapshot_config):
+def auto_ddpg_benchmarks():
     """Run experiments for DDPG benchmarking."""
     iterate_experiments(ddpg_garage_tf,
                         MuJoCo1M_ENV_SET,
-                        snapshot_config=snapshot_config)
+                        snapshot_config={'snapshot_mode': 'none'})
 
 
 @benchmark(plot=False, auto=True)
-def auto_ppo_benchmarks(snapshot_config):
+def auto_ppo_benchmarks():
     """Run experiments for PPO benchmarking."""
     iterate_experiments(ppo_garage_pytorch,
                         MuJoCo1M_ENV_SET,
-                        snapshot_config=snapshot_config)
+                        snapshot_config={'snapshot_mode': 'none'})
     iterate_experiments(ppo_garage_tf,
                         MuJoCo1M_ENV_SET,
-                        snapshot_config=snapshot_config)
+                        snapshot_config={'snapshot_mode': 'none'})
 
 
 @benchmark(plot=False, auto=True)
-def auto_td3_benchmarks(snapshot_config):
+def auto_td3_benchmarks():
     """Run experiments for TD3 benchmarking."""
     td3_env_ids = [
         env_id for env_id in MuJoCo1M_ENV_SET if env_id != 'Reacher-v2'
@@ -42,26 +42,26 @@ def auto_td3_benchmarks(snapshot_config):
 
     iterate_experiments(td3_garage_tf,
                         td3_env_ids,
-                        snapshot_config=snapshot_config)
+                        snapshot_config={'snapshot_mode': 'none'})
 
 
 @benchmark(plot=False, auto=True)
-def auto_trpo_benchmarks(snapshot_config):
+def auto_trpo_benchmarks():
     """Run experiments for TRPO benchmarking."""
     iterate_experiments(trpo_garage_pytorch,
                         MuJoCo1M_ENV_SET,
-                        snapshot_config=snapshot_config)
+                        snapshot_config={'snapshot_mode': 'none'})
     iterate_experiments(trpo_garage_tf,
                         MuJoCo1M_ENV_SET,
-                        snapshot_config=snapshot_config)
+                        snapshot_config={'snapshot_mode': 'none'})
 
 
 @benchmark(plot=False, auto=True)
-def auto_vpg_benchmarks(snapshot_config):
+def auto_vpg_benchmarks():
     """Run experiments for VPG benchmarking."""
     iterate_experiments(vpg_garage_pytorch,
                         MuJoCo1M_ENV_SET,
-                        snapshot_config=snapshot_config)
+                        snapshot_config={'snapshot_mode': 'none'})
     iterate_experiments(vpg_garage_tf,
                         MuJoCo1M_ENV_SET,
-                        snapshot_config=snapshot_config)
+                        snapshot_config={'snapshot_mode': 'none'})
diff --git a/benchmarks/src/garage_benchmarks/helper.py b/benchmarks/src/garage_benchmarks/helper.py
index 301873585a..2dec455e4a 100644
--- a/benchmarks/src/garage_benchmarks/helper.py
+++ b/benchmarks/src/garage_benchmarks/helper.py
@@ -80,15 +80,12 @@ def wrapper_func():
                 count += 1
             _log_dir = _log_dir + '_' + str(count)
 
-        snapshot_config = {}
-
         if auto:
             _auto = auto
             auto_dir = os.path.join(_log_dir, 'auto')
             os.makedirs(auto_dir)
-            snapshot_config['snapshot_mode'] = 'none'
 
-        exec_func(snapshot_config)
+        exec_func()
 
         if plot:
             plot_dir = os.path.join(_log_dir, 'plot')
@@ -148,7 +145,8 @@ def iterate_experiments(func,
             tf.compat.v1.reset_default_graph()
 
             ctxt = dict(log_dir=sub_log_dir)
-            ctxt.update(snapshot_config)
+            if snapshot_config:
+                ctxt.update(snapshot_config)
             func(ctxt, env_id=env_id, seed=seed)
 
             if _plot is not None or _auto:

From b4b7aa177c76bf74885a673f3a4b6b424a63fc22 Mon Sep 17 00:00:00 2001
From: "Nicole (Shin Ying) Ng" <shinyinn@usc.edu>
Date: Wed, 14 Oct 2020 17:54:07 -0700
Subject: [PATCH 03/23] Add doc for MTSAC algorithm (#2041)

* Add mtsac docs

* Add mtsac doc

* Update author name
---
 docs/index.md           |  1 +
 docs/user/algo_mtsac.md | 63 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 docs/user/algo_mtsac.md

diff --git a/docs/index.md b/docs/index.md
index 505f3cf8a2..d001eade5e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -53,6 +53,7 @@ and how to implement new MDPs and new algorithms.
    user/algo_trpo
    user/algo_mttrpo
    user/algo_sac
+   user/algo_mtsac
    user/algo_pearl
    user/algo_rl2
    user/algo_ppo
diff --git a/docs/user/algo_mtsac.md b/docs/user/algo_mtsac.md
new file mode 100644
index 0000000000..2f30a1267f
--- /dev/null
+++ b/docs/user/algo_mtsac.md
@@ -0,0 +1,63 @@
+# Multi-Task Soft Actor-Critic
+
+```eval_rst
+.. list-table::
+   :header-rows: 0
+   :stub-columns: 1
+   :widths: auto
+
+   * - **Action Space**
+     - Continuous
+   * - **Framework(s)**
+     - .. figure:: ./images/pytorch.png
+        :scale: 10%
+
+        PyTorch
+   * - **API Reference**
+     - `garage.torch.algos.MTSAC <../_autoapi/garage/torch/algos/index.html#garage.torch.algos.MTSAC>`_
+   * - **Code**
+     - `garage/torch/algos/mtsac.py <https://github.com/rlworkgroup/garage/blob/master/src/garage/torch/algos/mtsac.py>`_
+   * - **Examples**
+     - :ref:`mtsac_metaworld_ml1_pick_place`, :ref:`mtsac_metaworld_mt10`, :ref:`mtsac_metaworld_mt50`
+```
+
+The Multi-Task Soft Actor-Critic (MTSAC) algorithm is the same as the [Soft Actor Critic (SAC)](algo_sac) algorithm, except for a small change called "disentangled alphas". Alpha is the entropy coefficient that is used to control exploration of the agent/policy. Disentangling alphas refers to having a separate alpha coefficients for every task learned by the policy. The alphas are accessed by using a one-hot encoding of an id that is assigned to each task.
+
+
+## Default Parameters
+
+```python
+initial_log_entropy=0.,
+discount=0.99,
+buffer_batch_size=64,
+min_buffer_size=int(1e4),
+target_update_tau=5e-3,
+policy_lr=3e-4,
+qf_lr=3e-4,
+reward_scale=1.0,
+optimizer=torch.optim.Adam,
+steps_per_epoch=1,
+num_evaluation_episodes=5,
+use_deterministic_evaluation=True,
+```
+
+## Examples
+
+### mtsac_metaworld_ml1_pick_place
+```eval_rst
+.. literalinclude:: ../../examples/torch/mtsac_metaworld_ml1_pick_place.py
+```
+
+### mtsac_metaworld_mt10
+```eval_rst
+.. literalinclude:: ../../examples/torch/mtsac_metaworld_mt10.py
+```
+
+### mtsac_metaworld_mt50
+```eval_rst
+.. literalinclude:: ../../examples/torch/mtsac_metaworld_mt10.py
+```
+
+----
+
+*This page was authored by Nicole Shin Ying Ng ([@nicolengsy](https://github.com/nicolengsy)).*

From 6da4d5206bbac00eadeb156b766fe6f96bbda5ba Mon Sep 17 00:00:00 2001
From: Avnish Narayan <38871737+avnishn@users.noreply.github.com>
Date: Thu, 15 Oct 2020 13:02:39 -0700
Subject: [PATCH 04/23] Fix #2131 (#2132)

There's something specifically wrong with an image used
in rendering pendulum-v0, causing visualize to fail.

I swapped it out with CartPole-v1 for the purposes of
the tests and that was a fix.
---
 tests/garage/envs/test_normalized_gym.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/garage/envs/test_normalized_gym.py b/tests/garage/envs/test_normalized_gym.py
index b899336ca3..b513029dcd 100644
--- a/tests/garage/envs/test_normalized_gym.py
+++ b/tests/garage/envs/test_normalized_gym.py
@@ -4,7 +4,7 @@
 class TestNormalizedGym:
 
     def setup_method(self):
-        self.env = normalize(GymEnv('Pendulum-v0'),
+        self.env = normalize(GymEnv('CartPole-v1'),
                              normalize_reward=True,
                              normalize_obs=True,
                              flatten_obs=True)

From ade1ba147dbead58c061e02ed8243669372b58b0 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com>
Date: Fri, 16 Oct 2020 11:34:18 -0700
Subject: [PATCH 05/23] Allow setting x-axis in wrap_experiment (#2128)

---
 src/garage/experiment/experiment.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/garage/experiment/experiment.py b/src/garage/experiment/experiment.py
index 04836682f4..f697db6f82 100644
--- a/src/garage/experiment/experiment.py
+++ b/src/garage/experiment/experiment.py
@@ -161,6 +161,7 @@ def my_experiment(ctxt, seed, lr=0.5):
             the function definition.
         use_existing_dir (bool): If true, (re)use the directory for this
             experiment, even if it already contains data.
+        x_axis (str): Key to use for x axis of plots.
 
 
@@ -170,7 +171,7 @@ def my_experiment(ctxt, seed, lr=0.5):
 
     def __init__(self, *, function, log_dir, name, prefix, snapshot_mode,
                  snapshot_gap, archive_launch_repo, name_parameters,
-                 use_existing_dir):
+                 use_existing_dir, x_axis):
         self.function = function
         self.log_dir = log_dir
         self.name = name
@@ -180,6 +181,7 @@ def __init__(self, *, function, log_dir, name, prefix, snapshot_mode,
         self.archive_launch_repo = archive_launch_repo
         self.name_parameters = name_parameters
         self.use_existing_dir = use_existing_dir
+        self.x_axis = x_axis
         if self.function is not None:
             self._update_wrap_params()
 
@@ -263,6 +265,7 @@ def _get_options(self, *args):
                        snapshot_gap=self.snapshot_gap,
                        snapshot_mode=self.snapshot_mode,
                        use_existing_dir=self.use_existing_dir,
+                       x_axis=self.x_axis,
                        signature=self.__signature__)
         if args:
             if len(args) == 1 and isinstance(args[0], dict):
@@ -321,7 +324,7 @@ def _make_context(cls, options, **kwargs):
         logger.add_output(dowel.TextOutput(text_log_file))
         logger.add_output(dowel.CsvOutput(tabular_log_file))
         logger.add_output(
-            dowel.TensorBoardOutput(log_dir, x_axis='TotalEnvSteps'))
+            dowel.TensorBoardOutput(log_dir, x_axis=options['x_axis']))
         logger.add_output(dowel.StdOutput())
 
         logger.push_prefix('[{}] '.format(name))
@@ -377,7 +380,8 @@ def wrap_experiment(function=None,
                     snapshot_gap=1,
                     archive_launch_repo=True,
                     name_parameters=None,
-                    use_existing_dir=False):
+                    use_existing_dir=False,
+                    x_axis='TotalEnvSteps'):
     """Decorate a function to turn it into an ExperimentTemplate.
 
     When invoked, the wrapped function will receive an ExperimentContext, which
@@ -424,6 +428,7 @@ def my_experiment(ctxt, seed, lr=0.5):
             the function definition.
         use_existing_dir (bool): If true, (re)use the directory for this
             experiment, even if it already contains data.
+        x_axis (str): Key to use for x axis of plots.
 
     Returns:
         callable: The wrapped function.
@@ -437,7 +442,8 @@ def my_experiment(ctxt, seed, lr=0.5):
                               snapshot_gap=snapshot_gap,
                               archive_launch_repo=archive_launch_repo,
                               name_parameters=name_parameters,
-                              use_existing_dir=use_existing_dir)
+                              use_existing_dir=use_existing_dir,
+                              x_axis=x_axis)
 
 
 def dump_json(filename, data):

From b91fcf1e3a703cb04c2c5c1de4aeef5e4e3fd026 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com>
Date: Fri, 16 Oct 2020 12:47:02 -0700
Subject: [PATCH 06/23] Record total_env_steps in samplers (#2125)

---
 src/garage/sampler/local_sampler.py           | 9 +++++++--
 src/garage/sampler/multiprocessing_sampler.py | 9 +++++++--
 src/garage/sampler/ray_sampler.py             | 9 +++++++--
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/src/garage/sampler/local_sampler.py b/src/garage/sampler/local_sampler.py
index 47fcb0baa3..11b88fd287 100644
--- a/src/garage/sampler/local_sampler.py
+++ b/src/garage/sampler/local_sampler.py
@@ -40,6 +40,7 @@ def __init__(self, worker_factory, agents, envs):
         for worker, agent, env in zip(self._workers, self._agents, self._envs):
             worker.update_agent(agent)
             worker.update_env(env)
+        self.total_env_steps = 0
 
     @classmethod
     def from_worker_factory(cls, worker_factory, agents, envs):
@@ -117,7 +118,9 @@ def obtain_samples(self, itr, num_samples, agent_update, env_update=None):
                 completed_samples += len(batch.actions)
                 batches.append(batch)
                 if completed_samples >= num_samples:
-                    return EpisodeBatch.concatenate(*batches)
+                    samples = EpisodeBatch.concatenate(*batches)
+                    self.total_env_steps += sum(samples.lengths)
+                    return samples
 
     def obtain_exact_episodes(self,
                               n_eps_per_worker,
@@ -149,7 +152,9 @@ def obtain_exact_episodes(self,
             for _ in range(n_eps_per_worker):
                 batch = worker.rollout()
                 batches.append(batch)
-        return EpisodeBatch.concatenate(*batches)
+        samples = EpisodeBatch.concatenate(*batches)
+        self.total_env_steps += sum(samples.lengths)
+        return samples
 
     def shutdown_worker(self):
         """Shutdown the workers."""
diff --git a/src/garage/sampler/multiprocessing_sampler.py b/src/garage/sampler/multiprocessing_sampler.py
index 86aaf445b3..13a2b4a916 100644
--- a/src/garage/sampler/multiprocessing_sampler.py
+++ b/src/garage/sampler/multiprocessing_sampler.py
@@ -61,6 +61,7 @@ def __init__(self, worker_factory, agents, envs):
         self._agent_version = 0
         for w in self._workers:
             w.start()
+        self.total_env_steps = 0
 
     @classmethod
     def from_worker_factory(cls, worker_factory, agents, envs):
@@ -182,7 +183,9 @@ def obtain_samples(self, itr, num_samples, agent_update, env_update=None):
                 except queue.Full:
                     pass
 
-        return EpisodeBatch.concatenate(*batches)
+        samples = EpisodeBatch.concatenate(*batches)
+        self.total_env_steps += sum(samples.lengths)
+        return samples
 
     def obtain_exact_episodes(self,
                               n_eps_per_worker,
@@ -254,7 +257,9 @@ def obtain_exact_episodes(self,
         ordered_episodes = list(
             itertools.chain(
                 *[episodes[i] for i in range(self._factory.n_workers)]))
-        return EpisodeBatch.concatenate(*ordered_episodes)
+        samples = EpisodeBatch.concatenate(*ordered_episodes)
+        self.total_env_steps += sum(samples.lengths)
+        return samples
 
     def shutdown_worker(self):
         """Shutdown the workers."""
diff --git a/src/garage/sampler/ray_sampler.py b/src/garage/sampler/ray_sampler.py
index 506866ddf3..f943971957 100644
--- a/src/garage/sampler/ray_sampler.py
+++ b/src/garage/sampler/ray_sampler.py
@@ -40,6 +40,7 @@ def __init__(self, worker_factory, agents, envs):
         self._all_workers = defaultdict(None)
         self._workers_started = False
         self.start_worker()
+        self.total_env_steps = 0
 
     @classmethod
     def from_worker_factory(cls, worker_factory, agents, envs):
@@ -170,7 +171,9 @@ def obtain_samples(self, itr, num_samples, agent_update, env_update=None):
                     batches.append(episode_batch)
                     pbar.update(num_returned_samples)
 
-        return EpisodeBatch.concatenate(*batches)
+        samples = EpisodeBatch.concatenate(*batches)
+        self.total_env_steps += sum(samples.lengths)
+        return samples
 
     def obtain_exact_episodes(self,
                               n_eps_per_worker,
@@ -247,7 +250,9 @@ def obtain_exact_episodes(self,
             itertools.chain(
                 *[episodes[i] for i in range(self._worker_factory.n_workers)]))
 
-        return EpisodeBatch.concatenate(*ordered_episodes)
+        samples = EpisodeBatch.concatenate(*ordered_episodes)
+        self.total_env_steps += sum(samples.lengths)
+        return samples
 
     def shutdown_worker(self):
         """Shuts down the worker."""

From 06e5aba9b06635e0772acf442f845672acc03694 Mon Sep 17 00:00:00 2001
From: Gitanshu Sardana <gitanshusardana@gmail.com>
Date: Fri, 16 Oct 2020 15:09:36 -0700
Subject: [PATCH 07/23] Rerun failed tests automatically once on CI (#2094)

---
 .github/workflows/ci.yml | 10 +++++-----
 Makefile                 | 10 +++++-----
 setup.py                 |  1 +
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4fe37df4e0..dff391b9e8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -129,7 +129,7 @@ jobs:
           "${DOCKER_TAG}" \
           /bin/bash -c \
           '[ ! -f ${MJKEY_PATH} ] || mv ${MJKEY_PATH} ${MJKEY_PATH}.bak &&
-          pytest --cov=garage --cov-report=xml -m \
+          pytest --cov=garage --cov-report=xml --reruns 1 -m \
               "not nightly and not huge and not flaky and not large and not mujoco and not mujoco_long" --durations=20 &&
           for i in {1..5}; do
               bash <(curl -s https://codecov.io/bash --retry 5) -Z && break
@@ -171,7 +171,7 @@ jobs:
           "${DOCKER_TAG}" \
           /bin/bash -c \
           '[ ! -f ${MJKEY_PATH} ] || mv ${MJKEY_PATH} ${MJKEY_PATH}.bak &&
-          pytest --cov=garage --cov-report=xml -m "large and not flaky" --durations=20 &&
+          pytest --cov=garage --cov-report=xml --reruns 1 -m "large and not flaky" --durations=20 &&
           for i in {1..5}; do
               bash <(curl -s https://codecov.io/bash --retry 5) -Z && break
               if [ $i == 5 ]; then
@@ -211,7 +211,7 @@ jobs:
           --memory-swap 6500m \
           "${DOCKER_TAG}" \
           /bin/bash -c \
-          'pytest --cov=garage --cov-report=xml -m "mujoco and not flaky" --durations=20 &&
+          'pytest --cov=garage --cov-report=xml --reruns 1 -m "mujoco and not flaky" --durations=20 &&
           for i in {1..5}; do
               bash <(curl -s https://codecov.io/bash --retry 5) -Z && break
               if [ $i == 5 ]; then
@@ -251,7 +251,7 @@ jobs:
           --memory-swap 6500m \
           "${DOCKER_TAG}" \
           /bin/bash -c \
-          'pytest --cov=garage --cov-report=xml -m "mujoco_long and not flaky" --durations=20 &&
+          'pytest --cov=garage --cov-report=xml --reruns 1 -m "mujoco_long and not flaky" --durations=20 &&
           for i in {1..5}; do
               bash <(curl -s https://codecov.io/bash --retry 5) -Z && break
               if [ $i == 5 ]; then
@@ -290,7 +290,7 @@ jobs:
             $ci_env\
             --memory 6500m \
             --memory-swap 6500m \
-            "${DOCKER_TAG}" pytest -v -m nightly
+            "${DOCKER_TAG}" pytest -v --reruns 1 -m nightly
 
 
   verify_envs_conda:
diff --git a/Makefile b/Makefile
index a979a22ca5..5a90f38c8a 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ ci-job-precommit: assert-docker
 
 ci-job-normal: assert-docker
 	[ ! -f $(MJKEY_PATH) ] || mv $(MJKEY_PATH) $(MJKEY_PATH).bak
-	pytest --cov=garage --cov-report=xml -m \
+	pytest --cov=garage --cov-report=xml --reruns 1 -m \
 	    'not nightly and not huge and not flaky and not large and not mujoco and not mujoco_long' --durations=20
 	for i in {1..5}; do \
 		bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \
@@ -59,7 +59,7 @@ ci-job-normal: assert-docker
 
 ci-job-large: assert-docker
 	[ ! -f $(MJKEY_PATH) ] || mv $(MJKEY_PATH) $(MJKEY_PATH).bak
-	pytest --cov=garage --cov-report=xml -m 'large and not flaky' --durations=20
+	pytest --cov=garage --cov-report=xml --reruns 1 -m 'large and not flaky' --durations=20
 	for i in {1..5}; do \
 		bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \
 			|| echo 'Retrying...' && sleep 30 && continue; \
@@ -67,7 +67,7 @@ ci-job-large: assert-docker
 	done
 
 ci-job-mujoco: assert-docker
-	pytest --cov=garage --cov-report=xml -m 'mujoco and not flaky' --durations=20
+	pytest --cov=garage --cov-report=xml --reruns 1 -m 'mujoco and not flaky' --durations=20
 	for i in {1..5}; do \
 		bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \
 			|| echo 'Retrying...' && sleep 30 && continue; \
@@ -75,7 +75,7 @@ ci-job-mujoco: assert-docker
 	done
 
 ci-job-mujoco-long: assert-docker
-	pytest --cov=garage --cov-report=xml -m 'mujoco_long and not flaky' --durations=20
+	pytest --cov=garage --cov-report=xml --reruns 1 -m 'mujoco_long and not flaky' --durations=20
 	for i in {1..5}; do \
 		bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \
 			|| echo 'Retrying...' && sleep 30 && continue; \
@@ -83,7 +83,7 @@ ci-job-mujoco-long: assert-docker
 	done
 
 ci-job-nightly: assert-docker
-	pytest -m nightly
+	pytest --reruns 1 -m nightly
 
 ci-job-verify-envs: assert-docker ci-job-verify-envs-pipenv ci-job-verify-envs-conda
 
diff --git a/setup.py b/setup.py
index 21cd579a4a..1f3cb68719 100644
--- a/setup.py
+++ b/setup.py
@@ -66,6 +66,7 @@
     'pylint>=2.5.3',
     'pytest>=4.5.0',  # Required for strict-markers
     'pytest-cov',
+    'pytest-rerunfailures',
     'pytest-timeout',
     'pytest-xdist',
     'recommonmark',

From 52047e85b047e06f2c6550cb03e357eebbb3bc7c Mon Sep 17 00:00:00 2001
From: Gitanshu Sardana <gitanshusardana@gmail.com>
Date: Fri, 16 Oct 2020 20:03:45 -0700
Subject: [PATCH 08/23] Unpin cloudpickle (#2133)

closes #1882

tensorflow/probability 0.11.1 is out which allows
compatibility with cloudpickle >= 1.3, so we don't
need to pin it anymore
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 1f3cb68719..2c502ed37a 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
     # Please keep alphabetized
     'akro',
     'click>=2.0',
-    'cloudpickle==1.3',
+    'cloudpickle',
     'cma==2.7.0',
     'dowel==0.0.3',
     'numpy>=1.14.5',

From be20a5ed29017dc079d9b46ee7dbd2a2fe89e90e Mon Sep 17 00:00:00 2001
From: "Nicole (Shin Ying) Ng" <shinyinn@usc.edu>
Date: Mon, 19 Oct 2020 17:42:19 -0700
Subject: [PATCH 09/23] Check observation shape conforms to observation_space
 (#2089)

* Add GymEnv check for obs shape

* Check obs space contains observation
---
 src/garage/envs/gym_env.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/garage/envs/gym_env.py b/src/garage/envs/gym_env.py
index 5474c4f4b7..472c196268 100644
--- a/src/garage/envs/gym_env.py
+++ b/src/garage/envs/gym_env.py
@@ -5,6 +5,7 @@
 
 import akro
 import gym
+import numpy as np
 
 from garage import Environment, EnvSpec, EnvStep, StepType
 
@@ -242,6 +243,14 @@ def step(self, action):
         if step_type in (StepType.TERMINAL, StepType.TIMEOUT):
             self._step_cnt = None
 
+        if not self.spec.observation_space.contains(observation):
+            # Discrete actions can be either in the space normally, or one-hot
+            # encoded.
+            if self.spec.observation_space.flat_dim != np.prod(
+                    observation.shape):
+                raise RuntimeError('GymEnv observation shape does not '
+                                   'conform to its observation_space')
+
         return EnvStep(env_spec=self.spec,
                        action=action,
                        reward=reward,

From 83ba45c55f7eec7b8ac6d59066f9dad1e3e05e8c Mon Sep 17 00:00:00 2001
From: Hayden Shively <17186559+haydenshively@users.noreply.github.com>
Date: Tue, 20 Oct 2020 12:02:02 -0500
Subject: [PATCH 10/23] Remove include_dashboard arg so that doctests pass
 (#2140)

---
 docs/requirements.txt             | 12 ++++--------
 src/garage/sampler/ray_sampler.py |  4 +---
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 2be56ef948..3e7b8fab54 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,3 +1,6 @@
+sphinx-autoapi
+sphinxcontrib-bibtex
+
 akro
 click
 cloudpickle
@@ -5,6 +8,7 @@ cma==2.7.0
 dm_env
 dowel==0.0.3
 gym[atari, box2d, classic_control]==0.17.2
+matplotlib
 psutil
 pyprind
 python-dateutil
@@ -13,11 +17,3 @@ scipy
 setproctitle
 tensorflow
 tensorflow-probability
-
-# dev dependencies
-matplotlib
-recommonmark
-sphinx
-sphinx-autoapi>=1.4.0
-sphinx_rtd_theme
-sphinxcontrib-bibtex
diff --git a/src/garage/sampler/ray_sampler.py b/src/garage/sampler/ray_sampler.py
index f943971957..170897521a 100644
--- a/src/garage/sampler/ray_sampler.py
+++ b/src/garage/sampler/ray_sampler.py
@@ -30,9 +30,7 @@ class RaySampler(Sampler):
     def __init__(self, worker_factory, agents, envs):
         # pylint: disable=super-init-not-called
         if not ray.is_initialized():
-            ray.init(log_to_driver=False,
-                     ignore_reinit_error=True,
-                     include_dashboard=False)
+            ray.init(log_to_driver=False, ignore_reinit_error=True)
         self._sampler_worker = ray.remote(SamplerWorker)
         self._worker_factory = worker_factory
         self._agents = agents

From 18f4a9b0f6b0e248c8d289147d13a8be6675c410 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com>
Date: Tue, 20 Oct 2020 11:04:53 -0700
Subject: [PATCH 11/23] Make plotting plot the learner in BC (#2127)

---
 src/garage/torch/algos/bc.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/garage/torch/algos/bc.py b/src/garage/torch/algos/bc.py
index d070e21f3f..55ee264667 100644
--- a/src/garage/torch/algos/bc.py
+++ b/src/garage/torch/algos/bc.py
@@ -76,13 +76,17 @@ def __init__(
         self._batch_size = batch_size
         self._name = name
 
+        # For plotting
+        self.policy = self.learner
+
         # Public fields for sampling.
         self._env_spec = env_spec
+        self.exploration_policy = None
         self.policy = None
         self.max_episode_length = env_spec.max_episode_length
         self.sampler_cls = None
         if isinstance(self._source, Policy):
-            self.policy = self._source
+            self.exploration_policy = self._source
             self.sampler_cls = RaySampler
             self._source = source
         else:

From 3a0f8d18624b326de0400bff84ddc0f8c60c46e9 Mon Sep 17 00:00:00 2001
From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com>
Date: Tue, 20 Oct 2020 12:12:15 -0700
Subject: [PATCH 12/23] Refactor reps to use episode batch (#2123)

---
 src/garage/tf/algos/reps.py | 123 ++++++++++++++----------------------
 1 file changed, 47 insertions(+), 76 deletions(-)

diff --git a/src/garage/tf/algos/reps.py b/src/garage/tf/algos/reps.py
index 55a38f70e6..4c99b36b0f 100644
--- a/src/garage/tf/algos/reps.py
+++ b/src/garage/tf/algos/reps.py
@@ -7,18 +7,13 @@
 import scipy.optimize
 import tensorflow as tf
 
-from garage import (_Default,
-                    EpisodeBatch,
-                    log_performance,
-                    make_optimizer,
-                    StepType)
+from garage import _Default, log_performance, make_optimizer
 from garage.np.algos import RLAlgorithm
 from garage.sampler import RaySampler
 from garage.tf import (compile_function,
                        flatten_inputs,
                        graph_inputs,
-                       new_tensor,
-                       paths_to_tensors)
+                       new_tensor)
 from garage.tf.optimizers import LBFGSOptimizer
 
 # yapf: disable
@@ -148,66 +143,38 @@ def train(self, trainer):
         last_return = None
 
         for _ in trainer.step_epochs():
-            trainer.step_path = trainer.obtain_samples(trainer.step_itr)
+            trainer.step_path = trainer.obtain_episodes(trainer.step_itr)
             last_return = self._train_once(trainer.step_itr, trainer.step_path)
             trainer.step_itr += 1
 
         return last_return
 
-    def _train_once(self, itr, paths):
+    def _train_once(self, itr, episodes):
         """Perform one step of policy optimization given one batch of samples.
 
         Args:
             itr (int): Iteration number.
-            paths (list[dict]): A list of collected paths.
+            episodes (EpisodeBatch): Batch of episodes.
 
         Returns:
             numpy.float64: Average return.
 
         """
-        # -- Stage: Calculate baseline
-        paths = [
-            dict(
-                observations=path['observations'],
-                actions=(
-                    self._env_spec.action_space.flatten_n(  # noqa: E126
-                        path['actions'])),
-                rewards=path['rewards'],
-                env_infos=path['env_infos'],
-                agent_infos=path['agent_infos'],
-                dones=np.array([
-                    step_type == StepType.TERMINAL
-                    for step_type in path['step_types']
-                ])) for path in paths
-        ]
-
-        if hasattr(self._baseline, 'predict_n'):
-            baseline_predictions = self._baseline.predict_n(paths)
-        else:
-            baseline_predictions = [
-                self._baseline.predict(path) for path in paths
-            ]
-
-        # -- Stage: Pre-process samples based on collected paths
-        samples_data = paths_to_tensors(paths, self.max_episode_length,
-                                        baseline_predictions, self._discount,
-                                        self._gae_lambda)
-
         # -- Stage: Run and calculate performance of the algorithm
         undiscounted_returns = log_performance(
             itr,
-            EpisodeBatch.from_list(self._env_spec, paths),
+            episodes,
             discount=self._discount)
         self._episode_reward_mean.extend(undiscounted_returns)
         tabular.record('Extras/EpisodeRewardMean',
                        np.mean(self._episode_reward_mean))
 
-        samples_data['average_return'] = np.mean(undiscounted_returns)
+        average_return = np.mean(undiscounted_returns)
 
         logger.log('Optimizing policy...')
-        self._optimize_policy(samples_data)
+        self._optimize_policy(episodes)
 
-        return samples_data['average_return']
+        return average_return
 
     def __getstate__(self):
         """Parameters to save in snapshot.
@@ -238,12 +205,11 @@ def __setstate__(self, state):
         self._name_scope = tf.name_scope(self._name)
         self._init_opt()
 
-    def _optimize_policy(self, samples_data):
+    def _optimize_policy(self, episodes):
         """Optimize the policy using the samples.
 
         Args:
-            samples_data (dict): Processed sample data.
-                See garage.tf.paths_to_tensors() for details.
+            episodes (EpisodeBatch): Batch of episodes.
 
         """
         # Initial BFGS parameter values.
@@ -255,8 +221,8 @@ def _optimize_policy(self, samples_data):
         # Optimize dual
         eta_before = self._param_eta
         logger.log('Computing dual before')
-        self._feat_diff = self._features(samples_data)
-        dual_opt_input_values = self._dual_opt_input_values(samples_data)
+        self._feat_diff = self._features(episodes)
+        dual_opt_input_values = self._dual_opt_input_values(episodes)
         dual_before = self._f_dual(*dual_opt_input_values)
         logger.log('Optimizing dual')
 
@@ -272,7 +238,7 @@ def eval_dual(x):
             """
             self._param_eta = x[0]
             self._param_v = x[1:]
-            dual_opt_input_values = self._dual_opt_input_values(samples_data)
+            dual_opt_input_values = self._dual_opt_input_values(episodes)
             return self._f_dual(*dual_opt_input_values)
 
         def eval_dual_grad(x):
@@ -287,7 +253,7 @@ def eval_dual_grad(x):
             """
             self._param_eta = x[0]
             self._param_v = x[1:]
-            dual_opt_input_values = self._dual_opt_input_values(samples_data)
+            dual_opt_input_values = self._dual_opt_input_values(episodes)
             grad = self._f_dual_grad(*dual_opt_input_values)
             eta_grad = np.float(grad[0])
             v_grad = grad[1]
@@ -301,11 +267,11 @@ def eval_dual_grad(x):
 
         logger.log('Computing dual after')
         self._param_eta, self._param_v = params_ast[0], params_ast[1:]
-        dual_opt_input_values = self._dual_opt_input_values(samples_data)
+        dual_opt_input_values = self._dual_opt_input_values(episodes)
         dual_after = self._f_dual(*dual_opt_input_values)
 
         # Optimize policy
-        policy_opt_input_values = self._policy_opt_input_values(samples_data)
+        policy_opt_input_values = self._policy_opt_input_values(episodes)
         logger.log('Computing policy loss before')
         loss_before = self._optimizer.loss(policy_opt_input_values)
         logger.log('Computing policy KL before')
@@ -488,26 +454,25 @@ def _build_policy_loss(self, i):
 
         return loss
 
-    def _dual_opt_input_values(self, samples_data):
+    def _dual_opt_input_values(self, episodes):
         """Update dual func optimize input values based on samples data.
 
         Args:
-            samples_data (dict): Processed sample data.
-                See garage.tf.paths_to_tensors() for details.
+            episodes (EpisodeBatch): Batch of episodes.
 
         Returns:
             list(np.ndarray): Flatten dual function optimization input values.
 
         """
+        agent_infos = episodes.padded_agent_infos
         policy_state_info_list = [
-            samples_data['agent_infos'][k]
-            for k in self.policy.state_info_keys
-        ]  # yapf: disable
+            agent_infos[k] for k in self.policy.state_info_keys
+        ]
 
         # pylint: disable=unexpected-keyword-arg
         dual_opt_input_values = self._dual_opt_inputs._replace(
-            reward_var=samples_data['rewards'],
-            valid_var=samples_data['valids'],
+            reward_var=episodes.padded_rewards,
+            valid_var=episodes.valids,
             feat_diff=self._feat_diff,
             param_eta=self._param_eta,
             param_v=self._param_v,
@@ -516,28 +481,33 @@ def _dual_opt_input_values(self, samples_data):
 
         return flatten_inputs(dual_opt_input_values)
 
-    def _policy_opt_input_values(self, samples_data):
+    def _policy_opt_input_values(self, episodes):
         """Update policy optimize input values based on samples data.
 
         Args:
-            samples_data (dict): Processed sample data.
-                See garage.tf.paths_to_tensors() for details.
+            episodes (EpisodeBatch): Batch of episodes.
 
         Returns:
             list(np.ndarray): Flatten policy optimization input values.
 
         """
+        agent_infos = episodes.padded_agent_infos
         policy_state_info_list = [
-            samples_data['agent_infos'][k]
-            for k in self.policy.state_info_keys
-        ]  # yapf: disable
+            agent_infos[k] for k in self.policy.state_info_keys
+        ]
+
+        actions = [
+            self._env_spec.action_space.flatten_n(act)
+            for act in episodes.actions_list
+        ]
+        padded_actions = episodes.pad_to_last(np.concatenate(actions))
 
         # pylint: disable=unexpected-keyword-arg
         policy_opt_input_values = self._policy_opt_inputs._replace(
-            obs_var=samples_data['observations'],
-            action_var=samples_data['actions'],
-            reward_var=samples_data['rewards'],
-            valid_var=samples_data['valids'],
+            obs_var=episodes.padded_observations,
+            action_var=padded_actions,
+            reward_var=episodes.padded_rewards,
+            valid_var=episodes.valids,
             feat_diff=self._feat_diff,
             param_eta=self._param_eta,
             param_v=self._param_v,
@@ -546,24 +516,24 @@ def _policy_opt_input_values(self, samples_data):
 
         return flatten_inputs(policy_opt_input_values)
 
-    def _features(self, samples_data):
+    def _features(self, episodes):
         """Get valid view features based on samples data.
 
         Args:
-            samples_data (dict): Processed sample data.
-                See garage.tf.paths_to_tensors() for details.
+            episodes (EpisodeBatch): Batch of episodes.
 
         Returns:
             numpy.ndarray: Features for training.
 
         """
-        paths = samples_data['paths']
+        start = 0
         feat_diff = []
-        for path in paths:
-            o = np.clip(path['observations'],
+        for length in episodes.lengths:
+            stop = start + length
+            o = np.clip(episodes.observations[start:stop],
                         self._env_spec.observation_space.low,
                         self._env_spec.observation_space.high)
-            lr = len(path['rewards'])
+            lr = length
             al = np.arange(lr).reshape(-1, 1) / self.max_episode_length
             feats = np.concatenate(
                 [o, o**2, al, al**2, al**3,
@@ -571,5 +541,6 @@ def _features(self, samples_data):
             # pylint: disable=unsubscriptable-object
             feats = np.vstack([feats, np.zeros(feats.shape[1])])
             feat_diff.append(feats[1:] - feats[:-1])
+            start = stop
 
         return np.vstack(feat_diff)

From a8417bd402d147996ab5e34da1c698e8e141c253 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com>
Date: Wed, 21 Oct 2020 16:49:27 -0700
Subject: [PATCH 13/23] Make mujoco test run a few minutes faster (#2143)

---
 examples/torch/maml_trpo_metaworld_ml1_push.py | 4 +++-
 tests/garage/torch/algos/test_pearl.py         | 1 +
 tests/integration_tests/test_examples.py       | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/torch/maml_trpo_metaworld_ml1_push.py b/examples/torch/maml_trpo_metaworld_ml1_push.py
index c059dbb882..93de39f2d3 100755
--- a/examples/torch/maml_trpo_metaworld_ml1_push.py
+++ b/examples/torch/maml_trpo_metaworld_ml1_push.py
@@ -61,7 +61,9 @@ def maml_trpo_metaworld_ml1_push(ctxt, seed, epochs, rollouts_per_task,
                                               hidden_nonlinearity=torch.tanh,
                                               output_nonlinearity=None)
 
-    meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler)
+    meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler,
+                                   n_test_tasks=1,
+                                   n_exploration_eps=rollouts_per_task)
 
     trainer = Trainer(ctxt)
     algo = MAMLTRPO(env=env,
diff --git a/tests/garage/torch/algos/test_pearl.py b/tests/garage/torch/algos/test_pearl.py
index 81343befa1..8265150c7e 100644
--- a/tests/garage/torch/algos/test_pearl.py
+++ b/tests/garage/torch/algos/test_pearl.py
@@ -36,6 +36,7 @@
 class TestPEARL:
     """Test class for PEARL."""
 
+    @pytest.mark.skip
     @pytest.mark.large
     def test_pearl_ml1_push(self):
         """Test PEARL with ML1 Push environment."""
diff --git a/tests/integration_tests/test_examples.py b/tests/integration_tests/test_examples.py
index 114da72c00..c58eea6458 100644
--- a/tests/integration_tests/test_examples.py
+++ b/tests/integration_tests/test_examples.py
@@ -309,7 +309,7 @@ def test_maml_trpo_metaworld_ml1_push():
     """Test maml_trpo_ml1_push.py."""
     assert subprocess.run([
         EXAMPLES_ROOT_DIR / 'torch/maml_trpo_metaworld_ml1_push.py',
-        '--epochs', '1', '--meta_batch_size', '1'
+        '--epochs', '1', '--meta_batch_size', '1', '--rollouts_per_task', '1'
     ],
                           check=False).returncode == 0
 

From f8a2d53061af21ba759680b3adc59581af3b866d Mon Sep 17 00:00:00 2001
From: mishari <44849486+maliesa96@users.noreply.github.com>
Date: Wed, 21 Oct 2020 17:58:44 -0700
Subject: [PATCH 14/23] Add torch DQN (#2076)

This also adds several smaller features:
 - torch/examples/watch_atari.py: use a trained agent to play atari.
 - Error handling in the snapshotter for invalid arguments.
 - torch/examples/dqn_atari.py: train on atari environments.
---
 .../src/garage_benchmarks/parameters.py       |   4 +
 examples/sim_policy.py                        |   4 +-
 examples/torch/dqn_atari.py                   | 194 ++++++++++++
 examples/torch/dqn_cartpole.py                |  68 +++++
 examples/torch/watch_atari.py                 |  67 ++++
 src/garage/_functions.py                      |  10 +-
 src/garage/envs/wrappers/fire_reset.py        |  26 +-
 src/garage/envs/wrappers/max_and_skip.py      |  26 +-
 src/garage/envs/wrappers/stack_frames.py      |  56 +++-
 src/garage/experiment/snapshotter.py          |  24 +-
 src/garage/plotter/plotter.py                 |  12 +-
 src/garage/tf/plotter/plotter.py              |   9 +-
 src/garage/torch/algos/__init__.py            |   5 +-
 src/garage/torch/algos/dqn.py                 | 289 ++++++++++++++++++
 .../policies/discrete_qf_argmax_policy.py     |   3 +-
 .../q_functions/discrete_cnn_q_function.py    |  27 +-
 src/garage/trainer.py                         |   2 +-
 tests/garage/envs/wrappers/test_fire_reset.py |   7 +-
 .../envs/wrappers/test_stack_frames_env.py    |  22 ++
 tests/garage/experiment/test_snapshotter.py   |  36 ++-
 tests/garage/torch/algos/test_dqn.py          | 134 ++++++++
 .../test_discrete_qf_argmax_policy.py         |   8 +-
 .../test_discrete_cnn_q_function.py           |   2 -
 tests/integration_tests/test_examples.py      |  21 ++
 24 files changed, 990 insertions(+), 66 deletions(-)
 create mode 100755 examples/torch/dqn_atari.py
 create mode 100755 examples/torch/dqn_cartpole.py
 create mode 100755 examples/torch/watch_atari.py
 create mode 100644 src/garage/torch/algos/dqn.py
 create mode 100644 tests/garage/torch/algos/test_dqn.py

diff --git a/benchmarks/src/garage_benchmarks/parameters.py b/benchmarks/src/garage_benchmarks/parameters.py
index cae5c58305..10ec5cccbb 100644
--- a/benchmarks/src/garage_benchmarks/parameters.py
+++ b/benchmarks/src/garage_benchmarks/parameters.py
@@ -9,6 +9,10 @@
     task['env_id'] for task in benchmarks.get_benchmark('Mujoco1M')['tasks']
 ]
 
+Atari10M_ENV_SET = [
+    task['env_id'] for task in benchmarks.get_benchmark('Atari10M')['tasks']
+]
+
 PIXEL_ENV_SET = ['CubeCrash-v0', 'MemorizeDigits-v0']
 
 STATE_ENV_SET = [
diff --git a/examples/sim_policy.py b/examples/sim_policy.py
index fa06f7da13..4d5a1228be 100755
--- a/examples/sim_policy.py
+++ b/examples/sim_policy.py
@@ -53,7 +53,6 @@ def query_yes_no(question, default='yes'):
                         type=int,
                         default=1000,
                         help='Max length of episode')
-    parser.add_argument('--speedup', type=float, default=1, help='Speedup')
     args = parser.parse_args()
 
     # If the snapshot file use tensorflow, do:
@@ -68,7 +67,6 @@ def query_yes_no(question, default='yes'):
             path = rollout(env,
                            policy,
                            max_episode_length=args.max_episode_length,
-                           animated=True,
-                           speedup=args.speedup)
+                           animated=True)
             if not query_yes_no('Continue simulation?'):
                 break
diff --git a/examples/torch/dqn_atari.py b/examples/torch/dqn_atari.py
new file mode 100755
index 0000000000..e6d11fe11a
--- /dev/null
+++ b/examples/torch/dqn_atari.py
@@ -0,0 +1,194 @@
+#!/usr/bin/env python3
+"""An example to train a task with DQN algorithm.
+
+Here it creates a gym environment CartPole, and trains a DQN with 50k steps.
+"""
+import click
+import gym
+import numpy as np
+import psutil
+import torch
+
+from garage import wrap_experiment
+from garage.envs import GymEnv
+from garage.envs.wrappers.clip_reward import ClipReward
+from garage.envs.wrappers.episodic_life import EpisodicLife
+from garage.envs.wrappers.fire_reset import FireReset
+from garage.envs.wrappers.grayscale import Grayscale
+from garage.envs.wrappers.max_and_skip import MaxAndSkip
+from garage.envs.wrappers.noop import Noop
+from garage.envs.wrappers.resize import Resize
+from garage.envs.wrappers.stack_frames import StackFrames
+from garage.experiment.deterministic import set_seed
+from garage.np.exploration_policies import EpsilonGreedyPolicy
+from garage.replay_buffer import PathBuffer
+from garage.sampler import FragmentWorker, LocalSampler
+from garage.torch import set_gpu_mode
+from garage.torch.algos import DQN
+from garage.torch.policies import DiscreteQFArgmaxPolicy
+from garage.torch.q_functions import DiscreteCNNQFunction
+from garage.trainer import Trainer
+
+hyperparams = dict(n_epochs=500,
+                   steps_per_epoch=20,
+                   sampler_batch_size=500,
+                   lr=1e-4,
+                   discount=0.99,
+                   min_buffer_size=int(1e4),
+                   n_train_steps=125,
+                   target_update_freq=2,
+                   buffer_batch_size=32,
+                   max_epsilon=1.0,
+                   min_epsilon=0.01,
+                   decay_ratio=0.1,
+                   buffer_size=int(1e4),
+                   hidden_sizes=(512, ),
+                   hidden_channels=(32, 64, 64),
+                   kernel_sizes=(8, 4, 3),
+                   strides=(4, 2, 1),
+                   clip_gradient=10)
+
+
+@click.command()
+@click.argument('env', type=str)
+@click.option('--seed', default=24)
+@click.option('--n', type=int, default=psutil.cpu_count(logical=False))
+@click.option('--buffer_size', type=int, default=None)
+@click.option('--max_episode_length', type=int, default=None)
+def main(env=None,
+         seed=24,
+         n=psutil.cpu_count(logical=False),
+         buffer_size=None,
+         max_episode_length=None):
+    """Wrapper to setup the logging directory.
+
+    Args:
+        env (str): Name of the atari environment, can either be the prefix
+            or the full name. For example, this can either be 'Pong' or
+            'PongNoFrameskip-v4'. If the former is used, the env used will be
+            `env` + 'NoFrameskip-v4'.
+        seed (int): Seed to use for the RNG.
+        n (int): Number of workers to use. Defaults to the number of CPU cores
+            available.
+        buffer_size (int): size of the replay buffer in transitions. If None,
+            defaults to hyperparams['buffer_size']. This is used by the
+            integration tests.
+        max_episode_length (int): Max length of an episode. If None, defaults
+            to the timelimit specific to the environment. Used by integration
+            tests.
+    """
+    if '-v' not in env:
+        env += 'NoFrameskip-v4'
+    logdir = 'data/local/experiment/' + env
+
+    if buffer_size is not None:
+        hyperparams['buffer_size'] = buffer_size
+
+    dqn_atari(dict(log_dir=logdir),
+              env=env,
+              seed=seed,
+              n_workers=n,
+              max_episode_length=max_episode_length,
+              **hyperparams)
+
+
+# pylint: disable=unused-argument
+@wrap_experiment(snapshot_mode='gap_overwrite', snapshot_gap=30)
+def dqn_atari(ctxt=None,
+              env=None,
+              seed=24,
+              n_workers=psutil.cpu_count(logical=False),
+              max_episode_length=None,
+              **kwargs):
+    """Train DQN with PongNoFrameskip-v4 environment.
+
+    Args:
+        ctxt (garage.experiment.ExperimentContext): The experiment
+            configuration used by Trainer to create the snapshotter.
+        env (str): Name of the atari environment, eg. 'PongNoFrameskip-v4'.
+        seed (int): Used to seed the random number generator to produce
+            determinism.
+        n_workers (int): Number of workers to use. Defaults to the number of
+            CPU cores available.
+        max_episode_length (int): Max length of an episode. If None, defaults
+            to the timelimit specific to the environment. Used by integration
+            tests.
+        kwargs (dict): hyperparameters to be saved to variant.json.
+
+    """
+    assert n_workers > 0
+    assert env is not None
+    env = gym.make(env)
+    env = Noop(env, noop_max=30)
+    env = MaxAndSkip(env, skip=4)
+    env = EpisodicLife(env)
+    if 'FIRE' in env.unwrapped.get_action_meanings():
+        env = FireReset(env)
+    env = Grayscale(env)
+    env = Resize(env, 84, 84)
+    env = ClipReward(env)
+    env = StackFrames(env, 4, axis=0)
+    env = GymEnv(env, max_episode_length=max_episode_length)
+    set_seed(seed)
+    trainer = Trainer(ctxt)
+
+    env.spec.observation_space = env.observation_space
+    env.spec.action_space = env.action_space
+
+    n_epochs = hyperparams['n_epochs']
+    steps_per_epoch = hyperparams['steps_per_epoch']
+    sampler_batch_size = hyperparams['sampler_batch_size']
+    num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
+    replay_buffer = PathBuffer(
+        capacity_in_transitions=hyperparams['buffer_size'])
+
+    qf = DiscreteCNNQFunction(
+        env_spec=env.spec,
+        hidden_channels=hyperparams['hidden_channels'],
+        kernel_sizes=hyperparams['kernel_sizes'],
+        strides=hyperparams['strides'],
+        hidden_w_init=(
+            lambda x: torch.nn.init.orthogonal_(x, gain=np.sqrt(2))),
+        hidden_sizes=hyperparams['hidden_sizes'],
+        is_image=True)
+
+    policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
+    exploration_policy = EpsilonGreedyPolicy(
+        env_spec=env.spec,
+        policy=policy,
+        total_timesteps=num_timesteps,
+        max_epsilon=hyperparams['max_epsilon'],
+        min_epsilon=hyperparams['min_epsilon'],
+        decay_ratio=hyperparams['decay_ratio'])
+
+    algo = DQN(env_spec=env.spec,
+               policy=policy,
+               qf=qf,
+               exploration_policy=exploration_policy,
+               replay_buffer=replay_buffer,
+               steps_per_epoch=steps_per_epoch,
+               qf_lr=hyperparams['lr'],
+               clip_gradient=hyperparams['clip_gradient'],
+               discount=hyperparams['discount'],
+               min_buffer_size=hyperparams['min_buffer_size'],
+               n_train_steps=hyperparams['n_train_steps'],
+               target_update_freq=hyperparams['target_update_freq'],
+               buffer_batch_size=hyperparams['buffer_batch_size'])
+
+    set_gpu_mode(False)
+    torch.set_num_threads(1)
+    if torch.cuda.is_available():
+        set_gpu_mode(True)
+        algo.to()
+
+    trainer.setup(algo,
+                  env,
+                  sampler_cls=LocalSampler,
+                  worker_class=FragmentWorker,
+                  n_workers=n_workers)
+
+    trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
+    env.close()
+
+
+main()
diff --git a/examples/torch/dqn_cartpole.py b/examples/torch/dqn_cartpole.py
new file mode 100755
index 0000000000..0a54e537b2
--- /dev/null
+++ b/examples/torch/dqn_cartpole.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python3
+"""An example to train a task with DQN algorithm.
+
+Here it creates a gym environment CartPole, and trains a DQN with 50k steps.
+"""
+import click
+
+from garage import wrap_experiment
+from garage.envs import GymEnv
+from garage.experiment.deterministic import set_seed
+from garage.np.exploration_policies import EpsilonGreedyPolicy
+from garage.replay_buffer import PathBuffer
+from garage.sampler import LocalSampler
+from garage.torch.algos import DQN
+from garage.torch.policies import DiscreteQFArgmaxPolicy
+from garage.torch.q_functions import DiscreteMLPQFunction
+from garage.trainer import Trainer
+
+
+@click.command()
+@click.option('--seed', default=24)
+@wrap_experiment(snapshot_mode='none')
+def dqn_cartpole(ctxt=None, seed=24):
+    """Train DQN with CartPole-v0 environment.
+
+    Args:
+        ctxt (garage.experiment.ExperimentContext): The experiment
+            configuration used by LocalRunner to create the snapshotter.
+        seed (int): Used to seed the random number generator to produce
+            determinism.
+    """
+    set_seed(seed)
+    runner = Trainer(ctxt)
+
+    n_epochs = 100
+    steps_per_epoch = 10
+    sampler_batch_size = 512
+    num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
+    env = GymEnv('CartPole-v0')
+    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
+    qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5))
+    policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
+    exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec,
+                                             policy=policy,
+                                             total_timesteps=num_timesteps,
+                                             max_epsilon=1.0,
+                                             min_epsilon=0.01,
+                                             decay_ratio=0.4)
+    algo = DQN(env_spec=env.spec,
+               policy=policy,
+               qf=qf,
+               exploration_policy=exploration_policy,
+               replay_buffer=replay_buffer,
+               steps_per_epoch=steps_per_epoch,
+               qf_lr=5e-5,
+               discount=0.9,
+               min_buffer_size=int(1e4),
+               n_train_steps=500,
+               target_update_freq=30,
+               buffer_batch_size=64)
+
+    runner.setup(algo, env, sampler_cls=LocalSampler)
+    runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
+
+    env.close()
+
+
+dqn_cartpole()
diff --git a/examples/torch/watch_atari.py b/examples/torch/watch_atari.py
new file mode 100755
index 0000000000..b780d332c8
--- /dev/null
+++ b/examples/torch/watch_atari.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+"""Utility to watch a trained agent play an Atari game."""
+
+import click
+import gym
+import numpy as np
+
+from garage import rollout
+from garage.envs import GymEnv
+from garage.envs.wrappers.clip_reward import ClipReward
+from garage.envs.wrappers.episodic_life import EpisodicLife
+from garage.envs.wrappers.fire_reset import FireReset
+from garage.envs.wrappers.grayscale import Grayscale
+from garage.envs.wrappers.max_and_skip import MaxAndSkip
+from garage.envs.wrappers.noop import Noop
+from garage.envs.wrappers.resize import Resize
+from garage.envs.wrappers.stack_frames import StackFrames
+from garage.experiment import Snapshotter
+
+
+# pylint: disable=no-value-for-parameter, protected-access
+@click.command()
+@click.argument('saved_dir', type=str)
+@click.option('--env', type=str, default=None)
+@click.option('--num_episodes', type=int, default=10)
+def watch_atari(saved_dir, env=None, num_episodes=10):
+    """Watch a trained agent play an atari game.
+
+    Args:
+        saved_dir (str): Directory containing the pickle file.
+        env (str): Environment to run episodes on. If None, the pickled
+            environment is used.
+        num_episodes (int): Number of episodes to play. Note that when using
+            the EpisodicLife wrapper, an episode is considered done when a
+            life is lost. Defaults to 10.
+    """
+    snapshotter = Snapshotter()
+    data = snapshotter.load(saved_dir)
+    if env is not None:
+        env = gym.make(env)
+        env = Noop(env, noop_max=30)
+        env = MaxAndSkip(env, skip=4)
+        env = EpisodicLife(env)
+        if 'FIRE' in env.unwrapped.get_action_meanings():
+            env = FireReset(env)
+        env = Grayscale(env)
+        env = Resize(env, 84, 84)
+        env = ClipReward(env)
+        env = StackFrames(env, 4, axis=0)
+        env = GymEnv(env)
+    else:
+        env = data['env']
+
+    exploration_policy = data['algo'].exploration_policy
+    exploration_policy.policy._qf.to('cpu')
+    ep_rewards = np.asarray([])
+    for _ in range(num_episodes):
+        episode_data = rollout(env,
+                               exploration_policy.policy,
+                               animated=True,
+                               pause_per_frame=0.02)
+        ep_rewards = np.append(ep_rewards, np.sum(episode_data['rewards']))
+
+    print('Average Reward {}'.format(np.mean(ep_rewards)))
+
+
+watch_atari()
diff --git a/src/garage/_functions.py b/src/garage/_functions.py
index a08effe367..19eae59767 100644
--- a/src/garage/_functions.py
+++ b/src/garage/_functions.py
@@ -1,5 +1,6 @@
 """Functions exposed directly in the garage namespace."""
 from collections import defaultdict
+import time
 
 from dowel import tabular
 import numpy as np
@@ -68,7 +69,7 @@ def rollout(env,
             *,
             max_episode_length=np.inf,
             animated=False,
-            speedup=1,
+            pause_per_frame=None,
             deterministic=False):
     """Sample a single episode of the agent in the environment.
 
@@ -78,8 +79,8 @@ def rollout(env,
         max_episode_length (int): If the episode reaches this many timesteps,
             it is truncated.
         animated (bool): If true, render the environment after each step.
-        speedup (float): Factor by which to decrease the wait time between
-            rendered steps. Only relevant, if animated == true.
+        pause_per_frame (float): Time to sleep between steps. Only relevant if
+            animated == true.
         deterministic (bool): If true, use the mean action returned by the
             stochastic policy instead of sampling from the returned action
             distribution.
@@ -104,7 +105,6 @@ def rollout(env,
             * dones(np.array): Array of termination signals.
 
     """
-    del speedup
     env_steps = []
     agent_infos = []
     observations = []
@@ -114,6 +114,8 @@ def rollout(env,
     if animated:
         env.visualize()
     while episode_length < (max_episode_length or np.inf):
+        if pause_per_frame is not None:
+            time.sleep(pause_per_frame)
         a, agent_info = agent.get_action(last_obs)
         if deterministic and 'mean' in agent_info:
             a = agent_info['mean']
diff --git a/src/garage/envs/wrappers/fire_reset.py b/src/garage/envs/wrappers/fire_reset.py
index ab7f151d1c..cadf3d6cf5 100644
--- a/src/garage/envs/wrappers/fire_reset.py
+++ b/src/garage/envs/wrappers/fire_reset.py
@@ -19,13 +19,33 @@ def __init__(self, env):
             'Only use fire reset wrapper for suitable environment!')
 
     def step(self, action):
-        """gym.Env step function."""
+        """gym.Env step function.
+
+        Args:
+            action (int): index of the action to take.
+
+        Returns:
+            np.ndarray: Observation conforming to observation_space
+            float: Reward for this step
+            bool: Termination signal
+            dict: Extra information from the environment.
+        """
         return self.env.step(action)
 
     def reset(self, **kwargs):
-        """gym.Env reset function."""
+        """gym.Env reset function.
+
+        Args:
+            kwargs (dict): extra arguments passed to gym.Env.reset()
+
+        Returns:
+            np.ndarray: next observation.
+        """
         self.env.reset(**kwargs)
         obs, _, done, _ = self.env.step(1)
         if done:
-            obs = self.env.reset(**kwargs)
+            self.env.reset(**kwargs)
+        obs, _, done, _ = self.env.step(2)
+        if done:
+            self.env.reset(**kwargs)
         return obs
diff --git a/src/garage/envs/wrappers/max_and_skip.py b/src/garage/envs/wrappers/max_and_skip.py
index 9545825ce0..51253fd617 100644
--- a/src/garage/envs/wrappers/max_and_skip.py
+++ b/src/garage/envs/wrappers/max_and_skip.py
@@ -14,8 +14,8 @@ class MaxAndSkip(gym.Wrapper):
     render their sprites every other game frame.
 
     Args:
-        env: The environment to be wrapped.
-        skip: The environment only returns `skip`-th frame.
+        env (gym.Env): The environment to be wrapped.
+        skip (int): The environment only returns `skip`-th frame.
 
     """
 
@@ -26,13 +26,22 @@ def __init__(self, env, skip=4):
         self._skip = skip
 
     def step(self, action):
-        """
-        gym.Env step.
+        """Repeat action, sum reward, and max over last two observations.
+
+        Args:
+            action (int): action to take in the atari environment.
+
+        Returns:
+            np.ndarray: observation of shape :math:`(O*,)` representating
+                the max values over the last two oservations.
+            float: Reward for this step
+            bool: Termination signal
+            dict: Extra information from the environment.
 
-        Repeat action, sum reward, and max over last two observations.
         """
         total_reward = 0.0
         done = None
+
         for i in range(self._skip):
             obs, reward, done, info = self.env.step(action)
             if i == self._skip - 2:
@@ -45,6 +54,11 @@ def step(self, action):
         max_frame = self._obs_buffer.max(axis=0)
         return max_frame, total_reward, done, info
 
+    # pylint: disable=arguments-differ
     def reset(self):
-        """gym.Env reset."""
+        """gym.Env reset.
+
+        Returns:
+            np.ndarray: observaion of shape :math:`(O*,)`.
+        """
         return self.env.reset()
diff --git a/src/garage/envs/wrappers/stack_frames.py b/src/garage/envs/wrappers/stack_frames.py
index 01edb7d639..5ed17ae0ee 100644
--- a/src/garage/envs/wrappers/stack_frames.py
+++ b/src/garage/envs/wrappers/stack_frames.py
@@ -13,16 +13,23 @@ class StackFrames(gym.Wrapper):
     Only works with gym.spaces.Box environment with 2D single channel frames.
 
     Args:
-        env: gym.Env to wrap.
-        n_frames: number of frames to stack.
+        env (gym.Env): gym.Env to wrap.
+        n_frames (int): number of frames to stack.
+        axis (int): Axis to stack frames on. This should be 2 for tensorflow
+            and 0 for pytorch.
 
     Raises:
-        ValueError: If observation space shape is not 2 or
-        environment is not gym.spaces.Box.
+         ValueError: If observation space shape is not 2 dimnesional,
+         if the environment is not gym.spaces.Box, or if the specified axis
+         is not 0 or 2.
+
 
     """
 
-    def __init__(self, env, n_frames):
+    def __init__(self, env, n_frames, axis=2):
+        if axis not in (0, 2):
+            raise ValueError('Frame stacking axis should be 0 for pytorch or '
+                             '2 for tensorflow.')
         if not isinstance(env.observation_space, gym.spaces.Box):
             raise ValueError('Stack frames only works with gym.spaces.Box '
                              'environment.')
@@ -34,9 +41,13 @@ def __init__(self, env, n_frames):
         super().__init__(env)
 
         self._n_frames = n_frames
+        self._axis = axis
         self._frames = deque(maxlen=n_frames)
 
         new_obs_space_shape = env.observation_space.shape + (n_frames, )
+        if axis == 0:
+            new_obs_space_shape = (n_frames, ) + env.observation_space.shape
+
         _low = env.observation_space.low.flatten()[0]
         _high = env.observation_space.high.flatten()[0]
         self._observation_space = gym.spaces.Box(
@@ -47,7 +58,7 @@ def __init__(self, env, n_frames):
 
     @property
     def observation_space(self):
-        """gym.Env observation space."""
+        """gym.spaces.Box: gym.Env observation space."""
         return self._observation_space
 
     @observation_space.setter
@@ -55,19 +66,44 @@ def observation_space(self, observation_space):
         self._observation_space = observation_space
 
     def _stack_frames(self):
-        return np.stack(self._frames, axis=2)
+        """Stacks and returns the last n_frames.
+
+        Returns:
+            np.ndarray: stacked observation with shape either
+            :math:`(N, n_frames, O*)` or :math:(N, O*, n_frames),
+            depending on the axis specified.
+        """
+        return np.stack(self._frames, axis=self._axis)
 
+    # pylint: disable=arguments-differ
     def reset(self):
-        """gym.Env reset function."""
+        """gym.Env reset function.
+
+        Returns:
+            np.ndarray: Observation conforming to observation_space
+            float: Reward for this step
+            bool: Termination signal
+            dict: Extra information from the environment.
+        """
         observation = self.env.reset()
         self._frames.clear()
-        for i in range(self._n_frames):
+        for _ in range(self._n_frames):
             self._frames.append(observation)
 
         return self._stack_frames()
 
     def step(self, action):
-        """gym.Env step function."""
+        """gym.Env step function.
+
+        Args:
+            action (int): index of the action to take.
+
+        Returns:
+            np.ndarray: Observation conforming to observation_space
+            float: Reward for this step
+            bool: Termination signal
+            dict: Extra information from the environment.
+        """
         new_observation, reward, done, info = self.env.step(action)
         self._frames.append(new_observation)
 
diff --git a/src/garage/experiment/snapshotter.py b/src/garage/experiment/snapshotter.py
index 4eef38c881..549569fbf7 100644
--- a/src/garage/experiment/snapshotter.py
+++ b/src/garage/experiment/snapshotter.py
@@ -21,7 +21,10 @@ class Snapshotter:
         snapshot_mode (str): Mode to save the snapshot. Can be either "all"
             (all iterations will be saved), "last" (only the last iteration
             will be saved), "gap" (every snapshot_gap iterations are saved),
-            or "none" (do not save snapshots).
+            "gap_and_last" (save the last iteration as 'params.pkl' and save
+            every snapshot_gap iteration separately), "gap_overwrite" (same as
+            gap but overwrites the last saved snapshot), or "none" (do not
+            save snapshots).
         snapshot_gap (int): Gap between snapshot iterations. Wait this number
             of iterations before taking another snapshot.
 
@@ -36,6 +39,16 @@ def __init__(self,
         self._snapshot_mode = snapshot_mode
         self._snapshot_gap = snapshot_gap
 
+        if snapshot_mode == 'gap_overwrite' and snapshot_gap <= 1:
+            raise ValueError('snapshot_gap must be > 1 when using '
+                             'snapshot_mode="gap_overwrite". Use '
+                             'snapshot_mode="last" to snapshot after '
+                             'every iteration.')
+        if snapshot_mode == 'last' and snapshot_gap != 1:
+            raise ValueError('snapshot_gap should be set to 1 if using '
+                             'snapshot_mode="last". Did you mean to'
+                             ' use snapshot_mode="gap"?')
+
         pathlib.Path(snapshot_dir).mkdir(parents=True, exist_ok=True)
 
     @property
@@ -53,7 +66,8 @@ def snapshot_mode(self):
         """Return the type of snapshot.
 
         Returns:
-            str: The type of snapshot. Can be "all", "last" or "gap"
+            str: The type of snapshot. Can be "all", "last", "gap",
+                "gap_overwrite", "gap_and_last", or "none".
 
         """
         return self._snapshot_mode
@@ -76,13 +90,17 @@ def save_itr_params(self, itr, params):
             params (obj): Content of snapshot to be saved.
 
         Raises:
-            ValueError: If snapshot_mode is not one of "all", "last" or "gap".
+            ValueError: If snapshot_mode is not one of "all", "last", "gap",
+                "gap_overwrite", "gap_and_last", or "none".
 
         """
         file_name = None
 
         if self._snapshot_mode == 'all':
             file_name = os.path.join(self._snapshot_dir, 'itr_%d.pkl' % itr)
+        elif self._snapshot_mode == 'gap_overwrite':
+            if itr % self._snapshot_gap == 0:
+                file_name = os.path.join(self._snapshot_dir, 'params.pkl')
         elif self._snapshot_mode == 'last':
             # override previous params
             file_name = os.path.join(self._snapshot_dir, 'params.pkl')
diff --git a/src/garage/plotter/plotter.py b/src/garage/plotter/plotter.py
index fc2d05e30b..fe82d59144 100644
--- a/src/garage/plotter/plotter.py
+++ b/src/garage/plotter/plotter.py
@@ -72,15 +72,13 @@ def _worker_start(self):
                     rollout(env,
                             policy,
                             max_episode_length=max_length,
-                            animated=True,
-                            speedup=5)
+                            animated=True)
                 else:
                     if max_length:
                         rollout(env,
                                 policy,
                                 max_episode_length=max_length,
-                                animated=True,
-                                speedup=5)
+                                animated=True)
         except KeyboardInterrupt:
             pass
 
@@ -139,11 +137,7 @@ def init_plot(self, env, policy):
 
         # Needed in order to draw glfw window on the main thread
         if 'Darwin' in platform.platform():
-            rollout(env,
-                    policy,
-                    max_episode_length=np.inf,
-                    animated=True,
-                    speedup=5)
+            rollout(env, policy, max_episode_length=np.inf, animated=True)
 
         self._queue.put(Message(op=Op.UPDATE, args=(env, policy), kwargs=None))
 
diff --git a/src/garage/tf/plotter/plotter.py b/src/garage/tf/plotter/plotter.py
index a5d5b87ee8..7083e1ca36 100644
--- a/src/garage/tf/plotter/plotter.py
+++ b/src/garage/tf/plotter/plotter.py
@@ -67,8 +67,7 @@ def __init__(self,
             self.rollout(self._env,
                          self._policy,
                          max_episode_length=np.inf,
-                         animated=True,
-                         speedup=5)
+                         animated=True)
 
     def _start_worker(self):
         max_length = None
@@ -106,16 +105,14 @@ def _start_worker(self):
                         self.rollout(self._env,
                                      self._policy,
                                      max_episode_length=max_length,
-                                     animated=True,
-                                     speedup=5)
+                                     animated=True)
                         self.queue.task_done()
                     else:
                         if max_length:
                             self.rollout(self._env,
                                          self._policy,
                                          max_episode_length=max_length,
-                                         animated=True,
-                                         speedup=5)
+                                         animated=True)
         except KeyboardInterrupt:
             pass
 
diff --git a/src/garage/torch/algos/__init__.py b/src/garage/torch/algos/__init__.py
index d440ef9f46..c0c95ecab2 100644
--- a/src/garage/torch/algos/__init__.py
+++ b/src/garage/torch/algos/__init__.py
@@ -5,6 +5,7 @@
 from garage.torch.algos.ddpg import DDPG
 # VPG has to be imported first because it is depended by PPO and TRPO.
 # PPO, TRPO, and VPG need to be imported before their MAML variants
+from garage.torch.algos.dqn import DQN
 from garage.torch.algos.vpg import VPG
 from garage.torch.algos.maml_vpg import MAMLVPG
 from garage.torch.algos.ppo import PPO
@@ -17,6 +18,6 @@
 from garage.torch.algos.pearl import PEARL
 
 __all__ = [
-    'BC', 'DDPG', 'VPG', 'PPO', 'TRPO', 'MAMLPPO', 'MAMLTRPO', 'MAMLVPG',
-    'MTSAC', 'PEARL', 'SAC'
+    'BC', 'DDPG', 'DQN', 'VPG', 'PPO', 'TRPO', 'MAMLPPO', 'MAMLTRPO',
+    'MAMLVPG', 'MTSAC', 'PEARL', 'SAC'
 ]
diff --git a/src/garage/torch/algos/dqn.py b/src/garage/torch/algos/dqn.py
new file mode 100644
index 0000000000..f6c5f06716
--- /dev/null
+++ b/src/garage/torch/algos/dqn.py
@@ -0,0 +1,289 @@
+"""This modules creates a DDPG model in PyTorch."""
+import collections
+import copy
+
+from dowel import logger, tabular
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from garage import _Default, log_performance, make_optimizer
+from garage._functions import obtain_evaluation_episodes
+from garage.np.algos import RLAlgorithm
+from garage.sampler import FragmentWorker
+from garage.torch import global_device, np_to_torch
+
+
+class DQN(RLAlgorithm):
+    """DQN algorithm. See https://arxiv.org/pdf/1312.5602.pdf.
+
+    DQN, also known as the Deep Q Network algorithm, is an off-policy algorithm
+    that learns action-value estimates for each state, action pair. The
+    policy then simply acts by taking the action that yields the highest Q(s,a)
+    value for a given state s.
+
+    Args:
+        env_spec (EnvSpec): Environment specification.
+        policy (garage.torch.policies.Policy): Policy. For DQN, this is a
+            policy that performs the action that yields the highest Q value.
+        qf (nn.Module): Q-value network.
+        replay_buffer (ReplayBuffer): Replay buffer.
+        steps_per_epoch (int): Number of train_once calls per epoch.
+        n_train_steps (int): Training steps.
+        eval_env (Environment): Evaluation environment. If None, a copy of the
+            main environment is used for evaluation.
+        max_episode_length_eval (int or None): Maximum length of episodes used
+            for off-policy evaluation. If `None`, defaults to
+            `env_spec.max_episode_length`.
+        buffer_batch_size (int): Batch size of replay buffer.
+        min_buffer_size (int): The minimum buffer size for replay buffer.
+        exploration_policy (ExplorationPolicy): Exploration strategy, typically
+            epsilon-greedy.
+        num_eval_episodes (int): Nunber of evaluation episodes. Defaults to 10.
+        deterministic_eval (bool): Whether to evaluate the policy
+            deterministically (without exploration noise). False by default.
+        target_update_freq (int): Number of optimization steps between each
+            update to the target Q network.
+        discount(float): Discount factor for the cumulative return.
+        qf_optimizer (Union[type, tuple[type, dict]]): Type of optimizer
+            for training Q-value network. This can be an optimizer type such
+            as `torch.optim.Adam` or a tuple of type and dictionary, where
+            dictionary contains arguments to initialize the optimizer
+            e.g. `(torch.optim.Adam, {'lr' : 1e-3})`.
+        qf_lr (float): Learning rate for Q-value network parameters.
+        clip_rewards (float): Clip reward to be in [-clip_rewards,
+            clip_rewards]. If None, rewards are not clipped.
+        clip_gradient (float): Clip gradient norm to `clip_gradient`. If None,
+            gradient are not clipped. Defaults to 10.
+        reward_scale (float): Reward scale.
+    """
+    worker_cls = FragmentWorker
+
+    def __init__(
+            self,
+            env_spec,
+            policy,
+            qf,
+            replay_buffer,
+            exploration_policy=None,
+            eval_env=None,
+            qf_optimizer=torch.optim.Adam,
+            *,  # Everything after this is numbers.
+            steps_per_epoch=20,
+            n_train_steps=50,
+            max_episode_length_eval=None,
+            deterministic_eval=False,
+            buffer_batch_size=64,
+            min_buffer_size=int(1e4),
+            num_eval_episodes=10,
+            discount=0.99,
+            qf_lr=_Default(1e-3),
+            clip_rewards=None,
+            clip_gradient=10,
+            target_update_freq=5,
+            reward_scale=1.):
+        self._clip_reward = clip_rewards
+        self._clip_grad = clip_gradient
+
+        self._steps_per_epoch = steps_per_epoch
+        self._target_update_freq = target_update_freq
+        self._episode_qf_losses = []
+        self._epoch_ys = []
+        self._epoch_qs = []
+
+        self._policy = policy
+        self._qf = qf
+        self._n_train_steps = n_train_steps
+
+        self._min_buffer_size = min_buffer_size
+        self._qf = qf
+        self._steps_per_epoch = steps_per_epoch
+        self._n_train_steps = n_train_steps
+        self._buffer_batch_size = buffer_batch_size
+        self._discount = discount
+        self._reward_scale = reward_scale
+        self.max_episode_length = env_spec.max_episode_length
+        self._max_episode_length_eval = (max_episode_length_eval
+                                         or self.max_episode_length)
+        self._episode_reward_mean = collections.deque(maxlen=100)
+        self._num_eval_episodes = num_eval_episodes
+        self._deterministic_eval = deterministic_eval
+
+        self.env_spec = env_spec
+        self.replay_buffer = replay_buffer
+        self.policy = policy
+        self.exploration_policy = exploration_policy
+
+        self._target_qf = copy.deepcopy(self._qf)
+        self._qf_optimizer = make_optimizer(qf_optimizer,
+                                            module=self._qf,
+                                            lr=qf_lr)
+        self._eval_env = eval_env
+
+    def train(self, trainer):
+        """Obtain samplers and start actual training for each epoch.
+
+        Args:
+            trainer (Trainer): Experiment trainer.
+
+        Returns:
+            float: The average return in last epoch cycle.
+
+        """
+        if not self._eval_env:
+            self._eval_env = trainer.get_env_copy()
+        last_returns = [float('nan')]
+
+        if self._min_buffer_size > self.replay_buffer.n_transitions_stored:
+            num_warmup_steps = (self._min_buffer_size -
+                                self.replay_buffer.n_transitions_stored)
+            self.replay_buffer.add_episode_batch(
+                trainer.obtain_episodes(0, num_warmup_steps))
+
+        trainer.enable_logging = True
+
+        for _ in trainer.step_epochs():
+            if (self.replay_buffer.n_transitions_stored >=
+                    self._min_buffer_size):
+                logger.log('Evaluating policy')
+
+                params_before = self.exploration_policy.get_param_values()
+                eval_eps = obtain_evaluation_episodes(
+                    (self.exploration_policy
+                     if not self._deterministic_eval else self.policy),
+                    self._eval_env,
+                    num_eps=self._num_eval_episodes,
+                    max_episode_length=self._max_episode_length_eval)
+                self.exploration_policy.set_param_values(params_before)
+
+                last_returns = log_performance(trainer.step_itr,
+                                               eval_eps,
+                                               discount=self._discount)
+                self._episode_reward_mean.extend(last_returns)
+                tabular.record('Evaluation/100EpRewardMean',
+                               np.mean(self._episode_reward_mean))
+
+            for _ in range(self._steps_per_epoch):
+                trainer.step_path = trainer.obtain_episodes(trainer.step_itr)
+                if hasattr(self.exploration_policy, 'update'):
+                    self.exploration_policy.update(trainer.step_path)
+
+                self._train_once(trainer.step_itr, trainer.step_path)
+                trainer.step_itr += 1
+
+        return np.mean(last_returns)
+
+    def _train_once(self, itr, episodes):
+        """Perform one iteration of training.
+
+        Args:
+            itr (int): Iteration number.
+            episodes (EpisodeBatch): Batch of episodes.
+
+        """
+        self.replay_buffer.add_episode_batch(episodes)
+
+        epoch = itr / self._steps_per_epoch
+
+        for _ in range(self._n_train_steps):
+            if (self.replay_buffer.n_transitions_stored >=
+                    self._min_buffer_size):
+                timesteps = self.replay_buffer.sample_timesteps(
+                    self._buffer_batch_size)
+                qf_loss, y, q = tuple(v.cpu().numpy()
+                                      for v in self._optimize_qf(timesteps))
+
+                self._episode_qf_losses.append(qf_loss)
+                self._epoch_ys.append(y)
+                self._epoch_qs.append(q)
+
+        if itr % self._steps_per_epoch == 0:
+            self._log_eval_results(epoch)
+
+        if itr % self._target_update_freq == 0:
+            self._target_qf = copy.deepcopy(self._qf)
+
+    def _log_eval_results(self, epoch):
+        """Log evaluation results after an epoch.
+
+        Args:
+            epoch (int): Current epoch.
+        """
+        logger.log('Training finished')
+
+        if self.replay_buffer.n_transitions_stored >= self._min_buffer_size:
+            tabular.record('Epoch', epoch)
+            tabular.record('QFunction/AverageQFunctionLoss',
+                           np.mean(self._episode_qf_losses))
+            tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs))
+            tabular.record('QFunction/MaxQ', np.max(self._epoch_qs))
+            tabular.record('QFunction/AverageAbsQ',
+                           np.mean(np.abs(self._epoch_qs)))
+            tabular.record('QFunction/AverageY', np.mean(self._epoch_ys))
+            tabular.record('QFunction/MaxY', np.max(self._epoch_ys))
+            tabular.record('QFunction/AverageAbsY',
+                           np.mean(np.abs(self._epoch_ys)))
+
+    def _optimize_qf(self, timesteps):
+        """Perform algorithm optimizing.
+
+        Args:
+            timesteps (TimeStepBatch): Processed batch data.
+
+        Returns:
+            qval_loss: Loss of Q-value predicted by the Q-network.
+            ys: y_s.
+            qval: Q-value predicted by the Q-network.
+
+        """
+        observations = np_to_torch(timesteps.observations)
+        rewards = np_to_torch(timesteps.rewards).reshape(-1, 1)
+        rewards *= self._reward_scale
+        actions = np_to_torch(timesteps.actions)
+        next_observations = np_to_torch(timesteps.next_observations)
+        terminals = np_to_torch(timesteps.terminals).reshape(-1, 1)
+
+        next_inputs = next_observations
+        inputs = observations
+        with torch.no_grad():
+            # discrete, outputs Qs for all possible actions
+            target_qvals = self._target_qf(next_inputs)
+            best_qvals, _ = torch.max(target_qvals, 1)
+            best_qvals = best_qvals.unsqueeze(1)
+
+        rewards_clipped = rewards
+        if self._clip_reward is not None:
+            rewards_clipped = torch.clamp(rewards, -1 * self._clip_reward,
+                                          self._clip_reward)
+        y_target = (rewards_clipped +
+                    (1.0 - terminals) * self._discount * best_qvals)
+        y_target = y_target.squeeze(1)
+
+        # optimize qf
+        qvals = self._qf(inputs)
+        selected_qs = torch.sum(qvals * actions, axis=1)
+        qval_loss = F.smooth_l1_loss(selected_qs, y_target)
+
+        self._qf_optimizer.zero_grad()
+        qval_loss.backward()
+
+        # optionally clip the gradients
+        if self._clip_grad is not None:
+            torch.nn.utils.clip_grad_norm_(self.policy.parameters(),
+                                           self._clip_grad)
+        self._qf_optimizer.step()
+
+        return (qval_loss.detach(), y_target, selected_qs.detach())
+
+    def to(self, device=None):
+        """Put all the networks within the model on device.
+
+        Args:
+            device (str): ID of GPU or CPU.
+
+        """
+        if device is None:
+            device = global_device()
+        logger.log('Using device: ' + str(device))
+        self._qf = self._qf.to(device)
+        self._target_qf = self._target_qf.to(device)
diff --git a/src/garage/torch/policies/discrete_qf_argmax_policy.py b/src/garage/torch/policies/discrete_qf_argmax_policy.py
index 4ed39c53b4..9cf8c2e625 100644
--- a/src/garage/torch/policies/discrete_qf_argmax_policy.py
+++ b/src/garage/torch/policies/discrete_qf_argmax_policy.py
@@ -5,6 +5,7 @@
 import numpy as np
 import torch
 
+from garage.torch import np_to_torch
 from garage.torch.policies.policy import Policy
 
 
@@ -65,4 +66,4 @@ def get_actions(self, observations):
             dict: Empty since this policy does not produce a distribution.
         """
         with torch.no_grad():
-            return self(torch.Tensor(observations)).numpy(), dict()
+            return self(np_to_torch(observations)).cpu().numpy(), dict()
diff --git a/src/garage/torch/q_functions/discrete_cnn_q_function.py b/src/garage/torch/q_functions/discrete_cnn_q_function.py
index 6b27d77a98..4550ef52da 100644
--- a/src/garage/torch/q_functions/discrete_cnn_q_function.py
+++ b/src/garage/torch/q_functions/discrete_cnn_q_function.py
@@ -27,7 +27,6 @@ class DiscreteCNNQFunction(DiscreteCNNModule):
             For example, (3, 32) means there are two convolutional layers.
             The filter for the first conv layer outputs 3 channels
             and the second one outputs 32 channels.
-        minibatch_size (int): Size of the optimization minibatch.
         hidden_sizes (list[int]): Output dimension of dense layer(s) for
             the MLP for mean. For example, (32, 32) means the MLP consists
             of two hidden layers, each with 32 hidden units.
@@ -71,10 +70,9 @@ def __init__(self,
                  kernel_sizes,
                  hidden_channels,
                  strides,
-                 minibatch_size,
                  hidden_sizes=(32, 32),
-                 cnn_hidden_nonlinearity=torch.relu,
-                 mlp_hidden_nonlinearity=torch.relu,
+                 cnn_hidden_nonlinearity=torch.nn.ReLU,
+                 mlp_hidden_nonlinearity=torch.nn.ReLU,
                  hidden_w_init=nn.init.xavier_uniform_,
                  hidden_b_init=nn.init.zeros_,
                  paddings=0,
@@ -88,7 +86,8 @@ def __init__(self,
                  layer_normalization=False,
                  is_image=True):
 
-        input_shape = (minibatch_size, ) + env_spec.observation_space.shape
+        self._env_spec = env_spec
+        input_shape = (1, ) + env_spec.observation_space.shape
         output_dim = env_spec.action_space.flat_dim
         super().__init__(input_shape=input_shape,
                          output_dim=output_dim,
@@ -110,3 +109,21 @@ def __init__(self,
                          output_b_init=output_b_init,
                          layer_normalization=layer_normalization,
                          is_image=is_image)
+
+    # pylint: disable=arguments-differ
+    def forward(self, observations):
+        """Return Q-value(s).
+
+        Args:
+            observations (np.ndarray): observations of shape :math: `(N, O*)`.
+
+        Returns:
+            torch.Tensor: Output value
+        """
+        if observations.shape != self._env_spec.observation_space.shape:
+            # avoid using observation_space.unflatten_n
+            # to support tensors on GPUs
+            obs_shape = ((len(observations), ) +
+                         self._env_spec.observation_space.shape)
+            observations = observations.reshape(obs_shape)
+        return super().forward(observations)
diff --git a/src/garage/trainer.py b/src/garage/trainer.py
index 16f0a3ec8d..e7b0daf94c 100644
--- a/src/garage/trainer.py
+++ b/src/garage/trainer.py
@@ -516,6 +516,7 @@ def train(self,
                                      start_epoch=0)
 
         self._plot = plot
+        self._start_worker()
 
         average_return = self._algo.train(self)
         self._shutdown_worker()
@@ -543,7 +544,6 @@ def step_epochs(self):
                 trainer.step_itr += 1
 
         """
-        self._start_worker()
         self._start_time = time.time()
         self.step_itr = self._stats.total_itr
         self.step_episode = None
diff --git a/tests/garage/envs/wrappers/test_fire_reset.py b/tests/garage/envs/wrappers/test_fire_reset.py
index dde1f5597e..efbe2b2d80 100644
--- a/tests/garage/envs/wrappers/test_fire_reset.py
+++ b/tests/garage/envs/wrappers/test_fire_reset.py
@@ -8,15 +8,16 @@
 class TestFireReset:
 
     def test_fire_reset(self):
-        env = DummyDiscretePixelEnv()
+        env = DummyDiscretePixelEnv(random=False)
         env_wrap = FireReset(env)
         obs = env.reset()
         obs_wrap = env_wrap.reset()
 
         assert np.array_equal(obs, np.ones(env.observation_space.shape))
         assert np.array_equal(obs_wrap, np.full(env.observation_space.shape,
-                                                2))
+                                                3))
 
         env_wrap.step(2)
         obs_wrap = env_wrap.reset()  # env will call reset again, after fire
-        assert np.array_equal(obs_wrap, np.ones(env.observation_space.shape))
+        assert np.array_equal(obs_wrap, np.full(env.observation_space.shape,
+                                                3))
diff --git a/tests/garage/envs/wrappers/test_stack_frames_env.py b/tests/garage/envs/wrappers/test_stack_frames_env.py
index f5e4745623..e60e905391 100644
--- a/tests/garage/envs/wrappers/test_stack_frames_env.py
+++ b/tests/garage/envs/wrappers/test_stack_frames_env.py
@@ -34,6 +34,7 @@ def test_stack_frames_invalid_environment_shape(self):
             StackFrames(self.env, n_frames=4)
 
     def test_stack_frames_output_observation_space(self):
+        print(self.env_s.observation_space.shape)
         assert self.env_s.observation_space.shape == (self.width, self.height,
                                                       self.n_frames)
 
@@ -56,3 +57,24 @@ def test_stack_frames_for_step(self):
             obs_stack, _, _, _ = self.env_s.step(1)
 
         np.testing.assert_array_equal(obs_stack, frame_stack)
+
+    def test_stack_frames_axis(self):
+        env = StackFrames(DummyDiscrete2DEnv(random=False),
+                          n_frames=self.n_frames,
+                          axis=0)
+        env.reset()
+        obs, _, _, _ = env.step(1)
+        assert obs.shape[0] == self.n_frames
+
+        env = StackFrames(DummyDiscrete2DEnv(random=False),
+                          n_frames=self.n_frames,
+                          axis=2)
+        env.reset()
+        obs, _, _, _ = env.step(1)
+        assert obs.shape[2] == self.n_frames
+
+    def test_invalid_axis_raises_error(self):
+        with pytest.raises(ValueError):
+            StackFrames(DummyDiscrete2DEnv(random=False),
+                        n_frames=self.n_frames,
+                        axis=5)
diff --git a/tests/garage/experiment/test_snapshotter.py b/tests/garage/experiment/test_snapshotter.py
index 4081395a3e..358d5887d1 100644
--- a/tests/garage/experiment/test_snapshotter.py
+++ b/tests/garage/experiment/test_snapshotter.py
@@ -20,6 +20,7 @@
 
 
 class TestSnapshotter:
+
     def setup_method(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
@@ -28,11 +29,11 @@ def teardown_method(self):
 
     @pytest.mark.parametrize('mode, files', [*configurations])
     def test_snapshotter(self, mode, files):
-        snapshotter = Snapshotter(self.temp_dir.name, mode, 2)
+        snapshotter = Snapshotter(self.temp_dir.name, mode, 1)
 
         assert snapshotter.snapshot_dir == self.temp_dir.name
         assert snapshotter.snapshot_mode == mode
-        assert snapshotter.snapshot_gap == 2
+        assert snapshotter.snapshot_gap == 1
 
         snapshot_data = [{'testparam': 1}, {'testparam': 4}]
         snapshotter.save_itr_params(1, snapshot_data[0])
@@ -45,8 +46,35 @@ def test_snapshotter(self, mode, files):
                 data = pickle.load(pkl_file)
                 assert data == snapshot_data[num]
 
+    def test_gap_overwrite(self):
+        snapshotter = Snapshotter(self.temp_dir.name, 'gap_overwrite', 2)
+        assert snapshotter.snapshot_dir == self.temp_dir.name
+        assert snapshotter.snapshot_mode == 'gap_overwrite'
+        assert snapshotter.snapshot_gap == 2
+
+        snapshot_data = [{'testparam': 1}, {'testparam': 4}]
+        snapshotter.save_itr_params(1, snapshot_data[0])
+        snapshotter.save_itr_params(2, snapshot_data[1])
+
+        filename = osp.join(self.temp_dir.name, 'params.pkl')
+        assert osp.exists(filename)
+        with open(filename, 'rb') as pkl_file:
+            data = pickle.load(pkl_file)
+            assert data == snapshot_data[1]
+
     def test_invalid_snapshot_mode(self):
         with pytest.raises(ValueError):
-            snapshotter = Snapshotter(
-                snapshot_dir=self.temp_dir.name, snapshot_mode='invalid')
+            snapshotter = Snapshotter(snapshot_dir=self.temp_dir.name,
+                                      snapshot_mode='invalid')
             snapshotter.save_itr_params(2, {'testparam': 'invalid'})
+
+    def test_conflicting_params(self):
+        with pytest.raises(ValueError):
+            Snapshotter(snapshot_dir=self.temp_dir.name,
+                        snapshot_mode='last',
+                        snapshot_gap=2)
+
+        with pytest.raises(ValueError):
+            Snapshotter(snapshot_dir=self.temp_dir.name,
+                        snapshot_mode='gap_overwrite',
+                        snapshot_gap=1)
diff --git a/tests/garage/torch/algos/test_dqn.py b/tests/garage/torch/algos/test_dqn.py
new file mode 100644
index 0000000000..ea3c157d7a
--- /dev/null
+++ b/tests/garage/torch/algos/test_dqn.py
@@ -0,0 +1,134 @@
+"""Test DQN performance on cartpole."""
+import copy
+import tempfile
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+from torch.nn import functional as F  # NOQA
+
+from garage.envs import GymEnv
+from garage.experiment import SnapshotConfig
+from garage.experiment.deterministic import set_seed
+from garage.np.exploration_policies import EpsilonGreedyPolicy
+from garage.replay_buffer import PathBuffer
+from garage.sampler import LocalSampler
+from garage.torch import np_to_torch
+from garage.torch.algos import DQN
+from garage.torch.policies import DiscreteQFArgmaxPolicy
+from garage.torch.q_functions import DiscreteMLPQFunction
+from garage.trainer import Trainer
+
+from tests.fixtures import snapshot_config
+
+
+@pytest.fixture
+def setup():
+    set_seed(24)
+    n_epochs = 11
+    steps_per_epoch = 10
+    sampler_batch_size = 512
+    num_timesteps = 100 * steps_per_epoch * sampler_batch_size
+
+    env = GymEnv('CartPole-v0')
+
+    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
+
+    qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5))
+
+    policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf)
+    exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec,
+                                             policy=policy,
+                                             total_timesteps=num_timesteps,
+                                             max_epsilon=1.0,
+                                             min_epsilon=0.01,
+                                             decay_ratio=0.4)
+    algo = DQN(env_spec=env.spec,
+               policy=policy,
+               qf=qf,
+               exploration_policy=exploration_policy,
+               replay_buffer=replay_buffer,
+               steps_per_epoch=steps_per_epoch,
+               qf_lr=5e-5,
+               discount=0.9,
+               min_buffer_size=int(1e4),
+               n_train_steps=500,
+               target_update_freq=30,
+               buffer_batch_size=64)
+
+    return algo, env, replay_buffer, n_epochs, sampler_batch_size
+
+
+@pytest.mark.large
+def test_dqn_cartpole(setup):
+    tempdir = tempfile.TemporaryDirectory()
+    config = SnapshotConfig(snapshot_dir=tempdir.name,
+                            snapshot_mode='last',
+                            snapshot_gap=1)
+
+    trainer = Trainer(config)
+    algo, env, _, n_epochs, batch_size = setup
+    trainer.setup(algo, env, sampler_cls=LocalSampler)
+    last_avg_return = trainer.train(n_epochs=n_epochs, batch_size=batch_size)
+    assert last_avg_return > 10
+    env.close()
+
+    # test resume from snapshot
+    trainer.restore(tempdir.name)
+    trainer.resume(n_epochs=1, batch_size=batch_size)
+
+
+def test_dqn_loss(setup):
+    algo, env, buff, _, batch_size = setup
+
+    trainer = Trainer(snapshot_config)
+    trainer.setup(algo, env, sampler_cls=LocalSampler)
+
+    paths = trainer.obtain_episodes(0, batch_size=batch_size)
+    buff.add_episode_batch(paths)
+    timesteps = buff.sample_timesteps(algo._buffer_batch_size)
+    timesteps_copy = copy.deepcopy(timesteps)
+
+    observations = np_to_torch(timesteps.observations)
+    rewards = np_to_torch(timesteps.rewards).reshape(-1, 1)
+    actions = np_to_torch(timesteps.actions)
+    next_observations = np_to_torch(timesteps.next_observations)
+    terminals = np_to_torch(timesteps.terminals).reshape(-1, 1)
+
+    next_inputs = next_observations
+    inputs = observations
+    with torch.no_grad():
+        target_qvals = algo._target_qf(next_inputs)
+        best_qvals, _ = torch.max(target_qvals, 1)
+        best_qvals = best_qvals.unsqueeze(1)
+
+    rewards_clipped = rewards
+    y_target = (rewards_clipped +
+                (1.0 - terminals) * algo._discount * best_qvals)
+    y_target = y_target.squeeze(1)
+
+    # optimize qf
+    qvals = algo._qf(inputs)
+    selected_qs = torch.sum(qvals * actions, axis=1)
+    qval_loss = F.smooth_l1_loss(selected_qs, y_target)
+
+    algo_loss, algo_targets, algo_selected_qs = algo._optimize_qf(
+        timesteps_copy)
+    env.close()
+
+    assert (qval_loss.detach() == algo_loss).all()
+    assert (y_target == algo_targets).all()
+    assert (selected_qs == algo_selected_qs).all()
+
+
+def test_to_device(setup):
+    algo, _, _, _, _ = setup
+    algo._qf.to = MagicMock(name='to')
+    algo._target_qf.to = MagicMock(name='to')
+
+    algo._qf.to.return_value = algo._qf
+    algo._target_qf.to.return_value = algo._target_qf
+
+    algo.to('cpu')
+    algo._qf.to.assert_called_once_with('cpu')
+    algo._target_qf.to.assert_called_once_with('cpu')
diff --git a/tests/garage/torch/policies/test_discrete_qf_argmax_policy.py b/tests/garage/torch/policies/test_discrete_qf_argmax_policy.py
index 1f40314b8b..78d72bcfeb 100644
--- a/tests/garage/torch/policies/test_discrete_qf_argmax_policy.py
+++ b/tests/garage/torch/policies/test_discrete_qf_argmax_policy.py
@@ -36,7 +36,7 @@ def test_get_action():
                               hidden_sizes=(2, 2))
     qvals = qf(obs.unsqueeze(0))
     policy = DiscreteQFArgmaxPolicy(qf, env_spec)
-    action, _ = policy.get_action(obs)
+    action, _ = policy.get_action(obs.numpy())
     assert action == torch.argmax(qvals, dim=1).numpy()
     assert action.shape == ()
 
@@ -51,7 +51,7 @@ def test_get_actions(batch_size):
                               hidden_sizes=(2, 2))
     qvals = qf(obs)
     policy = DiscreteQFArgmaxPolicy(qf, env_spec)
-    actions, _ = policy.get_actions(obs)
+    actions, _ = policy.get_actions(obs.numpy())
     assert (actions == torch.argmax(qvals, dim=1).numpy()).all()
     assert actions.shape == (batch_size, )
 
@@ -66,9 +66,9 @@ def test_is_pickleable(batch_size):
                               hidden_sizes=(2, 2))
     policy = DiscreteQFArgmaxPolicy(qf, env_spec)
 
-    output1 = policy.get_actions(obs)[0]
+    output1 = policy.get_actions(obs.numpy())[0]
 
     p = pickle.dumps(policy)
     policy_pickled = pickle.loads(p)
-    output2 = policy_pickled.get_actions(obs)[0]
+    output2 = policy_pickled.get_actions(obs.numpy())[0]
     assert np.array_equal(output1, output2)
diff --git a/tests/garage/torch/q_functions/test_discrete_cnn_q_function.py b/tests/garage/torch/q_functions/test_discrete_cnn_q_function.py
index a02ad1dc0b..667aae734d 100644
--- a/tests/garage/torch/q_functions/test_discrete_cnn_q_function.py
+++ b/tests/garage/torch/q_functions/test_discrete_cnn_q_function.py
@@ -27,7 +27,6 @@ def test_forward(batch_size, hidden_channels, kernel_sizes, strides):
     qf = DiscreteCNNQFunction(env_spec=env_spec,
                               kernel_sizes=kernel_sizes,
                               strides=strides,
-                              minibatch_size=batch_size,
                               mlp_hidden_nonlinearity=None,
                               cnn_hidden_nonlinearity=None,
                               hidden_channels=hidden_channels,
@@ -57,7 +56,6 @@ def test_is_pickleable(batch_size, hidden_channels, kernel_sizes, strides):
     qf = DiscreteCNNQFunction(env_spec=env_spec,
                               kernel_sizes=kernel_sizes,
                               strides=strides,
-                              minibatch_size=batch_size,
                               mlp_hidden_nonlinearity=None,
                               cnn_hidden_nonlinearity=None,
                               hidden_channels=hidden_channels,
diff --git a/tests/integration_tests/test_examples.py b/tests/integration_tests/test_examples.py
index c58eea6458..0d36d411a0 100644
--- a/tests/integration_tests/test_examples.py
+++ b/tests/integration_tests/test_examples.py
@@ -8,6 +8,7 @@
 EXAMPLES_ROOT_DIR = pathlib.Path('examples/')
 NON_ALGO_EXAMPLES = [
     EXAMPLES_ROOT_DIR / 'torch/resume_training.py',
+    EXAMPLES_ROOT_DIR / 'torch/watch_atari.py',
     EXAMPLES_ROOT_DIR / 'tf/resume_training.py',
     EXAMPLES_ROOT_DIR / 'sim_policy.py',
     EXAMPLES_ROOT_DIR / 'step_env.py',
@@ -20,6 +21,8 @@
     EXAMPLES_ROOT_DIR / 'tf/dqn_pong.py',
     EXAMPLES_ROOT_DIR / 'tf/her_ddpg_fetchreach.py',
     EXAMPLES_ROOT_DIR / 'tf/trpo_cubecrash.py',
+    EXAMPLES_ROOT_DIR / 'torch/dqn_cartpole.py',
+    EXAMPLES_ROOT_DIR / 'torch/dqn_atari.py',
     EXAMPLES_ROOT_DIR / 'torch/maml_ppo_half_cheetah_dir.py',
     EXAMPLES_ROOT_DIR / 'torch/maml_trpo_half_cheetah_dir.py',
     EXAMPLES_ROOT_DIR / 'torch/maml_vpg_half_cheetah_dir.py',
@@ -99,6 +102,24 @@ def test_dqn_pong():
                           env=env).returncode == 0
 
 
+@pytest.mark.no_cover
+@pytest.mark.timeout(200)
+def test_dqn_atari():
+    """Test torch/dqn_atari.py with reduced replay buffer size.
+
+    This is to reduced memory consumption.
+
+    """
+    env = os.environ.copy()
+    env['GARAGE_EXAMPLE_TEST_N_EPOCHS'] = '1'
+    assert subprocess.run([
+        EXAMPLES_ROOT_DIR / 'torch/dqn_atari.py', 'Pong', '--buffer_size', '1',
+        '--max_episode_length', '1'
+    ],
+                          check=False,
+                          env=env).returncode == 0
+
+
 @pytest.mark.no_cover
 @pytest.mark.timeout(30)
 def test_ppo_memorize_digits():

From c0fd41d73da7e7a71d6054e87370be35ca708e67 Mon Sep 17 00:00:00 2001
From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com>
Date: Wed, 21 Oct 2020 18:53:42 -0700
Subject: [PATCH 15/23] Refactor te_npo to use episode batch (#2137)

* Refactor te_npo to use episode batch

* Change variable name
---
 .../linear_multi_feature_baseline.py          |   4 +-
 src/garage/tf/algos/te_npo.py                 | 280 +++++++++---------
 2 files changed, 138 insertions(+), 146 deletions(-)

diff --git a/src/garage/np/baselines/linear_multi_feature_baseline.py b/src/garage/np/baselines/linear_multi_feature_baseline.py
index 3ab92bc7a9..bfb88575b3 100644
--- a/src/garage/np/baselines/linear_multi_feature_baseline.py
+++ b/src/garage/np/baselines/linear_multi_feature_baseline.py
@@ -21,7 +21,7 @@ def __init__(self,
                  reg_coeff=1e-5,
                  name='LinearMultiFeatureBaseline'):
         super().__init__(env_spec, reg_coeff, name)
-        features = features or ['observation']
+        features = features or ['observations']
         self._feature_names = features
 
     def _features(self, path):
@@ -38,7 +38,7 @@ def _features(self, path):
             np.clip(path[feature_name], -10, 10)
             for feature_name in self._feature_names
         ]
-        n = len(path['rewards'])
+        n = len(path['observations'])
         return np.concatenate(sum([[f, f**2]
                                    for f in features], []) + [np.ones((n, 1))],
                               axis=1)
diff --git a/src/garage/tf/algos/te_npo.py b/src/garage/tf/algos/te_npo.py
index e9db7b223c..f4d526a952 100644
--- a/src/garage/tf/algos/te_npo.py
+++ b/src/garage/tf/algos/te_npo.py
@@ -7,9 +7,12 @@
 import scipy.stats
 import tensorflow as tf
 
-from garage import EpisodeBatch, InOutSpec, log_performance
+from garage import InOutSpec, log_performance
 from garage.experiment import deterministic
-from garage.np import explained_variance_1d, rrse, sliding_window
+from garage.np import (discount_cumsum,
+                       explained_variance_1d,
+                       rrse,
+                       sliding_window)
 from garage.np.algos import RLAlgorithm
 from garage.sampler import LocalSampler
 from garage.tf import (center_advs,
@@ -19,10 +22,7 @@
                        discounted_returns,
                        flatten_inputs,
                        graph_inputs,
-                       pad_tensor,
                        pad_tensor_dict,
-                       pad_tensor_n,
-                       paths_to_tensors,
                        positive_advs,
                        stack_tensor_dict_list)
 from garage.tf.embeddings import StochasticEncoder
@@ -210,118 +210,107 @@ def train(self, trainer):
         last_return = None
 
         for _ in trainer.step_epochs():
-            trainer.step_path = trainer.obtain_samples(trainer.step_itr)
+            trainer.step_path = trainer.obtain_episodes(trainer.step_itr)
             last_return = self._train_once(trainer.step_itr, trainer.step_path)
             trainer.step_itr += 1
 
         return last_return
 
-    def _train_once(self, itr, paths):
+    def _train_once(self, itr, episodes):
         """Perform one step of policy optimization given one batch of samples.
 
         Args:
             itr (int): Iteration number.
-            paths (list[dict]): A list of collected paths.
+            episodes (EpisodeBatch): Batch of episodes.
 
         Returns:
             numpy.float64: Average return.
 
         """
         undiscounted_returns = log_performance(itr,
-                                               EpisodeBatch.from_list(
-                                                   self._env_spec, paths),
+                                               episodes,
                                                discount=self._discount)
 
-        samples_data = self._paths_to_tensors(paths)
+        # Calculate baseline predictions
+        baselines = []
+        start = 0
+        for length in episodes.lengths:
+            stop = start + length
+            baseline = self._baseline.predict(
+                dict(observations=episodes.observations[start:stop],
+                     tasks=episodes.env_infos['task_onehot'][start:stop],
+                     latents=episodes.agent_infos['latent'][start:stop]))
+            baselines.append(baseline)
+            start = stop
+        baselines = episodes.pad_to_last(np.concatenate(baselines))
 
-        samples_data['average_return'] = np.mean(undiscounted_returns)
+        # Process trajectories
+        embed_eps, embed_ep_infos = self._process_episodes(episodes)
+
+        average_return = np.mean(undiscounted_returns)
 
         logger.log('Optimizing policy...')
-        self._optimize_policy(itr, samples_data)
+        self._optimize_policy(itr, episodes, baselines, embed_eps,
+                              embed_ep_infos)
 
-        return samples_data['average_return']
+        return average_return
 
-    def _optimize_policy(self, itr, samples_data):
+    def _optimize_policy(self, itr, episodes, baselines, embed_eps,
+                         embed_ep_infos):
         """Optimize policy.
 
         Args:
             itr (int): Iteration number.
-            samples_data (dict): Processed sample data.
-                See process_samples() for details.
+            episodes (EpisodeBatch): Batch of episodes.
+            baselines (np.ndarray): Baseline predictions.
+            embed_eps (np.ndarray): Embedding episodes.
+            embed_ep_infos (dict): Embedding distribution information.
 
         """
         del itr
 
-        policy_opt_input_values = self._policy_opt_input_values(samples_data)
+        policy_opt_input_values = self._policy_opt_input_values(
+            episodes, baselines, embed_eps)
         inference_opt_input_values = self._inference_opt_input_values(
-            samples_data)
+            episodes, embed_eps, embed_ep_infos)
 
         self._train_policy_and_encoder_networks(policy_opt_input_values)
         self._train_inference_network(inference_opt_input_values)
 
-        paths = samples_data['paths']
-        self._evaluate(policy_opt_input_values, samples_data)
+        # paths = samples_data['paths']
+        fit_paths = self._evaluate(policy_opt_input_values, episodes,
+                                   baselines, embed_ep_infos)
         self._visualize_distribution()
 
         logger.log('Fitting baseline...')
-        self._baseline.fit(paths)
+        self._baseline.fit(fit_paths)
 
         self._old_policy.parameters = self.policy.parameters
         self._old_policy.encoder.model.parameters = (
             self.policy.encoder.model.parameters)
         self._old_inference.model.parameters = self._inference.model.parameters
 
-    def _paths_to_tensors(self, paths):
+    def _process_episodes(self, episodes):
         # pylint: disable=too-many-statements
         """Return processed sample data based on the collected paths.
 
         Args:
-            paths (list[dict]): A list of collected paths.
+            episodes (EpisodeBatch): Batch of episodes.
 
         Returns:
-            dict: Processed sample data, with key
-                * observations: (numpy.ndarray)
-                * tasks: (numpy.ndarray)
-                * actions: (numpy.ndarray)
-                * trjectories: (numpy.ndarray)
-                * rewards: (numpy.ndarray)
-                * baselines: (numpy.ndarray)
-                * returns: (numpy.ndarray)
-                * valids: (numpy.ndarray)
-                * agent_infos: (dict)
-                * letent_infos: (dict)
-                * env_infos: (dict)
-                * trjectory_infos: (dict)
-                * paths: (list[dict])
+            np.ndarray: Embedding episodes.
+            dict: Embedding distribution information.
+                * mean (list[numpy.ndarray]): Means of the distribution.
+                * log_std (list[numpy.ndarray]): Log standard deviations of the
+                    distribution.
 
         """
         max_episode_length = self.max_episode_length
 
-        def _extract_latent_infos(infos):
-            """Extract and pack latent infos from dict.
-
-            Args:
-                infos (dict): A dict that contains latent infos with key
-                    prefixed by 'latent_'.
-
-            Returns:
-                dict: A dict of latent infos.
-
-            """
-            latent_infos = dict()
-            for k, v in infos.items():
-                if k.startswith('latent_'):
-                    latent_infos[k[7:]] = v
-            return latent_infos
-
-        for path in paths:
-            path['actions'] = (self._env_spec.action_space.flatten_n(
-                path['actions']))
-            path['tasks'] = self.policy.task_space.flatten_n(
-                path['env_infos']['task_onehot'])
-            path['latents'] = path['agent_infos']['latent']
-            path['latent_infos'] = _extract_latent_infos(path['agent_infos'])
+        trajectories = []
+        trajectory_infos = []
 
+        for obs in episodes.padded_observations:
             # - Calculate a forward-looking sliding window.
             # - If step_space has shape (n, d), then trajs will have shape
             #   (n, window, d)
@@ -331,45 +320,21 @@ def _extract_latent_infos(infos):
             # - Only observation is used for a single step.
             #   Alternatively, stacked [observation, action] can be used for
             #   in harder tasks.
-            obs = pad_tensor(path['observations'], max_episode_length)
             obs_flat = self._env_spec.observation_space.flatten_n(obs)
             steps = obs_flat
             window = self._inference.spec.input_space.shape[0]
             traj = sliding_window(steps, window, smear=True)
             traj_flat = self._inference.spec.input_space.flatten_n(traj)
-            path['trajectories'] = traj_flat
+            trajectories.append(traj_flat)
 
             _, traj_info = self._inference.get_latents(traj_flat)
-            path['trajectory_infos'] = traj_info
-
-        all_path_baselines = [self._baseline.predict(path) for path in paths]
-
-        tasks = [path['tasks'] for path in paths]
-        tasks = pad_tensor_n(tasks, max_episode_length)
-
-        trajectories = np.stack([path['trajectories'] for path in paths])
-
-        latents = [path['latents'] for path in paths]
-        latents = pad_tensor_n(latents, max_episode_length)
+            trajectory_infos.append(traj_info)
 
-        latent_infos = [path['latent_infos'] for path in paths]
-        latent_infos = stack_tensor_dict_list(
-            [pad_tensor_dict(p, max_episode_length) for p in latent_infos])
-
-        trajectory_infos = [path['trajectory_infos'] for path in paths]
+        trajectories = np.stack(trajectories)
         trajectory_infos = stack_tensor_dict_list(
             [pad_tensor_dict(p, max_episode_length) for p in trajectory_infos])
 
-        samples_data = paths_to_tensors(paths, max_episode_length,
-                                        all_path_baselines, self._discount,
-                                        self._gae_lambda)
-        samples_data['tasks'] = tasks
-        samples_data['latents'] = latents
-        samples_data['latent_infos'] = latent_infos
-        samples_data['trajectories'] = trajectories
-        samples_data['trajectory_infos'] = trajectory_infos
-
-        return samples_data
+        return trajectories, trajectory_infos
 
     def _build_inputs(self):
         """Build input variables.
@@ -741,129 +706,156 @@ def _build_inference_loss(self, i):
 
             return infer_loss, infer_kl
 
-    def _policy_opt_input_values(self, samples_data):
+    def _policy_opt_input_values(self, episodes, baselines, embed_eps):
         """Map episode samples to the policy optimizer inputs.
 
         Args:
-            samples_data (dict): Processed sample data.
-                See process_samples() for details.
+            episodes (EpisodeBatch): Batch of episodes.
+            baselines (np.ndarray): Baseline predictions.
+            embed_eps (np.ndarray): Embedding episodes.
 
         Returns:
             list(np.ndarray): Flatten policy optimization input values.
 
         """
+        actions = [
+            self._env_spec.action_space.flatten_n(act)
+            for act in episodes.actions_list
+        ]
+        actions = episodes.pad_to_last(np.concatenate(actions))
+        tasks = episodes.pad_to_last(episodes.env_infos['task_onehot'])
+        latents = episodes.pad_to_last(episodes.agent_infos['latent'])
+
+        agent_infos = episodes.padded_agent_infos
         policy_state_info_list = [
-            samples_data['agent_infos'][k] for k in self.policy.state_info_keys
+            agent_infos[k] for k in self.policy.state_info_keys
         ]
         embed_state_info_list = [
-            samples_data['latent_infos'][k]
+            agent_infos['latent_' + k]
             for k in self.policy.encoder.state_info_keys
         ]
         # pylint: disable=unexpected-keyword-arg
         policy_opt_input_values = self._policy_opt_inputs._replace(
-            obs_var=samples_data['observations'],
-            action_var=samples_data['actions'],
-            reward_var=samples_data['rewards'],
-            baseline_var=samples_data['baselines'],
-            trajectory_var=samples_data['trajectories'],
-            task_var=samples_data['tasks'],
-            latent_var=samples_data['latents'],
-            valid_var=samples_data['valids'],
+            obs_var=episodes.padded_observations,
+            action_var=actions,
+            reward_var=episodes.padded_rewards,
+            baseline_var=baselines,
+            trajectory_var=embed_eps,
+            task_var=tasks,
+            latent_var=latents,
+            valid_var=episodes.valids,
             policy_state_info_vars_list=policy_state_info_list,
             embed_state_info_vars_list=embed_state_info_list,
         )
 
         return flatten_inputs(policy_opt_input_values)
 
-    def _inference_opt_input_values(self, samples_data):
+    def _inference_opt_input_values(self, episodes, embed_eps, embed_ep_infos):
         """Map episode samples to the inference optimizer inputs.
 
         Args:
-            samples_data (dict): Processed sample data.
-                See process_samples() for details.
+            episodes (EpisodeBatch): Batch of episodes.
+            embed_eps (np.ndarray): Embedding episodes.
+            embed_ep_infos (dict): Embedding distribution information.
 
         Returns:
             list(np.ndarray): Flatten inference optimization input values.
 
         """
+        latents = episodes.pad_to_last(episodes.agent_infos['latent'])
+
         infer_state_info_list = [
-            samples_data['trajectory_infos'][k]
-            for k in self._inference.state_info_keys
+            embed_ep_infos[k] for k in self._inference.state_info_keys
         ]
         # pylint: disable=unexpected-keyword-arg
         inference_opt_input_values = self._inference_opt_inputs._replace(
-            latent_var=samples_data['latents'],
-            trajectory_var=samples_data['trajectories'],
-            valid_var=samples_data['valids'],
+            latent_var=latents,
+            trajectory_var=embed_eps,
+            valid_var=episodes.valids,
             infer_state_info_vars_list=infer_state_info_list,
         )
 
         return flatten_inputs(inference_opt_input_values)
 
-    def _evaluate(self, policy_opt_input_values, samples_data):
+    def _evaluate(self, policy_opt_input_values, episodes, baselines,
+                  embed_ep_infos):
         """Evaluate rewards and everything else.
 
         Args:
             policy_opt_input_values (list[np.ndarray]): Flattened
                 policy optimization input values.
-            samples_data (dict): Processed sample data.
-                See process_samples() for details.
+            episodes (EpisodeBatch): Batch of episodes.
+            baselines (np.ndarray): Baseline predictions.
+            embed_ep_infos (dict): Embedding distribution information.
 
         Returns:
-            dict: Processed sample data.
+            dict: Paths for fitting the baseline.
 
         """
         # pylint: disable=too-many-statements
+        fit_paths = []
+        valids = episodes.valids
+        observations = episodes.padded_observations
+        tasks = episodes.pad_to_last(episodes.env_infos['task_onehot'])
+        latents = episodes.pad_to_last(episodes.agent_infos['latent'])
+        baselines_list = []
+        for baseline, valid in zip(baselines, valids):
+            baselines_list.append(baseline[valid.astype(np.bool)])
+
         # Augment reward from baselines
         rewards_tensor = self._f_rewards(*policy_opt_input_values)
         returns_tensor = self._f_returns(*policy_opt_input_values)
         returns_tensor = np.squeeze(returns_tensor, -1)
 
-        paths = samples_data['paths']
-        valids = samples_data['valids']
-        baselines = [path['baselines'] for path in paths]
-        env_rewards = [path['rewards'] for path in paths]
-        env_rewards = concat_tensor_list(env_rewards.copy())
-        env_returns = [path['returns'] for path in paths]
-        env_returns = concat_tensor_list(env_returns.copy())
-        env_average_discounted_return = (np.mean(
-            [path['returns'][0] for path in paths]))
-
-        # Recompute parts of samples_data
+        env_rewards = episodes.rewards
+        env_returns = [
+            discount_cumsum(rwd, self._discount)
+            for rwd in episodes.padded_rewards
+        ]
+        env_average_discounted_return = np.mean(
+            [ret[0] for ret in env_returns])
+
+        # Recompute returns and prepare paths for fitting the baseline
         aug_rewards = []
         aug_returns = []
-        for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids,
-                                       paths):
-            path['rewards'] = rew[val.astype(np.bool)]
-            path['returns'] = ret[val.astype(np.bool)]
-            aug_rewards.append(path['rewards'])
-            aug_returns.append(path['returns'])
+        for rew, ret, val, task, latent, obs in zip(rewards_tensor,
+                                                    returns_tensor, valids,
+                                                    tasks, latents,
+                                                    observations):
+            returns = ret[val.astype(np.bool)]
+            task = task[val.astype(np.bool)]
+            latent = latent[val.astype(np.bool)]
+            obs = obs[val.astype(np.bool)]
+
+            aug_rewards.append(rew[val.astype(np.bool)])
+            aug_returns.append(returns)
+            fit_paths.append(
+                dict(observations=obs,
+                     tasks=task,
+                     latents=latent,
+                     returns=returns))
         aug_rewards = concat_tensor_list(aug_rewards)
         aug_returns = concat_tensor_list(aug_returns)
-        samples_data['rewards'] = aug_rewards
-        samples_data['returns'] = aug_returns
 
         # Calculate effect of the entropy terms
         d_rewards = np.mean(aug_rewards - env_rewards)
         tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards)
 
         aug_average_discounted_return = (np.mean(
-            [path['returns'][0] for path in paths]))
+            [ret[0] for ret in returns_tensor]))
         d_returns = np.mean(aug_average_discounted_return -
                             env_average_discounted_return)
         tabular.record('{}/EntReturns'.format(self.policy.name), d_returns)
 
         # Calculate explained variance
-        ev = explained_variance_1d(np.concatenate(baselines), aug_returns)
+        ev = explained_variance_1d(np.concatenate(baselines_list), aug_returns)
         tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev)
 
-        inference_rmse = (samples_data['trajectory_infos']['mean'] -
-                          samples_data['latents'])**2.
+        inference_rmse = (embed_ep_infos['mean'] - latents)**2.
         inference_rmse = np.sqrt(inference_rmse.mean())
         tabular.record('Inference/RMSE', inference_rmse)
 
-        inference_rrse = rrse(samples_data['latents'],
-                              samples_data['trajectory_infos']['mean'])
+        inference_rrse = rrse(latents, embed_ep_infos['mean'])
         tabular.record('Inference/RRSE', inference_rrse)
 
         embed_ent = self._f_encoder_entropy(*policy_opt_input_values)
@@ -874,13 +866,13 @@ def _evaluate(self, policy_opt_input_values, samples_data):
         tabular.record('Inference/CrossEntropy', infer_ce)
 
         pol_ent = self._f_policy_entropy(*policy_opt_input_values)
-        pol_ent = np.sum(pol_ent) / np.sum(samples_data['valids'])
+        pol_ent = np.sum(pol_ent) / np.sum(episodes.lengths)
         tabular.record('{}/Entropy'.format(self.policy.name), pol_ent)
 
         task_ents = self._f_task_entropies(*policy_opt_input_values)
-        tasks = samples_data['tasks'][:, 0, :]
+        tasks = tasks[:, 0, :]
         _, task_indices = np.nonzero(tasks)
-        path_lengths = np.sum(samples_data['valids'], axis=1)
+        path_lengths = np.sum(valids, axis=1)
         for t in range(self.policy.task_space.flat_dim):
             lengths = path_lengths[task_indices == t]
             completed = lengths < self.max_episode_length
@@ -891,7 +883,7 @@ def _evaluate(self, policy_opt_input_values, samples_data):
                            pct_completed)
             tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t])
 
-        return samples_data
+        return fit_paths
 
     def _visualize_distribution(self):
         """Visualize encoder distribution."""

From 1593a943153c6fbbd2bfb8ffd3b15b24617120d7 Mon Sep 17 00:00:00 2001
From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com>
Date: Wed, 21 Oct 2020 21:32:36 -0700
Subject: [PATCH 16/23] Fix flake of large test run under Travis (#2145)

---
 Makefile | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 5a90f38c8a..35fef5157e 100644
--- a/Makefile
+++ b/Makefile
@@ -57,9 +57,20 @@ ci-job-normal: assert-docker
 		exit 1; \
 	done
 
+# Need to be able to access $!, a special bash variable
+define LARGE_TEST
+  pytest --cov=garage --cov-report=xml --reruns 1 -m 'large and not flaky' --durations=20 &
+  PYTEST_PID=$$!
+  while ps -p $$PYTEST_PID > /dev/null ; do
+    echo 'Still running'
+    sleep 60
+  done
+endef
+export LARGE_TEST
+
 ci-job-large: assert-docker
 	[ ! -f $(MJKEY_PATH) ] || mv $(MJKEY_PATH) $(MJKEY_PATH).bak
-	pytest --cov=garage --cov-report=xml --reruns 1 -m 'large and not flaky' --durations=20
+	bash -c "$$LARGE_TEST"
 	for i in {1..5}; do \
 		bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \
 			|| echo 'Retrying...' && sleep 30 && continue; \

From e32cd06f0194c5906aec0b7312c3894e44708783 Mon Sep 17 00:00:00 2001
From: "Nicole (Shin Ying) Ng" <shinyinn@usc.edu>
Date: Thu, 22 Oct 2020 01:27:22 -0700
Subject: [PATCH 17/23] Check in GymEnv for env_info consistency (#2083)

* Add GymEnv check for env_info consistency

* Update GymEnv reset() for env_info

* Fix pre-commit

* Fix pylin issues

* Add test for env_info inconsistency

* Fix pre-commit

* Fix pre-commit

* Fix pre-commit

* Move check below timelimit check
---
 src/garage/envs/gym_env.py        | 11 +++++++++++
 tests/garage/envs/test_gym_env.py | 11 +++++++++++
 2 files changed, 22 insertions(+)

diff --git a/src/garage/envs/gym_env.py b/src/garage/envs/gym_env.py
index 472c196268..321fe0ecaa 100644
--- a/src/garage/envs/gym_env.py
+++ b/src/garage/envs/gym_env.py
@@ -155,6 +155,9 @@ def __init__(self, env, is_image=False, max_episode_length=None):
         self._spec = EnvSpec(action_space=self.action_space,
                              observation_space=self.observation_space,
                              max_episode_length=self._max_episode_length)
+        # stores env_info keys & value types to ensure subsequent env_infos
+        # are consistent
+        self._env_info = None
 
     @property
     def action_space(self):
@@ -191,6 +194,7 @@ def reset(self):
         """
         first_obs = self._env.reset()
         self._step_cnt = 0
+        self._env_info = None
 
         return first_obs, dict()
 
@@ -206,6 +210,8 @@ def step(self, action):
         Raises:
             RuntimeError: if `step()` is called after the environment has been
                 constructed and `reset()` has not been called.
+            RuntimeError: if underlying environment outputs inconsistent
+                env_info keys.
 
         """
         if self._step_cnt is None:
@@ -243,6 +249,11 @@ def step(self, action):
         if step_type in (StepType.TERMINAL, StepType.TIMEOUT):
             self._step_cnt = None
 
+        # check that env_infos are consistent
+        if not self._env_info:
+            self._env_info = {k: type(info[k]) for k in info}
+        elif self._env_info.keys() != info.keys():
+            raise RuntimeError('GymEnv outputs inconsistent env_info keys.')
         if not self.spec.observation_space.contains(observation):
             # Discrete actions can be either in the space normally, or one-hot
             # encoded.
diff --git a/tests/garage/envs/test_gym_env.py b/tests/garage/envs/test_gym_env.py
index 089bbb840e..1831d0eb5f 100644
--- a/tests/garage/envs/test_gym_env.py
+++ b/tests/garage/envs/test_gym_env.py
@@ -142,6 +142,17 @@ def test_done_resets_step_cnt():
     assert env._step_cnt is None
 
 
+def test_inconsistent_env_infos():
+    env = GymEnv('MountainCar-v0')
+    env.reset()
+    env._env_info = {'k1': 'v1', 'k2': 'v2'}
+    with pytest.raises(RuntimeError,
+                       match='GymEnv outputs inconsistent env_info keys.'):
+        env.step(env.action_space.sample())
+    # check that order of keys don't matter for equality
+    assert env._env_info.keys() == {'k2': 'v2', 'k1': 'v1'}.keys()
+
+
 def test_is_pickleable():
     env = GymEnv('MountainCar-v0', max_episode_length=50)
     h = pickle.dumps(env)

From 9147627a81c979f421d88d8260a1794e425d361c Mon Sep 17 00:00:00 2001
From: Iris Liu <irisliu.0616@gmail.com>
Date: Thu, 22 Oct 2020 12:14:33 -0700
Subject: [PATCH 18/23] TD3 implementation in pytorch  (#1890)

* TD3 Torch (examples, benchmark, test)

* Change to Trainer

* Update examples
---
 .../src/garage_benchmarks/benchmark_algos.py  |   6 +-
 .../src/garage_benchmarks/benchmark_auto.py   |   3 +-
 .../experiments/algos/__init__.py             |   6 +-
 .../experiments/algos/td3_garage_pytorch.py   | 112 +++++
 .../experiments/algos/td3_garage_tf.py        |  12 +-
 examples/torch/mtsac_metaworld_mt10.py        |   4 +-
 examples/torch/mtsac_metaworld_mt50.py        |   4 +-
 examples/torch/td3_halfcheetah.py             |  87 ++++
 examples/torch/td3_pendulum.py                |  89 ++++
 setup.cfg                                     |   2 +-
 src/garage/np/policies/__init__.py            |   7 +-
 .../np/policies/uniform_random_policy.py      |  61 +++
 src/garage/tf/algos/ddpg.py                   |   4 +-
 src/garage/tf/algos/npo.py                    |   8 +-
 src/garage/tf/algos/rl2.py                    |   8 +-
 src/garage/tf/algos/td3.py                    |   7 +-
 src/garage/torch/__init__.py                  |  27 +-
 src/garage/torch/_functions.py                |  32 +-
 src/garage/torch/algos/__init__.py            |   3 +-
 src/garage/torch/algos/td3.py                 | 399 ++++++++++++++++++
 .../policies/deterministic_mlp_policy.py      |   5 +-
 .../np/policies/test_uniform_random_policy.py |  13 +
 tests/garage/test_dtypes.py                   |   6 +-
 tests/garage/torch/algos/test_td3.py          | 108 +++++
 tests/integration_tests/test_examples.py      |   2 +
 25 files changed, 946 insertions(+), 69 deletions(-)
 create mode 100644 benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_pytorch.py
 create mode 100644 examples/torch/td3_halfcheetah.py
 create mode 100644 examples/torch/td3_pendulum.py
 create mode 100644 src/garage/np/policies/uniform_random_policy.py
 create mode 100644 src/garage/torch/algos/td3.py
 create mode 100644 tests/garage/np/policies/test_uniform_random_policy.py
 create mode 100644 tests/garage/torch/algos/test_td3.py

diff --git a/benchmarks/src/garage_benchmarks/benchmark_algos.py b/benchmarks/src/garage_benchmarks/benchmark_algos.py
index d2f4f57add..3f5b7837ff 100644
--- a/benchmarks/src/garage_benchmarks/benchmark_algos.py
+++ b/benchmarks/src/garage_benchmarks/benchmark_algos.py
@@ -1,9 +1,9 @@
 """Benchmarking for algorithms."""
 # yapf: disable
-from garage_benchmarks.experiments.algos import (ddpg_garage_tf,
-                                                 her_garage_tf,
+from garage_benchmarks.experiments.algos import (ddpg_garage_tf, her_garage_tf,
                                                  ppo_garage_pytorch,
                                                  ppo_garage_tf,
+                                                 td3_garage_pytorch,
                                                  td3_garage_tf,
                                                  trpo_garage_pytorch,
                                                  trpo_garage_tf,
@@ -40,7 +40,7 @@ def td3_benchmarks():
     td3_env_ids = [
         env_id for env_id in MuJoCo1M_ENV_SET if env_id != 'Reacher-v2'
     ]
-
+    iterate_experiments(td3_garage_pytorch, td3_env_ids)
     iterate_experiments(td3_garage_tf, td3_env_ids)
 
 
diff --git a/benchmarks/src/garage_benchmarks/benchmark_auto.py b/benchmarks/src/garage_benchmarks/benchmark_auto.py
index 577f2ae911..5753dd9940 100644
--- a/benchmarks/src/garage_benchmarks/benchmark_auto.py
+++ b/benchmarks/src/garage_benchmarks/benchmark_auto.py
@@ -2,8 +2,7 @@
 # yapf: disable
 from garage_benchmarks.experiments.algos import (ddpg_garage_tf,
                                                  ppo_garage_pytorch,
-                                                 ppo_garage_tf,
-                                                 td3_garage_tf,
+                                                 ppo_garage_tf, td3_garage_tf,
                                                  trpo_garage_pytorch,
                                                  trpo_garage_tf,
                                                  vpg_garage_pytorch,
diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/__init__.py b/benchmarks/src/garage_benchmarks/experiments/algos/__init__.py
index 64ac239498..5f91892581 100644
--- a/benchmarks/src/garage_benchmarks/experiments/algos/__init__.py
+++ b/benchmarks/src/garage_benchmarks/experiments/algos/__init__.py
@@ -4,6 +4,8 @@
 from garage_benchmarks.experiments.algos.ppo_garage_pytorch import (
     ppo_garage_pytorch)
 from garage_benchmarks.experiments.algos.ppo_garage_tf import ppo_garage_tf
+from garage_benchmarks.experiments.algos.td3_garage_pytorch import (
+    td3_garage_pytorch)
 from garage_benchmarks.experiments.algos.td3_garage_tf import td3_garage_tf
 from garage_benchmarks.experiments.algos.trpo_garage_pytorch import (
     trpo_garage_pytorch)
@@ -14,6 +16,6 @@
 
 __all__ = [
     'ddpg_garage_tf', 'her_garage_tf', 'ppo_garage_pytorch', 'ppo_garage_tf',
-    'td3_garage_tf', 'trpo_garage_pytorch', 'trpo_garage_tf',
-    'vpg_garage_pytorch', 'vpg_garage_tf'
+    'td3_garage_pytorch', 'td3_garage_tf', 'trpo_garage_pytorch',
+    'trpo_garage_tf', 'vpg_garage_pytorch', 'vpg_garage_tf'
 ]
diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_pytorch.py b/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_pytorch.py
new file mode 100644
index 0000000000..9227f5c117
--- /dev/null
+++ b/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_pytorch.py
@@ -0,0 +1,112 @@
+"""A regression test for automatic benchmarking garage-Pytorch-TD3."""
+import torch
+from torch.nn import functional as F
+
+from garage import wrap_experiment
+from garage.envs import GymEnv, normalize
+from garage.experiment import deterministic
+from garage.np.exploration_policies import AddGaussianNoise
+from garage.np.policies import UniformRandomPolicy
+from garage.replay_buffer import PathBuffer
+from garage.torch import prefer_gpu
+from garage.torch.algos import TD3
+from garage.torch.policies import DeterministicMLPPolicy
+from garage.torch.q_functions import ContinuousMLPQFunction
+from garage.trainer import TFTrainer
+
+hyper_parameters = {
+    'policy_lr': 1e-3,
+    'qf_lr': 1e-3,
+    'policy_hidden_sizes': [256, 256],
+    'qf_hidden_sizes': [256, 256],
+    'n_epochs': 250,
+    'steps_per_epoch': 40,
+    'batch_size': 100,
+    'start_steps': 1000,
+    'update_after': 1000,
+    'grad_steps_per_env_step': 50,
+    'discount': 0.99,
+    'target_update_tau': 0.005,
+    'replay_buffer_size': int(1e6),
+    'sigma': 0.1,
+    'policy_noise': 0.2,
+    'policy_noise_clip': 0.5,
+    'buffer_batch_size': 100,
+    'min_buffer_size': int(1e4),
+}
+
+
+@wrap_experiment(snapshot_mode='last')
+def td3_garage_pytorch(ctxt, env_id, seed):
+    """Create garage TensorFlow TD3 model and training.
+
+    Args:
+        ctxt (garage.experiment.ExperimentContext): The experiment
+            configuration used by Localtrainer to create the
+            snapshotter.
+        env_id (str): Environment id of the task.
+        seed (int): Random positive integer for the trial.
+
+    """
+    deterministic.set_seed(seed)
+
+    with TFTrainer(ctxt) as trainer:
+        num_timesteps = hyper_parameters['n_epochs'] * hyper_parameters[
+            'steps_per_epoch'] * hyper_parameters['batch_size']
+        env = normalize(GymEnv(env_id))
+
+        policy = DeterministicMLPPolicy(
+            env_spec=env.spec,
+            hidden_sizes=hyper_parameters['policy_hidden_sizes'],
+            hidden_nonlinearity=F.relu,
+            output_nonlinearity=torch.tanh)
+
+        exploration_policy = AddGaussianNoise(
+            env.spec,
+            policy,
+            total_timesteps=num_timesteps,
+            max_sigma=hyper_parameters['sigma'],
+            min_sigma=hyper_parameters['sigma'])
+
+        uniform_random_policy = UniformRandomPolicy(env.spec)
+
+        qf1 = ContinuousMLPQFunction(
+            env_spec=env.spec,
+            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
+            hidden_nonlinearity=F.relu)
+
+        qf2 = ContinuousMLPQFunction(
+            env_spec=env.spec,
+            hidden_sizes=hyper_parameters['qf_hidden_sizes'],
+            hidden_nonlinearity=F.relu)
+
+        replay_buffer = PathBuffer(
+            capacity_in_transitions=hyper_parameters['replay_buffer_size'])
+
+        td3 = TD3(env_spec=env.spec,
+                  policy=policy,
+                  qf1=qf1,
+                  qf2=qf2,
+                  exploration_policy=exploration_policy,
+                  uniform_random_policy=uniform_random_policy,
+                  replay_buffer=replay_buffer,
+                  steps_per_epoch=hyper_parameters['steps_per_epoch'],
+                  policy_lr=hyper_parameters['policy_lr'],
+                  qf_lr=hyper_parameters['qf_lr'],
+                  target_update_tau=hyper_parameters['target_update_tau'],
+                  discount=hyper_parameters['discount'],
+                  grad_steps_per_env_step=hyper_parameters[
+                      'grad_steps_per_env_step'],
+                  start_steps=hyper_parameters['start_steps'],
+                  min_buffer_size=hyper_parameters['min_buffer_size'],
+                  buffer_batch_size=hyper_parameters['buffer_batch_size'],
+                  policy_optimizer=torch.optim.Adam,
+                  qf_optimizer=torch.optim.Adam,
+                  policy_noise_clip=hyper_parameters['policy_noise_clip'],
+                  policy_noise=hyper_parameters['policy_noise'])
+
+        prefer_gpu()
+        td3.to()
+        trainer.setup(td3, env)
+        trainer.train(n_epochs=hyper_parameters['n_epochs'],
+                      batch_size=hyper_parameters['batch_size'])
diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_tf.py b/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_tf.py
index 26565cd189..6bca74d2b3 100644
--- a/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_tf.py
+++ b/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_tf.py
@@ -14,12 +14,12 @@
 hyper_parameters = {
     'policy_lr': 1e-3,
     'qf_lr': 1e-3,
-    'policy_hidden_sizes': [400, 300],
-    'qf_hidden_sizes': [400, 300],
-    'n_epochs': 8,
-    'steps_per_epoch': 20,
-    'n_exploration_steps': 250,
-    'n_train_steps': 1,
+    'policy_hidden_sizes': [256, 256],
+    'qf_hidden_sizes': [256, 256],
+    'n_epochs': 250,
+    'steps_per_epoch': 40,
+    'n_exploration_steps': 100,
+    'n_train_steps': 50,
     'discount': 0.99,
     'tau': 0.005,
     'replay_buffer_size': int(1e6),
diff --git a/examples/torch/mtsac_metaworld_mt10.py b/examples/torch/mtsac_metaworld_mt10.py
index 76dc8ec888..57210bad53 100755
--- a/examples/torch/mtsac_metaworld_mt10.py
+++ b/examples/torch/mtsac_metaworld_mt10.py
@@ -43,8 +43,8 @@ def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps):
     """
     deterministic.set_seed(seed)
     trainer = Trainer(ctxt)
-    mt10 = metaworld.MT10()
-    mt10_test = metaworld.MT10()
+    mt10 = metaworld.MT10()  # pylint: disable=no-member
+    mt10_test = metaworld.MT10()  # pylint: disable=no-member
 
     # pylint: disable=missing-return-doc, missing-return-type-doc
     def wrap(env, _):
diff --git a/examples/torch/mtsac_metaworld_mt50.py b/examples/torch/mtsac_metaworld_mt50.py
index eb0febd8d1..193ccc64dd 100755
--- a/examples/torch/mtsac_metaworld_mt50.py
+++ b/examples/torch/mtsac_metaworld_mt50.py
@@ -51,8 +51,8 @@ def mtsac_metaworld_mt50(ctxt=None,
     """
     deterministic.set_seed(seed)
     trainer = Trainer(ctxt)
-    mt50 = metaworld.MT50()
-    mt50_test = metaworld.MT50()
+    mt50 = metaworld.MT50()  # pylint: disable=no-member
+    mt50_test = metaworld.MT50()  # pylint: disable=no-member
     train_task_sampler = MetaWorldTaskSampler(
         mt50,
         'train',
diff --git a/examples/torch/td3_halfcheetah.py b/examples/torch/td3_halfcheetah.py
new file mode 100644
index 0000000000..19dc5fb7a3
--- /dev/null
+++ b/examples/torch/td3_halfcheetah.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+"""An example to train TD3 algorithm on InvertedDoublePendulum PyTorch."""
+import torch
+from torch.nn import functional as F
+
+# from garage.np.exploration_policies import AddGaussianNoise
+from garage import wrap_experiment
+from garage.envs import GymEnv, normalize
+from garage.experiment.deterministic import set_seed
+from garage.np.exploration_policies import AddGaussianNoise
+from garage.np.policies import UniformRandomPolicy
+from garage.replay_buffer import PathBuffer
+from garage.torch.algos import TD3
+from garage.torch.policies import DeterministicMLPPolicy
+from garage.torch.q_functions import ContinuousMLPQFunction
+from garage.trainer import Trainer
+
+
+@wrap_experiment(snapshot_mode='none')
+def td3_half_cheetah(ctxt=None, seed=1):
+    """Train TD3 with InvertedDoublePendulum-v2 environment.
+
+    Args:
+        ctxt (garage.experiment.ExperimentContext): The experiment
+            configuration used by LocalRunner to create the snapshotter.
+        seed (int): Used to seed the random number generator to produce
+        determinism.
+    """
+    set_seed(seed)
+
+    n_epochs = 500
+    steps_per_epoch = 20
+    sampler_batch_size = 250
+    num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
+
+    trainer = Trainer(ctxt)
+    env = normalize(GymEnv('HalfCheetah-v2'))
+
+    policy = DeterministicMLPPolicy(env_spec=env.spec,
+                                    hidden_sizes=[256, 256],
+                                    hidden_nonlinearity=F.relu,
+                                    output_nonlinearity=torch.tanh)
+
+    exploration_policy = AddGaussianNoise(env.spec,
+                                          policy,
+                                          total_timesteps=num_timesteps,
+                                          max_sigma=0.1,
+                                          min_sigma=0.1)
+
+    uniform_random_policy = UniformRandomPolicy(env.spec)
+
+    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
+                                 hidden_sizes=[256, 256],
+                                 hidden_nonlinearity=F.relu)
+
+    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
+                                 hidden_sizes=[256, 256],
+                                 hidden_nonlinearity=F.relu)
+
+    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
+
+    td3 = TD3(env_spec=env.spec,
+              policy=policy,
+              qf1=qf1,
+              qf2=qf2,
+              replay_buffer=replay_buffer,
+              policy_optimizer=torch.optim.Adam,
+              qf_optimizer=torch.optim.Adam,
+              exploration_policy=exploration_policy,
+              uniform_random_policy=uniform_random_policy,
+              target_update_tau=0.005,
+              discount=0.99,
+              policy_noise_clip=0.5,
+              policy_noise=0.2,
+              policy_lr=1e-3,
+              qf_lr=1e-3,
+              steps_per_epoch=40,
+              start_steps=1000,
+              grad_steps_per_env_step=50,
+              min_buffer_size=1000,
+              buffer_batch_size=100)
+
+    trainer.setup(algo=td3, env=env)
+    trainer.train(n_epochs=750, batch_size=100)
+
+
+td3_half_cheetah(seed=0)
diff --git a/examples/torch/td3_pendulum.py b/examples/torch/td3_pendulum.py
new file mode 100644
index 0000000000..950b6798d7
--- /dev/null
+++ b/examples/torch/td3_pendulum.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""An example to train TD3 algorithm on InvertedDoublePendulum PyTorch."""
+import torch
+from torch.nn import functional as F
+
+from garage import wrap_experiment
+from garage.envs import GymEnv, normalize
+from garage.experiment.deterministic import set_seed
+from garage.np.exploration_policies import AddGaussianNoise
+from garage.np.policies import UniformRandomPolicy
+from garage.replay_buffer import PathBuffer
+from garage.torch import prefer_gpu
+from garage.torch.algos import TD3
+from garage.torch.policies import DeterministicMLPPolicy
+from garage.torch.q_functions import ContinuousMLPQFunction
+from garage.trainer import Trainer
+
+
+@wrap_experiment(snapshot_mode='none')
+def td3_pendulum(ctxt=None, seed=1):
+    """Train TD3 with InvertedDoublePendulum-v2 environment.
+
+    Args:
+        ctxt (garage.experiment.ExperimentContext): The experiment
+            configuration used by LocalRunner to create the snapshotter.
+        seed (int): Used to seed the random number generator to produce
+            determinism.
+
+    """
+    set_seed(seed)
+    n_epochs = 750
+    steps_per_epoch = 40
+    sampler_batch_size = 100
+    num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
+
+    trainer = Trainer(ctxt)
+    env = normalize(GymEnv('InvertedDoublePendulum-v2'))
+
+    policy = DeterministicMLPPolicy(env_spec=env.spec,
+                                    hidden_sizes=[256, 256],
+                                    hidden_nonlinearity=F.relu,
+                                    output_nonlinearity=torch.tanh)
+
+    exploration_policy = AddGaussianNoise(env.spec,
+                                          policy,
+                                          total_timesteps=num_timesteps,
+                                          max_sigma=0.1,
+                                          min_sigma=0.1)
+
+    uniform_random_policy = UniformRandomPolicy(env.spec)
+
+    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
+                                 hidden_sizes=[256, 256],
+                                 hidden_nonlinearity=F.relu)
+
+    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
+                                 hidden_sizes=[256, 256],
+                                 hidden_nonlinearity=F.relu)
+
+    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
+
+    td3 = TD3(env_spec=env.spec,
+              policy=policy,
+              qf1=qf1,
+              qf2=qf2,
+              replay_buffer=replay_buffer,
+              policy_optimizer=torch.optim.Adam,
+              qf_optimizer=torch.optim.Adam,
+              exploration_policy=exploration_policy,
+              uniform_random_policy=uniform_random_policy,
+              target_update_tau=0.005,
+              discount=0.99,
+              policy_noise_clip=0.5,
+              policy_noise=0.2,
+              policy_lr=1e-3,
+              qf_lr=1e-3,
+              steps_per_epoch=steps_per_epoch,
+              start_steps=1000,
+              grad_steps_per_env_step=1,
+              min_buffer_size=int(1e4),
+              buffer_batch_size=100)
+
+    prefer_gpu()
+    td3.to()
+    trainer.setup(algo=td3, env=env)
+    trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
+
+
+td3_pendulum()
diff --git a/setup.cfg b/setup.cfg
index f13a70eac5..899b02abb0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -26,7 +26,7 @@ use_parentheses = True
 force_sort_within_sections = True
 force_alphabetical_sort_within_sections = True
 lexicographical = True
-multi_line_output = 1
+multi_line_output = 0
 sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,TESTS,LOCALFOLDER
 known_first_party = garage
 known_tests = tests, garage_benchmarks
diff --git a/src/garage/np/policies/__init__.py b/src/garage/np/policies/__init__.py
index 22cfb88bde..242fd64372 100644
--- a/src/garage/np/policies/__init__.py
+++ b/src/garage/np/policies/__init__.py
@@ -3,9 +3,6 @@
 from garage.np.policies.fixed_policy import FixedPolicy
 from garage.np.policies.policy import Policy
 from garage.np.policies.scripted_policy import ScriptedPolicy
+from garage.np.policies.uniform_random_policy import UniformRandomPolicy
 
-__all__ = [
-    'FixedPolicy',
-    'Policy',
-    'ScriptedPolicy',
-]
+__all__ = ['FixedPolicy', 'Policy', 'ScriptedPolicy', 'UniformRandomPolicy']
diff --git a/src/garage/np/policies/uniform_random_policy.py b/src/garage/np/policies/uniform_random_policy.py
new file mode 100644
index 0000000000..f363387fc4
--- /dev/null
+++ b/src/garage/np/policies/uniform_random_policy.py
@@ -0,0 +1,61 @@
+"""Uniform random exploration strategy."""
+import gym
+
+from garage.np.policies.policy import Policy
+
+
+class UniformRandomPolicy(Policy):
+    """Action taken is uniformly random.
+
+    Args:
+        env_spec (EnvSpec): Environment spec to explore.
+
+    """
+
+    def __init__(
+        self,
+        env_spec,
+    ):
+        assert isinstance(env_spec.action_space, gym.spaces.Box)
+        assert len(env_spec.action_space.shape) == 1
+        self._env_spec = env_spec
+        self._action_space = env_spec.action_space
+        self._iteration = 0
+
+    def reset(self, do_resets=None):
+        """Reset the state of the exploration.
+
+        Args:
+            do_resets (List[bool] or numpy.ndarray or None): Which
+                vectorization states to reset.
+
+        """
+        self._iteration += 1
+        super().reset(do_resets)
+
+    def get_action(self, observation):
+        """Get action from this policy for the input observation.
+
+        Args:
+            observation(numpy.ndarray): Observation from the environment.
+
+        Returns:
+            np.ndarray: Actions with noise.
+            List[dict]: Arbitrary policy state information (agent_info).
+
+        """
+        return self._env_spec.action_space.sample(), dict()
+
+    def get_actions(self, observations):
+        """Get actions from this policy for the input observation.
+
+        Args:
+            observations(list): Observations from the environment.
+
+        Returns:
+            np.ndarray: Actions with noise.
+            List[dict]: Arbitrary policy state information (agent_info).
+
+        """
+        return [self._env_spec.action_space.sample()
+                for obs in observations], dict()
diff --git a/src/garage/tf/algos/ddpg.py b/src/garage/tf/algos/ddpg.py
index 5ab7aa8e3f..0da0ee42ba 100644
--- a/src/garage/tf/algos/ddpg.py
+++ b/src/garage/tf/algos/ddpg.py
@@ -4,9 +4,7 @@
 import numpy as np
 import tensorflow as tf
 
-from garage import (_Default,
-                    log_performance,
-                    make_optimizer,
+from garage import (_Default, log_performance, make_optimizer,
                     obtain_evaluation_episodes)
 from garage.np.algos import RLAlgorithm
 from garage.sampler import FragmentWorker, LocalSampler
diff --git a/src/garage/tf/algos/npo.py b/src/garage/tf/algos/npo.py
index a95d356f61..0812452fd4 100644
--- a/src/garage/tf/algos/npo.py
+++ b/src/garage/tf/algos/npo.py
@@ -11,12 +11,8 @@
 from garage.np import explained_variance_1d
 from garage.np.algos import RLAlgorithm
 from garage.sampler import RaySampler
-from garage.tf import (center_advs,
-                       compile_function,
-                       compute_advantages,
-                       discounted_returns,
-                       flatten_inputs,
-                       graph_inputs,
+from garage.tf import (center_advs, compile_function, compute_advantages,
+                       discounted_returns, flatten_inputs, graph_inputs,
                        positive_advs)
 from garage.tf.optimizers import LBFGSOptimizer
 
diff --git a/src/garage/tf/algos/rl2.py b/src/garage/tf/algos/rl2.py
index 14225abd1a..d67a93b584 100644
--- a/src/garage/tf/algos/rl2.py
+++ b/src/garage/tf/algos/rl2.py
@@ -10,12 +10,8 @@
 from dowel import logger
 import numpy as np
 
-from garage import (EnvSpec,
-                    EnvStep,
-                    EpisodeBatch,
-                    log_multitask_performance,
-                    StepType,
-                    Wrapper)
+from garage import (EnvSpec, EnvStep, EpisodeBatch, log_multitask_performance,
+                    StepType, Wrapper)
 from garage.np import concat_tensor_dict_list, discount_cumsum
 from garage.np.algos import MetaRLAlgorithm
 from garage.sampler import DefaultWorker
diff --git a/src/garage/tf/algos/td3.py b/src/garage/tf/algos/td3.py
index 496e9c3037..95fef35f06 100644
--- a/src/garage/tf/algos/td3.py
+++ b/src/garage/tf/algos/td3.py
@@ -9,9 +9,7 @@
 import numpy as np
 import tensorflow as tf
 
-from garage import (_Default,
-                    log_performance,
-                    make_optimizer,
+from garage import (_Default, log_performance, make_optimizer,
                     obtain_evaluation_episodes)
 from garage.np.algos import RLAlgorithm
 from garage.sampler import FragmentWorker, LocalSampler
@@ -140,11 +138,8 @@ def __init__(
         self._discount = discount
         self._reward_scale = reward_scale
         self.max_episode_length = env_spec.max_episode_length
-        self._max_episode_length_eval = env_spec.max_episode_length
-
         if max_episode_length_eval is not None:
             self._max_episode_length_eval = max_episode_length_eval
-
         self._eval_env = None
 
         self._env_spec = env_spec
diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py
index 81f34499ca..9b47be1771 100644
--- a/src/garage/torch/__init__.py
+++ b/src/garage/torch/__init__.py
@@ -1,24 +1,19 @@
 """PyTorch-backed modules and algorithms."""
 # yapf: disable
-from garage.torch._functions import (compute_advantages,
-                                     dict_np_to_torch,
-                                     filter_valids,
-                                     flatten_batch,
-                                     flatten_to_single_vector,
-                                     global_device,
-                                     NonLinearity,
-                                     np_to_torch,
-                                     pad_to_last,
-                                     product_of_gaussians,
-                                     set_gpu_mode,
-                                     torch_to_np,
-                                     TransposeImage,
+from garage.torch._functions import (compute_advantages, dict_np_to_torch,
+                                     filter_valids, flatten_batch,
+                                     flatten_to_single_vector, global_device,
+                                     NonLinearity, np_to_torch, pad_to_last,
+                                     prefer_gpu, product_of_gaussians,
+                                     set_gpu_mode, soft_update_model,
+                                     torch_to_np, TransposeImage,
                                      update_module_params)
 
 # yapf: enable
 __all__ = [
     'compute_advantages', 'dict_np_to_torch', 'filter_valids', 'flatten_batch',
-    'global_device', 'np_to_torch', 'pad_to_last', 'product_of_gaussians',
-    'set_gpu_mode', 'torch_to_np', 'update_module_params', 'NonLinearity',
-    'flatten_to_single_vector', 'TransposeImage'
+    'global_device', 'np_to_torch', 'pad_to_last', 'prefer_gpu',
+    'product_of_gaussians', 'set_gpu_mode', 'soft_update_model', 'torch_to_np',
+    'update_module_params', 'NonLinearity', 'flatten_to_single_vector',
+    'TransposeImage'
 ]
diff --git a/src/garage/torch/_functions.py b/src/garage/torch/_functions.py
index 0673744046..d1406a8c0c 100644
--- a/src/garage/torch/_functions.py
+++ b/src/garage/torch/_functions.py
@@ -176,7 +176,7 @@ def torch_to_np(tensors):
         `garage.torch._functions.to_numpy`.
 
     """
-    value_out = tuple(v.numpy() for v in tensors)
+    value_out = tuple(v.cpu().numpy() for v in tensors)
     return value_out
 
 
@@ -244,6 +244,28 @@ def update(m, name, param):
             update(module, name, new_param)
 
 
+# pylint: disable=missing-param-doc, missing-type-doc
+def soft_update_model(target_model, source_model, tau):
+    """Update model parameter of target and source model.
+
+    # noqa: D417
+    Args:
+        target_model
+                (garage.torch.Policy/garage.torch.QFunction):
+                    Target model to update.
+        source_model
+                (garage.torch.Policy/QFunction):
+                    Source network to update.
+        tau (float): Interpolation parameter for doing the
+            soft target update.
+
+    """
+    for target_param, param in zip(target_model.parameters(),
+                                   source_model.parameters()):
+        target_param.data.copy_(target_param.data * (1.0 - tau) +
+                                param.data * tau)
+
+
 def set_gpu_mode(mode, gpu_id=0):
     """Set GPU mode and device ID.
 
@@ -261,6 +283,14 @@ def set_gpu_mode(mode, gpu_id=0):
     _DEVICE = torch.device(('cuda:' + str(_GPU_ID)) if _USE_GPU else 'cpu')
 
 
+def prefer_gpu():
+    """Prefer to use GPU(s) if GPU(s) is detected."""
+    if torch.cuda.is_available():
+        set_gpu_mode(True)
+    else:
+        set_gpu_mode(False)
+
+
 def global_device():
     """Returns the global device that torch.Tensors should be placed on.
 
diff --git a/src/garage/torch/algos/__init__.py b/src/garage/torch/algos/__init__.py
index c0c95ecab2..b1c342441f 100644
--- a/src/garage/torch/algos/__init__.py
+++ b/src/garage/torch/algos/__init__.py
@@ -10,6 +10,7 @@
 from garage.torch.algos.maml_vpg import MAMLVPG
 from garage.torch.algos.ppo import PPO
 from garage.torch.algos.maml_ppo import MAMLPPO
+from garage.torch.algos.td3 import TD3
 from garage.torch.algos.trpo import TRPO
 from garage.torch.algos.maml_trpo import MAMLTRPO
 # SAC needs to be imported before MTSAC
@@ -18,6 +19,6 @@
 from garage.torch.algos.pearl import PEARL
 
 __all__ = [
-    'BC', 'DDPG', 'DQN', 'VPG', 'PPO', 'TRPO', 'MAMLPPO', 'MAMLTRPO',
+    'BC', 'DDPG', 'DQN', 'VPG', 'PPO', 'TD3', 'TRPO', 'MAMLPPO', 'MAMLTRPO',
     'MAMLVPG', 'MTSAC', 'PEARL', 'SAC'
 ]
diff --git a/src/garage/torch/algos/td3.py b/src/garage/torch/algos/td3.py
new file mode 100644
index 0000000000..d1bdc7550e
--- /dev/null
+++ b/src/garage/torch/algos/td3.py
@@ -0,0 +1,399 @@
+"""TD3 model in Pytorch."""
+import copy
+
+from dowel import logger, tabular
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+from garage import (_Default, log_performance, make_optimizer,
+                    obtain_evaluation_episodes)
+from garage.np.algos import RLAlgorithm
+from garage.sampler import FragmentWorker, LocalSampler
+from garage.torch import (dict_np_to_torch, global_device, soft_update_model,
+                          torch_to_np)
+
+
+class TD3(RLAlgorithm):
+    """Implementation of TD3.
+
+    Based on https://arxiv.org/pdf/1802.09477.pdf.
+
+    Args:
+        env_spec (EnvSpec): Environment specification.
+        policy (garage.torch.policies.Policy): Policy (actor network).
+        qf1 (garage.torch.q_functions.QFunction): Q function (critic network).
+        qf2 (garage.torch.q_functions.QFunction): Q function (critic network).
+        replay_buffer (ReplayBuffer): Replay buffer.
+        replay_buffer_size (int): Size of the replay buffer
+        exploration_policy (garage.np.exploration_policies.ExplorationPolicy):
+                Exploration strategy.
+        uniform_random_policy
+                (garage.np.exploration_policies.ExplorationPolicy):
+                Uniform random exploration strategy.
+        target_update_tau (float): Interpolation parameter for doing the
+            soft target update.
+        discount (float): Discount factor (gamma) for the cumulative return.
+        reward_scaling (float): Reward scaling.
+        update_actor_interval (int): Policy (Actor network) update interval.
+        max_action (float): Maximum action magnitude.
+        buffer_batch_size (int): Size of replay buffer.
+        min_buffer_size (int): The minimum buffer size for replay buffer.
+        policy_noise (float): Policy (actor) noise.
+        policy_noise_clip (float): Noise clip.
+        exploration_noise (float): Exploration noise.
+        clip_return (float): Clip return to be in [-clip_return,
+            clip_return].
+        policy_lr (float): Learning rate for training policy network.
+        qf_lr (float): Learning rate for training Q network.
+        policy_optimizer (Union[type, tuple[type, dict]]): Type of optimizer
+            for training policy network. This can be an optimizer type such as
+            `torch.optim.Adam` or a tuple of type and dictionary, where
+            dictionary contains arguments to initialize the optimizer
+            e.g. `(torch.optim.Adam, {'lr' : 1e-3})`.
+        qf_optimizer (Union[type, tuple[type, dict]]): Type of optimizer
+            for training Q-value network. This can be an optimizer type such
+            as `torch.optim.Adam` or a tuple of type and dictionary, where
+            dictionary contains arguments to initialize the optimizer
+            e.g. `(torch.optim.Adam, {'lr' : 1e-3})`.
+        steps_per_epoch (int): Number of train_once calls per epoch.
+        grad_steps_per_env_step (int): Number of gradient steps taken per
+            environment step sampled.
+        max_episode_length_eval (int or None): Maximum length of episodes used
+            for off-policy evaluation. If None, defaults to
+            `env_spec.max_episode_length`.
+        num_evaluation_episodes (int): The number of evaluation
+            trajectories used for computing eval stats at the end of every
+            epoch.
+        start_steps (int): The number of steps for warming up before
+             selecting actions according to policy.
+        update_after (int): The number of steps to perform before policy
+            is updated.
+        use_deterministic_evaluation (bool): True if the trained policy
+            should be evaluated deterministically.
+
+    """
+
+    def __init__(
+            self,
+            env_spec,
+            policy,
+            qf1,
+            qf2,
+            replay_buffer,
+            *,  # Everything after this is numbers.
+            max_episode_length_eval=None,
+            grad_steps_per_env_step,
+            exploration_policy,
+            uniform_random_policy=None,
+            max_action=None,
+            target_update_tau=0.005,
+            discount=0.99,
+            reward_scaling=1.,
+            update_actor_interval=2,
+            buffer_batch_size=64,
+            replay_buffer_size=1e6,
+            min_buffer_size=1e4,
+            exploration_noise=0.1,
+            policy_noise=0.2,
+            policy_noise_clip=0.5,
+            clip_return=np.inf,
+            policy_lr=_Default(1e-4),
+            qf_lr=_Default(1e-3),
+            policy_optimizer=torch.optim.Adam,
+            qf_optimizer=torch.optim.Adam,
+            num_evaluation_episodes=10,
+            steps_per_epoch=20,
+            start_steps=10000,
+            update_after=1000,
+            use_deterministic_evaluation=False):
+
+        self._env_spec = env_spec
+        action_bound = self._env_spec.action_space.high[0]
+        self._max_action = action_bound if max_action is None else max_action
+        self._action_dim = self._env_spec.action_space.shape[0]
+        self._tau = target_update_tau
+        self._discount = discount
+        self._reward_scaling = reward_scaling
+        self._exploration_noise = exploration_noise
+        self._policy_noise = policy_noise
+        self._policy_noise_clip = policy_noise_clip
+        self._clip_return = clip_return
+        self._replay_buffer_size = replay_buffer_size
+        self._min_buffer_size = min_buffer_size
+        self._buffer_batch_size = buffer_batch_size
+        self._grad_steps_per_env_step = grad_steps_per_env_step
+        self._update_actor_interval = update_actor_interval
+        self._steps_per_epoch = steps_per_epoch
+        self._start_steps = start_steps
+        self._update_after = update_after
+        self._num_evaluation_episodes = num_evaluation_episodes
+        self.max_episode_length = env_spec.max_episode_length
+        self._max_episode_length_eval = env_spec.max_episode_length
+
+        if max_episode_length_eval is not None:
+            self._max_episode_length_eval = max_episode_length_eval
+        self._use_deterministic_evaluation = use_deterministic_evaluation
+
+        self._episode_policy_losses = []
+        self._episode_qf_losses = []
+        self._epoch_ys = []
+        self._epoch_qs = []
+        self._eval_env = None
+        self.exploration_policy = exploration_policy
+        self._uniform_random_policy = uniform_random_policy
+        self.worker_cls = FragmentWorker
+        self.sampler_cls = LocalSampler
+
+        self._replay_buffer = replay_buffer
+        self.policy = policy
+        self._qf_1 = qf1
+        self._qf_2 = qf2
+        self._target_policy = copy.deepcopy(self.policy)
+        self._target_qf_1 = copy.deepcopy(self._qf_1)
+        self._target_qf_2 = copy.deepcopy(self._qf_2)
+
+        self._policy_optimizer = make_optimizer(policy_optimizer,
+                                                module=self.policy,
+                                                lr=policy_lr)
+        self._qf_optimizer_1 = make_optimizer(qf_optimizer,
+                                              module=self._qf_1,
+                                              lr=qf_lr)
+        self._qf_optimizer_2 = make_optimizer(qf_optimizer,
+                                              module=self._qf_2,
+                                              lr=qf_lr)
+        self._actor_loss = torch.zeros(1)
+
+    def _get_action(self, action, noise_scale):
+        """Select action based on policy.
+
+        Action can be added with noise.
+
+        Args:
+            action (float): Action.
+            noise_scale (float): Noise scale added to action.
+
+        Return:
+            float: Action selected by the policy.
+        """
+        action += noise_scale * np.random.randn(self._action_dim)
+        # pylint: disable=invalid-unary-operand-type
+        return np.clip(action, -self._max_action, self._max_action)
+
+    def train(self, trainer):
+        """Obtain samplers and start actual training for each epoch.
+
+        Args:
+            trainer (Trainer): Experiment trainer, which provides services
+                such as snapshotting and sampler control.
+
+        """
+        if not self._eval_env:
+            self._eval_env = trainer.get_env_copy()
+        trainer.enable_logging = False
+        for _ in trainer.step_epochs():
+            for cycle in range(self._steps_per_epoch):
+                # Obtain trasnsition batch and store it in replay buffer.
+                # Get action randomly from environment within warm-up steps.
+                # Afterwards, get action from policy.
+                if self._uniform_random_policy and \
+                        trainer.step_itr < self._start_steps:
+                    trainer.step_path = trainer.obtain_episodes(
+                        trainer.step_itr,
+                        agent_update=self._uniform_random_policy)
+                else:
+                    trainer.step_path = trainer.obtain_episodes(
+                        trainer.step_itr, agent_update=self.exploration_policy)
+                self._replay_buffer.add_episode_batch(trainer.step_path)
+
+                # Update after warm-up steps.
+                if trainer.total_env_steps >= self._update_after:
+                    self._train_once(trainer.step_itr)
+
+                # Evaluate and log the results.
+                if (cycle == 0 and self._replay_buffer.n_transitions_stored >=
+                        self._min_buffer_size):
+                    trainer.enable_logging = True
+                    eval_eps = self._evaluate_policy()
+                    log_performance(trainer.step_path,
+                                    eval_eps,
+                                    discount=self._discount,
+                                    prefix='Training')
+                    log_performance(trainer.step_itr,
+                                    eval_eps,
+                                    discount=self._discount,
+                                    prefix='Evaluation')
+                trainer.step_itr += 1
+
+    def _train_once(self, itr):
+        """Perform one iteration of training.
+
+        Args:
+            itr (int): Iteration number.
+
+        """
+        for grad_step_timer in range(self._grad_steps_per_env_step):
+            if (self._replay_buffer.n_transitions_stored >=
+                    self._min_buffer_size):
+                # Sample from buffer
+                samples = self._replay_buffer.sample_transitions(
+                    self._buffer_batch_size)
+                samples = dict_np_to_torch(samples)
+
+                # Optimize
+                qf_loss, y, q, policy_loss = torch_to_np(
+                    self._optimize_policy(samples, grad_step_timer))
+
+                self._episode_policy_losses.append(policy_loss)
+                self._episode_qf_losses.append(qf_loss)
+                self._epoch_ys.append(y)
+                self._epoch_qs.append(q)
+
+        if itr % self._steps_per_epoch == 0:
+            logger.log('Training finished')
+            epoch = itr // self._steps_per_epoch
+
+            if (self._replay_buffer.n_transitions_stored >=
+                    self._min_buffer_size):
+                tabular.record('Epoch', epoch)
+                self._log_statistics()
+
+    # pylint: disable=invalid-unary-operand-type
+    def _optimize_policy(self, samples_data, grad_step_timer):
+        """Perform algorithm optimization.
+
+        Args:
+            samples_data (dict): Processed batch data.
+            grad_step_timer (int): Iteration number of the gradient time
+                taken in the env.
+
+        Returns:
+            float: Loss predicted by the q networks
+                (critic networks).
+            float: Q value (min) predicted by one of the
+                target q networks.
+            float: Q value (min) predicted by one of the
+                current q networks.
+            float: Loss predicted by the policy
+                (action network).
+
+        """
+        rewards = samples_data['rewards'].to(global_device()).reshape(-1, 1)
+        terminals = samples_data['terminals'].to(global_device()).reshape(
+            -1, 1)
+        actions = samples_data['actions'].to(global_device())
+        observations = samples_data['observations'].to(global_device())
+        next_observations = samples_data['next_observations'].to(
+            global_device())
+
+        next_inputs = next_observations
+        inputs = observations
+        with torch.no_grad():
+            # Select action according to policy and add clipped noise
+            noise = (torch.randn_like(actions) * self._policy_noise).clamp(
+                -self._policy_noise_clip, self._policy_noise_clip)
+            next_actions = (self._target_policy(next_inputs) + noise).clamp(
+                -self._max_action, self._max_action)
+
+            # Compute the target Q value
+            target_Q1 = self._target_qf_1(next_inputs, next_actions)
+            target_Q2 = self._target_qf_2(next_inputs, next_actions)
+            target_q = torch.min(target_Q1, target_Q2)
+            target_Q = rewards * self._reward_scaling + (
+                1. - terminals) * self._discount * target_q
+
+        # Get current Q values
+        current_Q1 = self._qf_1(inputs, actions)
+        current_Q2 = self._qf_2(inputs, actions)
+        current_Q = torch.min(current_Q1, current_Q2)
+
+        # Compute critic loss
+        critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(
+            current_Q2, target_Q)
+
+        # Optimize critic
+        self._qf_optimizer_1.zero_grad()
+        self._qf_optimizer_2.zero_grad()
+        critic_loss.backward()
+        self._qf_optimizer_1.step()
+        self._qf_optimizer_2.step()
+
+        # Deplay policy updates
+        if grad_step_timer % self._update_actor_interval == 0:
+            # Compute actor loss
+            actions = self.policy(inputs)
+            self._actor_loss = -self._qf_1(inputs, actions).mean()
+
+            # Optimize actor
+            self._policy_optimizer.zero_grad()
+            self._actor_loss.backward()
+            self._policy_optimizer.step()
+
+            # update target networks
+            self._update_network_parameters()
+
+        return (critic_loss.detach(), target_Q, current_Q.detach(),
+                self._actor_loss.detach())
+
+    def _evaluate_policy(self):
+        """Evaluate the performance of the policy via deterministic rollouts.
+
+        Statistics such as (average) discounted return and success rate are
+            recorded.
+
+        Returns:
+            TrajectoryBatch: Evaluation trajectories, representing the best
+                current performance of the algorithm.
+
+        """
+        return obtain_evaluation_episodes(
+            self.exploration_policy,
+            self._eval_env,
+            self._max_episode_length_eval,
+            num_eps=self._num_evaluation_episodes,
+            deterministic=self._use_deterministic_evaluation)
+
+    def _update_network_parameters(self):
+        """Update parameters in actor network and critic networks."""
+        soft_update_model(self._target_qf_1, self._qf_1, self._tau)
+        soft_update_model(self._target_qf_2, self._qf_2, self._tau)
+        soft_update_model(self._target_policy, self.policy, self._tau)
+
+    def _log_statistics(self):
+        """Output training statistics to dowel such as losses and returns."""
+        tabular.record('Policy/AveragePolicyLoss',
+                       np.mean(self._episode_policy_losses))
+        tabular.record('QFunction/AverageQFunctionLoss',
+                       np.mean(self._episode_qf_losses))
+        tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs))
+        tabular.record('QFunction/MaxQ', np.max(self._epoch_qs))
+        tabular.record('QFunction/AverageAbsQ',
+                       np.mean(np.abs(self._epoch_qs)))
+        tabular.record('QFunction/AverageY', np.mean(self._epoch_ys))
+        tabular.record('QFunction/MaxY', np.max(self._epoch_ys))
+        tabular.record('QFunction/AverageAbsY',
+                       np.mean(np.abs(self._epoch_ys)))
+
+    @property
+    def networks(self):
+        """Return all the networks within the model.
+
+        Returns:
+            list: A list of networks.
+
+        """
+        return [
+            self.policy, self._qf_1, self._qf_2, self._target_policy,
+            self._target_qf_1, self._target_qf_2
+        ]
+
+    def to(self, device=None):
+        """Put all the networks within the model on device.
+
+        Args:
+            device (str): ID of GPU or CPU.
+
+        """
+        device = device or global_device()
+        for net in self.networks:
+            net.to(device)
diff --git a/src/garage/torch/policies/deterministic_mlp_policy.py b/src/garage/torch/policies/deterministic_mlp_policy.py
index ea94ff1eb2..3200204e75 100644
--- a/src/garage/torch/policies/deterministic_mlp_policy.py
+++ b/src/garage/torch/policies/deterministic_mlp_policy.py
@@ -7,6 +7,7 @@
 import numpy as np
 import torch
 
+from garage.torch import global_device
 from garage.torch.modules import MLPModule
 from garage.torch.policies.policy import Policy
 
@@ -101,5 +102,5 @@ def get_actions(self, observations):
             observations = self._env_spec.observation_space.unflatten_n(
                 observations)
         with torch.no_grad():
-            x = self(torch.Tensor(observations))
-            return x.numpy(), dict()
+            x = self(torch.Tensor(observations).to(global_device()))
+            return x.cpu().numpy(), dict()
diff --git a/tests/garage/np/policies/test_uniform_random_policy.py b/tests/garage/np/policies/test_uniform_random_policy.py
new file mode 100644
index 0000000000..7ed3661bc6
--- /dev/null
+++ b/tests/garage/np/policies/test_uniform_random_policy.py
@@ -0,0 +1,13 @@
+import numpy as np
+import pytest
+
+from garage.envs import GymEnv, normalize
+from garage.np.policies import UniformRandomPolicy
+
+
+@pytest.mark.mujoco
+def test_get_actions():
+    env = normalize(GymEnv('InvertedDoublePendulum-v2'))
+    policy = UniformRandomPolicy(env.spec)
+    assert policy.get_actions(np.array([0]).reshape(1, 1))[0]
+    assert policy.get_action(np.array([0]))[0]
diff --git a/tests/garage/test_dtypes.py b/tests/garage/test_dtypes.py
index 2e6d393ed7..d228323de3 100644
--- a/tests/garage/test_dtypes.py
+++ b/tests/garage/test_dtypes.py
@@ -4,11 +4,7 @@
 import pytest
 
 # yapf: disable
-from garage import (EnvSpec,
-                    EnvStep,
-                    EpisodeBatch,
-                    StepType,
-                    TimeStep,
+from garage import (EnvSpec, EnvStep, EpisodeBatch, StepType, TimeStep,
                     TimeStepBatch)
 
 # yapf: enable
diff --git a/tests/garage/torch/algos/test_td3.py b/tests/garage/torch/algos/test_td3.py
new file mode 100644
index 0000000000..7dba03f3fa
--- /dev/null
+++ b/tests/garage/torch/algos/test_td3.py
@@ -0,0 +1,108 @@
+"""Test TD3 on InvertedDoublePendulum-v2."""
+import pickle
+
+import pytest
+from torch.nn import functional as F
+
+from garage.envs import GymEnv, normalize
+from garage.experiment import deterministic
+from garage.np.exploration_policies import AddGaussianNoise
+from garage.replay_buffer import PathBuffer
+from garage.sampler import LocalSampler
+from garage.torch import prefer_gpu
+from garage.torch.algos import TD3
+from garage.torch.policies import DeterministicMLPPolicy
+from garage.torch.q_functions import ContinuousMLPQFunction
+from garage.trainer import Trainer
+
+from tests.fixtures import snapshot_config, TfGraphTestCase
+
+
+class TestTD3(TfGraphTestCase):
+    """Test class for TD3."""
+
+    @pytest.mark.mujoco
+    def test_td3_inverted_double_pendulum(self):
+        deterministic.set_seed(0)
+        n_epochs = 10
+        steps_per_epoch = 20
+        sampler_batch_size = 100
+        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
+        trainer = Trainer(snapshot_config=snapshot_config)
+        env = normalize(
+            GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
+        policy = DeterministicMLPPolicy(env_spec=env.spec,
+                                        hidden_sizes=[64, 64],
+                                        hidden_nonlinearity=F.relu,
+                                        output_nonlinearity=None)
+        exploration_policy = AddGaussianNoise(env.spec,
+                                              policy,
+                                              total_timesteps=num_timesteps,
+                                              max_sigma=0.1,
+                                              min_sigma=0.1)
+        qf1 = ContinuousMLPQFunction(env_spec=env.spec,
+                                     hidden_sizes=[256, 256],
+                                     hidden_nonlinearity=F.relu)
+        qf2 = ContinuousMLPQFunction(env_spec=env.spec,
+                                     hidden_sizes=[256, 256],
+                                     hidden_nonlinearity=F.relu)
+        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
+        td3 = TD3(env_spec=env.spec,
+                  policy=policy,
+                  qf1=qf1,
+                  qf2=qf2,
+                  replay_buffer=replay_buffer,
+                  exploration_policy=exploration_policy,
+                  steps_per_epoch=steps_per_epoch,
+                  grad_steps_per_env_step=1,
+                  num_evaluation_episodes=1,
+                  discount=0.99)
+
+        prefer_gpu()
+        td3.to()
+        trainer.setup(td3, env, sampler_cls=LocalSampler)
+        trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size)
+
+    @pytest.mark.mujoco
+    def test_pickling(self):
+        """Test pickle and unpickle."""
+
+        deterministic.set_seed(0)
+        n_epochs = 10
+        steps_per_epoch = 20
+        sampler_batch_size = 100
+        num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size
+        env = normalize(
+            GymEnv('InvertedDoublePendulum-v2', max_episode_length=100))
+        policy = DeterministicMLPPolicy(env_spec=env.spec,
+                                        hidden_sizes=[64, 64],
+                                        hidden_nonlinearity=F.relu,
+                                        output_nonlinearity=None)
+        exploration_policy = AddGaussianNoise(env.spec,
+                                              policy,
+                                              total_timesteps=num_timesteps,
+                                              max_sigma=0.1,
+                                              min_sigma=0.1)
+        qf1 = ContinuousMLPQFunction(env_spec=env.spec,
+                                     hidden_sizes=[256, 256],
+                                     hidden_nonlinearity=F.relu)
+        qf2 = ContinuousMLPQFunction(env_spec=env.spec,
+                                     hidden_sizes=[256, 256],
+                                     hidden_nonlinearity=F.relu)
+        replay_buffer = PathBuffer(capacity_in_transitions=int(1e6))
+        td3 = TD3(env_spec=env.spec,
+                  policy=policy,
+                  qf1=qf1,
+                  qf2=qf2,
+                  replay_buffer=replay_buffer,
+                  exploration_policy=exploration_policy,
+                  steps_per_epoch=steps_per_epoch,
+                  grad_steps_per_env_step=1,
+                  num_evaluation_episodes=1,
+                  discount=0.99)
+        prefer_gpu()
+        td3.to()
+
+        pickled = pickle.dumps(td3)
+        unpickled = pickle.loads(pickled)
+        assert unpickled
diff --git a/tests/integration_tests/test_examples.py b/tests/integration_tests/test_examples.py
index 0d36d411a0..f5401bfb22 100644
--- a/tests/integration_tests/test_examples.py
+++ b/tests/integration_tests/test_examples.py
@@ -45,6 +45,8 @@
     EXAMPLES_ROOT_DIR / 'torch/mttrpo_metaworld_mt1_push.py',
     EXAMPLES_ROOT_DIR / 'torch/mttrpo_metaworld_mt10.py',
     EXAMPLES_ROOT_DIR / 'torch/mttrpo_metaworld_mt50.py',
+    EXAMPLES_ROOT_DIR / 'torch/td3_halfcheetah.py',
+    EXAMPLES_ROOT_DIR / 'torch/td3_pendulum.py',
     EXAMPLES_ROOT_DIR / 'tf/te_ppo_point.py',
     EXAMPLES_ROOT_DIR / 'tf/te_ppo_metaworld_mt1_push.py',
     EXAMPLES_ROOT_DIR / 'tf/te_ppo_metaworld_mt10.py',

From fa3304008124ad77221d8b15f01f9a561f012acd Mon Sep 17 00:00:00 2001
From: mishari <44849486+maliesa96@users.noreply.github.com>
Date: Thu, 22 Oct 2020 13:08:11 -0700
Subject: [PATCH 19/23] Add Torch TD3 and DQN to README (#2150)

* Add Torch TD3 and DQN to README

* Apply suggestions from code review

* Update README.md

Co-authored-by: Ryan Julian <ryanjulian@users.noreply.github.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 5150ff3b78..6152b0cb0e 100644
--- a/README.md
+++ b/README.md
@@ -56,13 +56,13 @@ The table below summarizes the algorithms available in garage.
 | CMA-ES                 | numpy               |
 | REINFORCE (a.k.a. VPG) | PyTorch, TensorFlow |
 | DDPG                   | PyTorch, TensorFlow |
-| DQN                    | TensorFlow          |
+| DQN                    | PyTorch, TensorFlow |
 | DDQN                   | TensorFlow          |
 | ERWR                   | TensorFlow          |
 | NPO                    | TensorFlow          |
 | PPO                    | PyTorch, TensorFlow |
 | REPS                   | TensorFlow          |
-| TD3                    | TensorFlow          |
+| TD3                    | PyTorch, TensorFlow |
 | TNPG                   | TensorFlow          |
 | TRPO                   | PyTorch, TensorFlow |
 | MAML                   | PyTorch             |

From d843e5b67460c9216fb5f5b2a8982ffcb82531f4 Mon Sep 17 00:00:00 2001
From: Karthikeyan Singaravelan <tir.karthi@gmail.com>
Date: Fri, 23 Oct 2020 01:49:37 +0530
Subject: [PATCH 20/23] Fix warning regarding ABC import from collections
 (#2146)

---
 tests/garage/envs/dm_control/test_dm_control_env.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/garage/envs/dm_control/test_dm_control_env.py b/tests/garage/envs/dm_control/test_dm_control_env.py
index 9ecf066211..3b31c6d61e 100644
--- a/tests/garage/envs/dm_control/test_dm_control_env.py
+++ b/tests/garage/envs/dm_control/test_dm_control_env.py
@@ -1,4 +1,4 @@
-import collections
+import collections.abc
 from copy import copy
 import pickle
 
@@ -79,7 +79,7 @@ def test_does_not_modify_actions(self):
         a_copy = copy(a)
         env.reset()
         env.step(a)
-        if isinstance(a, collections.Iterable):
+        if isinstance(a, collections.abc.Iterable):
             assert a.all() == a_copy.all()
         else:
             assert a == a_copy
@@ -94,7 +94,7 @@ def test_all_does_not_modify_actions(self, domain_name, task_name):
         a_copy = copy(a)
         env.reset()
         env.step(a)
-        if isinstance(a, collections.Iterable):
+        if isinstance(a, collections.abc.Iterable):
             assert a.all() == a_copy.all()
         else:
             assert a == a_copy

From 792771dc7ecc1b695a56f8e54a5c249f0d7c914e Mon Sep 17 00:00:00 2001
From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com>
Date: Mon, 26 Oct 2020 21:11:54 -0700
Subject: [PATCH 21/23] Add docs for algos/CEM (#2141)

* Add cem doc fix ppo doc title

* Chmod numpy.png
---
 docs/index.md              |   1 +
 docs/user/algo_cem.md      |  48 +++++++++++++++++++++++++++++++++++++
 docs/user/algo_ppo.md      |   4 ++--
 docs/user/images/numpy.png | Bin 0 -> 4653 bytes
 4 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 docs/user/algo_cem.md
 create mode 100644 docs/user/images/numpy.png

diff --git a/docs/index.md b/docs/index.md
index d001eade5e..4cd7acd51f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -62,6 +62,7 @@ and how to implement new MDPs and new algorithms.
    user/algo_vpg
    user/algo_td3
    user/algo_ddpg
+   user/algo_cem
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/user/algo_cem.md b/docs/user/algo_cem.md
new file mode 100644
index 0000000000..4fd38e0f8c
--- /dev/null
+++ b/docs/user/algo_cem.md
@@ -0,0 +1,48 @@
+# Cross Entropy Method
+
+```eval_rst
++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Paper**         | The cross-entropy method: A unified approach to Monte Carlo simulation, randomized optimization and machine learning :cite:`rubinstein2004cross` |
++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Framework(s)**  | .. figure:: ./images/numpy.png                                                                                                                   |
+|                   |    :scale: 40%                                                                                                                                   |
+|                   |    :class: no-scaled-link                                                                                                                        |
++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| **API Reference** | `garage.np.algos.CEM <../_autoapi/garage/np/algos/index.html#garage.np.algos.CEM>`_                                                              |
++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+| **Code**          | `garage/np/algos/cem.py <https://github.com/rlworkgroup/garage/blob/master/src/garage/np/algos/cem.py>`_                                         |
++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
+```
+
+Cross Entropy Method (CEM) works by iteratively optimizing a gaussian
+distribution of policy.
+
+In each epoch, CEM does the following:
+
+1. Sample n_samples policies from a gaussian distribution of mean cur_mean and
+std cur_std.
+
+2. Collect episodes for each policy.
+
+3. Update cur_mean and cur_std by doing Maximum Likelihood Estimation over the
+n_best top policies in terms of return.
+
+## Examples
+
+### NumPy
+
+```eval_rst
+.. literalinclude:: ../../examples/np/cem_cartpole.py
+```
+
+## References
+
+```eval_rst
+.. bibliography:: references.bib
+   :style: unsrt
+   :filter: docname in docnames
+```
+
+----
+
+*This page was authored by Ruofu Wang ([@yeukfu](https://github.com/yeukfu)).*
diff --git a/docs/user/algo_ppo.md b/docs/user/algo_ppo.md
index abb4a9ea9f..2ee714846f 100644
--- a/docs/user/algo_ppo.md
+++ b/docs/user/algo_ppo.md
@@ -35,13 +35,13 @@ regularization adds the mean entropy to the surrogate objective. See
 
 Garage has implementations of PPO with PyTorch and TensorFlow.
 
-## PyTorch
+### PyTorch
 
 ```eval_rst
 .. literalinclude:: ../../examples/torch/ppo_pendulum.py
 ```
 
-## TensorFlow
+### TensorFlow
 
 ```eval_rst
 .. literalinclude:: ../../examples/tf/ppo_pendulum.py
diff --git a/docs/user/images/numpy.png b/docs/user/images/numpy.png
new file mode 100644
index 0000000000000000000000000000000000000000..4214edd5588f3a1f8d2baae20b38c27cf04e301d
GIT binary patch
literal 4653
zcmai2byU>P*Z%AR%aYRFAR)DYNQm^(At4|or63C|9THzkx={p|mK2r}LApyorKMb2
z(p|a)etggGulJAleb1bk=giEx=gc#6=iYms*vES6WW-Fw0059_YN#6C;^1vJheB@0
zs}siQTflcz(p3V0+GLWywgk6x4totlT>yB+3jm>!0B~`;6uJojJ}>~-wgCXyOaP#F
z&u%u5yWN1;X{)OOH~+T$w$gL}pj6dVRWkOU+sg^|o|w8Cz~mv*8bGp?KHCred8lHe
z{8?AaNA|e%Ogf*4=KWGn_CR#RgapBuycl~S*cE^$4Myh4+R9p)E2e=bWHukbXJ4j=
zPs`XAF1a7ZmMTx&$6TIYDUk%6@#?16lkElPOn7mehG5&8PTR4+51TNTSCAA99;#SX
zuGIe<uJE3FuJU%&ts3987=d_IePtz)niIW9W)UCCr;qWuQg33xVes4uLX93OxDndY
z`rz#UFq51P6{M5OhDAa@P<7%MX6Ka5o^wpq`o0>Y_k`ZXv*sWo=1%(nJVHJWmo(&0
z0|}(?yykq(wI%W9y3!Iq$r_*Tgr*k+r~vJc_`Mqkm*0?T&?CMwVI`FxONJJ*S<ivG
zkx$0<%NCZ?vg^T3zox{yLr*Fa27PDd4#E48-OA;*xHGpn2f(M-u4-OB#I#Z;L$aMD
z;@&^aL)K?+S)4WdiPm134#&6<FAQ{6>Pb1@5zF0Xc(B`~o37S~&<`$Ik$f1ZSw>Dl
ztH!q~3@`BXD<7gaauUHxE3E;K=NLb<=6mLwn=@*vL5m#r8YYY-cYaaWs-}H;Y$gZk
zB(-4{Fka7fo1?1urHN=&m=qf&SS1IVW7fED*@n<+#K2zQN#70Amgg=Zi2nBT#;zxl
zNC{0!PAI5LzY7Z}q4&3^6(3uFq@wQ+<j(?iE#*eU@bN~lrZPh+`s@w<lL2ZiDUk&h
zpy^TNF;r3XakN|5WeLf(U}-qd9=P^z`@Z*s%?%1%1rM*2w>^}ddbQ#B!Qr&qPSxQr
zXvw;UYdix$j_0pdO(OT%ZBD;E)kv92OY2n0tlCl&KTx(8w3Mg(gL{K+A~4qB8)b_n
zX)b|{Y25J!xTf6sgw|75WBiYu3`?c|;rr4ESTdvit6$(>+R9loMyL1cLLeJ>42$$t
zIjY3?!7zgkh|~V^^p!TV8)|9htBu!Zhx76)5wMHS*SdXnnn@WJ9>c0<#46(!V1X*9
zToveaKsG$Ee%!Ij>HaQr!GcD9w4Kgm4<LG{fsoGS;=C_<MbB4RlHqq0Pw=2kX{ukF
zA)>Kjaq<QhYA^(~Pusyd4zY@88iH$$&su?lPXAc4Bmb+Qi4N-QGIhY37yRmX6zcF4
zK`1$Z4IaTuVJ~GA#+Ta=k(>&PS)JBDf&x}t-I*%~Nz`$Mnh73msB#UxJDPQ1F~4gm
zpkEu(wr$p;c*(OjLYo*RsyFd&lh)Zx3cWO(nJ%$!&&wI!O{nTGm;2bIDx<Rc+nHf6
zbyRVc%pFC7Oe8p;JAc+y0=0gWQgkmxBngJAR3Z|xG+q3J9=CgUN(CMzv{gQM7J=dW
zLA@WD76Vh3g%hi$nS^=Mo&-eKK0>CeJZymGWhed)JJtrSpE7mm?=4T<S(vn1dK9pC
za!A{?p4z=xu2^FHs@j&H8>9&J-jq3;nW3k>5*+I{7etK$BBWPBWd|535pk)!UFu=O
zrmK74<hYhfqA^IJ3h8zIlz&ZeS84z=?}ayAf9D-i*VzvInwpkY$A2RJ*<<FkzFRm(
z8Rjp}qA))N>tadZ)3RFxoAenM*%Io*M)7|_0M$^fW)tn%6G6FnJN9^@4z_fnzp9&#
zvac(E>?1HJ{vpE`f$IaE91<$dM%Ob+?bm)1l%jq!2{-m7L5cMWu)a=~`74XW#dV7S
zQv>xjwEOWarp%r(NHA1Cji!Hh>4&ZTg74vYCLw!Ef5cl9Bfk<h$BqpGuSA*Meim82
zM=$sl>0UfH@+d|ImmJw5<qZ5w6cB@=RHF-b->`hsvgGd#j1{|d-w9(MppNS4y4vqu
zyV)eLZW*tIi~bHu3sLC#X!`PSa<+XL&wvB?(ppMgG@VPoX~J*SEgj^ZFz=QW5HV&x
z^9EUd-1Pn+Xy$Z+;+bTn>sgS@)Q2`Lu%yow=!`;9Jz?`KC>1`1XUxHnwWH8SC)(2+
z==TDF#uX@Ymb5_6<P2<Ss*mZ+oP$MP6f=e<#$N2D+JV0c(T@6-$HL|o^^6Q=@&;ah
z8y0(!2z(({G~ONT_0vJk`=2P%K!K0=(i`}rOydtq%FzL(pX&ljMmNS^67{)Cp?4W%
zU5wo|{%op7D&0E=ex@*fpQG*+)yZwS3M!A6&W+%%o>~>E1UZGN)p0URTFvC`?L1P*
zN=26Au0~dSOZ+pUvP{p^lX9&rm(gM&bvtk8=5X9R`(Ui5NS>y8CJAlv;7Fq?xsh?L
z@3spDUajCN|6?LvcwLu1y>K}BC+o%Fp3q-Jk>mr?N1lka<7m&5YBzTl3uE4Qt{s-8
ztF;4h6dymXpcD1!LLjiGf^9P-KOF32(_?QcDB3ov9&}J|G-M!e#L~^dg@>IC;3Wir
z{Ea_1JtcDf(qdcN2loWJ$UQa@GoUAokTcypGF3n7DNyN;@TqNxa!tyx4f&_#b$(>_
zebdt;@90WXJWVcRkO#qdpBWf^kj!FbV7ct$Omug|?n~=CiZM%|kFSeM#(cC3%R-(@
zwUhsRm)y8S;xK|TWZ)3!#^*W;D;R0zIyhM9X+nU`K#B&FsI0B3s<5%G{esU6fx<Kx
z#K_lmZ@Cd}*s5s;F$SDG@fU$OS8qyaD9!B68Az}*qxHM|lyE8Z{1-|tp7<tBi;uuU
ze-_O?{_%NqbfXW$siQ#HT_7rwcddbpok(%`pc6yqPsza?eLpky&o>Tw5}T4?^pke&
zK=&c(TM9Q$-d-L}jrX15RukT}#nYz{N>^P~mN;nRJ<`{giWHakD_?pTp|Ay{)PJOT
z55y<d#m0Y|54V|=G?*%Vk<gRDL@1+5i~|OW9TcM)f)&)hUQ1=H1#9b6NK#?PJik9|
zjM3(jm$w-i3X%DM@2SI6rqdm$kQZYD$A<C814j+|bDcV;yzKrsJm5ata<Mv^wta|^
zH%($SiFFJM;u(MO!N#*>oPxFY?OY;73&2s<L!~+=x^#K%HDJlQPFL<QflCUp*cn8O
zN{5AtH63ce)p|Njc>A_%v-tdXOB&NfqTe7n{UL>oqO=g}29pPgUn`M%h|F^)yXH7(
zwJHxe*?A*Lk)<vVFWBGW10LV(Jtlq45`t+$)IrwM;U{)a`YeE&+PNdKFwCS`C^G&;
z=RB&+ljfDkr|=`(Vm2Z0S5D%s61G;EWLuj=Zc9La^WS%|L*w3x;k;G1-CyK57OuAC
z@pxLJPt~JBewT}}c9aJe+Ye(ruX;rBR--9pm<O13rF{Dy-@W5@3h}YK(~aTyP@npZ
zG@ny15u)AH$Xj&n0tAp+5+q7^0O-ZDqVrEFa2J8l9d4nM`m~R(r9P}_IR`tZ-zmG6
ziYZqX>u-<iahnA5MwI%=u4+Afz1STHLB7f8rP7zw>QJ}Rfb#x0($Mb{?4#yHFiNwd
z3Pt>GHU%el9ccJ|Sd`BqOLJF^n@86>U@u7|Ss&}G#QRbD+HY4_dd4{(@0n6@-H$tR
zdWY<wzua8UOdgNE_h=>RT1qGj&}|jk`dOA7r94nMGDMNSFAwt7FCSWK$U=Nh8u)HG
z=A@ymE3>ULF6*yxzsGzFrP8~X<!JB}A-GkeriKohed*lCW|zD6)RKZz*$%AmVfn*?
z?@Aucmh~PJ@_t9^78n#de|cF|rbKkzZ{#czbC(oV@jP>C9d+CXwF;KqK9r72s{5#e
zhjTrWD`MK%G_4@_BnQm0@gy;AH8lo;5znU3)0^K51!y0N)ZDs4+^eOeeV63MnLuHP
zZ7xAKI&`Dc(XZ^{5->hC#SUCtrR<m>+25yg|2Z<sC+^I8)4#fr+egq6KiuZa^@_Xu
z?T-&Ft4nxyx;i&frmi_c@oavP!2{@JSg`>taP?JM+{Q{ccZMhS#;4hdkC@j7LQ>4#
zUG6qTfHS;DrdO>99U*?C17^14-^?xl6#UKM&pkfspdD2l`+l7_QADgDqjW{%d&7<z
z-1iJ(tcv0dd;5;#+)mC0!bn(Ej3Jk{YYETx@STA&zDlIZ$-$03%J4nnrC;rK#V9kS
zN2ydKwruwAur*txUlvWpfLrt0F>`(1pHKcw(PmcdPiNI?(~0!)f=rO#Ns}jb@t3w!
z`#}Y@P_hgAqn+Q+YUoE0T=HzuF^!uP0>eh=3R77N=sG8yQcP9(bp8*f`F@I91jvOz
zKyu%=h}Uf)%hl<Tr{LvmSEPJWbc|Pmu;1A{H~=`Hk{Fa`V8wV%PP|j`R=TnYY@*+(
zd4vC=cc?a?P2%S24f+Z4en?MYcn9drC}9nILDoM%E<noLXO5J3Hn-@DUe-X+Qda{%
zL{S8vlUhya__s*as0-lSUpht%w{k)jm@=fSSSDjfBSzyUP3hZ7Y%EylFSEQSClD=W
zy>j$p>Uhl={e)SO^6=-a>18mzBgW&)Sj+`{BZqXI{)cBC7nEtf5pYUOIx72S_-E1n
zHgPc*wW7-4Ozsq-iK%sbj;DJ1Y9vKosBf&?=juYe3C5AjwJ6c}|9ao&X;U|=)hkSb
z<yAHT8->qh?=tT|U$kPqYM|#28K9lsO^1*^+KYTTJfK<)Bb4mNB*%^KS?RvmBWC*b
zPcoZPJwfN*ex%3Z_MQq_1S6#2v+K|~6s|Wq*~OY?v;^4go#b%v4ZxBGHk2pINkdY@
z;1RkUWJyFykJmEf&c`T&IIVn+wvz^5IbL68cw*QUzZh_zKU|EarxZk%2RI7ZMj~fU
z#_YcSB}yHiw6a0J?onFN8ps%5Xjs+{^$?(Z?jr)eiH)%UKR_YO=Lg3csbSQLw!gc5
zp`CQE{bw)UcQ<N%Y{;Rbd1?>!{<cq`HTAmpPkrg34DPXQFk73DKnsx*@lCGs&h?mj
zB=p_e-bPN=T9kCFeArPgOQ!j>e--h28ZrLrSGhtLvaZ&a`l2zXE7GCZLurzp@DD5E
z7$?IY;QZ17siarg8wLB~*%Xi`{XakE+(z&jc4VN{p}kEAPnh1&PbooJmP!}bMcW`R
zk1k45&)eeWJ6@2w>TaNIWebkcQK>oHP`e5FMC#7sQB24u21ZT_Dge?Z&3l{wK^$(e
z3HlQPNjwC~NIaMA*kc3Iivc*ypQQOFY}A?-A||0CPVqy~5~60}Z-)9?zF1GN#a6l{
zUtn|qn&f4PAL+0WjWH>j`{}}6jUCHDYHh>GM<V^!X+|vny&VX-9~uZsbse#u$KFOM
zG7@5Wob;%#p*tf3md8^!rB{ySvTMY+aAnte*AMbo)XZ{IKoY}UYP&&2^VEytx}KXM
z^V!az_9-ZF(k{imI%;_s{dTl?B&Mahqr4ecv7&_$U%7rb%B;@7oX?-!e<|#%$(xvo
z<0p2N5n;exlS!_kUiH1zZiCwSkvThla|sH@x~u$s2Pihi<o`;X{HOUTu|37=Q|arr
z_K$+DEoOqIbUwhDTouA3y`$qx;rY;@>9w}4hPbapezo9zCNllYdGIOt%uPXKm9@b1
zKuSmOOp&~;XeRu~So*|`D|d$d*|e+Fjb@xm+So1OPp9#}6@QKmPeEVE3q_lGjuSbo
ze=Q`Ziev4hZ{s8dj_=tAwV`p}J!9_j<tPH3Z8b}Iy_WbJ3PlzSY^d3~v5<osiV_D3
z$ksBKzyje?)GcAJUNdqVm$dGmW#ViZg(;*ABve|#iA~G9SI(fAd!PgDc+ONhRc;Z`
zQnvZDS?RrIN5GO(I7*cY@74>P*BqzKv3Tjv5KMxB>i?f6@n6{jB71WuC^YmsWsmjI
zt-^BG8*b+9VC(%{*532^Edb&$ahNbnN?2UNSWHq@{DG{5m>>)$3xlcswU+)L2Chhl
amoEbTcLS+{jaz*O05suxs#VH1;r{`_@u0W>

literal 0
HcmV?d00001


From a63349a091df2f169c8f1530546ecbf0ae1e6a8d Mon Sep 17 00:00:00 2001
From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com>
Date: Tue, 27 Oct 2020 14:32:15 -0700
Subject: [PATCH 22/23] Add docs for logging and plotting (#2147)

* Add docs for logging and plotting

* Fix grammer error

* Fix isort
---
 docs/index.md                 |  1 +
 docs/user/logging_plotting.md | 82 +++++++++++++++++++++++++++++++++++
 src/garage/__init__.py        | 17 +++-----
 3 files changed, 90 insertions(+), 10 deletions(-)
 create mode 100644 docs/user/logging_plotting.md

diff --git a/docs/index.md b/docs/index.md
index 4cd7acd51f..cbbeb3ddc9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -72,6 +72,7 @@ and how to implement new MDPs and new algorithms.
    user/environment_libraries
    user/concept_experiment
    user/sampling
+   user/logging_plotting
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/user/logging_plotting.md b/docs/user/logging_plotting.md
new file mode 100644
index 0000000000..1d0c7d2f17
--- /dev/null
+++ b/docs/user/logging_plotting.md
@@ -0,0 +1,82 @@
+# Logging and plotting
+
+## Logging
+
+garage supports convenient and useful logging. garage uses [dowel](https://github.com/rlworkgroup/dowel)
+for logging. The `logger` supports many outputs, including
+
+- Std output
+- Text output
+- Csv output
+- TensorBoard output
+
+In garage's experiment, the `logger` will output to all of these.
+
+Here is an example of logging in garage.
+
+```py
+from garage import wrap_experiment
+from dowel import logger, tabular
+
+@wrap_experiment
+def log_experiment(ctxt=None):
+    for i in range(100):
+        # Log str directly
+        logger.log('Logging messages:')
+        # Log scalar values with the key 'AverageReturn'
+        tabular.record('AverageReturn', i)
+
+        # The Trainer will do these steps for you, if you log things in
+        # the algorithms.
+        logger.log(tabular)
+        logger.dump_all()
+
+log_experiment()
+```
+
+Running the example will generate outputs like:
+
+```sh
+2020-10-21 14:06:04 | [log_experiment] Logging to [CUR_DIR]/data/local/experiment/log_experiment
+2020-10-21 14:06:04 | [log_experiment] Logging messages:
+-------------  -
+AverageReturn  0
+-------------  -
+2020-10-21 14:06:04 | [log_experiment] Logging messages:
+-------------  -
+AverageReturn  1
+-------------  -
+2020-10-21 14:06:04 | [log_experiment] Logging messages:
+-------------  -
+AverageReturn  2
+-------------  -
+```
+
+To look at outputs with TensorBoard, you can refer to this [page](monitor_experiments_with_tensorboard).
+
+To set a customized log directory, just pass a `log_dir` argument to the
+experiment.
+
+```py
+@wrap_experiment(log_dir='my_custom_log_fir')
+```
+
+## Plotting
+
+In garage, as long as the environment implement the `visualize()` method, is
+it easy to plot a policy running in the environment when training.
+
+To visualize an experiment, just set the `plot` argument to `True` in the
+[`train`](../_autoapi/garage/index.html#garage.Trainer.train) method of
+`Trainer`. For example, in [example/tf/trpo_cartpole.py](https://github.com/rlworkgroup/garage/blob/master/examples/tf/trpo_cartpole.py),
+change the train line into:
+
+```py
+trainer.train(n_epochs=100, batch_size=4000, plot=True)
+```
+
+If you want to pause in every epoch, just set `pause_for_plot` to `True`.
+
+----
+
+*This page was authored by Ruofu Wang ([@yeukfu](https://github.com/yeukfu)).*
diff --git a/src/garage/__init__.py b/src/garage/__init__.py
index 4f1ce5ac5b..5481bd61dc 100644
--- a/src/garage/__init__.py
+++ b/src/garage/__init__.py
@@ -1,18 +1,13 @@
 """Garage Base."""
 # yapf: disable
-from garage._dtypes import (EpisodeBatch,
-                            InOutSpec,
-                            StepType,
-                            TimeStep,
+from garage._dtypes import (EpisodeBatch, InOutSpec, StepType, TimeStep,
                             TimeStepBatch)
 from garage._environment import Environment, EnvSpec, EnvStep, Wrapper
-from garage._functions import (_Default,
-                               log_multitask_performance,
-                               log_performance,
-                               make_optimizer,
-                               obtain_evaluation_episodes,
-                               rollout)
+from garage._functions import (_Default, log_multitask_performance,
+                               log_performance, make_optimizer,
+                               obtain_evaluation_episodes, rollout)
 from garage.experiment.experiment import wrap_experiment
+from garage.trainer import TFTrainer, Trainer
 
 # yapf: enable
 
@@ -33,4 +28,6 @@
     'Wrapper',
     'rollout',
     'obtain_evaluation_episodes',
+    'Trainer',
+    'TFTrainer',
 ]

From 4312678397f78ea52cd9f47e1d53583bd146b8e9 Mon Sep 17 00:00:00 2001
From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com>
Date: Wed, 28 Oct 2020 11:57:56 -0700
Subject: [PATCH 23/23] Refactor RL2 to use EpisodeBatch (#2138)

* Refactor RL2 to use EpisodeBatch

* Fix isort
---
 .../torch/maml_trpo_metaworld_ml1_push.py     |  3 +-
 src/garage/tf/algos/reps.py                   |  4 +-
 src/garage/tf/algos/rl2.py                    | 90 ++++++++++---------
 src/garage/tf/algos/te_npo.py                 | 16 +---
 src/garage/torch/algos/bc.py                  |  8 +-
 5 files changed, 55 insertions(+), 66 deletions(-)

diff --git a/examples/torch/maml_trpo_metaworld_ml1_push.py b/examples/torch/maml_trpo_metaworld_ml1_push.py
index 93de39f2d3..840dd5746f 100755
--- a/examples/torch/maml_trpo_metaworld_ml1_push.py
+++ b/examples/torch/maml_trpo_metaworld_ml1_push.py
@@ -8,8 +8,7 @@
 
 from garage import wrap_experiment
 from garage.envs import MetaWorldSetTaskEnv
-from garage.experiment import (MetaEvaluator,
-                               MetaWorldTaskSampler,
+from garage.experiment import (MetaEvaluator, MetaWorldTaskSampler,
                                SetTaskSampler)
 from garage.experiment.deterministic import set_seed
 from garage.torch.algos import MAMLTRPO
diff --git a/src/garage/tf/algos/reps.py b/src/garage/tf/algos/reps.py
index 4c99b36b0f..dd639be030 100644
--- a/src/garage/tf/algos/reps.py
+++ b/src/garage/tf/algos/reps.py
@@ -10,9 +10,7 @@
 from garage import _Default, log_performance, make_optimizer
 from garage.np.algos import RLAlgorithm
 from garage.sampler import RaySampler
-from garage.tf import (compile_function,
-                       flatten_inputs,
-                       graph_inputs,
+from garage.tf import (compile_function, flatten_inputs, graph_inputs,
                        new_tensor)
 from garage.tf.optimizers import LBFGSOptimizer
 
diff --git a/src/garage/tf/algos/rl2.py b/src/garage/tf/algos/rl2.py
index d67a93b584..d07026eef1 100644
--- a/src/garage/tf/algos/rl2.py
+++ b/src/garage/tf/algos/rl2.py
@@ -12,7 +12,6 @@
 
 from garage import (EnvSpec, EnvStep, EpisodeBatch, log_multitask_performance,
                     StepType, Wrapper)
-from garage.np import concat_tensor_dict_list, discount_cumsum
 from garage.np.algos import MetaRLAlgorithm
 from garage.sampler import DefaultWorker
 from garage.tf.algos._rl2npo import RL2NPO
@@ -339,7 +338,7 @@ def train(self, trainer):
             if trainer.step_itr % self._n_epochs_per_eval == 0:
                 if self._meta_evaluator is not None:
                     self._meta_evaluator.evaluate(self)
-            trainer.step_episode = trainer.obtain_samples(
+            trainer.step_episode = trainer.obtain_episodes(
                 trainer.step_itr,
                 env_update=self._task_sampler.sample(self._meta_batch_size))
             last_return = self.train_once(trainer.step_itr,
@@ -348,18 +347,18 @@ def train(self, trainer):
 
         return last_return
 
-    def train_once(self, itr, paths):
+    def train_once(self, itr, episodes):
         """Perform one step of policy optimization given one batch of samples.
 
         Args:
             itr (int): Iteration number.
-            paths (list[dict]): A list of collected paths.
+            episodes (EpisodeBatch): Batch of episodes.
 
         Returns:
             numpy.float64: Average return.
 
         """
-        episodes, average_return = self._process_samples(itr, paths)
+        episodes, average_return = self._process_samples(itr, episodes)
         logger.log('Optimizing policy...')
         self._inner_algo.optimize_policy(episodes)
         return average_return
@@ -400,16 +399,17 @@ def adapt_policy(self, exploration_policy, exploration_episodes):
         return RL2AdaptedPolicy(exploration_policy._policy)
 
     # pylint: disable=protected-access
-    def _process_samples(self, itr, paths):
+    def _process_samples(self, itr, episodes):
         # pylint: disable=too-many-statements
         """Return processed sample data based on the collected paths.
 
         Args:
             itr (int): Iteration number.
-            paths (OrderedDict[dict]): A list of collected paths for each
-                task. In RL^2, there are n environments/tasks and paths in
-                each of them will be concatenated at some point and fed to
-                the policy.
+            episodes (EpisodeBatch): Original collected episode batch for each
+                task. For each episode, episode.agent_infos['batch_idx']
+                indicates which task this episode belongs to. In RL^2, there
+                are n environments/tasks and paths in each of them will be
+                concatenated at some point and fed to the policy.
 
         Returns:
             EpisodeBatch: Processed batch of episodes for feeding the inner
@@ -423,13 +423,12 @@ def _process_samples(self, itr, paths):
         concatenated_paths = []
 
         paths_by_task = collections.defaultdict(list)
-        for path in paths:
-            path['returns'] = discount_cumsum(path['rewards'], self._discount)
-            path['lengths'] = [len(path['rewards'])]
-            if 'batch_idx' in path:
-                paths_by_task[path['batch_idx']].append(path)
-            elif 'batch_idx' in path['agent_infos']:
-                paths_by_task[path['agent_infos']['batch_idx'][0]].append(path)
+        for episode in episodes.split():
+            if hasattr(episode, 'batch_idx'):
+                paths_by_task[episode.batch_idx[0]].append(episode)
+            elif 'batch_idx' in episode.agent_infos:
+                paths_by_task[episode.agent_infos['batch_idx'][0]].append(
+                    episode)
             else:
                 raise ValueError(
                     'Batch idx is required for RL2 but not found, '
@@ -437,10 +436,12 @@ def _process_samples(self, itr, paths):
                     'for sampling')
 
         # all path in paths_by_task[i] are sampled from task[i]
-        for _paths in paths_by_task.values():
-            concatenated_path = self._concatenate_paths(_paths)
+        for episode_list in paths_by_task.values():
+            concatenated_path = self._concatenate_paths(episode_list)
             concatenated_paths.append(concatenated_path)
 
+        concatenated_episodes = EpisodeBatch.concatenate(*concatenated_paths)
+
         name_map = None
         if hasattr(self._task_sampler, '_envs') and hasattr(
                 self._task_sampler._envs[0]._env, 'all_task_names'):
@@ -450,17 +451,13 @@ def _process_samples(self, itr, paths):
             name_map = dict(enumerate(names))
 
         undiscounted_returns = log_multitask_performance(
-            itr,
-            EpisodeBatch.from_list(self._env_spec, paths),
-            self._inner_algo._discount,
-            name_map=name_map)
+            itr, episodes, self._inner_algo._discount, name_map=name_map)
 
         average_return = np.mean(undiscounted_returns)
-        episodes = EpisodeBatch.from_list(self._env_spec, concatenated_paths)
 
-        return episodes, average_return
+        return concatenated_episodes, average_return
 
-    def _concatenate_paths(self, paths):
+    def _concatenate_paths(self, episode_list):
         """Concatenate paths.
 
         The input paths are from different episodes but same task/environment.
@@ -468,8 +465,8 @@ def _concatenate_paths(self, paths):
         path and fed to the policy.
 
         Args:
-            paths (dict): Input paths. All paths are from different episodes,
-                but the same task/environment.
+            episode_list (list[EpisodeBatch]): Input paths. All paths are from
+                different episodes, but the same task/environment.
 
         Returns:
             dict: Concatenated paths from the same task/environment. Shape of
@@ -479,23 +476,30 @@ def _concatenate_paths(self, paths):
                 values of shape :math:`[max_episode_length, S^*]`
 
         """
-        observations = np.concatenate([path['observations'] for path in paths])
+        env_infos = {
+            k: np.concatenate([b.env_infos[k] for b in episode_list])
+            for k in episode_list[0].env_infos.keys()
+        }
+        agent_infos = {
+            k: np.concatenate([b.agent_infos[k] for b in episode_list])
+            for k in episode_list[0].agent_infos.keys()
+        }
         actions = np.concatenate([
-            self._env_spec.action_space.flatten_n(path['actions'])
-            for path in paths
+            self._env_spec.action_space.flatten_n(ep.actions)
+            for ep in episode_list
         ])
-        valids = np.concatenate(
-            [np.ones_like(path['rewards']) for path in paths])
-        baselines = np.concatenate(
-            [np.zeros_like(path['rewards']) for path in paths])
-
-        concatenated_path = concat_tensor_dict_list(paths)
-        concatenated_path['observations'] = observations
-        concatenated_path['actions'] = actions
-        concatenated_path['valids'] = valids
-        concatenated_path['baselines'] = baselines
-
-        return concatenated_path
+
+        return EpisodeBatch(
+            env_spec=episode_list[0].env_spec,
+            observations=np.concatenate(
+                [ep.observations for ep in episode_list]),
+            last_observations=episode_list[-1].last_observations,
+            actions=actions,
+            rewards=np.concatenate([ep.rewards for ep in episode_list]),
+            env_infos=env_infos,
+            agent_infos=agent_infos,
+            step_types=np.concatenate([ep.step_types for ep in episode_list]),
+            lengths=np.asarray([sum([ep.lengths[0] for ep in episode_list])]))
 
     @property
     def policy(self):
diff --git a/src/garage/tf/algos/te_npo.py b/src/garage/tf/algos/te_npo.py
index f4d526a952..962aaf2823 100644
--- a/src/garage/tf/algos/te_npo.py
+++ b/src/garage/tf/algos/te_npo.py
@@ -9,21 +9,13 @@
 
 from garage import InOutSpec, log_performance
 from garage.experiment import deterministic
-from garage.np import (discount_cumsum,
-                       explained_variance_1d,
-                       rrse,
+from garage.np import (discount_cumsum, explained_variance_1d, rrse,
                        sliding_window)
 from garage.np.algos import RLAlgorithm
 from garage.sampler import LocalSampler
-from garage.tf import (center_advs,
-                       compile_function,
-                       compute_advantages,
-                       concat_tensor_list,
-                       discounted_returns,
-                       flatten_inputs,
-                       graph_inputs,
-                       pad_tensor_dict,
-                       positive_advs,
+from garage.tf import (center_advs, compile_function, compute_advantages,
+                       concat_tensor_list, discounted_returns, flatten_inputs,
+                       graph_inputs, pad_tensor_dict, positive_advs,
                        stack_tensor_dict_list)
 from garage.tf.embeddings import StochasticEncoder
 from garage.tf.optimizers import LBFGSOptimizer
diff --git a/src/garage/torch/algos/bc.py b/src/garage/torch/algos/bc.py
index 55ee264667..e65bc142c4 100644
--- a/src/garage/torch/algos/bc.py
+++ b/src/garage/torch/algos/bc.py
@@ -6,12 +6,8 @@
 import numpy as np
 import torch
 
-from garage import (_Default,
-                    EpisodeBatch,
-                    log_performance,
-                    make_optimizer,
-                    obtain_evaluation_episodes,
-                    TimeStepBatch)
+from garage import (_Default, EpisodeBatch, log_performance, make_optimizer,
+                    obtain_evaluation_episodes, TimeStepBatch)
 from garage.np.algos.rl_algorithm import RLAlgorithm
 from garage.np.policies import Policy
 from garage.sampler import RaySampler