From 27ca8288a28169ec59252fc4370014c691449385 Mon Sep 17 00:00:00 2001 From: mishari <44849486+maliesa96@users.noreply.github.com> Date: Wed, 14 Oct 2020 01:30:11 -0700 Subject: [PATCH 01/23] Add MAML doc (#2093) --- docs/index.md | 1 + docs/user/algo_maml.md | 86 ++++++++++++++++++++++++++++++++++++++++ docs/user/references.bib | 12 +++++- 3 files changed, 98 insertions(+), 1 deletion(-) create mode 100644 docs/user/algo_maml.md diff --git a/docs/index.md b/docs/index.md index a817ab5649..505f3cf8a2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -56,6 +56,7 @@ and how to implement new MDPs and new algorithms. user/algo_pearl user/algo_rl2 user/algo_ppo + user/algo_maml user/algo_mtppo user/algo_vpg user/algo_td3 diff --git a/docs/user/algo_maml.md b/docs/user/algo_maml.md new file mode 100644 index 0000000000..788325f946 --- /dev/null +++ b/docs/user/algo_maml.md @@ -0,0 +1,86 @@ +# MAML + +```eval_rst ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Paper** | Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks :cite:`finn2017modelagnostic` | ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Framework(s)** | .. figure:: ./images/pytorch.png | +| | :scale: 10% | +| | :class: no-scaled-link | +| | | +| | PyTorch | ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **API Reference** | `garage.torch.algos.MAML <../_autoapi/garage/torch/algos/index.html#garage.torch.algos.maml>`_ | ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Code** | `garage/torch/algos/maml.py `_ | ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Examples** | :ref:`maml_ppo_half_cheetah_dir`, :ref:`maml_trpo_half_cheetah_dir`, :ref:`maml_trpo_metaworld_ml1_push`, :ref:`maml_trpo_metaworld_ml10`. :ref:`maml_trpo_metaworld_ml45` | ++-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +MAML is a meta-learning algorithm that trains the parameters of a policy such that they generalize well to unseen tasks. In essence, this technique produces models that are good few shot learners and easy to fine-tune. + +## Default Parameters + +```python +meta_batch_size=40, +inner_lr=0.1, +outer_lr=1e-3, +num_grad_updates=1, +meta_evaluator=None, +evaluate_every_n_epochs=1 +``` + +## Examples + +### maml_ppo_half_cheetah_dir + +```eval_rst +.. figure:: ./images/pytorch.png + :scale: 10% +.. literalinclude:: ../../examples/torch/maml_ppo_half_cheetah_dir.py +``` + +### maml_trpo_half_cheetah_dir + +```eval_rst +.. figure:: ./images/pytorch.png + :scale: 10% +.. literalinclude:: ../../examples/torch/maml_trpo_half_cheetah_dir.py +``` + +### maml_trpo_metaworld_ml1_push + +```eval_rst +.. figure:: ./images/pytorch.png + :scale: 10% +.. literalinclude:: ../../examples/torch/maml_trpo_metaworld_ml1_push.py +``` + +### maml_trpo_metaworld_ml10 + +```eval_rst +.. figure:: ./images/pytorch.png + :scale: 10% +.. literalinclude:: ../../examples/torch/maml_trpo_metaworld_ml10.py +``` + +### maml_trpo_metaworld_ml45 + +```eval_rst +.. figure:: ./images/pytorch.png + :scale: 10% +.. literalinclude:: ../../examples/torch/maml_trpo_metaworld_ml45.py +``` + +## References + +```eval_rst +.. bibliography:: references.bib + :style: unsrt + :filter: docname in docnames +``` + +---- + +*This page was authored by Mishari Aliesa ([@maliesa96](https://github.com/maliesa96)).* diff --git a/docs/user/references.bib b/docs/user/references.bib index eea7a77918..740c4f71fa 100644 --- a/docs/user/references.bib +++ b/docs/user/references.bib @@ -97,7 +97,8 @@ @inproceedings{peters2007reward year={2007}, volume={}, number={}, - pages={262-267},} + pages={262-267} +} @article{2009koberpolicy, title = {Policy Search for Motor Primitives in Robotics}, @@ -114,3 +115,12 @@ @article{2009koberpolicy year = {2009}, month_numeric = {6} } + +@misc{finn2017modelagnostic, + title={Model-Agnostic Meta-Learning for Fast Adaptation of Deep Networks}, + author={Chelsea Finn and Pieter Abbeel and Sergey Levine}, + year={2017}, + eprint={1703.03400}, + archivePrefix={arXiv}, + primaryClass={cs.LG} +} From 3b320484e6cad24013c57a9e96e078070f300963 Mon Sep 17 00:00:00 2001 From: Gitanshu Sardana Date: Wed, 14 Oct 2020 11:57:55 -0700 Subject: [PATCH 02/23] Make snapshot_config optional in benchmarks (#2085) PR #2072 added an argument to functions decorated with @benchmark to pass snapshot_config to auto benchmarks but that broke other benchmarks that didn't need that argument. This commit cleans it up by passing snapshot_config directly to iterate_experiments, since trying to pass it from within decorator function makes for a confusing API --- .../src/garage_benchmarks/benchmark_auto.py | 26 +++++++++---------- benchmarks/src/garage_benchmarks/helper.py | 8 +++--- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/benchmarks/src/garage_benchmarks/benchmark_auto.py b/benchmarks/src/garage_benchmarks/benchmark_auto.py index e42f025be2..577f2ae911 100644 --- a/benchmarks/src/garage_benchmarks/benchmark_auto.py +++ b/benchmarks/src/garage_benchmarks/benchmark_auto.py @@ -15,26 +15,26 @@ @benchmark(plot=False, auto=True) -def auto_ddpg_benchmarks(snapshot_config): +def auto_ddpg_benchmarks(): """Run experiments for DDPG benchmarking.""" iterate_experiments(ddpg_garage_tf, MuJoCo1M_ENV_SET, - snapshot_config=snapshot_config) + snapshot_config={'snapshot_mode': 'none'}) @benchmark(plot=False, auto=True) -def auto_ppo_benchmarks(snapshot_config): +def auto_ppo_benchmarks(): """Run experiments for PPO benchmarking.""" iterate_experiments(ppo_garage_pytorch, MuJoCo1M_ENV_SET, - snapshot_config=snapshot_config) + snapshot_config={'snapshot_mode': 'none'}) iterate_experiments(ppo_garage_tf, MuJoCo1M_ENV_SET, - snapshot_config=snapshot_config) + snapshot_config={'snapshot_mode': 'none'}) @benchmark(plot=False, auto=True) -def auto_td3_benchmarks(snapshot_config): +def auto_td3_benchmarks(): """Run experiments for TD3 benchmarking.""" td3_env_ids = [ env_id for env_id in MuJoCo1M_ENV_SET if env_id != 'Reacher-v2' @@ -42,26 +42,26 @@ def auto_td3_benchmarks(snapshot_config): iterate_experiments(td3_garage_tf, td3_env_ids, - snapshot_config=snapshot_config) + snapshot_config={'snapshot_mode': 'none'}) @benchmark(plot=False, auto=True) -def auto_trpo_benchmarks(snapshot_config): +def auto_trpo_benchmarks(): """Run experiments for TRPO benchmarking.""" iterate_experiments(trpo_garage_pytorch, MuJoCo1M_ENV_SET, - snapshot_config=snapshot_config) + snapshot_config={'snapshot_mode': 'none'}) iterate_experiments(trpo_garage_tf, MuJoCo1M_ENV_SET, - snapshot_config=snapshot_config) + snapshot_config={'snapshot_mode': 'none'}) @benchmark(plot=False, auto=True) -def auto_vpg_benchmarks(snapshot_config): +def auto_vpg_benchmarks(): """Run experiments for VPG benchmarking.""" iterate_experiments(vpg_garage_pytorch, MuJoCo1M_ENV_SET, - snapshot_config=snapshot_config) + snapshot_config={'snapshot_mode': 'none'}) iterate_experiments(vpg_garage_tf, MuJoCo1M_ENV_SET, - snapshot_config=snapshot_config) + snapshot_config={'snapshot_mode': 'none'}) diff --git a/benchmarks/src/garage_benchmarks/helper.py b/benchmarks/src/garage_benchmarks/helper.py index 301873585a..2dec455e4a 100644 --- a/benchmarks/src/garage_benchmarks/helper.py +++ b/benchmarks/src/garage_benchmarks/helper.py @@ -80,15 +80,12 @@ def wrapper_func(): count += 1 _log_dir = _log_dir + '_' + str(count) - snapshot_config = {} - if auto: _auto = auto auto_dir = os.path.join(_log_dir, 'auto') os.makedirs(auto_dir) - snapshot_config['snapshot_mode'] = 'none' - exec_func(snapshot_config) + exec_func() if plot: plot_dir = os.path.join(_log_dir, 'plot') @@ -148,7 +145,8 @@ def iterate_experiments(func, tf.compat.v1.reset_default_graph() ctxt = dict(log_dir=sub_log_dir) - ctxt.update(snapshot_config) + if snapshot_config: + ctxt.update(snapshot_config) func(ctxt, env_id=env_id, seed=seed) if _plot is not None or _auto: From b4b7aa177c76bf74885a673f3a4b6b424a63fc22 Mon Sep 17 00:00:00 2001 From: "Nicole (Shin Ying) Ng" Date: Wed, 14 Oct 2020 17:54:07 -0700 Subject: [PATCH 03/23] Add doc for MTSAC algorithm (#2041) * Add mtsac docs * Add mtsac doc * Update author name --- docs/index.md | 1 + docs/user/algo_mtsac.md | 63 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 docs/user/algo_mtsac.md diff --git a/docs/index.md b/docs/index.md index 505f3cf8a2..d001eade5e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -53,6 +53,7 @@ and how to implement new MDPs and new algorithms. user/algo_trpo user/algo_mttrpo user/algo_sac + user/algo_mtsac user/algo_pearl user/algo_rl2 user/algo_ppo diff --git a/docs/user/algo_mtsac.md b/docs/user/algo_mtsac.md new file mode 100644 index 0000000000..2f30a1267f --- /dev/null +++ b/docs/user/algo_mtsac.md @@ -0,0 +1,63 @@ +# Multi-Task Soft Actor-Critic + +```eval_rst +.. list-table:: + :header-rows: 0 + :stub-columns: 1 + :widths: auto + + * - **Action Space** + - Continuous + * - **Framework(s)** + - .. figure:: ./images/pytorch.png + :scale: 10% + + PyTorch + * - **API Reference** + - `garage.torch.algos.MTSAC <../_autoapi/garage/torch/algos/index.html#garage.torch.algos.MTSAC>`_ + * - **Code** + - `garage/torch/algos/mtsac.py `_ + * - **Examples** + - :ref:`mtsac_metaworld_ml1_pick_place`, :ref:`mtsac_metaworld_mt10`, :ref:`mtsac_metaworld_mt50` +``` + +The Multi-Task Soft Actor-Critic (MTSAC) algorithm is the same as the [Soft Actor Critic (SAC)](algo_sac) algorithm, except for a small change called "disentangled alphas". Alpha is the entropy coefficient that is used to control exploration of the agent/policy. Disentangling alphas refers to having a separate alpha coefficients for every task learned by the policy. The alphas are accessed by using a one-hot encoding of an id that is assigned to each task. + + +## Default Parameters + +```python +initial_log_entropy=0., +discount=0.99, +buffer_batch_size=64, +min_buffer_size=int(1e4), +target_update_tau=5e-3, +policy_lr=3e-4, +qf_lr=3e-4, +reward_scale=1.0, +optimizer=torch.optim.Adam, +steps_per_epoch=1, +num_evaluation_episodes=5, +use_deterministic_evaluation=True, +``` + +## Examples + +### mtsac_metaworld_ml1_pick_place +```eval_rst +.. literalinclude:: ../../examples/torch/mtsac_metaworld_ml1_pick_place.py +``` + +### mtsac_metaworld_mt10 +```eval_rst +.. literalinclude:: ../../examples/torch/mtsac_metaworld_mt10.py +``` + +### mtsac_metaworld_mt50 +```eval_rst +.. literalinclude:: ../../examples/torch/mtsac_metaworld_mt10.py +``` + +---- + +*This page was authored by Nicole Shin Ying Ng ([@nicolengsy](https://github.com/nicolengsy)).* From 6da4d5206bbac00eadeb156b766fe6f96bbda5ba Mon Sep 17 00:00:00 2001 From: Avnish Narayan <38871737+avnishn@users.noreply.github.com> Date: Thu, 15 Oct 2020 13:02:39 -0700 Subject: [PATCH 04/23] Fix #2131 (#2132) There's something specifically wrong with an image used in rendering pendulum-v0, causing visualize to fail. I swapped it out with CartPole-v1 for the purposes of the tests and that was a fix. --- tests/garage/envs/test_normalized_gym.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/garage/envs/test_normalized_gym.py b/tests/garage/envs/test_normalized_gym.py index b899336ca3..b513029dcd 100644 --- a/tests/garage/envs/test_normalized_gym.py +++ b/tests/garage/envs/test_normalized_gym.py @@ -4,7 +4,7 @@ class TestNormalizedGym: def setup_method(self): - self.env = normalize(GymEnv('Pendulum-v0'), + self.env = normalize(GymEnv('CartPole-v1'), normalize_reward=True, normalize_obs=True, flatten_obs=True) From ade1ba147dbead58c061e02ed8243669372b58b0 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com> Date: Fri, 16 Oct 2020 11:34:18 -0700 Subject: [PATCH 05/23] Allow setting x-axis in wrap_experiment (#2128) --- src/garage/experiment/experiment.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/garage/experiment/experiment.py b/src/garage/experiment/experiment.py index 04836682f4..f697db6f82 100644 --- a/src/garage/experiment/experiment.py +++ b/src/garage/experiment/experiment.py @@ -161,6 +161,7 @@ def my_experiment(ctxt, seed, lr=0.5): the function definition. use_existing_dir (bool): If true, (re)use the directory for this experiment, even if it already contains data. + x_axis (str): Key to use for x axis of plots. @@ -170,7 +171,7 @@ def my_experiment(ctxt, seed, lr=0.5): def __init__(self, *, function, log_dir, name, prefix, snapshot_mode, snapshot_gap, archive_launch_repo, name_parameters, - use_existing_dir): + use_existing_dir, x_axis): self.function = function self.log_dir = log_dir self.name = name @@ -180,6 +181,7 @@ def __init__(self, *, function, log_dir, name, prefix, snapshot_mode, self.archive_launch_repo = archive_launch_repo self.name_parameters = name_parameters self.use_existing_dir = use_existing_dir + self.x_axis = x_axis if self.function is not None: self._update_wrap_params() @@ -263,6 +265,7 @@ def _get_options(self, *args): snapshot_gap=self.snapshot_gap, snapshot_mode=self.snapshot_mode, use_existing_dir=self.use_existing_dir, + x_axis=self.x_axis, signature=self.__signature__) if args: if len(args) == 1 and isinstance(args[0], dict): @@ -321,7 +324,7 @@ def _make_context(cls, options, **kwargs): logger.add_output(dowel.TextOutput(text_log_file)) logger.add_output(dowel.CsvOutput(tabular_log_file)) logger.add_output( - dowel.TensorBoardOutput(log_dir, x_axis='TotalEnvSteps')) + dowel.TensorBoardOutput(log_dir, x_axis=options['x_axis'])) logger.add_output(dowel.StdOutput()) logger.push_prefix('[{}] '.format(name)) @@ -377,7 +380,8 @@ def wrap_experiment(function=None, snapshot_gap=1, archive_launch_repo=True, name_parameters=None, - use_existing_dir=False): + use_existing_dir=False, + x_axis='TotalEnvSteps'): """Decorate a function to turn it into an ExperimentTemplate. When invoked, the wrapped function will receive an ExperimentContext, which @@ -424,6 +428,7 @@ def my_experiment(ctxt, seed, lr=0.5): the function definition. use_existing_dir (bool): If true, (re)use the directory for this experiment, even if it already contains data. + x_axis (str): Key to use for x axis of plots. Returns: callable: The wrapped function. @@ -437,7 +442,8 @@ def my_experiment(ctxt, seed, lr=0.5): snapshot_gap=snapshot_gap, archive_launch_repo=archive_launch_repo, name_parameters=name_parameters, - use_existing_dir=use_existing_dir) + use_existing_dir=use_existing_dir, + x_axis=x_axis) def dump_json(filename, data): From b91fcf1e3a703cb04c2c5c1de4aeef5e4e3fd026 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com> Date: Fri, 16 Oct 2020 12:47:02 -0700 Subject: [PATCH 06/23] Record total_env_steps in samplers (#2125) --- src/garage/sampler/local_sampler.py | 9 +++++++-- src/garage/sampler/multiprocessing_sampler.py | 9 +++++++-- src/garage/sampler/ray_sampler.py | 9 +++++++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/garage/sampler/local_sampler.py b/src/garage/sampler/local_sampler.py index 47fcb0baa3..11b88fd287 100644 --- a/src/garage/sampler/local_sampler.py +++ b/src/garage/sampler/local_sampler.py @@ -40,6 +40,7 @@ def __init__(self, worker_factory, agents, envs): for worker, agent, env in zip(self._workers, self._agents, self._envs): worker.update_agent(agent) worker.update_env(env) + self.total_env_steps = 0 @classmethod def from_worker_factory(cls, worker_factory, agents, envs): @@ -117,7 +118,9 @@ def obtain_samples(self, itr, num_samples, agent_update, env_update=None): completed_samples += len(batch.actions) batches.append(batch) if completed_samples >= num_samples: - return EpisodeBatch.concatenate(*batches) + samples = EpisodeBatch.concatenate(*batches) + self.total_env_steps += sum(samples.lengths) + return samples def obtain_exact_episodes(self, n_eps_per_worker, @@ -149,7 +152,9 @@ def obtain_exact_episodes(self, for _ in range(n_eps_per_worker): batch = worker.rollout() batches.append(batch) - return EpisodeBatch.concatenate(*batches) + samples = EpisodeBatch.concatenate(*batches) + self.total_env_steps += sum(samples.lengths) + return samples def shutdown_worker(self): """Shutdown the workers.""" diff --git a/src/garage/sampler/multiprocessing_sampler.py b/src/garage/sampler/multiprocessing_sampler.py index 86aaf445b3..13a2b4a916 100644 --- a/src/garage/sampler/multiprocessing_sampler.py +++ b/src/garage/sampler/multiprocessing_sampler.py @@ -61,6 +61,7 @@ def __init__(self, worker_factory, agents, envs): self._agent_version = 0 for w in self._workers: w.start() + self.total_env_steps = 0 @classmethod def from_worker_factory(cls, worker_factory, agents, envs): @@ -182,7 +183,9 @@ def obtain_samples(self, itr, num_samples, agent_update, env_update=None): except queue.Full: pass - return EpisodeBatch.concatenate(*batches) + samples = EpisodeBatch.concatenate(*batches) + self.total_env_steps += sum(samples.lengths) + return samples def obtain_exact_episodes(self, n_eps_per_worker, @@ -254,7 +257,9 @@ def obtain_exact_episodes(self, ordered_episodes = list( itertools.chain( *[episodes[i] for i in range(self._factory.n_workers)])) - return EpisodeBatch.concatenate(*ordered_episodes) + samples = EpisodeBatch.concatenate(*ordered_episodes) + self.total_env_steps += sum(samples.lengths) + return samples def shutdown_worker(self): """Shutdown the workers.""" diff --git a/src/garage/sampler/ray_sampler.py b/src/garage/sampler/ray_sampler.py index 506866ddf3..f943971957 100644 --- a/src/garage/sampler/ray_sampler.py +++ b/src/garage/sampler/ray_sampler.py @@ -40,6 +40,7 @@ def __init__(self, worker_factory, agents, envs): self._all_workers = defaultdict(None) self._workers_started = False self.start_worker() + self.total_env_steps = 0 @classmethod def from_worker_factory(cls, worker_factory, agents, envs): @@ -170,7 +171,9 @@ def obtain_samples(self, itr, num_samples, agent_update, env_update=None): batches.append(episode_batch) pbar.update(num_returned_samples) - return EpisodeBatch.concatenate(*batches) + samples = EpisodeBatch.concatenate(*batches) + self.total_env_steps += sum(samples.lengths) + return samples def obtain_exact_episodes(self, n_eps_per_worker, @@ -247,7 +250,9 @@ def obtain_exact_episodes(self, itertools.chain( *[episodes[i] for i in range(self._worker_factory.n_workers)])) - return EpisodeBatch.concatenate(*ordered_episodes) + samples = EpisodeBatch.concatenate(*ordered_episodes) + self.total_env_steps += sum(samples.lengths) + return samples def shutdown_worker(self): """Shuts down the worker.""" From 06e5aba9b06635e0772acf442f845672acc03694 Mon Sep 17 00:00:00 2001 From: Gitanshu Sardana Date: Fri, 16 Oct 2020 15:09:36 -0700 Subject: [PATCH 07/23] Rerun failed tests automatically once on CI (#2094) --- .github/workflows/ci.yml | 10 +++++----- Makefile | 10 +++++----- setup.py | 1 + 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4fe37df4e0..dff391b9e8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -129,7 +129,7 @@ jobs: "${DOCKER_TAG}" \ /bin/bash -c \ '[ ! -f ${MJKEY_PATH} ] || mv ${MJKEY_PATH} ${MJKEY_PATH}.bak && - pytest --cov=garage --cov-report=xml -m \ + pytest --cov=garage --cov-report=xml --reruns 1 -m \ "not nightly and not huge and not flaky and not large and not mujoco and not mujoco_long" --durations=20 && for i in {1..5}; do bash <(curl -s https://codecov.io/bash --retry 5) -Z && break @@ -171,7 +171,7 @@ jobs: "${DOCKER_TAG}" \ /bin/bash -c \ '[ ! -f ${MJKEY_PATH} ] || mv ${MJKEY_PATH} ${MJKEY_PATH}.bak && - pytest --cov=garage --cov-report=xml -m "large and not flaky" --durations=20 && + pytest --cov=garage --cov-report=xml --reruns 1 -m "large and not flaky" --durations=20 && for i in {1..5}; do bash <(curl -s https://codecov.io/bash --retry 5) -Z && break if [ $i == 5 ]; then @@ -211,7 +211,7 @@ jobs: --memory-swap 6500m \ "${DOCKER_TAG}" \ /bin/bash -c \ - 'pytest --cov=garage --cov-report=xml -m "mujoco and not flaky" --durations=20 && + 'pytest --cov=garage --cov-report=xml --reruns 1 -m "mujoco and not flaky" --durations=20 && for i in {1..5}; do bash <(curl -s https://codecov.io/bash --retry 5) -Z && break if [ $i == 5 ]; then @@ -251,7 +251,7 @@ jobs: --memory-swap 6500m \ "${DOCKER_TAG}" \ /bin/bash -c \ - 'pytest --cov=garage --cov-report=xml -m "mujoco_long and not flaky" --durations=20 && + 'pytest --cov=garage --cov-report=xml --reruns 1 -m "mujoco_long and not flaky" --durations=20 && for i in {1..5}; do bash <(curl -s https://codecov.io/bash --retry 5) -Z && break if [ $i == 5 ]; then @@ -290,7 +290,7 @@ jobs: $ci_env\ --memory 6500m \ --memory-swap 6500m \ - "${DOCKER_TAG}" pytest -v -m nightly + "${DOCKER_TAG}" pytest -v --reruns 1 -m nightly verify_envs_conda: diff --git a/Makefile b/Makefile index a979a22ca5..5a90f38c8a 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ ci-job-precommit: assert-docker ci-job-normal: assert-docker [ ! -f $(MJKEY_PATH) ] || mv $(MJKEY_PATH) $(MJKEY_PATH).bak - pytest --cov=garage --cov-report=xml -m \ + pytest --cov=garage --cov-report=xml --reruns 1 -m \ 'not nightly and not huge and not flaky and not large and not mujoco and not mujoco_long' --durations=20 for i in {1..5}; do \ bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \ @@ -59,7 +59,7 @@ ci-job-normal: assert-docker ci-job-large: assert-docker [ ! -f $(MJKEY_PATH) ] || mv $(MJKEY_PATH) $(MJKEY_PATH).bak - pytest --cov=garage --cov-report=xml -m 'large and not flaky' --durations=20 + pytest --cov=garage --cov-report=xml --reruns 1 -m 'large and not flaky' --durations=20 for i in {1..5}; do \ bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \ || echo 'Retrying...' && sleep 30 && continue; \ @@ -67,7 +67,7 @@ ci-job-large: assert-docker done ci-job-mujoco: assert-docker - pytest --cov=garage --cov-report=xml -m 'mujoco and not flaky' --durations=20 + pytest --cov=garage --cov-report=xml --reruns 1 -m 'mujoco and not flaky' --durations=20 for i in {1..5}; do \ bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \ || echo 'Retrying...' && sleep 30 && continue; \ @@ -75,7 +75,7 @@ ci-job-mujoco: assert-docker done ci-job-mujoco-long: assert-docker - pytest --cov=garage --cov-report=xml -m 'mujoco_long and not flaky' --durations=20 + pytest --cov=garage --cov-report=xml --reruns 1 -m 'mujoco_long and not flaky' --durations=20 for i in {1..5}; do \ bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \ || echo 'Retrying...' && sleep 30 && continue; \ @@ -83,7 +83,7 @@ ci-job-mujoco-long: assert-docker done ci-job-nightly: assert-docker - pytest -m nightly + pytest --reruns 1 -m nightly ci-job-verify-envs: assert-docker ci-job-verify-envs-pipenv ci-job-verify-envs-conda diff --git a/setup.py b/setup.py index 21cd579a4a..1f3cb68719 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,7 @@ 'pylint>=2.5.3', 'pytest>=4.5.0', # Required for strict-markers 'pytest-cov', + 'pytest-rerunfailures', 'pytest-timeout', 'pytest-xdist', 'recommonmark', From 52047e85b047e06f2c6550cb03e357eebbb3bc7c Mon Sep 17 00:00:00 2001 From: Gitanshu Sardana Date: Fri, 16 Oct 2020 20:03:45 -0700 Subject: [PATCH 08/23] Unpin cloudpickle (#2133) closes #1882 tensorflow/probability 0.11.1 is out which allows compatibility with cloudpickle >= 1.3, so we don't need to pin it anymore --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1f3cb68719..2c502ed37a 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ # Please keep alphabetized 'akro', 'click>=2.0', - 'cloudpickle==1.3', + 'cloudpickle', 'cma==2.7.0', 'dowel==0.0.3', 'numpy>=1.14.5', From be20a5ed29017dc079d9b46ee7dbd2a2fe89e90e Mon Sep 17 00:00:00 2001 From: "Nicole (Shin Ying) Ng" Date: Mon, 19 Oct 2020 17:42:19 -0700 Subject: [PATCH 09/23] Check observation shape conforms to observation_space (#2089) * Add GymEnv check for obs shape * Check obs space contains observation --- src/garage/envs/gym_env.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/garage/envs/gym_env.py b/src/garage/envs/gym_env.py index 5474c4f4b7..472c196268 100644 --- a/src/garage/envs/gym_env.py +++ b/src/garage/envs/gym_env.py @@ -5,6 +5,7 @@ import akro import gym +import numpy as np from garage import Environment, EnvSpec, EnvStep, StepType @@ -242,6 +243,14 @@ def step(self, action): if step_type in (StepType.TERMINAL, StepType.TIMEOUT): self._step_cnt = None + if not self.spec.observation_space.contains(observation): + # Discrete actions can be either in the space normally, or one-hot + # encoded. + if self.spec.observation_space.flat_dim != np.prod( + observation.shape): + raise RuntimeError('GymEnv observation shape does not ' + 'conform to its observation_space') + return EnvStep(env_spec=self.spec, action=action, reward=reward, From 83ba45c55f7eec7b8ac6d59066f9dad1e3e05e8c Mon Sep 17 00:00:00 2001 From: Hayden Shively <17186559+haydenshively@users.noreply.github.com> Date: Tue, 20 Oct 2020 12:02:02 -0500 Subject: [PATCH 10/23] Remove include_dashboard arg so that doctests pass (#2140) --- docs/requirements.txt | 12 ++++-------- src/garage/sampler/ray_sampler.py | 4 +--- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 2be56ef948..3e7b8fab54 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,6 @@ +sphinx-autoapi +sphinxcontrib-bibtex + akro click cloudpickle @@ -5,6 +8,7 @@ cma==2.7.0 dm_env dowel==0.0.3 gym[atari, box2d, classic_control]==0.17.2 +matplotlib psutil pyprind python-dateutil @@ -13,11 +17,3 @@ scipy setproctitle tensorflow tensorflow-probability - -# dev dependencies -matplotlib -recommonmark -sphinx -sphinx-autoapi>=1.4.0 -sphinx_rtd_theme -sphinxcontrib-bibtex diff --git a/src/garage/sampler/ray_sampler.py b/src/garage/sampler/ray_sampler.py index f943971957..170897521a 100644 --- a/src/garage/sampler/ray_sampler.py +++ b/src/garage/sampler/ray_sampler.py @@ -30,9 +30,7 @@ class RaySampler(Sampler): def __init__(self, worker_factory, agents, envs): # pylint: disable=super-init-not-called if not ray.is_initialized(): - ray.init(log_to_driver=False, - ignore_reinit_error=True, - include_dashboard=False) + ray.init(log_to_driver=False, ignore_reinit_error=True) self._sampler_worker = ray.remote(SamplerWorker) self._worker_factory = worker_factory self._agents = agents From 18f4a9b0f6b0e248c8d289147d13a8be6675c410 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com> Date: Tue, 20 Oct 2020 11:04:53 -0700 Subject: [PATCH 11/23] Make plotting plot the learner in BC (#2127) --- src/garage/torch/algos/bc.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/garage/torch/algos/bc.py b/src/garage/torch/algos/bc.py index d070e21f3f..55ee264667 100644 --- a/src/garage/torch/algos/bc.py +++ b/src/garage/torch/algos/bc.py @@ -76,13 +76,17 @@ def __init__( self._batch_size = batch_size self._name = name + # For plotting + self.policy = self.learner + # Public fields for sampling. self._env_spec = env_spec + self.exploration_policy = None self.policy = None self.max_episode_length = env_spec.max_episode_length self.sampler_cls = None if isinstance(self._source, Policy): - self.policy = self._source + self.exploration_policy = self._source self.sampler_cls = RaySampler self._source = source else: From 3a0f8d18624b326de0400bff84ddc0f8c60c46e9 Mon Sep 17 00:00:00 2001 From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com> Date: Tue, 20 Oct 2020 12:12:15 -0700 Subject: [PATCH 12/23] Refactor reps to use episode batch (#2123) --- src/garage/tf/algos/reps.py | 123 ++++++++++++++---------------------- 1 file changed, 47 insertions(+), 76 deletions(-) diff --git a/src/garage/tf/algos/reps.py b/src/garage/tf/algos/reps.py index 55a38f70e6..4c99b36b0f 100644 --- a/src/garage/tf/algos/reps.py +++ b/src/garage/tf/algos/reps.py @@ -7,18 +7,13 @@ import scipy.optimize import tensorflow as tf -from garage import (_Default, - EpisodeBatch, - log_performance, - make_optimizer, - StepType) +from garage import _Default, log_performance, make_optimizer from garage.np.algos import RLAlgorithm from garage.sampler import RaySampler from garage.tf import (compile_function, flatten_inputs, graph_inputs, - new_tensor, - paths_to_tensors) + new_tensor) from garage.tf.optimizers import LBFGSOptimizer # yapf: disable @@ -148,66 +143,38 @@ def train(self, trainer): last_return = None for _ in trainer.step_epochs(): - trainer.step_path = trainer.obtain_samples(trainer.step_itr) + trainer.step_path = trainer.obtain_episodes(trainer.step_itr) last_return = self._train_once(trainer.step_itr, trainer.step_path) trainer.step_itr += 1 return last_return - def _train_once(self, itr, paths): + def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. - paths (list[dict]): A list of collected paths. + episodes (EpisodeBatch): Batch of episodes. Returns: numpy.float64: Average return. """ - # -- Stage: Calculate baseline - paths = [ - dict( - observations=path['observations'], - actions=( - self._env_spec.action_space.flatten_n( # noqa: E126 - path['actions'])), - rewards=path['rewards'], - env_infos=path['env_infos'], - agent_infos=path['agent_infos'], - dones=np.array([ - step_type == StepType.TERMINAL - for step_type in path['step_types'] - ])) for path in paths - ] - - if hasattr(self._baseline, 'predict_n'): - baseline_predictions = self._baseline.predict_n(paths) - else: - baseline_predictions = [ - self._baseline.predict(path) for path in paths - ] - - # -- Stage: Pre-process samples based on collected paths - samples_data = paths_to_tensors(paths, self.max_episode_length, - baseline_predictions, self._discount, - self._gae_lambda) - # -- Stage: Run and calculate performance of the algorithm undiscounted_returns = log_performance( itr, - EpisodeBatch.from_list(self._env_spec, paths), + episodes, discount=self._discount) self._episode_reward_mean.extend(undiscounted_returns) tabular.record('Extras/EpisodeRewardMean', np.mean(self._episode_reward_mean)) - samples_data['average_return'] = np.mean(undiscounted_returns) + average_return = np.mean(undiscounted_returns) logger.log('Optimizing policy...') - self._optimize_policy(samples_data) + self._optimize_policy(episodes) - return samples_data['average_return'] + return average_return def __getstate__(self): """Parameters to save in snapshot. @@ -238,12 +205,11 @@ def __setstate__(self, state): self._name_scope = tf.name_scope(self._name) self._init_opt() - def _optimize_policy(self, samples_data): + def _optimize_policy(self, episodes): """Optimize the policy using the samples. Args: - samples_data (dict): Processed sample data. - See garage.tf.paths_to_tensors() for details. + episodes (EpisodeBatch): Batch of episodes. """ # Initial BFGS parameter values. @@ -255,8 +221,8 @@ def _optimize_policy(self, samples_data): # Optimize dual eta_before = self._param_eta logger.log('Computing dual before') - self._feat_diff = self._features(samples_data) - dual_opt_input_values = self._dual_opt_input_values(samples_data) + self._feat_diff = self._features(episodes) + dual_opt_input_values = self._dual_opt_input_values(episodes) dual_before = self._f_dual(*dual_opt_input_values) logger.log('Optimizing dual') @@ -272,7 +238,7 @@ def eval_dual(x): """ self._param_eta = x[0] self._param_v = x[1:] - dual_opt_input_values = self._dual_opt_input_values(samples_data) + dual_opt_input_values = self._dual_opt_input_values(episodes) return self._f_dual(*dual_opt_input_values) def eval_dual_grad(x): @@ -287,7 +253,7 @@ def eval_dual_grad(x): """ self._param_eta = x[0] self._param_v = x[1:] - dual_opt_input_values = self._dual_opt_input_values(samples_data) + dual_opt_input_values = self._dual_opt_input_values(episodes) grad = self._f_dual_grad(*dual_opt_input_values) eta_grad = np.float(grad[0]) v_grad = grad[1] @@ -301,11 +267,11 @@ def eval_dual_grad(x): logger.log('Computing dual after') self._param_eta, self._param_v = params_ast[0], params_ast[1:] - dual_opt_input_values = self._dual_opt_input_values(samples_data) + dual_opt_input_values = self._dual_opt_input_values(episodes) dual_after = self._f_dual(*dual_opt_input_values) # Optimize policy - policy_opt_input_values = self._policy_opt_input_values(samples_data) + policy_opt_input_values = self._policy_opt_input_values(episodes) logger.log('Computing policy loss before') loss_before = self._optimizer.loss(policy_opt_input_values) logger.log('Computing policy KL before') @@ -488,26 +454,25 @@ def _build_policy_loss(self, i): return loss - def _dual_opt_input_values(self, samples_data): + def _dual_opt_input_values(self, episodes): """Update dual func optimize input values based on samples data. Args: - samples_data (dict): Processed sample data. - See garage.tf.paths_to_tensors() for details. + episodes (EpisodeBatch): Batch of episodes. Returns: list(np.ndarray): Flatten dual function optimization input values. """ + agent_infos = episodes.padded_agent_infos policy_state_info_list = [ - samples_data['agent_infos'][k] - for k in self.policy.state_info_keys - ] # yapf: disable + agent_infos[k] for k in self.policy.state_info_keys + ] # pylint: disable=unexpected-keyword-arg dual_opt_input_values = self._dual_opt_inputs._replace( - reward_var=samples_data['rewards'], - valid_var=samples_data['valids'], + reward_var=episodes.padded_rewards, + valid_var=episodes.valids, feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, @@ -516,28 +481,33 @@ def _dual_opt_input_values(self, samples_data): return flatten_inputs(dual_opt_input_values) - def _policy_opt_input_values(self, samples_data): + def _policy_opt_input_values(self, episodes): """Update policy optimize input values based on samples data. Args: - samples_data (dict): Processed sample data. - See garage.tf.paths_to_tensors() for details. + episodes (EpisodeBatch): Batch of episodes. Returns: list(np.ndarray): Flatten policy optimization input values. """ + agent_infos = episodes.padded_agent_infos policy_state_info_list = [ - samples_data['agent_infos'][k] - for k in self.policy.state_info_keys - ] # yapf: disable + agent_infos[k] for k in self.policy.state_info_keys + ] + + actions = [ + self._env_spec.action_space.flatten_n(act) + for act in episodes.actions_list + ] + padded_actions = episodes.pad_to_last(np.concatenate(actions)) # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( - obs_var=samples_data['observations'], - action_var=samples_data['actions'], - reward_var=samples_data['rewards'], - valid_var=samples_data['valids'], + obs_var=episodes.padded_observations, + action_var=padded_actions, + reward_var=episodes.padded_rewards, + valid_var=episodes.valids, feat_diff=self._feat_diff, param_eta=self._param_eta, param_v=self._param_v, @@ -546,24 +516,24 @@ def _policy_opt_input_values(self, samples_data): return flatten_inputs(policy_opt_input_values) - def _features(self, samples_data): + def _features(self, episodes): """Get valid view features based on samples data. Args: - samples_data (dict): Processed sample data. - See garage.tf.paths_to_tensors() for details. + episodes (EpisodeBatch): Batch of episodes. Returns: numpy.ndarray: Features for training. """ - paths = samples_data['paths'] + start = 0 feat_diff = [] - for path in paths: - o = np.clip(path['observations'], + for length in episodes.lengths: + stop = start + length + o = np.clip(episodes.observations[start:stop], self._env_spec.observation_space.low, self._env_spec.observation_space.high) - lr = len(path['rewards']) + lr = length al = np.arange(lr).reshape(-1, 1) / self.max_episode_length feats = np.concatenate( [o, o**2, al, al**2, al**3, @@ -571,5 +541,6 @@ def _features(self, samples_data): # pylint: disable=unsubscriptable-object feats = np.vstack([feats, np.zeros(feats.shape[1])]) feat_diff.append(feats[1:] - feats[:-1]) + start = stop return np.vstack(feat_diff) From a8417bd402d147996ab5e34da1c698e8e141c253 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com> Date: Wed, 21 Oct 2020 16:49:27 -0700 Subject: [PATCH 13/23] Make mujoco test run a few minutes faster (#2143) --- examples/torch/maml_trpo_metaworld_ml1_push.py | 4 +++- tests/garage/torch/algos/test_pearl.py | 1 + tests/integration_tests/test_examples.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/torch/maml_trpo_metaworld_ml1_push.py b/examples/torch/maml_trpo_metaworld_ml1_push.py index c059dbb882..93de39f2d3 100755 --- a/examples/torch/maml_trpo_metaworld_ml1_push.py +++ b/examples/torch/maml_trpo_metaworld_ml1_push.py @@ -61,7 +61,9 @@ def maml_trpo_metaworld_ml1_push(ctxt, seed, epochs, rollouts_per_task, hidden_nonlinearity=torch.tanh, output_nonlinearity=None) - meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler) + meta_evaluator = MetaEvaluator(test_task_sampler=test_sampler, + n_test_tasks=1, + n_exploration_eps=rollouts_per_task) trainer = Trainer(ctxt) algo = MAMLTRPO(env=env, diff --git a/tests/garage/torch/algos/test_pearl.py b/tests/garage/torch/algos/test_pearl.py index 81343befa1..8265150c7e 100644 --- a/tests/garage/torch/algos/test_pearl.py +++ b/tests/garage/torch/algos/test_pearl.py @@ -36,6 +36,7 @@ class TestPEARL: """Test class for PEARL.""" + @pytest.mark.skip @pytest.mark.large def test_pearl_ml1_push(self): """Test PEARL with ML1 Push environment.""" diff --git a/tests/integration_tests/test_examples.py b/tests/integration_tests/test_examples.py index 114da72c00..c58eea6458 100644 --- a/tests/integration_tests/test_examples.py +++ b/tests/integration_tests/test_examples.py @@ -309,7 +309,7 @@ def test_maml_trpo_metaworld_ml1_push(): """Test maml_trpo_ml1_push.py.""" assert subprocess.run([ EXAMPLES_ROOT_DIR / 'torch/maml_trpo_metaworld_ml1_push.py', - '--epochs', '1', '--meta_batch_size', '1' + '--epochs', '1', '--meta_batch_size', '1', '--rollouts_per_task', '1' ], check=False).returncode == 0 From f8a2d53061af21ba759680b3adc59581af3b866d Mon Sep 17 00:00:00 2001 From: mishari <44849486+maliesa96@users.noreply.github.com> Date: Wed, 21 Oct 2020 17:58:44 -0700 Subject: [PATCH 14/23] Add torch DQN (#2076) This also adds several smaller features: - torch/examples/watch_atari.py: use a trained agent to play atari. - Error handling in the snapshotter for invalid arguments. - torch/examples/dqn_atari.py: train on atari environments. --- .../src/garage_benchmarks/parameters.py | 4 + examples/sim_policy.py | 4 +- examples/torch/dqn_atari.py | 194 ++++++++++++ examples/torch/dqn_cartpole.py | 68 +++++ examples/torch/watch_atari.py | 67 ++++ src/garage/_functions.py | 10 +- src/garage/envs/wrappers/fire_reset.py | 26 +- src/garage/envs/wrappers/max_and_skip.py | 26 +- src/garage/envs/wrappers/stack_frames.py | 56 +++- src/garage/experiment/snapshotter.py | 24 +- src/garage/plotter/plotter.py | 12 +- src/garage/tf/plotter/plotter.py | 9 +- src/garage/torch/algos/__init__.py | 5 +- src/garage/torch/algos/dqn.py | 289 ++++++++++++++++++ .../policies/discrete_qf_argmax_policy.py | 3 +- .../q_functions/discrete_cnn_q_function.py | 27 +- src/garage/trainer.py | 2 +- tests/garage/envs/wrappers/test_fire_reset.py | 7 +- .../envs/wrappers/test_stack_frames_env.py | 22 ++ tests/garage/experiment/test_snapshotter.py | 36 ++- tests/garage/torch/algos/test_dqn.py | 134 ++++++++ .../test_discrete_qf_argmax_policy.py | 8 +- .../test_discrete_cnn_q_function.py | 2 - tests/integration_tests/test_examples.py | 21 ++ 24 files changed, 990 insertions(+), 66 deletions(-) create mode 100755 examples/torch/dqn_atari.py create mode 100755 examples/torch/dqn_cartpole.py create mode 100755 examples/torch/watch_atari.py create mode 100644 src/garage/torch/algos/dqn.py create mode 100644 tests/garage/torch/algos/test_dqn.py diff --git a/benchmarks/src/garage_benchmarks/parameters.py b/benchmarks/src/garage_benchmarks/parameters.py index cae5c58305..10ec5cccbb 100644 --- a/benchmarks/src/garage_benchmarks/parameters.py +++ b/benchmarks/src/garage_benchmarks/parameters.py @@ -9,6 +9,10 @@ task['env_id'] for task in benchmarks.get_benchmark('Mujoco1M')['tasks'] ] +Atari10M_ENV_SET = [ + task['env_id'] for task in benchmarks.get_benchmark('Atari10M')['tasks'] +] + PIXEL_ENV_SET = ['CubeCrash-v0', 'MemorizeDigits-v0'] STATE_ENV_SET = [ diff --git a/examples/sim_policy.py b/examples/sim_policy.py index fa06f7da13..4d5a1228be 100755 --- a/examples/sim_policy.py +++ b/examples/sim_policy.py @@ -53,7 +53,6 @@ def query_yes_no(question, default='yes'): type=int, default=1000, help='Max length of episode') - parser.add_argument('--speedup', type=float, default=1, help='Speedup') args = parser.parse_args() # If the snapshot file use tensorflow, do: @@ -68,7 +67,6 @@ def query_yes_no(question, default='yes'): path = rollout(env, policy, max_episode_length=args.max_episode_length, - animated=True, - speedup=args.speedup) + animated=True) if not query_yes_no('Continue simulation?'): break diff --git a/examples/torch/dqn_atari.py b/examples/torch/dqn_atari.py new file mode 100755 index 0000000000..e6d11fe11a --- /dev/null +++ b/examples/torch/dqn_atari.py @@ -0,0 +1,194 @@ +#!/usr/bin/env python3 +"""An example to train a task with DQN algorithm. + +Here it creates a gym environment CartPole, and trains a DQN with 50k steps. +""" +import click +import gym +import numpy as np +import psutil +import torch + +from garage import wrap_experiment +from garage.envs import GymEnv +from garage.envs.wrappers.clip_reward import ClipReward +from garage.envs.wrappers.episodic_life import EpisodicLife +from garage.envs.wrappers.fire_reset import FireReset +from garage.envs.wrappers.grayscale import Grayscale +from garage.envs.wrappers.max_and_skip import MaxAndSkip +from garage.envs.wrappers.noop import Noop +from garage.envs.wrappers.resize import Resize +from garage.envs.wrappers.stack_frames import StackFrames +from garage.experiment.deterministic import set_seed +from garage.np.exploration_policies import EpsilonGreedyPolicy +from garage.replay_buffer import PathBuffer +from garage.sampler import FragmentWorker, LocalSampler +from garage.torch import set_gpu_mode +from garage.torch.algos import DQN +from garage.torch.policies import DiscreteQFArgmaxPolicy +from garage.torch.q_functions import DiscreteCNNQFunction +from garage.trainer import Trainer + +hyperparams = dict(n_epochs=500, + steps_per_epoch=20, + sampler_batch_size=500, + lr=1e-4, + discount=0.99, + min_buffer_size=int(1e4), + n_train_steps=125, + target_update_freq=2, + buffer_batch_size=32, + max_epsilon=1.0, + min_epsilon=0.01, + decay_ratio=0.1, + buffer_size=int(1e4), + hidden_sizes=(512, ), + hidden_channels=(32, 64, 64), + kernel_sizes=(8, 4, 3), + strides=(4, 2, 1), + clip_gradient=10) + + +@click.command() +@click.argument('env', type=str) +@click.option('--seed', default=24) +@click.option('--n', type=int, default=psutil.cpu_count(logical=False)) +@click.option('--buffer_size', type=int, default=None) +@click.option('--max_episode_length', type=int, default=None) +def main(env=None, + seed=24, + n=psutil.cpu_count(logical=False), + buffer_size=None, + max_episode_length=None): + """Wrapper to setup the logging directory. + + Args: + env (str): Name of the atari environment, can either be the prefix + or the full name. For example, this can either be 'Pong' or + 'PongNoFrameskip-v4'. If the former is used, the env used will be + `env` + 'NoFrameskip-v4'. + seed (int): Seed to use for the RNG. + n (int): Number of workers to use. Defaults to the number of CPU cores + available. + buffer_size (int): size of the replay buffer in transitions. If None, + defaults to hyperparams['buffer_size']. This is used by the + integration tests. + max_episode_length (int): Max length of an episode. If None, defaults + to the timelimit specific to the environment. Used by integration + tests. + """ + if '-v' not in env: + env += 'NoFrameskip-v4' + logdir = 'data/local/experiment/' + env + + if buffer_size is not None: + hyperparams['buffer_size'] = buffer_size + + dqn_atari(dict(log_dir=logdir), + env=env, + seed=seed, + n_workers=n, + max_episode_length=max_episode_length, + **hyperparams) + + +# pylint: disable=unused-argument +@wrap_experiment(snapshot_mode='gap_overwrite', snapshot_gap=30) +def dqn_atari(ctxt=None, + env=None, + seed=24, + n_workers=psutil.cpu_count(logical=False), + max_episode_length=None, + **kwargs): + """Train DQN with PongNoFrameskip-v4 environment. + + Args: + ctxt (garage.experiment.ExperimentContext): The experiment + configuration used by Trainer to create the snapshotter. + env (str): Name of the atari environment, eg. 'PongNoFrameskip-v4'. + seed (int): Used to seed the random number generator to produce + determinism. + n_workers (int): Number of workers to use. Defaults to the number of + CPU cores available. + max_episode_length (int): Max length of an episode. If None, defaults + to the timelimit specific to the environment. Used by integration + tests. + kwargs (dict): hyperparameters to be saved to variant.json. + + """ + assert n_workers > 0 + assert env is not None + env = gym.make(env) + env = Noop(env, noop_max=30) + env = MaxAndSkip(env, skip=4) + env = EpisodicLife(env) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireReset(env) + env = Grayscale(env) + env = Resize(env, 84, 84) + env = ClipReward(env) + env = StackFrames(env, 4, axis=0) + env = GymEnv(env, max_episode_length=max_episode_length) + set_seed(seed) + trainer = Trainer(ctxt) + + env.spec.observation_space = env.observation_space + env.spec.action_space = env.action_space + + n_epochs = hyperparams['n_epochs'] + steps_per_epoch = hyperparams['steps_per_epoch'] + sampler_batch_size = hyperparams['sampler_batch_size'] + num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size + replay_buffer = PathBuffer( + capacity_in_transitions=hyperparams['buffer_size']) + + qf = DiscreteCNNQFunction( + env_spec=env.spec, + hidden_channels=hyperparams['hidden_channels'], + kernel_sizes=hyperparams['kernel_sizes'], + strides=hyperparams['strides'], + hidden_w_init=( + lambda x: torch.nn.init.orthogonal_(x, gain=np.sqrt(2))), + hidden_sizes=hyperparams['hidden_sizes'], + is_image=True) + + policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) + exploration_policy = EpsilonGreedyPolicy( + env_spec=env.spec, + policy=policy, + total_timesteps=num_timesteps, + max_epsilon=hyperparams['max_epsilon'], + min_epsilon=hyperparams['min_epsilon'], + decay_ratio=hyperparams['decay_ratio']) + + algo = DQN(env_spec=env.spec, + policy=policy, + qf=qf, + exploration_policy=exploration_policy, + replay_buffer=replay_buffer, + steps_per_epoch=steps_per_epoch, + qf_lr=hyperparams['lr'], + clip_gradient=hyperparams['clip_gradient'], + discount=hyperparams['discount'], + min_buffer_size=hyperparams['min_buffer_size'], + n_train_steps=hyperparams['n_train_steps'], + target_update_freq=hyperparams['target_update_freq'], + buffer_batch_size=hyperparams['buffer_batch_size']) + + set_gpu_mode(False) + torch.set_num_threads(1) + if torch.cuda.is_available(): + set_gpu_mode(True) + algo.to() + + trainer.setup(algo, + env, + sampler_cls=LocalSampler, + worker_class=FragmentWorker, + n_workers=n_workers) + + trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) + env.close() + + +main() diff --git a/examples/torch/dqn_cartpole.py b/examples/torch/dqn_cartpole.py new file mode 100755 index 0000000000..0a54e537b2 --- /dev/null +++ b/examples/torch/dqn_cartpole.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 +"""An example to train a task with DQN algorithm. + +Here it creates a gym environment CartPole, and trains a DQN with 50k steps. +""" +import click + +from garage import wrap_experiment +from garage.envs import GymEnv +from garage.experiment.deterministic import set_seed +from garage.np.exploration_policies import EpsilonGreedyPolicy +from garage.replay_buffer import PathBuffer +from garage.sampler import LocalSampler +from garage.torch.algos import DQN +from garage.torch.policies import DiscreteQFArgmaxPolicy +from garage.torch.q_functions import DiscreteMLPQFunction +from garage.trainer import Trainer + + +@click.command() +@click.option('--seed', default=24) +@wrap_experiment(snapshot_mode='none') +def dqn_cartpole(ctxt=None, seed=24): + """Train DQN with CartPole-v0 environment. + + Args: + ctxt (garage.experiment.ExperimentContext): The experiment + configuration used by LocalRunner to create the snapshotter. + seed (int): Used to seed the random number generator to produce + determinism. + """ + set_seed(seed) + runner = Trainer(ctxt) + + n_epochs = 100 + steps_per_epoch = 10 + sampler_batch_size = 512 + num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size + env = GymEnv('CartPole-v0') + replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) + qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) + policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) + exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, + policy=policy, + total_timesteps=num_timesteps, + max_epsilon=1.0, + min_epsilon=0.01, + decay_ratio=0.4) + algo = DQN(env_spec=env.spec, + policy=policy, + qf=qf, + exploration_policy=exploration_policy, + replay_buffer=replay_buffer, + steps_per_epoch=steps_per_epoch, + qf_lr=5e-5, + discount=0.9, + min_buffer_size=int(1e4), + n_train_steps=500, + target_update_freq=30, + buffer_batch_size=64) + + runner.setup(algo, env, sampler_cls=LocalSampler) + runner.train(n_epochs=n_epochs, batch_size=sampler_batch_size) + + env.close() + + +dqn_cartpole() diff --git a/examples/torch/watch_atari.py b/examples/torch/watch_atari.py new file mode 100755 index 0000000000..b780d332c8 --- /dev/null +++ b/examples/torch/watch_atari.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +"""Utility to watch a trained agent play an Atari game.""" + +import click +import gym +import numpy as np + +from garage import rollout +from garage.envs import GymEnv +from garage.envs.wrappers.clip_reward import ClipReward +from garage.envs.wrappers.episodic_life import EpisodicLife +from garage.envs.wrappers.fire_reset import FireReset +from garage.envs.wrappers.grayscale import Grayscale +from garage.envs.wrappers.max_and_skip import MaxAndSkip +from garage.envs.wrappers.noop import Noop +from garage.envs.wrappers.resize import Resize +from garage.envs.wrappers.stack_frames import StackFrames +from garage.experiment import Snapshotter + + +# pylint: disable=no-value-for-parameter, protected-access +@click.command() +@click.argument('saved_dir', type=str) +@click.option('--env', type=str, default=None) +@click.option('--num_episodes', type=int, default=10) +def watch_atari(saved_dir, env=None, num_episodes=10): + """Watch a trained agent play an atari game. + + Args: + saved_dir (str): Directory containing the pickle file. + env (str): Environment to run episodes on. If None, the pickled + environment is used. + num_episodes (int): Number of episodes to play. Note that when using + the EpisodicLife wrapper, an episode is considered done when a + life is lost. Defaults to 10. + """ + snapshotter = Snapshotter() + data = snapshotter.load(saved_dir) + if env is not None: + env = gym.make(env) + env = Noop(env, noop_max=30) + env = MaxAndSkip(env, skip=4) + env = EpisodicLife(env) + if 'FIRE' in env.unwrapped.get_action_meanings(): + env = FireReset(env) + env = Grayscale(env) + env = Resize(env, 84, 84) + env = ClipReward(env) + env = StackFrames(env, 4, axis=0) + env = GymEnv(env) + else: + env = data['env'] + + exploration_policy = data['algo'].exploration_policy + exploration_policy.policy._qf.to('cpu') + ep_rewards = np.asarray([]) + for _ in range(num_episodes): + episode_data = rollout(env, + exploration_policy.policy, + animated=True, + pause_per_frame=0.02) + ep_rewards = np.append(ep_rewards, np.sum(episode_data['rewards'])) + + print('Average Reward {}'.format(np.mean(ep_rewards))) + + +watch_atari() diff --git a/src/garage/_functions.py b/src/garage/_functions.py index a08effe367..19eae59767 100644 --- a/src/garage/_functions.py +++ b/src/garage/_functions.py @@ -1,5 +1,6 @@ """Functions exposed directly in the garage namespace.""" from collections import defaultdict +import time from dowel import tabular import numpy as np @@ -68,7 +69,7 @@ def rollout(env, *, max_episode_length=np.inf, animated=False, - speedup=1, + pause_per_frame=None, deterministic=False): """Sample a single episode of the agent in the environment. @@ -78,8 +79,8 @@ def rollout(env, max_episode_length (int): If the episode reaches this many timesteps, it is truncated. animated (bool): If true, render the environment after each step. - speedup (float): Factor by which to decrease the wait time between - rendered steps. Only relevant, if animated == true. + pause_per_frame (float): Time to sleep between steps. Only relevant if + animated == true. deterministic (bool): If true, use the mean action returned by the stochastic policy instead of sampling from the returned action distribution. @@ -104,7 +105,6 @@ def rollout(env, * dones(np.array): Array of termination signals. """ - del speedup env_steps = [] agent_infos = [] observations = [] @@ -114,6 +114,8 @@ def rollout(env, if animated: env.visualize() while episode_length < (max_episode_length or np.inf): + if pause_per_frame is not None: + time.sleep(pause_per_frame) a, agent_info = agent.get_action(last_obs) if deterministic and 'mean' in agent_info: a = agent_info['mean'] diff --git a/src/garage/envs/wrappers/fire_reset.py b/src/garage/envs/wrappers/fire_reset.py index ab7f151d1c..cadf3d6cf5 100644 --- a/src/garage/envs/wrappers/fire_reset.py +++ b/src/garage/envs/wrappers/fire_reset.py @@ -19,13 +19,33 @@ def __init__(self, env): 'Only use fire reset wrapper for suitable environment!') def step(self, action): - """gym.Env step function.""" + """gym.Env step function. + + Args: + action (int): index of the action to take. + + Returns: + np.ndarray: Observation conforming to observation_space + float: Reward for this step + bool: Termination signal + dict: Extra information from the environment. + """ return self.env.step(action) def reset(self, **kwargs): - """gym.Env reset function.""" + """gym.Env reset function. + + Args: + kwargs (dict): extra arguments passed to gym.Env.reset() + + Returns: + np.ndarray: next observation. + """ self.env.reset(**kwargs) obs, _, done, _ = self.env.step(1) if done: - obs = self.env.reset(**kwargs) + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) return obs diff --git a/src/garage/envs/wrappers/max_and_skip.py b/src/garage/envs/wrappers/max_and_skip.py index 9545825ce0..51253fd617 100644 --- a/src/garage/envs/wrappers/max_and_skip.py +++ b/src/garage/envs/wrappers/max_and_skip.py @@ -14,8 +14,8 @@ class MaxAndSkip(gym.Wrapper): render their sprites every other game frame. Args: - env: The environment to be wrapped. - skip: The environment only returns `skip`-th frame. + env (gym.Env): The environment to be wrapped. + skip (int): The environment only returns `skip`-th frame. """ @@ -26,13 +26,22 @@ def __init__(self, env, skip=4): self._skip = skip def step(self, action): - """ - gym.Env step. + """Repeat action, sum reward, and max over last two observations. + + Args: + action (int): action to take in the atari environment. + + Returns: + np.ndarray: observation of shape :math:`(O*,)` representating + the max values over the last two oservations. + float: Reward for this step + bool: Termination signal + dict: Extra information from the environment. - Repeat action, sum reward, and max over last two observations. """ total_reward = 0.0 done = None + for i in range(self._skip): obs, reward, done, info = self.env.step(action) if i == self._skip - 2: @@ -45,6 +54,11 @@ def step(self, action): max_frame = self._obs_buffer.max(axis=0) return max_frame, total_reward, done, info + # pylint: disable=arguments-differ def reset(self): - """gym.Env reset.""" + """gym.Env reset. + + Returns: + np.ndarray: observaion of shape :math:`(O*,)`. + """ return self.env.reset() diff --git a/src/garage/envs/wrappers/stack_frames.py b/src/garage/envs/wrappers/stack_frames.py index 01edb7d639..5ed17ae0ee 100644 --- a/src/garage/envs/wrappers/stack_frames.py +++ b/src/garage/envs/wrappers/stack_frames.py @@ -13,16 +13,23 @@ class StackFrames(gym.Wrapper): Only works with gym.spaces.Box environment with 2D single channel frames. Args: - env: gym.Env to wrap. - n_frames: number of frames to stack. + env (gym.Env): gym.Env to wrap. + n_frames (int): number of frames to stack. + axis (int): Axis to stack frames on. This should be 2 for tensorflow + and 0 for pytorch. Raises: - ValueError: If observation space shape is not 2 or - environment is not gym.spaces.Box. + ValueError: If observation space shape is not 2 dimnesional, + if the environment is not gym.spaces.Box, or if the specified axis + is not 0 or 2. + """ - def __init__(self, env, n_frames): + def __init__(self, env, n_frames, axis=2): + if axis not in (0, 2): + raise ValueError('Frame stacking axis should be 0 for pytorch or ' + '2 for tensorflow.') if not isinstance(env.observation_space, gym.spaces.Box): raise ValueError('Stack frames only works with gym.spaces.Box ' 'environment.') @@ -34,9 +41,13 @@ def __init__(self, env, n_frames): super().__init__(env) self._n_frames = n_frames + self._axis = axis self._frames = deque(maxlen=n_frames) new_obs_space_shape = env.observation_space.shape + (n_frames, ) + if axis == 0: + new_obs_space_shape = (n_frames, ) + env.observation_space.shape + _low = env.observation_space.low.flatten()[0] _high = env.observation_space.high.flatten()[0] self._observation_space = gym.spaces.Box( @@ -47,7 +58,7 @@ def __init__(self, env, n_frames): @property def observation_space(self): - """gym.Env observation space.""" + """gym.spaces.Box: gym.Env observation space.""" return self._observation_space @observation_space.setter @@ -55,19 +66,44 @@ def observation_space(self, observation_space): self._observation_space = observation_space def _stack_frames(self): - return np.stack(self._frames, axis=2) + """Stacks and returns the last n_frames. + + Returns: + np.ndarray: stacked observation with shape either + :math:`(N, n_frames, O*)` or :math:(N, O*, n_frames), + depending on the axis specified. + """ + return np.stack(self._frames, axis=self._axis) + # pylint: disable=arguments-differ def reset(self): - """gym.Env reset function.""" + """gym.Env reset function. + + Returns: + np.ndarray: Observation conforming to observation_space + float: Reward for this step + bool: Termination signal + dict: Extra information from the environment. + """ observation = self.env.reset() self._frames.clear() - for i in range(self._n_frames): + for _ in range(self._n_frames): self._frames.append(observation) return self._stack_frames() def step(self, action): - """gym.Env step function.""" + """gym.Env step function. + + Args: + action (int): index of the action to take. + + Returns: + np.ndarray: Observation conforming to observation_space + float: Reward for this step + bool: Termination signal + dict: Extra information from the environment. + """ new_observation, reward, done, info = self.env.step(action) self._frames.append(new_observation) diff --git a/src/garage/experiment/snapshotter.py b/src/garage/experiment/snapshotter.py index 4eef38c881..549569fbf7 100644 --- a/src/garage/experiment/snapshotter.py +++ b/src/garage/experiment/snapshotter.py @@ -21,7 +21,10 @@ class Snapshotter: snapshot_mode (str): Mode to save the snapshot. Can be either "all" (all iterations will be saved), "last" (only the last iteration will be saved), "gap" (every snapshot_gap iterations are saved), - or "none" (do not save snapshots). + "gap_and_last" (save the last iteration as 'params.pkl' and save + every snapshot_gap iteration separately), "gap_overwrite" (same as + gap but overwrites the last saved snapshot), or "none" (do not + save snapshots). snapshot_gap (int): Gap between snapshot iterations. Wait this number of iterations before taking another snapshot. @@ -36,6 +39,16 @@ def __init__(self, self._snapshot_mode = snapshot_mode self._snapshot_gap = snapshot_gap + if snapshot_mode == 'gap_overwrite' and snapshot_gap <= 1: + raise ValueError('snapshot_gap must be > 1 when using ' + 'snapshot_mode="gap_overwrite". Use ' + 'snapshot_mode="last" to snapshot after ' + 'every iteration.') + if snapshot_mode == 'last' and snapshot_gap != 1: + raise ValueError('snapshot_gap should be set to 1 if using ' + 'snapshot_mode="last". Did you mean to' + ' use snapshot_mode="gap"?') + pathlib.Path(snapshot_dir).mkdir(parents=True, exist_ok=True) @property @@ -53,7 +66,8 @@ def snapshot_mode(self): """Return the type of snapshot. Returns: - str: The type of snapshot. Can be "all", "last" or "gap" + str: The type of snapshot. Can be "all", "last", "gap", + "gap_overwrite", "gap_and_last", or "none". """ return self._snapshot_mode @@ -76,13 +90,17 @@ def save_itr_params(self, itr, params): params (obj): Content of snapshot to be saved. Raises: - ValueError: If snapshot_mode is not one of "all", "last" or "gap". + ValueError: If snapshot_mode is not one of "all", "last", "gap", + "gap_overwrite", "gap_and_last", or "none". """ file_name = None if self._snapshot_mode == 'all': file_name = os.path.join(self._snapshot_dir, 'itr_%d.pkl' % itr) + elif self._snapshot_mode == 'gap_overwrite': + if itr % self._snapshot_gap == 0: + file_name = os.path.join(self._snapshot_dir, 'params.pkl') elif self._snapshot_mode == 'last': # override previous params file_name = os.path.join(self._snapshot_dir, 'params.pkl') diff --git a/src/garage/plotter/plotter.py b/src/garage/plotter/plotter.py index fc2d05e30b..fe82d59144 100644 --- a/src/garage/plotter/plotter.py +++ b/src/garage/plotter/plotter.py @@ -72,15 +72,13 @@ def _worker_start(self): rollout(env, policy, max_episode_length=max_length, - animated=True, - speedup=5) + animated=True) else: if max_length: rollout(env, policy, max_episode_length=max_length, - animated=True, - speedup=5) + animated=True) except KeyboardInterrupt: pass @@ -139,11 +137,7 @@ def init_plot(self, env, policy): # Needed in order to draw glfw window on the main thread if 'Darwin' in platform.platform(): - rollout(env, - policy, - max_episode_length=np.inf, - animated=True, - speedup=5) + rollout(env, policy, max_episode_length=np.inf, animated=True) self._queue.put(Message(op=Op.UPDATE, args=(env, policy), kwargs=None)) diff --git a/src/garage/tf/plotter/plotter.py b/src/garage/tf/plotter/plotter.py index a5d5b87ee8..7083e1ca36 100644 --- a/src/garage/tf/plotter/plotter.py +++ b/src/garage/tf/plotter/plotter.py @@ -67,8 +67,7 @@ def __init__(self, self.rollout(self._env, self._policy, max_episode_length=np.inf, - animated=True, - speedup=5) + animated=True) def _start_worker(self): max_length = None @@ -106,16 +105,14 @@ def _start_worker(self): self.rollout(self._env, self._policy, max_episode_length=max_length, - animated=True, - speedup=5) + animated=True) self.queue.task_done() else: if max_length: self.rollout(self._env, self._policy, max_episode_length=max_length, - animated=True, - speedup=5) + animated=True) except KeyboardInterrupt: pass diff --git a/src/garage/torch/algos/__init__.py b/src/garage/torch/algos/__init__.py index d440ef9f46..c0c95ecab2 100644 --- a/src/garage/torch/algos/__init__.py +++ b/src/garage/torch/algos/__init__.py @@ -5,6 +5,7 @@ from garage.torch.algos.ddpg import DDPG # VPG has to be imported first because it is depended by PPO and TRPO. # PPO, TRPO, and VPG need to be imported before their MAML variants +from garage.torch.algos.dqn import DQN from garage.torch.algos.vpg import VPG from garage.torch.algos.maml_vpg import MAMLVPG from garage.torch.algos.ppo import PPO @@ -17,6 +18,6 @@ from garage.torch.algos.pearl import PEARL __all__ = [ - 'BC', 'DDPG', 'VPG', 'PPO', 'TRPO', 'MAMLPPO', 'MAMLTRPO', 'MAMLVPG', - 'MTSAC', 'PEARL', 'SAC' + 'BC', 'DDPG', 'DQN', 'VPG', 'PPO', 'TRPO', 'MAMLPPO', 'MAMLTRPO', + 'MAMLVPG', 'MTSAC', 'PEARL', 'SAC' ] diff --git a/src/garage/torch/algos/dqn.py b/src/garage/torch/algos/dqn.py new file mode 100644 index 0000000000..f6c5f06716 --- /dev/null +++ b/src/garage/torch/algos/dqn.py @@ -0,0 +1,289 @@ +"""This modules creates a DDPG model in PyTorch.""" +import collections +import copy + +from dowel import logger, tabular +import numpy as np +import torch +import torch.nn.functional as F + +from garage import _Default, log_performance, make_optimizer +from garage._functions import obtain_evaluation_episodes +from garage.np.algos import RLAlgorithm +from garage.sampler import FragmentWorker +from garage.torch import global_device, np_to_torch + + +class DQN(RLAlgorithm): + """DQN algorithm. See https://arxiv.org/pdf/1312.5602.pdf. + + DQN, also known as the Deep Q Network algorithm, is an off-policy algorithm + that learns action-value estimates for each state, action pair. The + policy then simply acts by taking the action that yields the highest Q(s,a) + value for a given state s. + + Args: + env_spec (EnvSpec): Environment specification. + policy (garage.torch.policies.Policy): Policy. For DQN, this is a + policy that performs the action that yields the highest Q value. + qf (nn.Module): Q-value network. + replay_buffer (ReplayBuffer): Replay buffer. + steps_per_epoch (int): Number of train_once calls per epoch. + n_train_steps (int): Training steps. + eval_env (Environment): Evaluation environment. If None, a copy of the + main environment is used for evaluation. + max_episode_length_eval (int or None): Maximum length of episodes used + for off-policy evaluation. If `None`, defaults to + `env_spec.max_episode_length`. + buffer_batch_size (int): Batch size of replay buffer. + min_buffer_size (int): The minimum buffer size for replay buffer. + exploration_policy (ExplorationPolicy): Exploration strategy, typically + epsilon-greedy. + num_eval_episodes (int): Nunber of evaluation episodes. Defaults to 10. + deterministic_eval (bool): Whether to evaluate the policy + deterministically (without exploration noise). False by default. + target_update_freq (int): Number of optimization steps between each + update to the target Q network. + discount(float): Discount factor for the cumulative return. + qf_optimizer (Union[type, tuple[type, dict]]): Type of optimizer + for training Q-value network. This can be an optimizer type such + as `torch.optim.Adam` or a tuple of type and dictionary, where + dictionary contains arguments to initialize the optimizer + e.g. `(torch.optim.Adam, {'lr' : 1e-3})`. + qf_lr (float): Learning rate for Q-value network parameters. + clip_rewards (float): Clip reward to be in [-clip_rewards, + clip_rewards]. If None, rewards are not clipped. + clip_gradient (float): Clip gradient norm to `clip_gradient`. If None, + gradient are not clipped. Defaults to 10. + reward_scale (float): Reward scale. + """ + worker_cls = FragmentWorker + + def __init__( + self, + env_spec, + policy, + qf, + replay_buffer, + exploration_policy=None, + eval_env=None, + qf_optimizer=torch.optim.Adam, + *, # Everything after this is numbers. + steps_per_epoch=20, + n_train_steps=50, + max_episode_length_eval=None, + deterministic_eval=False, + buffer_batch_size=64, + min_buffer_size=int(1e4), + num_eval_episodes=10, + discount=0.99, + qf_lr=_Default(1e-3), + clip_rewards=None, + clip_gradient=10, + target_update_freq=5, + reward_scale=1.): + self._clip_reward = clip_rewards + self._clip_grad = clip_gradient + + self._steps_per_epoch = steps_per_epoch + self._target_update_freq = target_update_freq + self._episode_qf_losses = [] + self._epoch_ys = [] + self._epoch_qs = [] + + self._policy = policy + self._qf = qf + self._n_train_steps = n_train_steps + + self._min_buffer_size = min_buffer_size + self._qf = qf + self._steps_per_epoch = steps_per_epoch + self._n_train_steps = n_train_steps + self._buffer_batch_size = buffer_batch_size + self._discount = discount + self._reward_scale = reward_scale + self.max_episode_length = env_spec.max_episode_length + self._max_episode_length_eval = (max_episode_length_eval + or self.max_episode_length) + self._episode_reward_mean = collections.deque(maxlen=100) + self._num_eval_episodes = num_eval_episodes + self._deterministic_eval = deterministic_eval + + self.env_spec = env_spec + self.replay_buffer = replay_buffer + self.policy = policy + self.exploration_policy = exploration_policy + + self._target_qf = copy.deepcopy(self._qf) + self._qf_optimizer = make_optimizer(qf_optimizer, + module=self._qf, + lr=qf_lr) + self._eval_env = eval_env + + def train(self, trainer): + """Obtain samplers and start actual training for each epoch. + + Args: + trainer (Trainer): Experiment trainer. + + Returns: + float: The average return in last epoch cycle. + + """ + if not self._eval_env: + self._eval_env = trainer.get_env_copy() + last_returns = [float('nan')] + + if self._min_buffer_size > self.replay_buffer.n_transitions_stored: + num_warmup_steps = (self._min_buffer_size - + self.replay_buffer.n_transitions_stored) + self.replay_buffer.add_episode_batch( + trainer.obtain_episodes(0, num_warmup_steps)) + + trainer.enable_logging = True + + for _ in trainer.step_epochs(): + if (self.replay_buffer.n_transitions_stored >= + self._min_buffer_size): + logger.log('Evaluating policy') + + params_before = self.exploration_policy.get_param_values() + eval_eps = obtain_evaluation_episodes( + (self.exploration_policy + if not self._deterministic_eval else self.policy), + self._eval_env, + num_eps=self._num_eval_episodes, + max_episode_length=self._max_episode_length_eval) + self.exploration_policy.set_param_values(params_before) + + last_returns = log_performance(trainer.step_itr, + eval_eps, + discount=self._discount) + self._episode_reward_mean.extend(last_returns) + tabular.record('Evaluation/100EpRewardMean', + np.mean(self._episode_reward_mean)) + + for _ in range(self._steps_per_epoch): + trainer.step_path = trainer.obtain_episodes(trainer.step_itr) + if hasattr(self.exploration_policy, 'update'): + self.exploration_policy.update(trainer.step_path) + + self._train_once(trainer.step_itr, trainer.step_path) + trainer.step_itr += 1 + + return np.mean(last_returns) + + def _train_once(self, itr, episodes): + """Perform one iteration of training. + + Args: + itr (int): Iteration number. + episodes (EpisodeBatch): Batch of episodes. + + """ + self.replay_buffer.add_episode_batch(episodes) + + epoch = itr / self._steps_per_epoch + + for _ in range(self._n_train_steps): + if (self.replay_buffer.n_transitions_stored >= + self._min_buffer_size): + timesteps = self.replay_buffer.sample_timesteps( + self._buffer_batch_size) + qf_loss, y, q = tuple(v.cpu().numpy() + for v in self._optimize_qf(timesteps)) + + self._episode_qf_losses.append(qf_loss) + self._epoch_ys.append(y) + self._epoch_qs.append(q) + + if itr % self._steps_per_epoch == 0: + self._log_eval_results(epoch) + + if itr % self._target_update_freq == 0: + self._target_qf = copy.deepcopy(self._qf) + + def _log_eval_results(self, epoch): + """Log evaluation results after an epoch. + + Args: + epoch (int): Current epoch. + """ + logger.log('Training finished') + + if self.replay_buffer.n_transitions_stored >= self._min_buffer_size: + tabular.record('Epoch', epoch) + tabular.record('QFunction/AverageQFunctionLoss', + np.mean(self._episode_qf_losses)) + tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs)) + tabular.record('QFunction/MaxQ', np.max(self._epoch_qs)) + tabular.record('QFunction/AverageAbsQ', + np.mean(np.abs(self._epoch_qs))) + tabular.record('QFunction/AverageY', np.mean(self._epoch_ys)) + tabular.record('QFunction/MaxY', np.max(self._epoch_ys)) + tabular.record('QFunction/AverageAbsY', + np.mean(np.abs(self._epoch_ys))) + + def _optimize_qf(self, timesteps): + """Perform algorithm optimizing. + + Args: + timesteps (TimeStepBatch): Processed batch data. + + Returns: + qval_loss: Loss of Q-value predicted by the Q-network. + ys: y_s. + qval: Q-value predicted by the Q-network. + + """ + observations = np_to_torch(timesteps.observations) + rewards = np_to_torch(timesteps.rewards).reshape(-1, 1) + rewards *= self._reward_scale + actions = np_to_torch(timesteps.actions) + next_observations = np_to_torch(timesteps.next_observations) + terminals = np_to_torch(timesteps.terminals).reshape(-1, 1) + + next_inputs = next_observations + inputs = observations + with torch.no_grad(): + # discrete, outputs Qs for all possible actions + target_qvals = self._target_qf(next_inputs) + best_qvals, _ = torch.max(target_qvals, 1) + best_qvals = best_qvals.unsqueeze(1) + + rewards_clipped = rewards + if self._clip_reward is not None: + rewards_clipped = torch.clamp(rewards, -1 * self._clip_reward, + self._clip_reward) + y_target = (rewards_clipped + + (1.0 - terminals) * self._discount * best_qvals) + y_target = y_target.squeeze(1) + + # optimize qf + qvals = self._qf(inputs) + selected_qs = torch.sum(qvals * actions, axis=1) + qval_loss = F.smooth_l1_loss(selected_qs, y_target) + + self._qf_optimizer.zero_grad() + qval_loss.backward() + + # optionally clip the gradients + if self._clip_grad is not None: + torch.nn.utils.clip_grad_norm_(self.policy.parameters(), + self._clip_grad) + self._qf_optimizer.step() + + return (qval_loss.detach(), y_target, selected_qs.detach()) + + def to(self, device=None): + """Put all the networks within the model on device. + + Args: + device (str): ID of GPU or CPU. + + """ + if device is None: + device = global_device() + logger.log('Using device: ' + str(device)) + self._qf = self._qf.to(device) + self._target_qf = self._target_qf.to(device) diff --git a/src/garage/torch/policies/discrete_qf_argmax_policy.py b/src/garage/torch/policies/discrete_qf_argmax_policy.py index 4ed39c53b4..9cf8c2e625 100644 --- a/src/garage/torch/policies/discrete_qf_argmax_policy.py +++ b/src/garage/torch/policies/discrete_qf_argmax_policy.py @@ -5,6 +5,7 @@ import numpy as np import torch +from garage.torch import np_to_torch from garage.torch.policies.policy import Policy @@ -65,4 +66,4 @@ def get_actions(self, observations): dict: Empty since this policy does not produce a distribution. """ with torch.no_grad(): - return self(torch.Tensor(observations)).numpy(), dict() + return self(np_to_torch(observations)).cpu().numpy(), dict() diff --git a/src/garage/torch/q_functions/discrete_cnn_q_function.py b/src/garage/torch/q_functions/discrete_cnn_q_function.py index 6b27d77a98..4550ef52da 100644 --- a/src/garage/torch/q_functions/discrete_cnn_q_function.py +++ b/src/garage/torch/q_functions/discrete_cnn_q_function.py @@ -27,7 +27,6 @@ class DiscreteCNNQFunction(DiscreteCNNModule): For example, (3, 32) means there are two convolutional layers. The filter for the first conv layer outputs 3 channels and the second one outputs 32 channels. - minibatch_size (int): Size of the optimization minibatch. hidden_sizes (list[int]): Output dimension of dense layer(s) for the MLP for mean. For example, (32, 32) means the MLP consists of two hidden layers, each with 32 hidden units. @@ -71,10 +70,9 @@ def __init__(self, kernel_sizes, hidden_channels, strides, - minibatch_size, hidden_sizes=(32, 32), - cnn_hidden_nonlinearity=torch.relu, - mlp_hidden_nonlinearity=torch.relu, + cnn_hidden_nonlinearity=torch.nn.ReLU, + mlp_hidden_nonlinearity=torch.nn.ReLU, hidden_w_init=nn.init.xavier_uniform_, hidden_b_init=nn.init.zeros_, paddings=0, @@ -88,7 +86,8 @@ def __init__(self, layer_normalization=False, is_image=True): - input_shape = (minibatch_size, ) + env_spec.observation_space.shape + self._env_spec = env_spec + input_shape = (1, ) + env_spec.observation_space.shape output_dim = env_spec.action_space.flat_dim super().__init__(input_shape=input_shape, output_dim=output_dim, @@ -110,3 +109,21 @@ def __init__(self, output_b_init=output_b_init, layer_normalization=layer_normalization, is_image=is_image) + + # pylint: disable=arguments-differ + def forward(self, observations): + """Return Q-value(s). + + Args: + observations (np.ndarray): observations of shape :math: `(N, O*)`. + + Returns: + torch.Tensor: Output value + """ + if observations.shape != self._env_spec.observation_space.shape: + # avoid using observation_space.unflatten_n + # to support tensors on GPUs + obs_shape = ((len(observations), ) + + self._env_spec.observation_space.shape) + observations = observations.reshape(obs_shape) + return super().forward(observations) diff --git a/src/garage/trainer.py b/src/garage/trainer.py index 16f0a3ec8d..e7b0daf94c 100644 --- a/src/garage/trainer.py +++ b/src/garage/trainer.py @@ -516,6 +516,7 @@ def train(self, start_epoch=0) self._plot = plot + self._start_worker() average_return = self._algo.train(self) self._shutdown_worker() @@ -543,7 +544,6 @@ def step_epochs(self): trainer.step_itr += 1 """ - self._start_worker() self._start_time = time.time() self.step_itr = self._stats.total_itr self.step_episode = None diff --git a/tests/garage/envs/wrappers/test_fire_reset.py b/tests/garage/envs/wrappers/test_fire_reset.py index dde1f5597e..efbe2b2d80 100644 --- a/tests/garage/envs/wrappers/test_fire_reset.py +++ b/tests/garage/envs/wrappers/test_fire_reset.py @@ -8,15 +8,16 @@ class TestFireReset: def test_fire_reset(self): - env = DummyDiscretePixelEnv() + env = DummyDiscretePixelEnv(random=False) env_wrap = FireReset(env) obs = env.reset() obs_wrap = env_wrap.reset() assert np.array_equal(obs, np.ones(env.observation_space.shape)) assert np.array_equal(obs_wrap, np.full(env.observation_space.shape, - 2)) + 3)) env_wrap.step(2) obs_wrap = env_wrap.reset() # env will call reset again, after fire - assert np.array_equal(obs_wrap, np.ones(env.observation_space.shape)) + assert np.array_equal(obs_wrap, np.full(env.observation_space.shape, + 3)) diff --git a/tests/garage/envs/wrappers/test_stack_frames_env.py b/tests/garage/envs/wrappers/test_stack_frames_env.py index f5e4745623..e60e905391 100644 --- a/tests/garage/envs/wrappers/test_stack_frames_env.py +++ b/tests/garage/envs/wrappers/test_stack_frames_env.py @@ -34,6 +34,7 @@ def test_stack_frames_invalid_environment_shape(self): StackFrames(self.env, n_frames=4) def test_stack_frames_output_observation_space(self): + print(self.env_s.observation_space.shape) assert self.env_s.observation_space.shape == (self.width, self.height, self.n_frames) @@ -56,3 +57,24 @@ def test_stack_frames_for_step(self): obs_stack, _, _, _ = self.env_s.step(1) np.testing.assert_array_equal(obs_stack, frame_stack) + + def test_stack_frames_axis(self): + env = StackFrames(DummyDiscrete2DEnv(random=False), + n_frames=self.n_frames, + axis=0) + env.reset() + obs, _, _, _ = env.step(1) + assert obs.shape[0] == self.n_frames + + env = StackFrames(DummyDiscrete2DEnv(random=False), + n_frames=self.n_frames, + axis=2) + env.reset() + obs, _, _, _ = env.step(1) + assert obs.shape[2] == self.n_frames + + def test_invalid_axis_raises_error(self): + with pytest.raises(ValueError): + StackFrames(DummyDiscrete2DEnv(random=False), + n_frames=self.n_frames, + axis=5) diff --git a/tests/garage/experiment/test_snapshotter.py b/tests/garage/experiment/test_snapshotter.py index 4081395a3e..358d5887d1 100644 --- a/tests/garage/experiment/test_snapshotter.py +++ b/tests/garage/experiment/test_snapshotter.py @@ -20,6 +20,7 @@ class TestSnapshotter: + def setup_method(self): self.temp_dir = tempfile.TemporaryDirectory() @@ -28,11 +29,11 @@ def teardown_method(self): @pytest.mark.parametrize('mode, files', [*configurations]) def test_snapshotter(self, mode, files): - snapshotter = Snapshotter(self.temp_dir.name, mode, 2) + snapshotter = Snapshotter(self.temp_dir.name, mode, 1) assert snapshotter.snapshot_dir == self.temp_dir.name assert snapshotter.snapshot_mode == mode - assert snapshotter.snapshot_gap == 2 + assert snapshotter.snapshot_gap == 1 snapshot_data = [{'testparam': 1}, {'testparam': 4}] snapshotter.save_itr_params(1, snapshot_data[0]) @@ -45,8 +46,35 @@ def test_snapshotter(self, mode, files): data = pickle.load(pkl_file) assert data == snapshot_data[num] + def test_gap_overwrite(self): + snapshotter = Snapshotter(self.temp_dir.name, 'gap_overwrite', 2) + assert snapshotter.snapshot_dir == self.temp_dir.name + assert snapshotter.snapshot_mode == 'gap_overwrite' + assert snapshotter.snapshot_gap == 2 + + snapshot_data = [{'testparam': 1}, {'testparam': 4}] + snapshotter.save_itr_params(1, snapshot_data[0]) + snapshotter.save_itr_params(2, snapshot_data[1]) + + filename = osp.join(self.temp_dir.name, 'params.pkl') + assert osp.exists(filename) + with open(filename, 'rb') as pkl_file: + data = pickle.load(pkl_file) + assert data == snapshot_data[1] + def test_invalid_snapshot_mode(self): with pytest.raises(ValueError): - snapshotter = Snapshotter( - snapshot_dir=self.temp_dir.name, snapshot_mode='invalid') + snapshotter = Snapshotter(snapshot_dir=self.temp_dir.name, + snapshot_mode='invalid') snapshotter.save_itr_params(2, {'testparam': 'invalid'}) + + def test_conflicting_params(self): + with pytest.raises(ValueError): + Snapshotter(snapshot_dir=self.temp_dir.name, + snapshot_mode='last', + snapshot_gap=2) + + with pytest.raises(ValueError): + Snapshotter(snapshot_dir=self.temp_dir.name, + snapshot_mode='gap_overwrite', + snapshot_gap=1) diff --git a/tests/garage/torch/algos/test_dqn.py b/tests/garage/torch/algos/test_dqn.py new file mode 100644 index 0000000000..ea3c157d7a --- /dev/null +++ b/tests/garage/torch/algos/test_dqn.py @@ -0,0 +1,134 @@ +"""Test DQN performance on cartpole.""" +import copy +import tempfile +from unittest.mock import MagicMock + +import pytest +import torch +from torch.nn import functional as F # NOQA + +from garage.envs import GymEnv +from garage.experiment import SnapshotConfig +from garage.experiment.deterministic import set_seed +from garage.np.exploration_policies import EpsilonGreedyPolicy +from garage.replay_buffer import PathBuffer +from garage.sampler import LocalSampler +from garage.torch import np_to_torch +from garage.torch.algos import DQN +from garage.torch.policies import DiscreteQFArgmaxPolicy +from garage.torch.q_functions import DiscreteMLPQFunction +from garage.trainer import Trainer + +from tests.fixtures import snapshot_config + + +@pytest.fixture +def setup(): + set_seed(24) + n_epochs = 11 + steps_per_epoch = 10 + sampler_batch_size = 512 + num_timesteps = 100 * steps_per_epoch * sampler_batch_size + + env = GymEnv('CartPole-v0') + + replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) + + qf = DiscreteMLPQFunction(env_spec=env.spec, hidden_sizes=(8, 5)) + + policy = DiscreteQFArgmaxPolicy(env_spec=env.spec, qf=qf) + exploration_policy = EpsilonGreedyPolicy(env_spec=env.spec, + policy=policy, + total_timesteps=num_timesteps, + max_epsilon=1.0, + min_epsilon=0.01, + decay_ratio=0.4) + algo = DQN(env_spec=env.spec, + policy=policy, + qf=qf, + exploration_policy=exploration_policy, + replay_buffer=replay_buffer, + steps_per_epoch=steps_per_epoch, + qf_lr=5e-5, + discount=0.9, + min_buffer_size=int(1e4), + n_train_steps=500, + target_update_freq=30, + buffer_batch_size=64) + + return algo, env, replay_buffer, n_epochs, sampler_batch_size + + +@pytest.mark.large +def test_dqn_cartpole(setup): + tempdir = tempfile.TemporaryDirectory() + config = SnapshotConfig(snapshot_dir=tempdir.name, + snapshot_mode='last', + snapshot_gap=1) + + trainer = Trainer(config) + algo, env, _, n_epochs, batch_size = setup + trainer.setup(algo, env, sampler_cls=LocalSampler) + last_avg_return = trainer.train(n_epochs=n_epochs, batch_size=batch_size) + assert last_avg_return > 10 + env.close() + + # test resume from snapshot + trainer.restore(tempdir.name) + trainer.resume(n_epochs=1, batch_size=batch_size) + + +def test_dqn_loss(setup): + algo, env, buff, _, batch_size = setup + + trainer = Trainer(snapshot_config) + trainer.setup(algo, env, sampler_cls=LocalSampler) + + paths = trainer.obtain_episodes(0, batch_size=batch_size) + buff.add_episode_batch(paths) + timesteps = buff.sample_timesteps(algo._buffer_batch_size) + timesteps_copy = copy.deepcopy(timesteps) + + observations = np_to_torch(timesteps.observations) + rewards = np_to_torch(timesteps.rewards).reshape(-1, 1) + actions = np_to_torch(timesteps.actions) + next_observations = np_to_torch(timesteps.next_observations) + terminals = np_to_torch(timesteps.terminals).reshape(-1, 1) + + next_inputs = next_observations + inputs = observations + with torch.no_grad(): + target_qvals = algo._target_qf(next_inputs) + best_qvals, _ = torch.max(target_qvals, 1) + best_qvals = best_qvals.unsqueeze(1) + + rewards_clipped = rewards + y_target = (rewards_clipped + + (1.0 - terminals) * algo._discount * best_qvals) + y_target = y_target.squeeze(1) + + # optimize qf + qvals = algo._qf(inputs) + selected_qs = torch.sum(qvals * actions, axis=1) + qval_loss = F.smooth_l1_loss(selected_qs, y_target) + + algo_loss, algo_targets, algo_selected_qs = algo._optimize_qf( + timesteps_copy) + env.close() + + assert (qval_loss.detach() == algo_loss).all() + assert (y_target == algo_targets).all() + assert (selected_qs == algo_selected_qs).all() + + +def test_to_device(setup): + algo, _, _, _, _ = setup + algo._qf.to = MagicMock(name='to') + algo._target_qf.to = MagicMock(name='to') + + algo._qf.to.return_value = algo._qf + algo._target_qf.to.return_value = algo._target_qf + + algo.to('cpu') + algo._qf.to.assert_called_once_with('cpu') + algo._target_qf.to.assert_called_once_with('cpu') diff --git a/tests/garage/torch/policies/test_discrete_qf_argmax_policy.py b/tests/garage/torch/policies/test_discrete_qf_argmax_policy.py index 1f40314b8b..78d72bcfeb 100644 --- a/tests/garage/torch/policies/test_discrete_qf_argmax_policy.py +++ b/tests/garage/torch/policies/test_discrete_qf_argmax_policy.py @@ -36,7 +36,7 @@ def test_get_action(): hidden_sizes=(2, 2)) qvals = qf(obs.unsqueeze(0)) policy = DiscreteQFArgmaxPolicy(qf, env_spec) - action, _ = policy.get_action(obs) + action, _ = policy.get_action(obs.numpy()) assert action == torch.argmax(qvals, dim=1).numpy() assert action.shape == () @@ -51,7 +51,7 @@ def test_get_actions(batch_size): hidden_sizes=(2, 2)) qvals = qf(obs) policy = DiscreteQFArgmaxPolicy(qf, env_spec) - actions, _ = policy.get_actions(obs) + actions, _ = policy.get_actions(obs.numpy()) assert (actions == torch.argmax(qvals, dim=1).numpy()).all() assert actions.shape == (batch_size, ) @@ -66,9 +66,9 @@ def test_is_pickleable(batch_size): hidden_sizes=(2, 2)) policy = DiscreteQFArgmaxPolicy(qf, env_spec) - output1 = policy.get_actions(obs)[0] + output1 = policy.get_actions(obs.numpy())[0] p = pickle.dumps(policy) policy_pickled = pickle.loads(p) - output2 = policy_pickled.get_actions(obs)[0] + output2 = policy_pickled.get_actions(obs.numpy())[0] assert np.array_equal(output1, output2) diff --git a/tests/garage/torch/q_functions/test_discrete_cnn_q_function.py b/tests/garage/torch/q_functions/test_discrete_cnn_q_function.py index a02ad1dc0b..667aae734d 100644 --- a/tests/garage/torch/q_functions/test_discrete_cnn_q_function.py +++ b/tests/garage/torch/q_functions/test_discrete_cnn_q_function.py @@ -27,7 +27,6 @@ def test_forward(batch_size, hidden_channels, kernel_sizes, strides): qf = DiscreteCNNQFunction(env_spec=env_spec, kernel_sizes=kernel_sizes, strides=strides, - minibatch_size=batch_size, mlp_hidden_nonlinearity=None, cnn_hidden_nonlinearity=None, hidden_channels=hidden_channels, @@ -57,7 +56,6 @@ def test_is_pickleable(batch_size, hidden_channels, kernel_sizes, strides): qf = DiscreteCNNQFunction(env_spec=env_spec, kernel_sizes=kernel_sizes, strides=strides, - minibatch_size=batch_size, mlp_hidden_nonlinearity=None, cnn_hidden_nonlinearity=None, hidden_channels=hidden_channels, diff --git a/tests/integration_tests/test_examples.py b/tests/integration_tests/test_examples.py index c58eea6458..0d36d411a0 100644 --- a/tests/integration_tests/test_examples.py +++ b/tests/integration_tests/test_examples.py @@ -8,6 +8,7 @@ EXAMPLES_ROOT_DIR = pathlib.Path('examples/') NON_ALGO_EXAMPLES = [ EXAMPLES_ROOT_DIR / 'torch/resume_training.py', + EXAMPLES_ROOT_DIR / 'torch/watch_atari.py', EXAMPLES_ROOT_DIR / 'tf/resume_training.py', EXAMPLES_ROOT_DIR / 'sim_policy.py', EXAMPLES_ROOT_DIR / 'step_env.py', @@ -20,6 +21,8 @@ EXAMPLES_ROOT_DIR / 'tf/dqn_pong.py', EXAMPLES_ROOT_DIR / 'tf/her_ddpg_fetchreach.py', EXAMPLES_ROOT_DIR / 'tf/trpo_cubecrash.py', + EXAMPLES_ROOT_DIR / 'torch/dqn_cartpole.py', + EXAMPLES_ROOT_DIR / 'torch/dqn_atari.py', EXAMPLES_ROOT_DIR / 'torch/maml_ppo_half_cheetah_dir.py', EXAMPLES_ROOT_DIR / 'torch/maml_trpo_half_cheetah_dir.py', EXAMPLES_ROOT_DIR / 'torch/maml_vpg_half_cheetah_dir.py', @@ -99,6 +102,24 @@ def test_dqn_pong(): env=env).returncode == 0 +@pytest.mark.no_cover +@pytest.mark.timeout(200) +def test_dqn_atari(): + """Test torch/dqn_atari.py with reduced replay buffer size. + + This is to reduced memory consumption. + + """ + env = os.environ.copy() + env['GARAGE_EXAMPLE_TEST_N_EPOCHS'] = '1' + assert subprocess.run([ + EXAMPLES_ROOT_DIR / 'torch/dqn_atari.py', 'Pong', '--buffer_size', '1', + '--max_episode_length', '1' + ], + check=False, + env=env).returncode == 0 + + @pytest.mark.no_cover @pytest.mark.timeout(30) def test_ppo_memorize_digits(): From c0fd41d73da7e7a71d6054e87370be35ca708e67 Mon Sep 17 00:00:00 2001 From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com> Date: Wed, 21 Oct 2020 18:53:42 -0700 Subject: [PATCH 15/23] Refactor te_npo to use episode batch (#2137) * Refactor te_npo to use episode batch * Change variable name --- .../linear_multi_feature_baseline.py | 4 +- src/garage/tf/algos/te_npo.py | 280 +++++++++--------- 2 files changed, 138 insertions(+), 146 deletions(-) diff --git a/src/garage/np/baselines/linear_multi_feature_baseline.py b/src/garage/np/baselines/linear_multi_feature_baseline.py index 3ab92bc7a9..bfb88575b3 100644 --- a/src/garage/np/baselines/linear_multi_feature_baseline.py +++ b/src/garage/np/baselines/linear_multi_feature_baseline.py @@ -21,7 +21,7 @@ def __init__(self, reg_coeff=1e-5, name='LinearMultiFeatureBaseline'): super().__init__(env_spec, reg_coeff, name) - features = features or ['observation'] + features = features or ['observations'] self._feature_names = features def _features(self, path): @@ -38,7 +38,7 @@ def _features(self, path): np.clip(path[feature_name], -10, 10) for feature_name in self._feature_names ] - n = len(path['rewards']) + n = len(path['observations']) return np.concatenate(sum([[f, f**2] for f in features], []) + [np.ones((n, 1))], axis=1) diff --git a/src/garage/tf/algos/te_npo.py b/src/garage/tf/algos/te_npo.py index e9db7b223c..f4d526a952 100644 --- a/src/garage/tf/algos/te_npo.py +++ b/src/garage/tf/algos/te_npo.py @@ -7,9 +7,12 @@ import scipy.stats import tensorflow as tf -from garage import EpisodeBatch, InOutSpec, log_performance +from garage import InOutSpec, log_performance from garage.experiment import deterministic -from garage.np import explained_variance_1d, rrse, sliding_window +from garage.np import (discount_cumsum, + explained_variance_1d, + rrse, + sliding_window) from garage.np.algos import RLAlgorithm from garage.sampler import LocalSampler from garage.tf import (center_advs, @@ -19,10 +22,7 @@ discounted_returns, flatten_inputs, graph_inputs, - pad_tensor, pad_tensor_dict, - pad_tensor_n, - paths_to_tensors, positive_advs, stack_tensor_dict_list) from garage.tf.embeddings import StochasticEncoder @@ -210,118 +210,107 @@ def train(self, trainer): last_return = None for _ in trainer.step_epochs(): - trainer.step_path = trainer.obtain_samples(trainer.step_itr) + trainer.step_path = trainer.obtain_episodes(trainer.step_itr) last_return = self._train_once(trainer.step_itr, trainer.step_path) trainer.step_itr += 1 return last_return - def _train_once(self, itr, paths): + def _train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. - paths (list[dict]): A list of collected paths. + episodes (EpisodeBatch): Batch of episodes. Returns: numpy.float64: Average return. """ undiscounted_returns = log_performance(itr, - EpisodeBatch.from_list( - self._env_spec, paths), + episodes, discount=self._discount) - samples_data = self._paths_to_tensors(paths) + # Calculate baseline predictions + baselines = [] + start = 0 + for length in episodes.lengths: + stop = start + length + baseline = self._baseline.predict( + dict(observations=episodes.observations[start:stop], + tasks=episodes.env_infos['task_onehot'][start:stop], + latents=episodes.agent_infos['latent'][start:stop])) + baselines.append(baseline) + start = stop + baselines = episodes.pad_to_last(np.concatenate(baselines)) - samples_data['average_return'] = np.mean(undiscounted_returns) + # Process trajectories + embed_eps, embed_ep_infos = self._process_episodes(episodes) + + average_return = np.mean(undiscounted_returns) logger.log('Optimizing policy...') - self._optimize_policy(itr, samples_data) + self._optimize_policy(itr, episodes, baselines, embed_eps, + embed_ep_infos) - return samples_data['average_return'] + return average_return - def _optimize_policy(self, itr, samples_data): + def _optimize_policy(self, itr, episodes, baselines, embed_eps, + embed_ep_infos): """Optimize policy. Args: itr (int): Iteration number. - samples_data (dict): Processed sample data. - See process_samples() for details. + episodes (EpisodeBatch): Batch of episodes. + baselines (np.ndarray): Baseline predictions. + embed_eps (np.ndarray): Embedding episodes. + embed_ep_infos (dict): Embedding distribution information. """ del itr - policy_opt_input_values = self._policy_opt_input_values(samples_data) + policy_opt_input_values = self._policy_opt_input_values( + episodes, baselines, embed_eps) inference_opt_input_values = self._inference_opt_input_values( - samples_data) + episodes, embed_eps, embed_ep_infos) self._train_policy_and_encoder_networks(policy_opt_input_values) self._train_inference_network(inference_opt_input_values) - paths = samples_data['paths'] - self._evaluate(policy_opt_input_values, samples_data) + # paths = samples_data['paths'] + fit_paths = self._evaluate(policy_opt_input_values, episodes, + baselines, embed_ep_infos) self._visualize_distribution() logger.log('Fitting baseline...') - self._baseline.fit(paths) + self._baseline.fit(fit_paths) self._old_policy.parameters = self.policy.parameters self._old_policy.encoder.model.parameters = ( self.policy.encoder.model.parameters) self._old_inference.model.parameters = self._inference.model.parameters - def _paths_to_tensors(self, paths): + def _process_episodes(self, episodes): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: - paths (list[dict]): A list of collected paths. + episodes (EpisodeBatch): Batch of episodes. Returns: - dict: Processed sample data, with key - * observations: (numpy.ndarray) - * tasks: (numpy.ndarray) - * actions: (numpy.ndarray) - * trjectories: (numpy.ndarray) - * rewards: (numpy.ndarray) - * baselines: (numpy.ndarray) - * returns: (numpy.ndarray) - * valids: (numpy.ndarray) - * agent_infos: (dict) - * letent_infos: (dict) - * env_infos: (dict) - * trjectory_infos: (dict) - * paths: (list[dict]) + np.ndarray: Embedding episodes. + dict: Embedding distribution information. + * mean (list[numpy.ndarray]): Means of the distribution. + * log_std (list[numpy.ndarray]): Log standard deviations of the + distribution. """ max_episode_length = self.max_episode_length - def _extract_latent_infos(infos): - """Extract and pack latent infos from dict. - - Args: - infos (dict): A dict that contains latent infos with key - prefixed by 'latent_'. - - Returns: - dict: A dict of latent infos. - - """ - latent_infos = dict() - for k, v in infos.items(): - if k.startswith('latent_'): - latent_infos[k[7:]] = v - return latent_infos - - for path in paths: - path['actions'] = (self._env_spec.action_space.flatten_n( - path['actions'])) - path['tasks'] = self.policy.task_space.flatten_n( - path['env_infos']['task_onehot']) - path['latents'] = path['agent_infos']['latent'] - path['latent_infos'] = _extract_latent_infos(path['agent_infos']) + trajectories = [] + trajectory_infos = [] + for obs in episodes.padded_observations: # - Calculate a forward-looking sliding window. # - If step_space has shape (n, d), then trajs will have shape # (n, window, d) @@ -331,45 +320,21 @@ def _extract_latent_infos(infos): # - Only observation is used for a single step. # Alternatively, stacked [observation, action] can be used for # in harder tasks. - obs = pad_tensor(path['observations'], max_episode_length) obs_flat = self._env_spec.observation_space.flatten_n(obs) steps = obs_flat window = self._inference.spec.input_space.shape[0] traj = sliding_window(steps, window, smear=True) traj_flat = self._inference.spec.input_space.flatten_n(traj) - path['trajectories'] = traj_flat + trajectories.append(traj_flat) _, traj_info = self._inference.get_latents(traj_flat) - path['trajectory_infos'] = traj_info - - all_path_baselines = [self._baseline.predict(path) for path in paths] - - tasks = [path['tasks'] for path in paths] - tasks = pad_tensor_n(tasks, max_episode_length) - - trajectories = np.stack([path['trajectories'] for path in paths]) - - latents = [path['latents'] for path in paths] - latents = pad_tensor_n(latents, max_episode_length) + trajectory_infos.append(traj_info) - latent_infos = [path['latent_infos'] for path in paths] - latent_infos = stack_tensor_dict_list( - [pad_tensor_dict(p, max_episode_length) for p in latent_infos]) - - trajectory_infos = [path['trajectory_infos'] for path in paths] + trajectories = np.stack(trajectories) trajectory_infos = stack_tensor_dict_list( [pad_tensor_dict(p, max_episode_length) for p in trajectory_infos]) - samples_data = paths_to_tensors(paths, max_episode_length, - all_path_baselines, self._discount, - self._gae_lambda) - samples_data['tasks'] = tasks - samples_data['latents'] = latents - samples_data['latent_infos'] = latent_infos - samples_data['trajectories'] = trajectories - samples_data['trajectory_infos'] = trajectory_infos - - return samples_data + return trajectories, trajectory_infos def _build_inputs(self): """Build input variables. @@ -741,129 +706,156 @@ def _build_inference_loss(self, i): return infer_loss, infer_kl - def _policy_opt_input_values(self, samples_data): + def _policy_opt_input_values(self, episodes, baselines, embed_eps): """Map episode samples to the policy optimizer inputs. Args: - samples_data (dict): Processed sample data. - See process_samples() for details. + episodes (EpisodeBatch): Batch of episodes. + baselines (np.ndarray): Baseline predictions. + embed_eps (np.ndarray): Embedding episodes. Returns: list(np.ndarray): Flatten policy optimization input values. """ + actions = [ + self._env_spec.action_space.flatten_n(act) + for act in episodes.actions_list + ] + actions = episodes.pad_to_last(np.concatenate(actions)) + tasks = episodes.pad_to_last(episodes.env_infos['task_onehot']) + latents = episodes.pad_to_last(episodes.agent_infos['latent']) + + agent_infos = episodes.padded_agent_infos policy_state_info_list = [ - samples_data['agent_infos'][k] for k in self.policy.state_info_keys + agent_infos[k] for k in self.policy.state_info_keys ] embed_state_info_list = [ - samples_data['latent_infos'][k] + agent_infos['latent_' + k] for k in self.policy.encoder.state_info_keys ] # pylint: disable=unexpected-keyword-arg policy_opt_input_values = self._policy_opt_inputs._replace( - obs_var=samples_data['observations'], - action_var=samples_data['actions'], - reward_var=samples_data['rewards'], - baseline_var=samples_data['baselines'], - trajectory_var=samples_data['trajectories'], - task_var=samples_data['tasks'], - latent_var=samples_data['latents'], - valid_var=samples_data['valids'], + obs_var=episodes.padded_observations, + action_var=actions, + reward_var=episodes.padded_rewards, + baseline_var=baselines, + trajectory_var=embed_eps, + task_var=tasks, + latent_var=latents, + valid_var=episodes.valids, policy_state_info_vars_list=policy_state_info_list, embed_state_info_vars_list=embed_state_info_list, ) return flatten_inputs(policy_opt_input_values) - def _inference_opt_input_values(self, samples_data): + def _inference_opt_input_values(self, episodes, embed_eps, embed_ep_infos): """Map episode samples to the inference optimizer inputs. Args: - samples_data (dict): Processed sample data. - See process_samples() for details. + episodes (EpisodeBatch): Batch of episodes. + embed_eps (np.ndarray): Embedding episodes. + embed_ep_infos (dict): Embedding distribution information. Returns: list(np.ndarray): Flatten inference optimization input values. """ + latents = episodes.pad_to_last(episodes.agent_infos['latent']) + infer_state_info_list = [ - samples_data['trajectory_infos'][k] - for k in self._inference.state_info_keys + embed_ep_infos[k] for k in self._inference.state_info_keys ] # pylint: disable=unexpected-keyword-arg inference_opt_input_values = self._inference_opt_inputs._replace( - latent_var=samples_data['latents'], - trajectory_var=samples_data['trajectories'], - valid_var=samples_data['valids'], + latent_var=latents, + trajectory_var=embed_eps, + valid_var=episodes.valids, infer_state_info_vars_list=infer_state_info_list, ) return flatten_inputs(inference_opt_input_values) - def _evaluate(self, policy_opt_input_values, samples_data): + def _evaluate(self, policy_opt_input_values, episodes, baselines, + embed_ep_infos): """Evaluate rewards and everything else. Args: policy_opt_input_values (list[np.ndarray]): Flattened policy optimization input values. - samples_data (dict): Processed sample data. - See process_samples() for details. + episodes (EpisodeBatch): Batch of episodes. + baselines (np.ndarray): Baseline predictions. + embed_ep_infos (dict): Embedding distribution information. Returns: - dict: Processed sample data. + dict: Paths for fitting the baseline. """ # pylint: disable=too-many-statements + fit_paths = [] + valids = episodes.valids + observations = episodes.padded_observations + tasks = episodes.pad_to_last(episodes.env_infos['task_onehot']) + latents = episodes.pad_to_last(episodes.agent_infos['latent']) + baselines_list = [] + for baseline, valid in zip(baselines, valids): + baselines_list.append(baseline[valid.astype(np.bool)]) + # Augment reward from baselines rewards_tensor = self._f_rewards(*policy_opt_input_values) returns_tensor = self._f_returns(*policy_opt_input_values) returns_tensor = np.squeeze(returns_tensor, -1) - paths = samples_data['paths'] - valids = samples_data['valids'] - baselines = [path['baselines'] for path in paths] - env_rewards = [path['rewards'] for path in paths] - env_rewards = concat_tensor_list(env_rewards.copy()) - env_returns = [path['returns'] for path in paths] - env_returns = concat_tensor_list(env_returns.copy()) - env_average_discounted_return = (np.mean( - [path['returns'][0] for path in paths])) - - # Recompute parts of samples_data + env_rewards = episodes.rewards + env_returns = [ + discount_cumsum(rwd, self._discount) + for rwd in episodes.padded_rewards + ] + env_average_discounted_return = np.mean( + [ret[0] for ret in env_returns]) + + # Recompute returns and prepare paths for fitting the baseline aug_rewards = [] aug_returns = [] - for rew, ret, val, path in zip(rewards_tensor, returns_tensor, valids, - paths): - path['rewards'] = rew[val.astype(np.bool)] - path['returns'] = ret[val.astype(np.bool)] - aug_rewards.append(path['rewards']) - aug_returns.append(path['returns']) + for rew, ret, val, task, latent, obs in zip(rewards_tensor, + returns_tensor, valids, + tasks, latents, + observations): + returns = ret[val.astype(np.bool)] + task = task[val.astype(np.bool)] + latent = latent[val.astype(np.bool)] + obs = obs[val.astype(np.bool)] + + aug_rewards.append(rew[val.astype(np.bool)]) + aug_returns.append(returns) + fit_paths.append( + dict(observations=obs, + tasks=task, + latents=latent, + returns=returns)) aug_rewards = concat_tensor_list(aug_rewards) aug_returns = concat_tensor_list(aug_returns) - samples_data['rewards'] = aug_rewards - samples_data['returns'] = aug_returns # Calculate effect of the entropy terms d_rewards = np.mean(aug_rewards - env_rewards) tabular.record('{}/EntRewards'.format(self.policy.name), d_rewards) aug_average_discounted_return = (np.mean( - [path['returns'][0] for path in paths])) + [ret[0] for ret in returns_tensor])) d_returns = np.mean(aug_average_discounted_return - env_average_discounted_return) tabular.record('{}/EntReturns'.format(self.policy.name), d_returns) # Calculate explained variance - ev = explained_variance_1d(np.concatenate(baselines), aug_returns) + ev = explained_variance_1d(np.concatenate(baselines_list), aug_returns) tabular.record('{}/ExplainedVariance'.format(self._baseline.name), ev) - inference_rmse = (samples_data['trajectory_infos']['mean'] - - samples_data['latents'])**2. + inference_rmse = (embed_ep_infos['mean'] - latents)**2. inference_rmse = np.sqrt(inference_rmse.mean()) tabular.record('Inference/RMSE', inference_rmse) - inference_rrse = rrse(samples_data['latents'], - samples_data['trajectory_infos']['mean']) + inference_rrse = rrse(latents, embed_ep_infos['mean']) tabular.record('Inference/RRSE', inference_rrse) embed_ent = self._f_encoder_entropy(*policy_opt_input_values) @@ -874,13 +866,13 @@ def _evaluate(self, policy_opt_input_values, samples_data): tabular.record('Inference/CrossEntropy', infer_ce) pol_ent = self._f_policy_entropy(*policy_opt_input_values) - pol_ent = np.sum(pol_ent) / np.sum(samples_data['valids']) + pol_ent = np.sum(pol_ent) / np.sum(episodes.lengths) tabular.record('{}/Entropy'.format(self.policy.name), pol_ent) task_ents = self._f_task_entropies(*policy_opt_input_values) - tasks = samples_data['tasks'][:, 0, :] + tasks = tasks[:, 0, :] _, task_indices = np.nonzero(tasks) - path_lengths = np.sum(samples_data['valids'], axis=1) + path_lengths = np.sum(valids, axis=1) for t in range(self.policy.task_space.flat_dim): lengths = path_lengths[task_indices == t] completed = lengths < self.max_episode_length @@ -891,7 +883,7 @@ def _evaluate(self, policy_opt_input_values, samples_data): pct_completed) tabular.record('Tasks/Entropy/t={}'.format(t), task_ents[t]) - return samples_data + return fit_paths def _visualize_distribution(self): """Visualize encoder distribution.""" From 1593a943153c6fbbd2bfb8ffd3b15b24617120d7 Mon Sep 17 00:00:00 2001 From: "K.R. Zentner" <41180126+krzentner@users.noreply.github.com> Date: Wed, 21 Oct 2020 21:32:36 -0700 Subject: [PATCH 16/23] Fix flake of large test run under Travis (#2145) --- Makefile | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5a90f38c8a..35fef5157e 100644 --- a/Makefile +++ b/Makefile @@ -57,9 +57,20 @@ ci-job-normal: assert-docker exit 1; \ done +# Need to be able to access $!, a special bash variable +define LARGE_TEST + pytest --cov=garage --cov-report=xml --reruns 1 -m 'large and not flaky' --durations=20 & + PYTEST_PID=$$! + while ps -p $$PYTEST_PID > /dev/null ; do + echo 'Still running' + sleep 60 + done +endef +export LARGE_TEST + ci-job-large: assert-docker [ ! -f $(MJKEY_PATH) ] || mv $(MJKEY_PATH) $(MJKEY_PATH).bak - pytest --cov=garage --cov-report=xml --reruns 1 -m 'large and not flaky' --durations=20 + bash -c "$$LARGE_TEST" for i in {1..5}; do \ bash <(curl -s https://codecov.io/bash --retry 5) -Z && break \ || echo 'Retrying...' && sleep 30 && continue; \ From e32cd06f0194c5906aec0b7312c3894e44708783 Mon Sep 17 00:00:00 2001 From: "Nicole (Shin Ying) Ng" Date: Thu, 22 Oct 2020 01:27:22 -0700 Subject: [PATCH 17/23] Check in GymEnv for env_info consistency (#2083) * Add GymEnv check for env_info consistency * Update GymEnv reset() for env_info * Fix pre-commit * Fix pylin issues * Add test for env_info inconsistency * Fix pre-commit * Fix pre-commit * Fix pre-commit * Move check below timelimit check --- src/garage/envs/gym_env.py | 11 +++++++++++ tests/garage/envs/test_gym_env.py | 11 +++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/garage/envs/gym_env.py b/src/garage/envs/gym_env.py index 472c196268..321fe0ecaa 100644 --- a/src/garage/envs/gym_env.py +++ b/src/garage/envs/gym_env.py @@ -155,6 +155,9 @@ def __init__(self, env, is_image=False, max_episode_length=None): self._spec = EnvSpec(action_space=self.action_space, observation_space=self.observation_space, max_episode_length=self._max_episode_length) + # stores env_info keys & value types to ensure subsequent env_infos + # are consistent + self._env_info = None @property def action_space(self): @@ -191,6 +194,7 @@ def reset(self): """ first_obs = self._env.reset() self._step_cnt = 0 + self._env_info = None return first_obs, dict() @@ -206,6 +210,8 @@ def step(self, action): Raises: RuntimeError: if `step()` is called after the environment has been constructed and `reset()` has not been called. + RuntimeError: if underlying environment outputs inconsistent + env_info keys. """ if self._step_cnt is None: @@ -243,6 +249,11 @@ def step(self, action): if step_type in (StepType.TERMINAL, StepType.TIMEOUT): self._step_cnt = None + # check that env_infos are consistent + if not self._env_info: + self._env_info = {k: type(info[k]) for k in info} + elif self._env_info.keys() != info.keys(): + raise RuntimeError('GymEnv outputs inconsistent env_info keys.') if not self.spec.observation_space.contains(observation): # Discrete actions can be either in the space normally, or one-hot # encoded. diff --git a/tests/garage/envs/test_gym_env.py b/tests/garage/envs/test_gym_env.py index 089bbb840e..1831d0eb5f 100644 --- a/tests/garage/envs/test_gym_env.py +++ b/tests/garage/envs/test_gym_env.py @@ -142,6 +142,17 @@ def test_done_resets_step_cnt(): assert env._step_cnt is None +def test_inconsistent_env_infos(): + env = GymEnv('MountainCar-v0') + env.reset() + env._env_info = {'k1': 'v1', 'k2': 'v2'} + with pytest.raises(RuntimeError, + match='GymEnv outputs inconsistent env_info keys.'): + env.step(env.action_space.sample()) + # check that order of keys don't matter for equality + assert env._env_info.keys() == {'k2': 'v2', 'k1': 'v1'}.keys() + + def test_is_pickleable(): env = GymEnv('MountainCar-v0', max_episode_length=50) h = pickle.dumps(env) From 9147627a81c979f421d88d8260a1794e425d361c Mon Sep 17 00:00:00 2001 From: Iris Liu Date: Thu, 22 Oct 2020 12:14:33 -0700 Subject: [PATCH 18/23] TD3 implementation in pytorch (#1890) * TD3 Torch (examples, benchmark, test) * Change to Trainer * Update examples --- .../src/garage_benchmarks/benchmark_algos.py | 6 +- .../src/garage_benchmarks/benchmark_auto.py | 3 +- .../experiments/algos/__init__.py | 6 +- .../experiments/algos/td3_garage_pytorch.py | 112 +++++ .../experiments/algos/td3_garage_tf.py | 12 +- examples/torch/mtsac_metaworld_mt10.py | 4 +- examples/torch/mtsac_metaworld_mt50.py | 4 +- examples/torch/td3_halfcheetah.py | 87 ++++ examples/torch/td3_pendulum.py | 89 ++++ setup.cfg | 2 +- src/garage/np/policies/__init__.py | 7 +- .../np/policies/uniform_random_policy.py | 61 +++ src/garage/tf/algos/ddpg.py | 4 +- src/garage/tf/algos/npo.py | 8 +- src/garage/tf/algos/rl2.py | 8 +- src/garage/tf/algos/td3.py | 7 +- src/garage/torch/__init__.py | 27 +- src/garage/torch/_functions.py | 32 +- src/garage/torch/algos/__init__.py | 3 +- src/garage/torch/algos/td3.py | 399 ++++++++++++++++++ .../policies/deterministic_mlp_policy.py | 5 +- .../np/policies/test_uniform_random_policy.py | 13 + tests/garage/test_dtypes.py | 6 +- tests/garage/torch/algos/test_td3.py | 108 +++++ tests/integration_tests/test_examples.py | 2 + 25 files changed, 946 insertions(+), 69 deletions(-) create mode 100644 benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_pytorch.py create mode 100644 examples/torch/td3_halfcheetah.py create mode 100644 examples/torch/td3_pendulum.py create mode 100644 src/garage/np/policies/uniform_random_policy.py create mode 100644 src/garage/torch/algos/td3.py create mode 100644 tests/garage/np/policies/test_uniform_random_policy.py create mode 100644 tests/garage/torch/algos/test_td3.py diff --git a/benchmarks/src/garage_benchmarks/benchmark_algos.py b/benchmarks/src/garage_benchmarks/benchmark_algos.py index d2f4f57add..3f5b7837ff 100644 --- a/benchmarks/src/garage_benchmarks/benchmark_algos.py +++ b/benchmarks/src/garage_benchmarks/benchmark_algos.py @@ -1,9 +1,9 @@ """Benchmarking for algorithms.""" # yapf: disable -from garage_benchmarks.experiments.algos import (ddpg_garage_tf, - her_garage_tf, +from garage_benchmarks.experiments.algos import (ddpg_garage_tf, her_garage_tf, ppo_garage_pytorch, ppo_garage_tf, + td3_garage_pytorch, td3_garage_tf, trpo_garage_pytorch, trpo_garage_tf, @@ -40,7 +40,7 @@ def td3_benchmarks(): td3_env_ids = [ env_id for env_id in MuJoCo1M_ENV_SET if env_id != 'Reacher-v2' ] - + iterate_experiments(td3_garage_pytorch, td3_env_ids) iterate_experiments(td3_garage_tf, td3_env_ids) diff --git a/benchmarks/src/garage_benchmarks/benchmark_auto.py b/benchmarks/src/garage_benchmarks/benchmark_auto.py index 577f2ae911..5753dd9940 100644 --- a/benchmarks/src/garage_benchmarks/benchmark_auto.py +++ b/benchmarks/src/garage_benchmarks/benchmark_auto.py @@ -2,8 +2,7 @@ # yapf: disable from garage_benchmarks.experiments.algos import (ddpg_garage_tf, ppo_garage_pytorch, - ppo_garage_tf, - td3_garage_tf, + ppo_garage_tf, td3_garage_tf, trpo_garage_pytorch, trpo_garage_tf, vpg_garage_pytorch, diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/__init__.py b/benchmarks/src/garage_benchmarks/experiments/algos/__init__.py index 64ac239498..5f91892581 100644 --- a/benchmarks/src/garage_benchmarks/experiments/algos/__init__.py +++ b/benchmarks/src/garage_benchmarks/experiments/algos/__init__.py @@ -4,6 +4,8 @@ from garage_benchmarks.experiments.algos.ppo_garage_pytorch import ( ppo_garage_pytorch) from garage_benchmarks.experiments.algos.ppo_garage_tf import ppo_garage_tf +from garage_benchmarks.experiments.algos.td3_garage_pytorch import ( + td3_garage_pytorch) from garage_benchmarks.experiments.algos.td3_garage_tf import td3_garage_tf from garage_benchmarks.experiments.algos.trpo_garage_pytorch import ( trpo_garage_pytorch) @@ -14,6 +16,6 @@ __all__ = [ 'ddpg_garage_tf', 'her_garage_tf', 'ppo_garage_pytorch', 'ppo_garage_tf', - 'td3_garage_tf', 'trpo_garage_pytorch', 'trpo_garage_tf', - 'vpg_garage_pytorch', 'vpg_garage_tf' + 'td3_garage_pytorch', 'td3_garage_tf', 'trpo_garage_pytorch', + 'trpo_garage_tf', 'vpg_garage_pytorch', 'vpg_garage_tf' ] diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_pytorch.py b/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_pytorch.py new file mode 100644 index 0000000000..9227f5c117 --- /dev/null +++ b/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_pytorch.py @@ -0,0 +1,112 @@ +"""A regression test for automatic benchmarking garage-Pytorch-TD3.""" +import torch +from torch.nn import functional as F + +from garage import wrap_experiment +from garage.envs import GymEnv, normalize +from garage.experiment import deterministic +from garage.np.exploration_policies import AddGaussianNoise +from garage.np.policies import UniformRandomPolicy +from garage.replay_buffer import PathBuffer +from garage.torch import prefer_gpu +from garage.torch.algos import TD3 +from garage.torch.policies import DeterministicMLPPolicy +from garage.torch.q_functions import ContinuousMLPQFunction +from garage.trainer import TFTrainer + +hyper_parameters = { + 'policy_lr': 1e-3, + 'qf_lr': 1e-3, + 'policy_hidden_sizes': [256, 256], + 'qf_hidden_sizes': [256, 256], + 'n_epochs': 250, + 'steps_per_epoch': 40, + 'batch_size': 100, + 'start_steps': 1000, + 'update_after': 1000, + 'grad_steps_per_env_step': 50, + 'discount': 0.99, + 'target_update_tau': 0.005, + 'replay_buffer_size': int(1e6), + 'sigma': 0.1, + 'policy_noise': 0.2, + 'policy_noise_clip': 0.5, + 'buffer_batch_size': 100, + 'min_buffer_size': int(1e4), +} + + +@wrap_experiment(snapshot_mode='last') +def td3_garage_pytorch(ctxt, env_id, seed): + """Create garage TensorFlow TD3 model and training. + + Args: + ctxt (garage.experiment.ExperimentContext): The experiment + configuration used by Localtrainer to create the + snapshotter. + env_id (str): Environment id of the task. + seed (int): Random positive integer for the trial. + + """ + deterministic.set_seed(seed) + + with TFTrainer(ctxt) as trainer: + num_timesteps = hyper_parameters['n_epochs'] * hyper_parameters[ + 'steps_per_epoch'] * hyper_parameters['batch_size'] + env = normalize(GymEnv(env_id)) + + policy = DeterministicMLPPolicy( + env_spec=env.spec, + hidden_sizes=hyper_parameters['policy_hidden_sizes'], + hidden_nonlinearity=F.relu, + output_nonlinearity=torch.tanh) + + exploration_policy = AddGaussianNoise( + env.spec, + policy, + total_timesteps=num_timesteps, + max_sigma=hyper_parameters['sigma'], + min_sigma=hyper_parameters['sigma']) + + uniform_random_policy = UniformRandomPolicy(env.spec) + + qf1 = ContinuousMLPQFunction( + env_spec=env.spec, + hidden_sizes=hyper_parameters['qf_hidden_sizes'], + hidden_nonlinearity=F.relu) + + qf2 = ContinuousMLPQFunction( + env_spec=env.spec, + hidden_sizes=hyper_parameters['qf_hidden_sizes'], + hidden_nonlinearity=F.relu) + + replay_buffer = PathBuffer( + capacity_in_transitions=hyper_parameters['replay_buffer_size']) + + td3 = TD3(env_spec=env.spec, + policy=policy, + qf1=qf1, + qf2=qf2, + exploration_policy=exploration_policy, + uniform_random_policy=uniform_random_policy, + replay_buffer=replay_buffer, + steps_per_epoch=hyper_parameters['steps_per_epoch'], + policy_lr=hyper_parameters['policy_lr'], + qf_lr=hyper_parameters['qf_lr'], + target_update_tau=hyper_parameters['target_update_tau'], + discount=hyper_parameters['discount'], + grad_steps_per_env_step=hyper_parameters[ + 'grad_steps_per_env_step'], + start_steps=hyper_parameters['start_steps'], + min_buffer_size=hyper_parameters['min_buffer_size'], + buffer_batch_size=hyper_parameters['buffer_batch_size'], + policy_optimizer=torch.optim.Adam, + qf_optimizer=torch.optim.Adam, + policy_noise_clip=hyper_parameters['policy_noise_clip'], + policy_noise=hyper_parameters['policy_noise']) + + prefer_gpu() + td3.to() + trainer.setup(td3, env) + trainer.train(n_epochs=hyper_parameters['n_epochs'], + batch_size=hyper_parameters['batch_size']) diff --git a/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_tf.py b/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_tf.py index 26565cd189..6bca74d2b3 100644 --- a/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_tf.py +++ b/benchmarks/src/garage_benchmarks/experiments/algos/td3_garage_tf.py @@ -14,12 +14,12 @@ hyper_parameters = { 'policy_lr': 1e-3, 'qf_lr': 1e-3, - 'policy_hidden_sizes': [400, 300], - 'qf_hidden_sizes': [400, 300], - 'n_epochs': 8, - 'steps_per_epoch': 20, - 'n_exploration_steps': 250, - 'n_train_steps': 1, + 'policy_hidden_sizes': [256, 256], + 'qf_hidden_sizes': [256, 256], + 'n_epochs': 250, + 'steps_per_epoch': 40, + 'n_exploration_steps': 100, + 'n_train_steps': 50, 'discount': 0.99, 'tau': 0.005, 'replay_buffer_size': int(1e6), diff --git a/examples/torch/mtsac_metaworld_mt10.py b/examples/torch/mtsac_metaworld_mt10.py index 76dc8ec888..57210bad53 100755 --- a/examples/torch/mtsac_metaworld_mt10.py +++ b/examples/torch/mtsac_metaworld_mt10.py @@ -43,8 +43,8 @@ def mtsac_metaworld_mt10(ctxt=None, *, seed, _gpu, n_tasks, timesteps): """ deterministic.set_seed(seed) trainer = Trainer(ctxt) - mt10 = metaworld.MT10() - mt10_test = metaworld.MT10() + mt10 = metaworld.MT10() # pylint: disable=no-member + mt10_test = metaworld.MT10() # pylint: disable=no-member # pylint: disable=missing-return-doc, missing-return-type-doc def wrap(env, _): diff --git a/examples/torch/mtsac_metaworld_mt50.py b/examples/torch/mtsac_metaworld_mt50.py index eb0febd8d1..193ccc64dd 100755 --- a/examples/torch/mtsac_metaworld_mt50.py +++ b/examples/torch/mtsac_metaworld_mt50.py @@ -51,8 +51,8 @@ def mtsac_metaworld_mt50(ctxt=None, """ deterministic.set_seed(seed) trainer = Trainer(ctxt) - mt50 = metaworld.MT50() - mt50_test = metaworld.MT50() + mt50 = metaworld.MT50() # pylint: disable=no-member + mt50_test = metaworld.MT50() # pylint: disable=no-member train_task_sampler = MetaWorldTaskSampler( mt50, 'train', diff --git a/examples/torch/td3_halfcheetah.py b/examples/torch/td3_halfcheetah.py new file mode 100644 index 0000000000..19dc5fb7a3 --- /dev/null +++ b/examples/torch/td3_halfcheetah.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +"""An example to train TD3 algorithm on InvertedDoublePendulum PyTorch.""" +import torch +from torch.nn import functional as F + +# from garage.np.exploration_policies import AddGaussianNoise +from garage import wrap_experiment +from garage.envs import GymEnv, normalize +from garage.experiment.deterministic import set_seed +from garage.np.exploration_policies import AddGaussianNoise +from garage.np.policies import UniformRandomPolicy +from garage.replay_buffer import PathBuffer +from garage.torch.algos import TD3 +from garage.torch.policies import DeterministicMLPPolicy +from garage.torch.q_functions import ContinuousMLPQFunction +from garage.trainer import Trainer + + +@wrap_experiment(snapshot_mode='none') +def td3_half_cheetah(ctxt=None, seed=1): + """Train TD3 with InvertedDoublePendulum-v2 environment. + + Args: + ctxt (garage.experiment.ExperimentContext): The experiment + configuration used by LocalRunner to create the snapshotter. + seed (int): Used to seed the random number generator to produce + determinism. + """ + set_seed(seed) + + n_epochs = 500 + steps_per_epoch = 20 + sampler_batch_size = 250 + num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size + + trainer = Trainer(ctxt) + env = normalize(GymEnv('HalfCheetah-v2')) + + policy = DeterministicMLPPolicy(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu, + output_nonlinearity=torch.tanh) + + exploration_policy = AddGaussianNoise(env.spec, + policy, + total_timesteps=num_timesteps, + max_sigma=0.1, + min_sigma=0.1) + + uniform_random_policy = UniformRandomPolicy(env.spec) + + qf1 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu) + + qf2 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu) + + replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) + + td3 = TD3(env_spec=env.spec, + policy=policy, + qf1=qf1, + qf2=qf2, + replay_buffer=replay_buffer, + policy_optimizer=torch.optim.Adam, + qf_optimizer=torch.optim.Adam, + exploration_policy=exploration_policy, + uniform_random_policy=uniform_random_policy, + target_update_tau=0.005, + discount=0.99, + policy_noise_clip=0.5, + policy_noise=0.2, + policy_lr=1e-3, + qf_lr=1e-3, + steps_per_epoch=40, + start_steps=1000, + grad_steps_per_env_step=50, + min_buffer_size=1000, + buffer_batch_size=100) + + trainer.setup(algo=td3, env=env) + trainer.train(n_epochs=750, batch_size=100) + + +td3_half_cheetah(seed=0) diff --git a/examples/torch/td3_pendulum.py b/examples/torch/td3_pendulum.py new file mode 100644 index 0000000000..950b6798d7 --- /dev/null +++ b/examples/torch/td3_pendulum.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +"""An example to train TD3 algorithm on InvertedDoublePendulum PyTorch.""" +import torch +from torch.nn import functional as F + +from garage import wrap_experiment +from garage.envs import GymEnv, normalize +from garage.experiment.deterministic import set_seed +from garage.np.exploration_policies import AddGaussianNoise +from garage.np.policies import UniformRandomPolicy +from garage.replay_buffer import PathBuffer +from garage.torch import prefer_gpu +from garage.torch.algos import TD3 +from garage.torch.policies import DeterministicMLPPolicy +from garage.torch.q_functions import ContinuousMLPQFunction +from garage.trainer import Trainer + + +@wrap_experiment(snapshot_mode='none') +def td3_pendulum(ctxt=None, seed=1): + """Train TD3 with InvertedDoublePendulum-v2 environment. + + Args: + ctxt (garage.experiment.ExperimentContext): The experiment + configuration used by LocalRunner to create the snapshotter. + seed (int): Used to seed the random number generator to produce + determinism. + + """ + set_seed(seed) + n_epochs = 750 + steps_per_epoch = 40 + sampler_batch_size = 100 + num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size + + trainer = Trainer(ctxt) + env = normalize(GymEnv('InvertedDoublePendulum-v2')) + + policy = DeterministicMLPPolicy(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu, + output_nonlinearity=torch.tanh) + + exploration_policy = AddGaussianNoise(env.spec, + policy, + total_timesteps=num_timesteps, + max_sigma=0.1, + min_sigma=0.1) + + uniform_random_policy = UniformRandomPolicy(env.spec) + + qf1 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu) + + qf2 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu) + + replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) + + td3 = TD3(env_spec=env.spec, + policy=policy, + qf1=qf1, + qf2=qf2, + replay_buffer=replay_buffer, + policy_optimizer=torch.optim.Adam, + qf_optimizer=torch.optim.Adam, + exploration_policy=exploration_policy, + uniform_random_policy=uniform_random_policy, + target_update_tau=0.005, + discount=0.99, + policy_noise_clip=0.5, + policy_noise=0.2, + policy_lr=1e-3, + qf_lr=1e-3, + steps_per_epoch=steps_per_epoch, + start_steps=1000, + grad_steps_per_env_step=1, + min_buffer_size=int(1e4), + buffer_batch_size=100) + + prefer_gpu() + td3.to() + trainer.setup(algo=td3, env=env) + trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) + + +td3_pendulum() diff --git a/setup.cfg b/setup.cfg index f13a70eac5..899b02abb0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,7 +26,7 @@ use_parentheses = True force_sort_within_sections = True force_alphabetical_sort_within_sections = True lexicographical = True -multi_line_output = 1 +multi_line_output = 0 sections=FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,TESTS,LOCALFOLDER known_first_party = garage known_tests = tests, garage_benchmarks diff --git a/src/garage/np/policies/__init__.py b/src/garage/np/policies/__init__.py index 22cfb88bde..242fd64372 100644 --- a/src/garage/np/policies/__init__.py +++ b/src/garage/np/policies/__init__.py @@ -3,9 +3,6 @@ from garage.np.policies.fixed_policy import FixedPolicy from garage.np.policies.policy import Policy from garage.np.policies.scripted_policy import ScriptedPolicy +from garage.np.policies.uniform_random_policy import UniformRandomPolicy -__all__ = [ - 'FixedPolicy', - 'Policy', - 'ScriptedPolicy', -] +__all__ = ['FixedPolicy', 'Policy', 'ScriptedPolicy', 'UniformRandomPolicy'] diff --git a/src/garage/np/policies/uniform_random_policy.py b/src/garage/np/policies/uniform_random_policy.py new file mode 100644 index 0000000000..f363387fc4 --- /dev/null +++ b/src/garage/np/policies/uniform_random_policy.py @@ -0,0 +1,61 @@ +"""Uniform random exploration strategy.""" +import gym + +from garage.np.policies.policy import Policy + + +class UniformRandomPolicy(Policy): + """Action taken is uniformly random. + + Args: + env_spec (EnvSpec): Environment spec to explore. + + """ + + def __init__( + self, + env_spec, + ): + assert isinstance(env_spec.action_space, gym.spaces.Box) + assert len(env_spec.action_space.shape) == 1 + self._env_spec = env_spec + self._action_space = env_spec.action_space + self._iteration = 0 + + def reset(self, do_resets=None): + """Reset the state of the exploration. + + Args: + do_resets (List[bool] or numpy.ndarray or None): Which + vectorization states to reset. + + """ + self._iteration += 1 + super().reset(do_resets) + + def get_action(self, observation): + """Get action from this policy for the input observation. + + Args: + observation(numpy.ndarray): Observation from the environment. + + Returns: + np.ndarray: Actions with noise. + List[dict]: Arbitrary policy state information (agent_info). + + """ + return self._env_spec.action_space.sample(), dict() + + def get_actions(self, observations): + """Get actions from this policy for the input observation. + + Args: + observations(list): Observations from the environment. + + Returns: + np.ndarray: Actions with noise. + List[dict]: Arbitrary policy state information (agent_info). + + """ + return [self._env_spec.action_space.sample() + for obs in observations], dict() diff --git a/src/garage/tf/algos/ddpg.py b/src/garage/tf/algos/ddpg.py index 5ab7aa8e3f..0da0ee42ba 100644 --- a/src/garage/tf/algos/ddpg.py +++ b/src/garage/tf/algos/ddpg.py @@ -4,9 +4,7 @@ import numpy as np import tensorflow as tf -from garage import (_Default, - log_performance, - make_optimizer, +from garage import (_Default, log_performance, make_optimizer, obtain_evaluation_episodes) from garage.np.algos import RLAlgorithm from garage.sampler import FragmentWorker, LocalSampler diff --git a/src/garage/tf/algos/npo.py b/src/garage/tf/algos/npo.py index a95d356f61..0812452fd4 100644 --- a/src/garage/tf/algos/npo.py +++ b/src/garage/tf/algos/npo.py @@ -11,12 +11,8 @@ from garage.np import explained_variance_1d from garage.np.algos import RLAlgorithm from garage.sampler import RaySampler -from garage.tf import (center_advs, - compile_function, - compute_advantages, - discounted_returns, - flatten_inputs, - graph_inputs, +from garage.tf import (center_advs, compile_function, compute_advantages, + discounted_returns, flatten_inputs, graph_inputs, positive_advs) from garage.tf.optimizers import LBFGSOptimizer diff --git a/src/garage/tf/algos/rl2.py b/src/garage/tf/algos/rl2.py index 14225abd1a..d67a93b584 100644 --- a/src/garage/tf/algos/rl2.py +++ b/src/garage/tf/algos/rl2.py @@ -10,12 +10,8 @@ from dowel import logger import numpy as np -from garage import (EnvSpec, - EnvStep, - EpisodeBatch, - log_multitask_performance, - StepType, - Wrapper) +from garage import (EnvSpec, EnvStep, EpisodeBatch, log_multitask_performance, + StepType, Wrapper) from garage.np import concat_tensor_dict_list, discount_cumsum from garage.np.algos import MetaRLAlgorithm from garage.sampler import DefaultWorker diff --git a/src/garage/tf/algos/td3.py b/src/garage/tf/algos/td3.py index 496e9c3037..95fef35f06 100644 --- a/src/garage/tf/algos/td3.py +++ b/src/garage/tf/algos/td3.py @@ -9,9 +9,7 @@ import numpy as np import tensorflow as tf -from garage import (_Default, - log_performance, - make_optimizer, +from garage import (_Default, log_performance, make_optimizer, obtain_evaluation_episodes) from garage.np.algos import RLAlgorithm from garage.sampler import FragmentWorker, LocalSampler @@ -140,11 +138,8 @@ def __init__( self._discount = discount self._reward_scale = reward_scale self.max_episode_length = env_spec.max_episode_length - self._max_episode_length_eval = env_spec.max_episode_length - if max_episode_length_eval is not None: self._max_episode_length_eval = max_episode_length_eval - self._eval_env = None self._env_spec = env_spec diff --git a/src/garage/torch/__init__.py b/src/garage/torch/__init__.py index 81f34499ca..9b47be1771 100644 --- a/src/garage/torch/__init__.py +++ b/src/garage/torch/__init__.py @@ -1,24 +1,19 @@ """PyTorch-backed modules and algorithms.""" # yapf: disable -from garage.torch._functions import (compute_advantages, - dict_np_to_torch, - filter_valids, - flatten_batch, - flatten_to_single_vector, - global_device, - NonLinearity, - np_to_torch, - pad_to_last, - product_of_gaussians, - set_gpu_mode, - torch_to_np, - TransposeImage, +from garage.torch._functions import (compute_advantages, dict_np_to_torch, + filter_valids, flatten_batch, + flatten_to_single_vector, global_device, + NonLinearity, np_to_torch, pad_to_last, + prefer_gpu, product_of_gaussians, + set_gpu_mode, soft_update_model, + torch_to_np, TransposeImage, update_module_params) # yapf: enable __all__ = [ 'compute_advantages', 'dict_np_to_torch', 'filter_valids', 'flatten_batch', - 'global_device', 'np_to_torch', 'pad_to_last', 'product_of_gaussians', - 'set_gpu_mode', 'torch_to_np', 'update_module_params', 'NonLinearity', - 'flatten_to_single_vector', 'TransposeImage' + 'global_device', 'np_to_torch', 'pad_to_last', 'prefer_gpu', + 'product_of_gaussians', 'set_gpu_mode', 'soft_update_model', 'torch_to_np', + 'update_module_params', 'NonLinearity', 'flatten_to_single_vector', + 'TransposeImage' ] diff --git a/src/garage/torch/_functions.py b/src/garage/torch/_functions.py index 0673744046..d1406a8c0c 100644 --- a/src/garage/torch/_functions.py +++ b/src/garage/torch/_functions.py @@ -176,7 +176,7 @@ def torch_to_np(tensors): `garage.torch._functions.to_numpy`. """ - value_out = tuple(v.numpy() for v in tensors) + value_out = tuple(v.cpu().numpy() for v in tensors) return value_out @@ -244,6 +244,28 @@ def update(m, name, param): update(module, name, new_param) +# pylint: disable=missing-param-doc, missing-type-doc +def soft_update_model(target_model, source_model, tau): + """Update model parameter of target and source model. + + # noqa: D417 + Args: + target_model + (garage.torch.Policy/garage.torch.QFunction): + Target model to update. + source_model + (garage.torch.Policy/QFunction): + Source network to update. + tau (float): Interpolation parameter for doing the + soft target update. + + """ + for target_param, param in zip(target_model.parameters(), + source_model.parameters()): + target_param.data.copy_(target_param.data * (1.0 - tau) + + param.data * tau) + + def set_gpu_mode(mode, gpu_id=0): """Set GPU mode and device ID. @@ -261,6 +283,14 @@ def set_gpu_mode(mode, gpu_id=0): _DEVICE = torch.device(('cuda:' + str(_GPU_ID)) if _USE_GPU else 'cpu') +def prefer_gpu(): + """Prefer to use GPU(s) if GPU(s) is detected.""" + if torch.cuda.is_available(): + set_gpu_mode(True) + else: + set_gpu_mode(False) + + def global_device(): """Returns the global device that torch.Tensors should be placed on. diff --git a/src/garage/torch/algos/__init__.py b/src/garage/torch/algos/__init__.py index c0c95ecab2..b1c342441f 100644 --- a/src/garage/torch/algos/__init__.py +++ b/src/garage/torch/algos/__init__.py @@ -10,6 +10,7 @@ from garage.torch.algos.maml_vpg import MAMLVPG from garage.torch.algos.ppo import PPO from garage.torch.algos.maml_ppo import MAMLPPO +from garage.torch.algos.td3 import TD3 from garage.torch.algos.trpo import TRPO from garage.torch.algos.maml_trpo import MAMLTRPO # SAC needs to be imported before MTSAC @@ -18,6 +19,6 @@ from garage.torch.algos.pearl import PEARL __all__ = [ - 'BC', 'DDPG', 'DQN', 'VPG', 'PPO', 'TRPO', 'MAMLPPO', 'MAMLTRPO', + 'BC', 'DDPG', 'DQN', 'VPG', 'PPO', 'TD3', 'TRPO', 'MAMLPPO', 'MAMLTRPO', 'MAMLVPG', 'MTSAC', 'PEARL', 'SAC' ] diff --git a/src/garage/torch/algos/td3.py b/src/garage/torch/algos/td3.py new file mode 100644 index 0000000000..d1bdc7550e --- /dev/null +++ b/src/garage/torch/algos/td3.py @@ -0,0 +1,399 @@ +"""TD3 model in Pytorch.""" +import copy + +from dowel import logger, tabular +import numpy as np +import torch +import torch.nn.functional as F + +from garage import (_Default, log_performance, make_optimizer, + obtain_evaluation_episodes) +from garage.np.algos import RLAlgorithm +from garage.sampler import FragmentWorker, LocalSampler +from garage.torch import (dict_np_to_torch, global_device, soft_update_model, + torch_to_np) + + +class TD3(RLAlgorithm): + """Implementation of TD3. + + Based on https://arxiv.org/pdf/1802.09477.pdf. + + Args: + env_spec (EnvSpec): Environment specification. + policy (garage.torch.policies.Policy): Policy (actor network). + qf1 (garage.torch.q_functions.QFunction): Q function (critic network). + qf2 (garage.torch.q_functions.QFunction): Q function (critic network). + replay_buffer (ReplayBuffer): Replay buffer. + replay_buffer_size (int): Size of the replay buffer + exploration_policy (garage.np.exploration_policies.ExplorationPolicy): + Exploration strategy. + uniform_random_policy + (garage.np.exploration_policies.ExplorationPolicy): + Uniform random exploration strategy. + target_update_tau (float): Interpolation parameter for doing the + soft target update. + discount (float): Discount factor (gamma) for the cumulative return. + reward_scaling (float): Reward scaling. + update_actor_interval (int): Policy (Actor network) update interval. + max_action (float): Maximum action magnitude. + buffer_batch_size (int): Size of replay buffer. + min_buffer_size (int): The minimum buffer size for replay buffer. + policy_noise (float): Policy (actor) noise. + policy_noise_clip (float): Noise clip. + exploration_noise (float): Exploration noise. + clip_return (float): Clip return to be in [-clip_return, + clip_return]. + policy_lr (float): Learning rate for training policy network. + qf_lr (float): Learning rate for training Q network. + policy_optimizer (Union[type, tuple[type, dict]]): Type of optimizer + for training policy network. This can be an optimizer type such as + `torch.optim.Adam` or a tuple of type and dictionary, where + dictionary contains arguments to initialize the optimizer + e.g. `(torch.optim.Adam, {'lr' : 1e-3})`. + qf_optimizer (Union[type, tuple[type, dict]]): Type of optimizer + for training Q-value network. This can be an optimizer type such + as `torch.optim.Adam` or a tuple of type and dictionary, where + dictionary contains arguments to initialize the optimizer + e.g. `(torch.optim.Adam, {'lr' : 1e-3})`. + steps_per_epoch (int): Number of train_once calls per epoch. + grad_steps_per_env_step (int): Number of gradient steps taken per + environment step sampled. + max_episode_length_eval (int or None): Maximum length of episodes used + for off-policy evaluation. If None, defaults to + `env_spec.max_episode_length`. + num_evaluation_episodes (int): The number of evaluation + trajectories used for computing eval stats at the end of every + epoch. + start_steps (int): The number of steps for warming up before + selecting actions according to policy. + update_after (int): The number of steps to perform before policy + is updated. + use_deterministic_evaluation (bool): True if the trained policy + should be evaluated deterministically. + + """ + + def __init__( + self, + env_spec, + policy, + qf1, + qf2, + replay_buffer, + *, # Everything after this is numbers. + max_episode_length_eval=None, + grad_steps_per_env_step, + exploration_policy, + uniform_random_policy=None, + max_action=None, + target_update_tau=0.005, + discount=0.99, + reward_scaling=1., + update_actor_interval=2, + buffer_batch_size=64, + replay_buffer_size=1e6, + min_buffer_size=1e4, + exploration_noise=0.1, + policy_noise=0.2, + policy_noise_clip=0.5, + clip_return=np.inf, + policy_lr=_Default(1e-4), + qf_lr=_Default(1e-3), + policy_optimizer=torch.optim.Adam, + qf_optimizer=torch.optim.Adam, + num_evaluation_episodes=10, + steps_per_epoch=20, + start_steps=10000, + update_after=1000, + use_deterministic_evaluation=False): + + self._env_spec = env_spec + action_bound = self._env_spec.action_space.high[0] + self._max_action = action_bound if max_action is None else max_action + self._action_dim = self._env_spec.action_space.shape[0] + self._tau = target_update_tau + self._discount = discount + self._reward_scaling = reward_scaling + self._exploration_noise = exploration_noise + self._policy_noise = policy_noise + self._policy_noise_clip = policy_noise_clip + self._clip_return = clip_return + self._replay_buffer_size = replay_buffer_size + self._min_buffer_size = min_buffer_size + self._buffer_batch_size = buffer_batch_size + self._grad_steps_per_env_step = grad_steps_per_env_step + self._update_actor_interval = update_actor_interval + self._steps_per_epoch = steps_per_epoch + self._start_steps = start_steps + self._update_after = update_after + self._num_evaluation_episodes = num_evaluation_episodes + self.max_episode_length = env_spec.max_episode_length + self._max_episode_length_eval = env_spec.max_episode_length + + if max_episode_length_eval is not None: + self._max_episode_length_eval = max_episode_length_eval + self._use_deterministic_evaluation = use_deterministic_evaluation + + self._episode_policy_losses = [] + self._episode_qf_losses = [] + self._epoch_ys = [] + self._epoch_qs = [] + self._eval_env = None + self.exploration_policy = exploration_policy + self._uniform_random_policy = uniform_random_policy + self.worker_cls = FragmentWorker + self.sampler_cls = LocalSampler + + self._replay_buffer = replay_buffer + self.policy = policy + self._qf_1 = qf1 + self._qf_2 = qf2 + self._target_policy = copy.deepcopy(self.policy) + self._target_qf_1 = copy.deepcopy(self._qf_1) + self._target_qf_2 = copy.deepcopy(self._qf_2) + + self._policy_optimizer = make_optimizer(policy_optimizer, + module=self.policy, + lr=policy_lr) + self._qf_optimizer_1 = make_optimizer(qf_optimizer, + module=self._qf_1, + lr=qf_lr) + self._qf_optimizer_2 = make_optimizer(qf_optimizer, + module=self._qf_2, + lr=qf_lr) + self._actor_loss = torch.zeros(1) + + def _get_action(self, action, noise_scale): + """Select action based on policy. + + Action can be added with noise. + + Args: + action (float): Action. + noise_scale (float): Noise scale added to action. + + Return: + float: Action selected by the policy. + """ + action += noise_scale * np.random.randn(self._action_dim) + # pylint: disable=invalid-unary-operand-type + return np.clip(action, -self._max_action, self._max_action) + + def train(self, trainer): + """Obtain samplers and start actual training for each epoch. + + Args: + trainer (Trainer): Experiment trainer, which provides services + such as snapshotting and sampler control. + + """ + if not self._eval_env: + self._eval_env = trainer.get_env_copy() + trainer.enable_logging = False + for _ in trainer.step_epochs(): + for cycle in range(self._steps_per_epoch): + # Obtain trasnsition batch and store it in replay buffer. + # Get action randomly from environment within warm-up steps. + # Afterwards, get action from policy. + if self._uniform_random_policy and \ + trainer.step_itr < self._start_steps: + trainer.step_path = trainer.obtain_episodes( + trainer.step_itr, + agent_update=self._uniform_random_policy) + else: + trainer.step_path = trainer.obtain_episodes( + trainer.step_itr, agent_update=self.exploration_policy) + self._replay_buffer.add_episode_batch(trainer.step_path) + + # Update after warm-up steps. + if trainer.total_env_steps >= self._update_after: + self._train_once(trainer.step_itr) + + # Evaluate and log the results. + if (cycle == 0 and self._replay_buffer.n_transitions_stored >= + self._min_buffer_size): + trainer.enable_logging = True + eval_eps = self._evaluate_policy() + log_performance(trainer.step_path, + eval_eps, + discount=self._discount, + prefix='Training') + log_performance(trainer.step_itr, + eval_eps, + discount=self._discount, + prefix='Evaluation') + trainer.step_itr += 1 + + def _train_once(self, itr): + """Perform one iteration of training. + + Args: + itr (int): Iteration number. + + """ + for grad_step_timer in range(self._grad_steps_per_env_step): + if (self._replay_buffer.n_transitions_stored >= + self._min_buffer_size): + # Sample from buffer + samples = self._replay_buffer.sample_transitions( + self._buffer_batch_size) + samples = dict_np_to_torch(samples) + + # Optimize + qf_loss, y, q, policy_loss = torch_to_np( + self._optimize_policy(samples, grad_step_timer)) + + self._episode_policy_losses.append(policy_loss) + self._episode_qf_losses.append(qf_loss) + self._epoch_ys.append(y) + self._epoch_qs.append(q) + + if itr % self._steps_per_epoch == 0: + logger.log('Training finished') + epoch = itr // self._steps_per_epoch + + if (self._replay_buffer.n_transitions_stored >= + self._min_buffer_size): + tabular.record('Epoch', epoch) + self._log_statistics() + + # pylint: disable=invalid-unary-operand-type + def _optimize_policy(self, samples_data, grad_step_timer): + """Perform algorithm optimization. + + Args: + samples_data (dict): Processed batch data. + grad_step_timer (int): Iteration number of the gradient time + taken in the env. + + Returns: + float: Loss predicted by the q networks + (critic networks). + float: Q value (min) predicted by one of the + target q networks. + float: Q value (min) predicted by one of the + current q networks. + float: Loss predicted by the policy + (action network). + + """ + rewards = samples_data['rewards'].to(global_device()).reshape(-1, 1) + terminals = samples_data['terminals'].to(global_device()).reshape( + -1, 1) + actions = samples_data['actions'].to(global_device()) + observations = samples_data['observations'].to(global_device()) + next_observations = samples_data['next_observations'].to( + global_device()) + + next_inputs = next_observations + inputs = observations + with torch.no_grad(): + # Select action according to policy and add clipped noise + noise = (torch.randn_like(actions) * self._policy_noise).clamp( + -self._policy_noise_clip, self._policy_noise_clip) + next_actions = (self._target_policy(next_inputs) + noise).clamp( + -self._max_action, self._max_action) + + # Compute the target Q value + target_Q1 = self._target_qf_1(next_inputs, next_actions) + target_Q2 = self._target_qf_2(next_inputs, next_actions) + target_q = torch.min(target_Q1, target_Q2) + target_Q = rewards * self._reward_scaling + ( + 1. - terminals) * self._discount * target_q + + # Get current Q values + current_Q1 = self._qf_1(inputs, actions) + current_Q2 = self._qf_2(inputs, actions) + current_Q = torch.min(current_Q1, current_Q2) + + # Compute critic loss + critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( + current_Q2, target_Q) + + # Optimize critic + self._qf_optimizer_1.zero_grad() + self._qf_optimizer_2.zero_grad() + critic_loss.backward() + self._qf_optimizer_1.step() + self._qf_optimizer_2.step() + + # Deplay policy updates + if grad_step_timer % self._update_actor_interval == 0: + # Compute actor loss + actions = self.policy(inputs) + self._actor_loss = -self._qf_1(inputs, actions).mean() + + # Optimize actor + self._policy_optimizer.zero_grad() + self._actor_loss.backward() + self._policy_optimizer.step() + + # update target networks + self._update_network_parameters() + + return (critic_loss.detach(), target_Q, current_Q.detach(), + self._actor_loss.detach()) + + def _evaluate_policy(self): + """Evaluate the performance of the policy via deterministic rollouts. + + Statistics such as (average) discounted return and success rate are + recorded. + + Returns: + TrajectoryBatch: Evaluation trajectories, representing the best + current performance of the algorithm. + + """ + return obtain_evaluation_episodes( + self.exploration_policy, + self._eval_env, + self._max_episode_length_eval, + num_eps=self._num_evaluation_episodes, + deterministic=self._use_deterministic_evaluation) + + def _update_network_parameters(self): + """Update parameters in actor network and critic networks.""" + soft_update_model(self._target_qf_1, self._qf_1, self._tau) + soft_update_model(self._target_qf_2, self._qf_2, self._tau) + soft_update_model(self._target_policy, self.policy, self._tau) + + def _log_statistics(self): + """Output training statistics to dowel such as losses and returns.""" + tabular.record('Policy/AveragePolicyLoss', + np.mean(self._episode_policy_losses)) + tabular.record('QFunction/AverageQFunctionLoss', + np.mean(self._episode_qf_losses)) + tabular.record('QFunction/AverageQ', np.mean(self._epoch_qs)) + tabular.record('QFunction/MaxQ', np.max(self._epoch_qs)) + tabular.record('QFunction/AverageAbsQ', + np.mean(np.abs(self._epoch_qs))) + tabular.record('QFunction/AverageY', np.mean(self._epoch_ys)) + tabular.record('QFunction/MaxY', np.max(self._epoch_ys)) + tabular.record('QFunction/AverageAbsY', + np.mean(np.abs(self._epoch_ys))) + + @property + def networks(self): + """Return all the networks within the model. + + Returns: + list: A list of networks. + + """ + return [ + self.policy, self._qf_1, self._qf_2, self._target_policy, + self._target_qf_1, self._target_qf_2 + ] + + def to(self, device=None): + """Put all the networks within the model on device. + + Args: + device (str): ID of GPU or CPU. + + """ + device = device or global_device() + for net in self.networks: + net.to(device) diff --git a/src/garage/torch/policies/deterministic_mlp_policy.py b/src/garage/torch/policies/deterministic_mlp_policy.py index ea94ff1eb2..3200204e75 100644 --- a/src/garage/torch/policies/deterministic_mlp_policy.py +++ b/src/garage/torch/policies/deterministic_mlp_policy.py @@ -7,6 +7,7 @@ import numpy as np import torch +from garage.torch import global_device from garage.torch.modules import MLPModule from garage.torch.policies.policy import Policy @@ -101,5 +102,5 @@ def get_actions(self, observations): observations = self._env_spec.observation_space.unflatten_n( observations) with torch.no_grad(): - x = self(torch.Tensor(observations)) - return x.numpy(), dict() + x = self(torch.Tensor(observations).to(global_device())) + return x.cpu().numpy(), dict() diff --git a/tests/garage/np/policies/test_uniform_random_policy.py b/tests/garage/np/policies/test_uniform_random_policy.py new file mode 100644 index 0000000000..7ed3661bc6 --- /dev/null +++ b/tests/garage/np/policies/test_uniform_random_policy.py @@ -0,0 +1,13 @@ +import numpy as np +import pytest + +from garage.envs import GymEnv, normalize +from garage.np.policies import UniformRandomPolicy + + +@pytest.mark.mujoco +def test_get_actions(): + env = normalize(GymEnv('InvertedDoublePendulum-v2')) + policy = UniformRandomPolicy(env.spec) + assert policy.get_actions(np.array([0]).reshape(1, 1))[0] + assert policy.get_action(np.array([0]))[0] diff --git a/tests/garage/test_dtypes.py b/tests/garage/test_dtypes.py index 2e6d393ed7..d228323de3 100644 --- a/tests/garage/test_dtypes.py +++ b/tests/garage/test_dtypes.py @@ -4,11 +4,7 @@ import pytest # yapf: disable -from garage import (EnvSpec, - EnvStep, - EpisodeBatch, - StepType, - TimeStep, +from garage import (EnvSpec, EnvStep, EpisodeBatch, StepType, TimeStep, TimeStepBatch) # yapf: enable diff --git a/tests/garage/torch/algos/test_td3.py b/tests/garage/torch/algos/test_td3.py new file mode 100644 index 0000000000..7dba03f3fa --- /dev/null +++ b/tests/garage/torch/algos/test_td3.py @@ -0,0 +1,108 @@ +"""Test TD3 on InvertedDoublePendulum-v2.""" +import pickle + +import pytest +from torch.nn import functional as F + +from garage.envs import GymEnv, normalize +from garage.experiment import deterministic +from garage.np.exploration_policies import AddGaussianNoise +from garage.replay_buffer import PathBuffer +from garage.sampler import LocalSampler +from garage.torch import prefer_gpu +from garage.torch.algos import TD3 +from garage.torch.policies import DeterministicMLPPolicy +from garage.torch.q_functions import ContinuousMLPQFunction +from garage.trainer import Trainer + +from tests.fixtures import snapshot_config, TfGraphTestCase + + +class TestTD3(TfGraphTestCase): + """Test class for TD3.""" + + @pytest.mark.mujoco + def test_td3_inverted_double_pendulum(self): + deterministic.set_seed(0) + n_epochs = 10 + steps_per_epoch = 20 + sampler_batch_size = 100 + num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size + trainer = Trainer(snapshot_config=snapshot_config) + env = normalize( + GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) + policy = DeterministicMLPPolicy(env_spec=env.spec, + hidden_sizes=[64, 64], + hidden_nonlinearity=F.relu, + output_nonlinearity=None) + exploration_policy = AddGaussianNoise(env.spec, + policy, + total_timesteps=num_timesteps, + max_sigma=0.1, + min_sigma=0.1) + qf1 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu) + qf2 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu) + replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) + td3 = TD3(env_spec=env.spec, + policy=policy, + qf1=qf1, + qf2=qf2, + replay_buffer=replay_buffer, + exploration_policy=exploration_policy, + steps_per_epoch=steps_per_epoch, + grad_steps_per_env_step=1, + num_evaluation_episodes=1, + discount=0.99) + + prefer_gpu() + td3.to() + trainer.setup(td3, env, sampler_cls=LocalSampler) + trainer.train(n_epochs=n_epochs, batch_size=sampler_batch_size) + + @pytest.mark.mujoco + def test_pickling(self): + """Test pickle and unpickle.""" + + deterministic.set_seed(0) + n_epochs = 10 + steps_per_epoch = 20 + sampler_batch_size = 100 + num_timesteps = n_epochs * steps_per_epoch * sampler_batch_size + env = normalize( + GymEnv('InvertedDoublePendulum-v2', max_episode_length=100)) + policy = DeterministicMLPPolicy(env_spec=env.spec, + hidden_sizes=[64, 64], + hidden_nonlinearity=F.relu, + output_nonlinearity=None) + exploration_policy = AddGaussianNoise(env.spec, + policy, + total_timesteps=num_timesteps, + max_sigma=0.1, + min_sigma=0.1) + qf1 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu) + qf2 = ContinuousMLPQFunction(env_spec=env.spec, + hidden_sizes=[256, 256], + hidden_nonlinearity=F.relu) + replay_buffer = PathBuffer(capacity_in_transitions=int(1e6)) + td3 = TD3(env_spec=env.spec, + policy=policy, + qf1=qf1, + qf2=qf2, + replay_buffer=replay_buffer, + exploration_policy=exploration_policy, + steps_per_epoch=steps_per_epoch, + grad_steps_per_env_step=1, + num_evaluation_episodes=1, + discount=0.99) + prefer_gpu() + td3.to() + + pickled = pickle.dumps(td3) + unpickled = pickle.loads(pickled) + assert unpickled diff --git a/tests/integration_tests/test_examples.py b/tests/integration_tests/test_examples.py index 0d36d411a0..f5401bfb22 100644 --- a/tests/integration_tests/test_examples.py +++ b/tests/integration_tests/test_examples.py @@ -45,6 +45,8 @@ EXAMPLES_ROOT_DIR / 'torch/mttrpo_metaworld_mt1_push.py', EXAMPLES_ROOT_DIR / 'torch/mttrpo_metaworld_mt10.py', EXAMPLES_ROOT_DIR / 'torch/mttrpo_metaworld_mt50.py', + EXAMPLES_ROOT_DIR / 'torch/td3_halfcheetah.py', + EXAMPLES_ROOT_DIR / 'torch/td3_pendulum.py', EXAMPLES_ROOT_DIR / 'tf/te_ppo_point.py', EXAMPLES_ROOT_DIR / 'tf/te_ppo_metaworld_mt1_push.py', EXAMPLES_ROOT_DIR / 'tf/te_ppo_metaworld_mt10.py', From fa3304008124ad77221d8b15f01f9a561f012acd Mon Sep 17 00:00:00 2001 From: mishari <44849486+maliesa96@users.noreply.github.com> Date: Thu, 22 Oct 2020 13:08:11 -0700 Subject: [PATCH 19/23] Add Torch TD3 and DQN to README (#2150) * Add Torch TD3 and DQN to README * Apply suggestions from code review * Update README.md Co-authored-by: Ryan Julian --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5150ff3b78..6152b0cb0e 100644 --- a/README.md +++ b/README.md @@ -56,13 +56,13 @@ The table below summarizes the algorithms available in garage. | CMA-ES | numpy | | REINFORCE (a.k.a. VPG) | PyTorch, TensorFlow | | DDPG | PyTorch, TensorFlow | -| DQN | TensorFlow | +| DQN | PyTorch, TensorFlow | | DDQN | TensorFlow | | ERWR | TensorFlow | | NPO | TensorFlow | | PPO | PyTorch, TensorFlow | | REPS | TensorFlow | -| TD3 | TensorFlow | +| TD3 | PyTorch, TensorFlow | | TNPG | TensorFlow | | TRPO | PyTorch, TensorFlow | | MAML | PyTorch | From d843e5b67460c9216fb5f5b2a8982ffcb82531f4 Mon Sep 17 00:00:00 2001 From: Karthikeyan Singaravelan Date: Fri, 23 Oct 2020 01:49:37 +0530 Subject: [PATCH 20/23] Fix warning regarding ABC import from collections (#2146) --- tests/garage/envs/dm_control/test_dm_control_env.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/garage/envs/dm_control/test_dm_control_env.py b/tests/garage/envs/dm_control/test_dm_control_env.py index 9ecf066211..3b31c6d61e 100644 --- a/tests/garage/envs/dm_control/test_dm_control_env.py +++ b/tests/garage/envs/dm_control/test_dm_control_env.py @@ -1,4 +1,4 @@ -import collections +import collections.abc from copy import copy import pickle @@ -79,7 +79,7 @@ def test_does_not_modify_actions(self): a_copy = copy(a) env.reset() env.step(a) - if isinstance(a, collections.Iterable): + if isinstance(a, collections.abc.Iterable): assert a.all() == a_copy.all() else: assert a == a_copy @@ -94,7 +94,7 @@ def test_all_does_not_modify_actions(self, domain_name, task_name): a_copy = copy(a) env.reset() env.step(a) - if isinstance(a, collections.Iterable): + if isinstance(a, collections.abc.Iterable): assert a.all() == a_copy.all() else: assert a == a_copy From 792771dc7ecc1b695a56f8e54a5c249f0d7c914e Mon Sep 17 00:00:00 2001 From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com> Date: Mon, 26 Oct 2020 21:11:54 -0700 Subject: [PATCH 21/23] Add docs for algos/CEM (#2141) * Add cem doc fix ppo doc title * Chmod numpy.png --- docs/index.md | 1 + docs/user/algo_cem.md | 48 +++++++++++++++++++++++++++++++++++++ docs/user/algo_ppo.md | 4 ++-- docs/user/images/numpy.png | Bin 0 -> 4653 bytes 4 files changed, 51 insertions(+), 2 deletions(-) create mode 100644 docs/user/algo_cem.md create mode 100644 docs/user/images/numpy.png diff --git a/docs/index.md b/docs/index.md index d001eade5e..4cd7acd51f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -62,6 +62,7 @@ and how to implement new MDPs and new algorithms. user/algo_vpg user/algo_td3 user/algo_ddpg + user/algo_cem .. toctree:: :maxdepth: 2 diff --git a/docs/user/algo_cem.md b/docs/user/algo_cem.md new file mode 100644 index 0000000000..4fd38e0f8c --- /dev/null +++ b/docs/user/algo_cem.md @@ -0,0 +1,48 @@ +# Cross Entropy Method + +```eval_rst ++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Paper** | The cross-entropy method: A unified approach to Monte Carlo simulation, randomized optimization and machine learning :cite:`rubinstein2004cross` | ++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Framework(s)** | .. figure:: ./images/numpy.png | +| | :scale: 40% | +| | :class: no-scaled-link | ++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +| **API Reference** | `garage.np.algos.CEM <../_autoapi/garage/np/algos/index.html#garage.np.algos.CEM>`_ | ++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +| **Code** | `garage/np/algos/cem.py `_ | ++-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+ +``` + +Cross Entropy Method (CEM) works by iteratively optimizing a gaussian +distribution of policy. + +In each epoch, CEM does the following: + +1. Sample n_samples policies from a gaussian distribution of mean cur_mean and +std cur_std. + +2. Collect episodes for each policy. + +3. Update cur_mean and cur_std by doing Maximum Likelihood Estimation over the +n_best top policies in terms of return. + +## Examples + +### NumPy + +```eval_rst +.. literalinclude:: ../../examples/np/cem_cartpole.py +``` + +## References + +```eval_rst +.. bibliography:: references.bib + :style: unsrt + :filter: docname in docnames +``` + +---- + +*This page was authored by Ruofu Wang ([@yeukfu](https://github.com/yeukfu)).* diff --git a/docs/user/algo_ppo.md b/docs/user/algo_ppo.md index abb4a9ea9f..2ee714846f 100644 --- a/docs/user/algo_ppo.md +++ b/docs/user/algo_ppo.md @@ -35,13 +35,13 @@ regularization adds the mean entropy to the surrogate objective. See Garage has implementations of PPO with PyTorch and TensorFlow. -## PyTorch +### PyTorch ```eval_rst .. literalinclude:: ../../examples/torch/ppo_pendulum.py ``` -## TensorFlow +### TensorFlow ```eval_rst .. literalinclude:: ../../examples/tf/ppo_pendulum.py diff --git a/docs/user/images/numpy.png b/docs/user/images/numpy.png new file mode 100644 index 0000000000000000000000000000000000000000..4214edd5588f3a1f8d2baae20b38c27cf04e301d GIT binary patch literal 4653 zcmai2byU>P*Z%AR%aYRFAR)DYNQm^(At4|or63C|9THzkx={p|mK2r}LApyorKMb2 z(p|a)etggGulJAleb1bk=giEx=gc#6=iYms*vES6WW-Fw0059_YN#6C;^1vJheB@0 zs}siQTflcz(p3V0+GLWywgk6x4totlT>yB+3jm>!0B~`;6uJojJ}>~-wgCXyOaP#F z&u%u5yWN1;X{)OOH~+T$w$gL}pj6dVRWkOU+sg^|o|w8Cz~mv*8bGp?KHCred8lHe z{8?AaNA|e%Ogf*4=KWGn_CR#RgapBuycl~S*cE^$4Myh4+R9p)E2e=bWHukbXJ4j= zPs`XAF1a7ZmMTx&$6TIYDUk%6@#?16lkElPOn7mehG5&8PTR4+51TNTSCAA99;#SX zuGIeY_k`ZXv*sWo=1%(nJVHJWmo(&0 z0|}(?yykq(wI%W9y3!Iq$r_*Tgr*k+r~vJc_`Mqkm*0?T&?CMwVI`FxONJJ*SPb1@5zF0Xc(B`~o37S~&<`$Ik$f1ZSw>Dl ztH!q~3@`BXD<7gaauUHxE3E;K=NLb<=6mLwn=@*vL5m#r8YYY-cYaaWs-}H;Y$gZk zB(-4{Fka7fo1?1urHN=&m=qf&SS1IVW7fED*@n<+#K2zQN#70Amgg=Zi2nBT#;zxl zNC{0!PAI5LzY7Z}q4&3^6(3uFq@wQ+^}ddbQ#B!Qr&qPSxQr zXvw;UYdix$j_0pdO(OT%ZBD;E)kv92OY2n0tlCl&KTx(8w3Mg(gL{K+A~4qB8)b_n zX)b|{Y25J!xTf6sgw|75WBiYu3`?c|;rr4ESTdvit6$(>+R9loMyL1cLLeJ>42$$t zIjY3?!7zgkh|~V^^p!TV8)|9htBu!Zhx76)5wMHS*SdXnnn@WJ9>c0<#46(!V1X*9 zToveaKsG$Ee%!Ij>HaQr!GcD9w4Kgm4Kjaq&PS)JBDf&x}t-I*%~Nz`$Mnh73msB#UxJDPQ1F~4gm zpkEu(wr$p;c*(OjLYo*RsyFd&lh)Zx3cWO(nJ%$!&&wI!O{nTGm;2bIDxCeJZymGWhed)JJtrSpE7mm?=4T9&J-jq3;nW3k>5*+I{7etK$BBWPBWd|535pk)!UFu=O zrmK74tadZ)3RFxoAenM*%Io*M)7|_0M$^fW)tn%6G6FnJN9^@4z_fnzp9&# zvac(E>?1HJ{vpE`f$IaE91<$dM%Ob+?bm)1l%jq!2{-m7L5cMWu)a=~`74XW#dV7S zQv>xjwEOWarp%r(NHA1Cji!Hh>4&ZTg74vYCLw!Ef5cl9Bfk0UfH@+d|ImmJw5`hsvgGd#j1{|d-w9(MppNS4y4vqu zyV)eLZW*tIi~bHu3sLC#X!`PSa<+XL&wvB?(ppMgG@VPoX~J*SEgj^ZFz=QW5HV&x z^9EUd-1Pn+Xy$Z+;+bTn>sgS@)Q2`Lu%yow=!`;9Jz?`KC>1`1XUxHnwWH8SC)(2+ z==TDF#uX@Ymb5_6~>E1UZGN)p0URTFvC`?L1P* zN=26Au0~dSOZ+pUvP{p^lX9&rm(gM&bvtk8=5X9R`(Ui5NS>y8CJAlv;7Fq?xsh?L z@3spDUajCN|6?LvcwLu1y>K}BC+o%Fp3q-Jk>mr?N1lka<7m&5YBzTl3uE4Qt{s-8 ztF;4h6dymXpcD1!LLjiGf^9P-KOF32(_?QcDB3ov9&}J|G-M!e#L~^dg@>IC;3Wir z{Ea_1JtcDf(qdcN2loWJ$UQa@GoUAokTcypGF3n7DNyN;@TqNxa!tyx4f&_#b$(>_ zebdt;@90WXJWVcRkO#qdpBWf^kj!FbV7ct$Omug|?n~=CiZM%|kFSeM#(cC3%R-(@ zwUhsRm)y8S;xK|TWZ)3!#^*W;D;R0zIyhM9X+nU`K#B&FsI0B3s<5%G{esU6fxTw5}T4?^pke& zK=&c(TM9Q$-d-L}jrX15RukT}#nYz{N>^P~mN;nRJ<`{giWHakD_?pTp|Ay{)PJOT z55yoPxFY?OY;73&2sA_%v-tdXOB&NfqTe7n{UL>oqO=g}29pPgUn`M%h|F^)yXH7( zwJHxe*?A*Lk)u-QJ}Rfb#x0($Mb{?4#yHFiNwd z3Pt>GHU%el9ccJ|Sd`BqOLJF^n@86>U@u7|Ss&}G#QRbD+HY4_dd4{(@0n6@-H$tR zdWYRT1qGj&}|jk`dOA7r94nMGDMNSFAwt7FCSWK$U=Nh8u)HG z=A@ymE3>ULF6*yxzsGzFrP8~XC9d+CXwF;KqK9r72s{5#e zhjTrWD`MK%G_4@_BnQm0@gy;AH8lo;5znU3)0^K51!y0N)ZDs4+^eOeeV63MnLuHP zZ7xAKI&`Dc(XZ^{5->hC#SUCtrR+25yg|2ZtaP?JM+{Q{ccZMhS#;4hdkC@j7LQ>4# zUG6qTfHS;DrdO>99U*?C17^14-^?xl6#UKM&pkfspdD2l`+l7_QADgDqjW{%d&7`(1pHKcw(PmcdPiNI?(~0!)f=rO#Ns}jb@t3w! z`#}Y@P_hgAqn+Q+YUoE0T=HzuF^!uP0>eh=3R77N=sG8yQcP9(bp8*f`F@I91jvOz zKyu%=h}Uf)%hlj$p>Uhl={e)SO^6=-a>18mzBgW&)Sj+`{BZqXI{)cBC7nEtf5pYUOIx72S_-E1n zHgPc*wW7-4Ozsq-iK%sbj;DJ1Y9vKosBf&?=juYe3C5AjwJ6c}|9ao&X;U|=)hkSb zqh?=tT|U$kPqYM|#28K9lsO^1*^+KYTTJfK<)Bb4mNB*%^KS?RvmBWC*b zPcoZPJwfN*ex%3Z_MQq_1S6#2v+K|~6s|Wq*~OY?v;^4go#b%v4ZxBGHk2pINkdY@ z;1RkUWJyFykJmEf&c`T&IIVn+wvz^5IbL68cw*QUzZh_zKU|EarxZk%2RI7ZMj~fU z#_YcSB}yHiw6a0J?onFN8ps%5Xjs+{^$?(Z?jr)eiH)%UKR_YO=Lg3csbSQLw!gc5 zp`CQE{bw)UcQ!{e--h28ZrLrSGhtLvaZ&a`l2zXE7GCZLurzp@DD5E z7$?IY;QZ17siarg8wLB~*%Xi`{XakE+(z&jc4VN{p}kEAPnh1&PbooJmP!}bMcW`R zk1k45&)eeWJ6@2w>TaNIWebkcQK>oHP`e5FMC#7sQB24u21ZT_Dge?Z&3l{wK^$(e z3HlQPNjwC~NIaMA*kc3Iivc*ypQQOFY}A?-A||0CPVqy~5~60}Z-)9?zF1GN#a6l{ zUtn|qn&f4PAL+0WjWH>j`{}}6jUCHDYHh>GM9w}4hPbapezo9zCNllYdGIOt%uPXKm9@b1 zKuSmOOp&~;XeRu~So*|`D|d$d*|e+Fjb@xm+So1OPp9#}6@QKmPeEVE3q_lGjuSbo ze=Q`Ziev4hZ{s8dj_=tAwV`p}J!9_jP*BqzKv3Tjv5KMxB>i?f6@n6{jB71WuC^YmsWsmjI zt-^BG8*b+9VC(%{*532^Edb&$ahNbnN?2UNSWHq@{DG{5m>>)$3xlcswU+)L2Chhl amoEbTcLS+{jaz*O05suxs#VH1;r{`_@u0W> literal 0 HcmV?d00001 From a63349a091df2f169c8f1530546ecbf0ae1e6a8d Mon Sep 17 00:00:00 2001 From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com> Date: Tue, 27 Oct 2020 14:32:15 -0700 Subject: [PATCH 22/23] Add docs for logging and plotting (#2147) * Add docs for logging and plotting * Fix grammer error * Fix isort --- docs/index.md | 1 + docs/user/logging_plotting.md | 82 +++++++++++++++++++++++++++++++++++ src/garage/__init__.py | 17 +++----- 3 files changed, 90 insertions(+), 10 deletions(-) create mode 100644 docs/user/logging_plotting.md diff --git a/docs/index.md b/docs/index.md index 4cd7acd51f..cbbeb3ddc9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -72,6 +72,7 @@ and how to implement new MDPs and new algorithms. user/environment_libraries user/concept_experiment user/sampling + user/logging_plotting .. toctree:: :maxdepth: 2 diff --git a/docs/user/logging_plotting.md b/docs/user/logging_plotting.md new file mode 100644 index 0000000000..1d0c7d2f17 --- /dev/null +++ b/docs/user/logging_plotting.md @@ -0,0 +1,82 @@ +# Logging and plotting + +## Logging + +garage supports convenient and useful logging. garage uses [dowel](https://github.com/rlworkgroup/dowel) +for logging. The `logger` supports many outputs, including + +- Std output +- Text output +- Csv output +- TensorBoard output + +In garage's experiment, the `logger` will output to all of these. + +Here is an example of logging in garage. + +```py +from garage import wrap_experiment +from dowel import logger, tabular + +@wrap_experiment +def log_experiment(ctxt=None): + for i in range(100): + # Log str directly + logger.log('Logging messages:') + # Log scalar values with the key 'AverageReturn' + tabular.record('AverageReturn', i) + + # The Trainer will do these steps for you, if you log things in + # the algorithms. + logger.log(tabular) + logger.dump_all() + +log_experiment() +``` + +Running the example will generate outputs like: + +```sh +2020-10-21 14:06:04 | [log_experiment] Logging to [CUR_DIR]/data/local/experiment/log_experiment +2020-10-21 14:06:04 | [log_experiment] Logging messages: +------------- - +AverageReturn 0 +------------- - +2020-10-21 14:06:04 | [log_experiment] Logging messages: +------------- - +AverageReturn 1 +------------- - +2020-10-21 14:06:04 | [log_experiment] Logging messages: +------------- - +AverageReturn 2 +------------- - +``` + +To look at outputs with TensorBoard, you can refer to this [page](monitor_experiments_with_tensorboard). + +To set a customized log directory, just pass a `log_dir` argument to the +experiment. + +```py +@wrap_experiment(log_dir='my_custom_log_fir') +``` + +## Plotting + +In garage, as long as the environment implement the `visualize()` method, is +it easy to plot a policy running in the environment when training. + +To visualize an experiment, just set the `plot` argument to `True` in the +[`train`](../_autoapi/garage/index.html#garage.Trainer.train) method of +`Trainer`. For example, in [example/tf/trpo_cartpole.py](https://github.com/rlworkgroup/garage/blob/master/examples/tf/trpo_cartpole.py), +change the train line into: + +```py +trainer.train(n_epochs=100, batch_size=4000, plot=True) +``` + +If you want to pause in every epoch, just set `pause_for_plot` to `True`. + +---- + +*This page was authored by Ruofu Wang ([@yeukfu](https://github.com/yeukfu)).* diff --git a/src/garage/__init__.py b/src/garage/__init__.py index 4f1ce5ac5b..5481bd61dc 100644 --- a/src/garage/__init__.py +++ b/src/garage/__init__.py @@ -1,18 +1,13 @@ """Garage Base.""" # yapf: disable -from garage._dtypes import (EpisodeBatch, - InOutSpec, - StepType, - TimeStep, +from garage._dtypes import (EpisodeBatch, InOutSpec, StepType, TimeStep, TimeStepBatch) from garage._environment import Environment, EnvSpec, EnvStep, Wrapper -from garage._functions import (_Default, - log_multitask_performance, - log_performance, - make_optimizer, - obtain_evaluation_episodes, - rollout) +from garage._functions import (_Default, log_multitask_performance, + log_performance, make_optimizer, + obtain_evaluation_episodes, rollout) from garage.experiment.experiment import wrap_experiment +from garage.trainer import TFTrainer, Trainer # yapf: enable @@ -33,4 +28,6 @@ 'Wrapper', 'rollout', 'obtain_evaluation_episodes', + 'Trainer', + 'TFTrainer', ] From 4312678397f78ea52cd9f47e1d53583bd146b8e9 Mon Sep 17 00:00:00 2001 From: Ruofu Wang <31981600+yeukfu@users.noreply.github.com> Date: Wed, 28 Oct 2020 11:57:56 -0700 Subject: [PATCH 23/23] Refactor RL2 to use EpisodeBatch (#2138) * Refactor RL2 to use EpisodeBatch * Fix isort --- .../torch/maml_trpo_metaworld_ml1_push.py | 3 +- src/garage/tf/algos/reps.py | 4 +- src/garage/tf/algos/rl2.py | 90 ++++++++++--------- src/garage/tf/algos/te_npo.py | 16 +--- src/garage/torch/algos/bc.py | 8 +- 5 files changed, 55 insertions(+), 66 deletions(-) diff --git a/examples/torch/maml_trpo_metaworld_ml1_push.py b/examples/torch/maml_trpo_metaworld_ml1_push.py index 93de39f2d3..840dd5746f 100755 --- a/examples/torch/maml_trpo_metaworld_ml1_push.py +++ b/examples/torch/maml_trpo_metaworld_ml1_push.py @@ -8,8 +8,7 @@ from garage import wrap_experiment from garage.envs import MetaWorldSetTaskEnv -from garage.experiment import (MetaEvaluator, - MetaWorldTaskSampler, +from garage.experiment import (MetaEvaluator, MetaWorldTaskSampler, SetTaskSampler) from garage.experiment.deterministic import set_seed from garage.torch.algos import MAMLTRPO diff --git a/src/garage/tf/algos/reps.py b/src/garage/tf/algos/reps.py index 4c99b36b0f..dd639be030 100644 --- a/src/garage/tf/algos/reps.py +++ b/src/garage/tf/algos/reps.py @@ -10,9 +10,7 @@ from garage import _Default, log_performance, make_optimizer from garage.np.algos import RLAlgorithm from garage.sampler import RaySampler -from garage.tf import (compile_function, - flatten_inputs, - graph_inputs, +from garage.tf import (compile_function, flatten_inputs, graph_inputs, new_tensor) from garage.tf.optimizers import LBFGSOptimizer diff --git a/src/garage/tf/algos/rl2.py b/src/garage/tf/algos/rl2.py index d67a93b584..d07026eef1 100644 --- a/src/garage/tf/algos/rl2.py +++ b/src/garage/tf/algos/rl2.py @@ -12,7 +12,6 @@ from garage import (EnvSpec, EnvStep, EpisodeBatch, log_multitask_performance, StepType, Wrapper) -from garage.np import concat_tensor_dict_list, discount_cumsum from garage.np.algos import MetaRLAlgorithm from garage.sampler import DefaultWorker from garage.tf.algos._rl2npo import RL2NPO @@ -339,7 +338,7 @@ def train(self, trainer): if trainer.step_itr % self._n_epochs_per_eval == 0: if self._meta_evaluator is not None: self._meta_evaluator.evaluate(self) - trainer.step_episode = trainer.obtain_samples( + trainer.step_episode = trainer.obtain_episodes( trainer.step_itr, env_update=self._task_sampler.sample(self._meta_batch_size)) last_return = self.train_once(trainer.step_itr, @@ -348,18 +347,18 @@ def train(self, trainer): return last_return - def train_once(self, itr, paths): + def train_once(self, itr, episodes): """Perform one step of policy optimization given one batch of samples. Args: itr (int): Iteration number. - paths (list[dict]): A list of collected paths. + episodes (EpisodeBatch): Batch of episodes. Returns: numpy.float64: Average return. """ - episodes, average_return = self._process_samples(itr, paths) + episodes, average_return = self._process_samples(itr, episodes) logger.log('Optimizing policy...') self._inner_algo.optimize_policy(episodes) return average_return @@ -400,16 +399,17 @@ def adapt_policy(self, exploration_policy, exploration_episodes): return RL2AdaptedPolicy(exploration_policy._policy) # pylint: disable=protected-access - def _process_samples(self, itr, paths): + def _process_samples(self, itr, episodes): # pylint: disable=too-many-statements """Return processed sample data based on the collected paths. Args: itr (int): Iteration number. - paths (OrderedDict[dict]): A list of collected paths for each - task. In RL^2, there are n environments/tasks and paths in - each of them will be concatenated at some point and fed to - the policy. + episodes (EpisodeBatch): Original collected episode batch for each + task. For each episode, episode.agent_infos['batch_idx'] + indicates which task this episode belongs to. In RL^2, there + are n environments/tasks and paths in each of them will be + concatenated at some point and fed to the policy. Returns: EpisodeBatch: Processed batch of episodes for feeding the inner @@ -423,13 +423,12 @@ def _process_samples(self, itr, paths): concatenated_paths = [] paths_by_task = collections.defaultdict(list) - for path in paths: - path['returns'] = discount_cumsum(path['rewards'], self._discount) - path['lengths'] = [len(path['rewards'])] - if 'batch_idx' in path: - paths_by_task[path['batch_idx']].append(path) - elif 'batch_idx' in path['agent_infos']: - paths_by_task[path['agent_infos']['batch_idx'][0]].append(path) + for episode in episodes.split(): + if hasattr(episode, 'batch_idx'): + paths_by_task[episode.batch_idx[0]].append(episode) + elif 'batch_idx' in episode.agent_infos: + paths_by_task[episode.agent_infos['batch_idx'][0]].append( + episode) else: raise ValueError( 'Batch idx is required for RL2 but not found, ' @@ -437,10 +436,12 @@ def _process_samples(self, itr, paths): 'for sampling') # all path in paths_by_task[i] are sampled from task[i] - for _paths in paths_by_task.values(): - concatenated_path = self._concatenate_paths(_paths) + for episode_list in paths_by_task.values(): + concatenated_path = self._concatenate_paths(episode_list) concatenated_paths.append(concatenated_path) + concatenated_episodes = EpisodeBatch.concatenate(*concatenated_paths) + name_map = None if hasattr(self._task_sampler, '_envs') and hasattr( self._task_sampler._envs[0]._env, 'all_task_names'): @@ -450,17 +451,13 @@ def _process_samples(self, itr, paths): name_map = dict(enumerate(names)) undiscounted_returns = log_multitask_performance( - itr, - EpisodeBatch.from_list(self._env_spec, paths), - self._inner_algo._discount, - name_map=name_map) + itr, episodes, self._inner_algo._discount, name_map=name_map) average_return = np.mean(undiscounted_returns) - episodes = EpisodeBatch.from_list(self._env_spec, concatenated_paths) - return episodes, average_return + return concatenated_episodes, average_return - def _concatenate_paths(self, paths): + def _concatenate_paths(self, episode_list): """Concatenate paths. The input paths are from different episodes but same task/environment. @@ -468,8 +465,8 @@ def _concatenate_paths(self, paths): path and fed to the policy. Args: - paths (dict): Input paths. All paths are from different episodes, - but the same task/environment. + episode_list (list[EpisodeBatch]): Input paths. All paths are from + different episodes, but the same task/environment. Returns: dict: Concatenated paths from the same task/environment. Shape of @@ -479,23 +476,30 @@ def _concatenate_paths(self, paths): values of shape :math:`[max_episode_length, S^*]` """ - observations = np.concatenate([path['observations'] for path in paths]) + env_infos = { + k: np.concatenate([b.env_infos[k] for b in episode_list]) + for k in episode_list[0].env_infos.keys() + } + agent_infos = { + k: np.concatenate([b.agent_infos[k] for b in episode_list]) + for k in episode_list[0].agent_infos.keys() + } actions = np.concatenate([ - self._env_spec.action_space.flatten_n(path['actions']) - for path in paths + self._env_spec.action_space.flatten_n(ep.actions) + for ep in episode_list ]) - valids = np.concatenate( - [np.ones_like(path['rewards']) for path in paths]) - baselines = np.concatenate( - [np.zeros_like(path['rewards']) for path in paths]) - - concatenated_path = concat_tensor_dict_list(paths) - concatenated_path['observations'] = observations - concatenated_path['actions'] = actions - concatenated_path['valids'] = valids - concatenated_path['baselines'] = baselines - - return concatenated_path + + return EpisodeBatch( + env_spec=episode_list[0].env_spec, + observations=np.concatenate( + [ep.observations for ep in episode_list]), + last_observations=episode_list[-1].last_observations, + actions=actions, + rewards=np.concatenate([ep.rewards for ep in episode_list]), + env_infos=env_infos, + agent_infos=agent_infos, + step_types=np.concatenate([ep.step_types for ep in episode_list]), + lengths=np.asarray([sum([ep.lengths[0] for ep in episode_list])])) @property def policy(self): diff --git a/src/garage/tf/algos/te_npo.py b/src/garage/tf/algos/te_npo.py index f4d526a952..962aaf2823 100644 --- a/src/garage/tf/algos/te_npo.py +++ b/src/garage/tf/algos/te_npo.py @@ -9,21 +9,13 @@ from garage import InOutSpec, log_performance from garage.experiment import deterministic -from garage.np import (discount_cumsum, - explained_variance_1d, - rrse, +from garage.np import (discount_cumsum, explained_variance_1d, rrse, sliding_window) from garage.np.algos import RLAlgorithm from garage.sampler import LocalSampler -from garage.tf import (center_advs, - compile_function, - compute_advantages, - concat_tensor_list, - discounted_returns, - flatten_inputs, - graph_inputs, - pad_tensor_dict, - positive_advs, +from garage.tf import (center_advs, compile_function, compute_advantages, + concat_tensor_list, discounted_returns, flatten_inputs, + graph_inputs, pad_tensor_dict, positive_advs, stack_tensor_dict_list) from garage.tf.embeddings import StochasticEncoder from garage.tf.optimizers import LBFGSOptimizer diff --git a/src/garage/torch/algos/bc.py b/src/garage/torch/algos/bc.py index 55ee264667..e65bc142c4 100644 --- a/src/garage/torch/algos/bc.py +++ b/src/garage/torch/algos/bc.py @@ -6,12 +6,8 @@ import numpy as np import torch -from garage import (_Default, - EpisodeBatch, - log_performance, - make_optimizer, - obtain_evaluation_episodes, - TimeStepBatch) +from garage import (_Default, EpisodeBatch, log_performance, make_optimizer, + obtain_evaluation_episodes, TimeStepBatch) from garage.np.algos.rl_algorithm import RLAlgorithm from garage.np.policies import Policy from garage.sampler import RaySampler