rlworkgroup
diff --git a/‎docs/user/implement_algo.md
Lines changed: 4 additions & 1 deletion b/‎docs/user/implement_algo.md
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/torch/tutorial_vpg.py
Lines changed: 5 additions & 1 deletion b/‎examples/torch/tutorial_vpg.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎setup.cfg
Lines changed: 2 additions & 2 deletions b/‎setup.cfg
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/garage/torch/__init__.py
Lines changed: 14 additions & 9 deletions b/‎src/garage/torch/__init__.py
Lines changed: 14 additions & 9 deletions
diff --git a/‎src/garage/torch/_dtypes.py
Lines changed: 108 additions & 0 deletions b/‎src/garage/torch/_dtypes.py
Lines changed: 108 additions & 0 deletions
diff --git a/‎src/garage/torch/_functions.py
Lines changed: 93 additions & 4 deletions b/‎src/garage/torch/_functions.py
Lines changed: 93 additions & 4 deletions
diff --git a/‎src/garage/torch/algos/bc.py
Lines changed: 3 additions & 3 deletions b/‎src/garage/torch/algos/bc.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/garage/torch/algos/ddpg.py
Lines changed: 2 additions & 2 deletions b/‎src/garage/torch/algos/ddpg.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/garage/torch/algos/dqn.py
Lines changed: 6 additions & 6 deletions b/‎src/garage/torch/algos/dqn.py
Lines changed: 6 additions & 6 deletions
@@ -195,6 +195,7 @@ import numpy as np
 
 from garage.samplers import LocalSampler
 from garage.np import discount_cumsum
+from garage.torch import PolicyMode, PolicyInput
 
 class SimpleVPG:
 
@@ -220,7 +221,9 @@ class SimpleVPG:
             returns = torch.Tensor(returns_numpy.copy())
             obs = torch.Tensor(path['observations'])
             actions = torch.Tensor(path['actions'])
-            dist = self.policy(obs)[0]
+            policy_input = PolicyInput(PolicyMode.FULL, obs,
+                                       lengths=[len(path)])
+            dist = self.policy(policy_input)[0]
             log_likelihoods = dist.log_prob(actions)
             loss = (-log_likelihoods * returns).mean()
             loss.backward()
 
@@ -8,6 +8,7 @@
 from garage.experiment.deterministic import set_seed
 from garage.np import discount_cumsum
 from garage.sampler import LocalSampler
+from garage.torch import PolicyInput, PolicyMode
 from garage.torch.policies import GaussianMLPPolicy
 from garage.trainer import Trainer
 
@@ -62,7 +63,10 @@ def _train_once(self, samples):
             returns = torch.Tensor(returns_numpy.copy())
             obs = torch.Tensor(path['observations'])
             actions = torch.Tensor(path['actions'])
-            dist = self.policy(obs)[0]
+            policy_input = PolicyInput(PolicyMode.FULL,
+                                       obs,
+                                       lengths=[len(path)])
+            dist = self.policy(policy_input)[0]
             log_likelihoods = dist.log_prob(actions)
             loss = (-log_likelihoods * returns).mean()
             loss.backward()
 
@@ -6,12 +6,12 @@ per-file-ignores =
     # See https://gitlab.com/pycqa/flake8/-/issues/494
     #
     # errors on valid property docstrings
-    src/garage/*:D403
+    src/garage/*:D403,R1720
     # unit tests don't need docstrings
     tests/garage/*:D, F401, F811
     # interferes with idiomatic `from torch.nn import functional as F`
     examples/torch/*:N812
-    src/garage/torch/*:N812,D403
+    src/garage/torch/*:N812,D403,R1720
     tests/garage/torch/*:N812,D
 
 # Docstring style checks
 
@@ -1,19 +1,24 @@
 """PyTorch-backed modules and algorithms."""
 # yapf: disable
-from garage.torch._functions import (compute_advantages, dict_np_to_torch,
+from garage.torch._dtypes import (PolicyInput, PolicyMode,
+                                  ShuffledOptimizationNotSupported)
+from garage.torch._functions import (as_torch, as_torch_dict,
+                                     compute_advantages, expand_var,
                                      filter_valids, flatten_batch,
                                      flatten_to_single_vector, global_device,
-                                     NonLinearity, np_to_torch, pad_to_last,
-                                     prefer_gpu, product_of_gaussians,
-                                     set_gpu_mode, soft_update_model,
-                                     torch_to_np, TransposeImage,
-                                     update_module_params)
+                                     NonLinearity, output_height_2d,
+                                     output_width_2d, pad_to_last, prefer_gpu,
+                                     product_of_gaussians, set_gpu_mode,
+                                     soft_update_model, torch_to_np,
+                                     TransposeImage, update_module_params)
 
 # yapf: enable
 __all__ = [
-    'compute_advantages', 'dict_np_to_torch', 'filter_valids', 'flatten_batch',
-    'global_device', 'np_to_torch', 'pad_to_last', 'prefer_gpu',
+    'compute_advantages', 'as_torch_dict', 'filter_valids', 'flatten_batch',
+    'global_device', 'as_torch', 'pad_to_last', 'prefer_gpu',
     'product_of_gaussians', 'set_gpu_mode', 'soft_update_model', 'torch_to_np',
     'update_module_params', 'NonLinearity', 'flatten_to_single_vector',
-    'TransposeImage'
+    'TransposeImage', 'PolicyMode', 'PolicyInput',
+    'ShuffledOptimizationNotSupported', 'output_width_2d', 'output_height_2d',
+    'expand_var'
 ]
@@ -0,0 +1,108 @@
+"""Data structures used in garage.torch."""
+from dataclasses import dataclass
+import enum
+
+import torch
+from torch import nn
+
+
+class ShuffledOptimizationNotSupported(ValueError):
+    """Raised by recurrent policies if they're passed a shuffled batch."""
+
+
+class PolicyMode(enum.IntEnum):
+    """Defines what mode a PolicyInput is being used in.
+
+    See :class:`PolicyInput` for detailed documentation.
+
+    """
+    # Policy is being used to run a rollout.
+    # observations contains the last observations, and all_observations
+    # contains partial episodes batched using lengths.
+    ROLLOUT = 0
+    # Policy is being used to do an optimization with timesteps from different
+    # episodes. Recurrent policies must raise
+    # ShuffledOptimizationNotSupported if they encounter this mode.
+    SHUFFLED = 1
+    # Policy is being used to do an optimization on complete episodes.
+    FULL = 2
+
+
+@dataclass
+class PolicyInput:
+    r"""The (differentiable) input to all pytorch policies.
+
+    Args:
+        mode (PolicyMode): The mode this batch is being used in. Determines the
+            shape of observations.
+        observations (torch.Tensor): A torch tensor containing flattened
+            observations in a batch. Stateless policies should always operate
+            on this input. Shape depends on the mode:
+             * If `mode == ROLLOUT`, has shape :math:`(V, O)` (where V is the
+                vectorization level).
+             * If `mode == SHUFFLED`, has shape :math:`(B, O)` (where B is the
+                mini-batch size).
+             * If mode == FULL, has shape :math:`(N \bullet [T], O)` (where N
+                is the number of episodes, and T is the episode lengths).
+        lengths (torch.Tensor or None): Integer tensor containing the lengths
+            of each episode. Only has a value if `mode == FULL`.
+
+    """
+
+    mode: PolicyMode
+    observations: torch.Tensor
+    lengths: torch.Tensor = None
+
+    def __post_init__(self):
+        """Check that lengths is consistent with the rest of the fields.
+
+        Raises:
+            ValueError: If lengths is not consistent with another field.
+
+        """
+        if self.mode == PolicyMode.FULL:
+            if self.lengths is None:
+                raise ValueError(
+                    'lengths is None, but must be a torch.Tensor when '
+                    'mode == PolicyMode.FULL')
+            assert self.lengths is not None
+            if self.lengths.dtype not in (torch.uint8, torch.int8, torch.int16,
+                                          torch.int32, torch.int64):
+                raise ValueError(
+                    f'lengths has dtype {self.lengths.dtype}, but must have '
+                    f'an integer dtype')
+            total_size = sum(self.lengths)
+            if self.observations.shape[0] != total_size:
+                raise ValueError(
+                    f'observations has batch size '
+                    f'{self.observations.shape[0]}, but must have batch '
+                    f'size {total_size} to match lengths')
+            assert self.observations.shape[0] == total_size
+        elif self.lengths is not None:
+            raise ValueError(
+                f'lengths has value {self.lengths}, but must be None '
+                f'when mode == {self.mode}')
+
+    def to_packed_sequence(self):
+        """Turn full observations into a torch.nn.utils.rnn.PackedSequence.
+
+        Raises:
+            ShuffledOptimizationNotSupported: If called when `mode != FULL`
+
+        Returns:
+            torch.nn.utils.rnn.PackedSequence: The sequence of flattened
+                observations.
+
+        """
+        if self.mode != PolicyMode.FULL:
+            raise ShuffledOptimizationNotSupported(
+                f'mode has value {self.mode} but must have mode '
+                f'{PolicyMode.FULL} to use to_packed_sequence')
+        sequence = []
+        start = 0
+        for length in self.lengths:
+            stop = start + length
+            sequence.append(self.observations[start:stop])
+            start = stop
+        pack_sequence = nn.utils.rnn.pack_sequence
+        return pack_sequence(sequence, enforce_sorted=False)
@@ -11,6 +11,8 @@
 """
 import copy
 import dataclasses
+import math
+import warnings
 
 import akro
 import torch
@@ -134,7 +136,7 @@ def filter_valids(tensor, valids):
     return [tensor[i][:valid] for i, valid in enumerate(valids)]
 
 
-def np_to_torch(array):
+def as_torch(array):
     """Numpy arrays to PyTorch tensors.
 
     Args:
@@ -144,10 +146,10 @@ def np_to_torch(array):
         torch.Tensor: float tensor on the global device.
 
     """
-    return torch.from_numpy(array).float().to(global_device())
+    return torch.as_tensor(array).float().to(global_device())
 
 
-def dict_np_to_torch(array_dict):
+def as_torch_dict(array_dict):
     """Convert a dict whose values are numpy arrays to PyTorch tensors.
 
     Modifies array_dict in place.
@@ -160,7 +162,7 @@ def dict_np_to_torch(array_dict):
 
     """
     for key, value in array_dict.items():
-        array_dict[key] = np_to_torch(value)
+        array_dict[key] = as_torch(value)
     return array_dict
 
 
@@ -401,3 +403,90 @@ def step(self, action):
         env_step = super().step(action)
         obs = env_step.observation.transpose(2, 0, 1)
         return dataclasses.replace(env_step, observation=obs)
+
+
+def output_height_2d(layer, height):
+    """Compute the output height of a torch.nn.Conv2d, assuming NCHW format.
+
+    This requires knowing the input height. Because NCHW format makes this very
+    easy to mix up, this is a seperate function from conv2d_output_height.
+
+    It also works on torch.nn.MaxPool2d.
+
+    This function implements the formula described in the torch.nn.Conv2d
+    documentation:
+    https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+
+    Args:
+        layer (torch.nn.Conv2d): The layer to compute output size for.
+        height (int): The height of the input image.
+
+    Returns:
+        int: The height of the output image.
+
+    """
+    assert isinstance(layer, (torch.nn.Conv2d, torch.nn.MaxPool2d))
+    return math.floor((height + 2 * layer.padding[0] - layer.dilation[0] *
+                       (layer.kernel_size[0] - 1) - 1) / layer.stride[0] + 1)
+
+
+def output_width_2d(layer, width):
+    """Compute the output width of a torch.nn.Conv2d, assuming NCHW format.
+
+    This requires knowing the input width. Because NCHW format makes this very
+    easy to mix up, this is a seperate function from conv2d_output_height.
+
+    It also works on torch.nn.MaxPool2d.
+
+    This function implements the formula described in the torch.nn.Conv2d
+    documentation:
+    https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+
+    Args:
+        layer (torch.nn.Conv2d): The layer to compute output size for.
+        width (int): The width of the input image.
+
+    Returns:
+        int: The width of the output image.
+
+    """
+    assert isinstance(layer, (torch.nn.Conv2d, torch.nn.MaxPool2d))
+    return math.floor((width + 2 * layer.padding[1] - layer.dilation[1] *
+                       (layer.kernel_size[1] - 1) - 1) / layer.stride[1] + 1)
+
+
+def expand_var(name, item, n_expected, reference):
+    """Expand a variable to an expected length.
+
+    This is used to handle arguments to primitives that can all be reasonably
+    set to the same value, or multiple different values.
+
+    Args:
+        name (str): Name of variable being expanded.
+        item (any): Element being expanded.
+        n_expected (int): Number of elements expected.
+        reference (str): Source of n_expected.
+
+    Returns:
+        list: List of references to item or item itself.
+
+    Raises:
+        ValueError: If the variable is a sequence but length of the variable
+            is not 1 or n_expected.
+
+    """
+    if n_expected == 1:
+        warnings.warn(
+            f'Providing a {reference} of length 1 prevents {name} from '
+            f'being expanded.')
+    if isinstance(item, (list, tuple)):
+        if len(item) == n_expected:
+            return item
+        elif len(item) == 1:
+            return list(item) * n_expected
+        else:
+            raise ValueError(
+                f'{name} is length {len(item)} but should be length '
+                f'{n_expected} to match {reference}')
+    else:
+        return [item] * n_expected
@@ -11,7 +11,7 @@
 from garage.np.algos.rl_algorithm import RLAlgorithm
 from garage.np.policies import Policy
 from garage.sampler import RaySampler
-from garage.torch import np_to_torch
+from garage.torch import as_torch
 
 # yapf: enable
 
@@ -126,8 +126,8 @@ def _train_once(self, trainer, epoch):
         minibatches = np.array_split(indices, self._minibatches_per_epoch)
         losses = []
         for minibatch in minibatches:
-            observations = np_to_torch(batch.observations[minibatch])
-            actions = np_to_torch(batch.actions[minibatch])
+            observations = as_torch(batch.observations[minibatch])
+            actions = as_torch(batch.actions[minibatch])
             self._optimizer.zero_grad()
             loss = self._compute_loss(observations, actions)
             loss.backward()
 
@@ -10,7 +10,7 @@
                     obtain_evaluation_episodes)
 from garage.np.algos import RLAlgorithm
 from garage.sampler import FragmentWorker, LocalSampler
-from garage.torch import dict_np_to_torch, torch_to_np
+from garage.torch import as_torch_dict, torch_to_np
 
 # yapf: enable
 
@@ -229,7 +229,7 @@ def optimize_policy(self, samples_data):
             qval: Q-value predicted by the Q-network.
 
         """
-        transitions = dict_np_to_torch(samples_data)
+        transitions = as_torch_dict(samples_data)
 
         observations = transitions['observations']
         rewards = transitions['rewards'].reshape(-1, 1)
 
@@ -11,7 +11,7 @@
 from garage._functions import obtain_evaluation_episodes
 from garage.np.algos import RLAlgorithm
 from garage.sampler import FragmentWorker
-from garage.torch import global_device, np_to_torch
+from garage.torch import as_torch, global_device
 
 
 class DQN(RLAlgorithm):
@@ -240,12 +240,12 @@ def _optimize_qf(self, timesteps):
             qval: Q-value predicted by the Q-network.
 
         """
-        observations = np_to_torch(timesteps.observations)
-        rewards = np_to_torch(timesteps.rewards).reshape(-1, 1)
+        observations = as_torch(timesteps.observations)
+        rewards = as_torch(timesteps.rewards).reshape(-1, 1)
         rewards *= self._reward_scale
-        actions = np_to_torch(timesteps.actions)
-        next_observations = np_to_torch(timesteps.next_observations)
-        terminals = np_to_torch(timesteps.terminals).reshape(-1, 1)
+        actions = as_torch(timesteps.actions)
+        next_observations = as_torch(timesteps.next_observations)
+        terminals = as_torch(timesteps.terminals).reshape(-1, 1)
 
         next_inputs = next_observations
         inputs = observations