feat(cbf): support crabs as a representative control barrier function…

… based algorithm (#327)
PKU-Alignment · Apr 30, 2024 · 89717ba · 89717ba
1 parent c6c8aa9
commit 89717ba
Show file tree

Hide file tree

Showing 20 changed files with 3,037 additions and 27 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ ci:
 default_stages: [commit, push, manual]
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: check-symlinks
       - id: destroyed-symlinks
@@ -29,16 +29,22 @@ repos:
       - id: debug-statements
       - id: double-quote-string-fixer
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.3.5
+    rev: v0.4.2
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
+        exclude: |
+          (?x)(
+            ^omnisafe/algorithms/off_policy/crabs.py$|
+            ^omnisafe/common/control_barrier_function/crabs/|
+            ^omnisafe/envs/classic_control/envs_from_crabs.py$
+          )
   - repo: https://github.com/PyCQA/isort
     rev: 5.13.2
     hooks:
       - id: isort
   - repo: https://github.com/psf/black
-    rev: 24.3.0
+    rev: 24.4.2
     hooks:
       - id: black-jupyter
   - repo: https://github.com/asottile/pyupgrade
@@ -84,7 +90,12 @@ repos:
             ^examples/|
             ^tests/|
             ^setup.py$|
-            ^docs/source/conf.py$
+            ^docs/source/conf.py$|
+            ^omnisafe/envs/classic_control/envs_from_crabs.py$|
+            ^omnisafe/common/control_barrier_function/crabs/models.py$|
+            ^omnisafe/common/control_barrier_function/crabs/optimizers.py$|
+            ^omnisafe/common/control_barrier_function/crabs/utils.py$|
+            ^omnisafe/algorithms/off_policy/crabs.py$
           )
   - repo: https://github.com/pycqa/pydocstyle
     rev: 6.3.0
@@ -97,5 +108,9 @@ repos:
             ^docs/|
             ^examples/|
             ^tests/|
-            ^setup.py$
+            ^setup.py$|
+            ^omnisafe/envs/classic_control/envs_from_crabs.py$|
+            ^omnisafe/common/control_barrier_function/crabs/models.py$|
+            ^omnisafe/common/control_barrier_function/crabs/optimizers.py$|
+            ^omnisafe/common/control_barrier_function/crabs/utils.py$
           )
diff --git a/omnisafe/adapter/crabs_adapter.py b/omnisafe/adapter/crabs_adapter.py
@@ -0,0 +1,167 @@
+# Copyright 2024 OmniSafe Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""CRABS Adapter for OmniSafe."""
+
+from __future__ import annotations
+
+import torch
+from rich import errors
+from rich.progress import track
+
+from omnisafe.adapter.offpolicy_adapter import OffPolicyAdapter
+from omnisafe.common.buffer import VectorOffPolicyBuffer
+from omnisafe.common.logger import Logger
+from omnisafe.envs.crabs_env import CRABSEnv
+from omnisafe.models.actor_critic.constraint_actor_q_critic import ConstraintActorQCritic
+from omnisafe.utils.config import Config
+
+
+class CRABSAdapter(OffPolicyAdapter):
+    """CRABS Adapter for OmniSafe.
+
+    :class:`CRABSAdapter` is used to adapt the environment to the CRABS algorithm training.
+
+    Args:
+        env_id (str): The environment id.
+        num_envs (int): The number of environments.
+        seed (int): The random seed.
+        cfgs (Config): The configuration.
+    """
+
+    _current_obs: torch.Tensor
+    _ep_ret: torch.Tensor
+    _ep_cost: torch.Tensor
+    _ep_len: torch.Tensor
+
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        env_id: str,
+        num_envs: int,
+        seed: int,
+        cfgs: Config,
+    ) -> None:
+        """Initialize a instance of :class:`CRABSAdapter`."""
+        super().__init__(env_id, num_envs, seed, cfgs)
+        self._env: CRABSEnv
+        self.n_expl_episodes = 0
+        self._max_ep_len = self._env.env.spec.max_episode_steps  # type: ignore
+        self.horizon = self._max_ep_len
+
+    def eval_policy(  # pylint: disable=too-many-locals
+        self,
+        episode: int,
+        agent: ConstraintActorQCritic,
+        logger: Logger,
+    ) -> None:
+        """Rollout the environment with deterministic agent action.
+
+        Args:
+            episode (int): Number of episodes.
+            agent (ConstraintActorQCritic): Agent.
+            logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``.
+        """
+        for _ in range(episode):
+            ep_ret, ep_cost, ep_len = 0.0, 0.0, 0
+            obs, _ = self._eval_env.reset()
+            obs = obs.to(self._device)
+
+            done = False
+            while not done:
+                act = agent.step(obs, deterministic=False)
+                obs, reward, cost, terminated, truncated, info = self._eval_env.step(act)
+                obs, reward, cost, terminated, truncated = (
+                    torch.as_tensor(x, dtype=torch.float32, device=self._device)
+                    for x in (obs, reward, cost, terminated, truncated)
+                )
+                ep_ret += info.get('original_reward', reward).cpu()
+                ep_cost += info.get('original_cost', cost).cpu()
+                ep_len += 1
+                done = bool(terminated[0].item()) or bool(truncated[0].item())
+
+            logger.store(
+                {
+                    'Metrics/RawPolicyEpRet': ep_ret,
+                    'Metrics/RawPolicyEpCost': ep_cost,
+                    'Metrics/RawPolicyEpLen': ep_len,
+                },
+            )
+
+    def rollout(  # pylint: disable=too-many-locals
+        self,
+        rollout_step: int,
+        agent: ConstraintActorQCritic,
+        buffer: VectorOffPolicyBuffer,
+        logger: Logger,
+        use_rand_action: bool,
+    ) -> None:
+        """Rollout the environment and store the data in the buffer.
+
+        .. warning::
+            As OmniSafe uses :class:`AutoReset` wrapper, the environment will be reset automatically,
+            so the final observation will be stored in ``info['final_observation']``.
+
+        Args:
+            rollout_step (int): Number of rollout steps.
+            agent (ConstraintActorQCritic): Constraint actor-critic, including actor, reward critic,
+                and cost critic.
+            buffer (VectorOffPolicyBuffer): Vector off-policy buffer.
+            logger (Logger): Logger, to log ``EpRet``, ``EpCost``, ``EpLen``.
+            use_rand_action (bool): Whether to use random action.
+        """
+        try:
+            for _ in track(
+                range(rollout_step),
+                description=f'Processing rollout for epoch: {logger.current_epoch}...',
+            ):
+                self._rollout_step(agent, buffer, logger, use_rand_action)
+        except errors.LiveError:
+            for _ in range(rollout_step):
+                self._rollout_step(agent, buffer, logger, use_rand_action)
+
+    def _rollout_step(  # pylint: disable=too-many-locals
+        self,
+        agent: ConstraintActorQCritic,
+        buffer: VectorOffPolicyBuffer,
+        logger: Logger,
+        use_rand_action: bool,
+    ) -> None:
+        if use_rand_action:
+            act = torch.as_tensor(self._env.sample_action(), dtype=torch.float32).to(
+                self._device,
+            )
+        else:
+            act = agent.step(self._current_obs, deterministic=False)
+
+        next_obs, reward, cost, terminated, truncated, info = self.step(act)
+
+        self._log_value(reward=reward, cost=cost, info=info)
+        real_next_obs = next_obs.clone()
+        for idx, done in enumerate(torch.logical_or(terminated, truncated)):
+            if done:
+                if 'final_observation' in info:
+                    real_next_obs[idx] = info['final_observation'][idx]
+                self._log_metrics(logger, idx)
+                self._reset_log(idx)
+
+        buffer.store(
+            obs=self._current_obs,
+            act=act,
+            reward=reward,
+            cost=cost,
+            done=torch.logical_and(terminated, torch.logical_xor(terminated, truncated)),
+            next_obs=real_next_obs,
+        )
+
+        self._current_obs = next_obs
diff --git a/omnisafe/algorithms/__init__.py b/omnisafe/algorithms/__init__.py
@@ -25,6 +25,7 @@
 
 # Off-Policy Safe
 from omnisafe.algorithms.off_policy import (
+    CRABS,
     DDPG,
     DDPGPID,
     SAC,

diff --git a/omnisafe/algorithms/algo_wrapper.py b/omnisafe/algorithms/algo_wrapper.py
@@ -130,9 +130,11 @@ def _init_config(self) -> Config:
         # the exp_name format is PPO-{SafetyPointGoal1-v0}
         exp_name = f'{self.algo}-{{{self.env_id}}}'
         cfgs.recurisve_update({'exp_name': exp_name, 'env_id': self.env_id, 'algo': self.algo})
-        cfgs.train_cfgs.recurisve_update(
-            {'epochs': cfgs.train_cfgs.total_steps // cfgs.algo_cfgs.steps_per_epoch},
-        )
+        if hasattr(cfgs.train_cfgs, 'total_steps') and hasattr(cfgs.algo_cfgs, 'steps_per_epoch'):
+            epochs = cfgs.train_cfgs.total_steps // cfgs.algo_cfgs.steps_per_epoch
+            cfgs.train_cfgs.recurisve_update(
+                {'epochs': epochs},
+            )
         return cfgs
 
     def _init_checks(self) -> None:

diff --git a/omnisafe/algorithms/off_policy/__init__.py b/omnisafe/algorithms/off_policy/__init__.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Off-policy algorithms."""
 
+from omnisafe.algorithms.off_policy.crabs import CRABS
 from omnisafe.algorithms.off_policy.ddpg import DDPG
 from omnisafe.algorithms.off_policy.ddpg_lag import DDPGLag
 from omnisafe.algorithms.off_policy.ddpg_pid import DDPGPID
@@ -25,4 +26,15 @@
 from omnisafe.algorithms.off_policy.td3_pid import TD3PID
 
 
-__all__ = ['DDPG', 'TD3', 'SAC', 'DDPGLag', 'TD3Lag', 'SACLag', 'DDPGPID', 'TD3PID', 'SACPID']
+__all__ = [
+    'DDPG',
+    'TD3',
+    'SAC',
+    'DDPGLag',
+    'TD3Lag',
+    'SACLag',
+    'DDPGPID',
+    'TD3PID',
+    'SACPID',
+    'CRABS',
+]