optimize spatial data augmentation (#3)

chenxi-wang · web-flow · commit 76d94ae698c3 · 2025-02-16T13:49:05.000+08:00
* optimize data augmentation

* enlarge norm range for translation

* add data visualization for config validation
diff --git a/README.md b/README.md
@@ -8,14 +8,17 @@
 
 ## 🔥 News
 
+- **[Feb 16, 2025]** Optimize spatial data augmentation. Add data visualization for config check. Add tips for setting workspace range and normalization range.
 - **[Dec 26, 2024]** Fix several potential installation issues. Add support for CUDA 12.1.
 - **[May 11, 2024]** Initial release.
 
 ## 🛫 Getting Started
 
 ### 💻 Installation
 
-Please following the [installation guide](assets/docs/INSTALL.md) to install the `rise` conda environments and the dependencies, as well as the real robot environments. Also, remember to adjust the constant parameters in `dataset/constants.py` and `utils/constants.py` according to your own environment.
+Please follow the [installation guide](assets/docs/INSTALL.md) to install the `rise` conda environments and the dependencies, as well as the real robot environments. Also, remember to adjust the constant parameters in `dataset/constants.py` and `utils/constants.py` according to your own environment.
+
+**Make sure that `TRANS_MIN/MAX` and `WORKSPACE_MIN/MAX` are correctly set in the camera coordinates, or you may obtain meaningless output.** We recommend expanding `TRANS_MIN/MAX` by 0.15 - 0.3 meters on both sides of the actual translation range to accommodate spatial data augmentation. You could follow [command_train.sh](command_train.sh) for data visualization and parameter check.
 
 ### 📷 Calibration
 
diff --git a/assets/docs/DEPLOY.md b/assets/docs/DEPLOY.md
@@ -4,7 +4,7 @@
    - `IMG_MEAN` and `IMG_STD` are the image normalization constants. Here we use ImageNet normalization coefficients.
    - `TRANS_MIN` and `TRANS_MAX` are the tcp normalization range in the camera coordinate.
    - `MAX_GRIPPER_WIDTH` indicates the gripper width normalization range (in meter).
-   - `WORKSPACE_MIN` and `WORKSPACE_MAX` are the workspace range in the camera coordinate. 
+   - `WORKSPACE_MIN` and `WORKSPACE_MAX` are the workspace range in the camera coordinate and used for point cloud cropping. 
    - `SAFE_WORKSPACE_MIN` and `SAFE_WORKSPACE_MAX` are the safe workspace range in the base coordinate (used for evaluation).
    - `SAFE_EPS` denotes the safe epsilon of the safe workspace range. Therefore, the real range should be [min + eps, max - eps].
    - `GRIPPER_THRESHOLD` denotes the gripper moving threshold (in meter) to avoid gripper action too frequently during evaluation.
diff --git a/command_train.sh b/command_train.sh
@@ -1 +1,5 @@
-torchrun --master_addr 192.168.3.50 --master_port 14522 --nproc_per_node 2 --nnodes 1 --node_rank 0 train.py --data_path data/collect_pens --aug --aug_jitter --num_action 20 --voxel_size 0.005 --obs_feature_dim 512 --hidden_dim 512 --nheads 8 --num_encoder_layers 4 --num_decoder_layers 1 --dim_feedforward 2048 --dropout 0.1 --ckpt_dir logs/collect_pens --batch_size 240 --num_epochs 1000 --save_epochs 50 --num_workers 24 --seed 233 
+# example for policy training
+torchrun --master_addr 192.168.3.50 --master_port 14522 --nproc_per_node 2 --nnodes 1 --node_rank 0 train.py --data_path data/collect_pens --aug --aug_jitter --num_action 20 --voxel_size 0.005 --obs_feature_dim 512 --hidden_dim 512 --nheads 8 --num_encoder_layers 4 --num_decoder_layers 1 --dim_feedforward 2048 --dropout 0.1 --ckpt_dir logs/collect_pens --batch_size 240 --num_epochs 1000 --save_epochs 50 --num_workers 24 --seed 233
+
+# example for data visualization & parameter check
+torchrun --master_addr 192.168.3.50 --master_port 14522 --nproc_per_node 1 --nnodes 1 --node_rank 0 train.py --data_path data/collect_pens --aug --aug_jitter --num_action 20 --voxel_size 0.005 --obs_feature_dim 512 --hidden_dim 512 --nheads 8 --num_encoder_layers 4 --num_decoder_layers 1 --dim_feedforward 2048 --dropout 0.1 --ckpt_dir logs/collect_pens --batch_size 1 --num_epochs 1 --save_epochs 1 --num_workers 1 --seed 233 --vis_data
diff --git a/dataset/realworld.py b/dataset/realworld.py
@@ -36,7 +36,8 @@ def __init__(
         aug_jitter = False,
         aug_jitter_params = [0.4, 0.4, 0.2, 0.1],
         aug_jitter_prob = 0.2,
-        with_cloud = False
+        with_cloud = False,
+        vis = False
     ):
         assert split in ['train', 'val', 'all']
 
@@ -56,6 +57,7 @@ def __init__(
         self.aug_jitter_params = np.array(aug_jitter_params)
         self.aug_jitter_prob = aug_jitter_prob
         self.with_cloud = with_cloud
+        self.vis = vis
         
         self.all_demos = sorted(os.listdir(self.data_path))
         self.num_demos = len(self.all_demos)
@@ -115,9 +117,17 @@ def _augmentation(self, clouds, tcps):
         rotation_angles = np.random.rand(3) * (self.aug_rot_max - self.aug_rot_min) + self.aug_rot_min
         rotation_angles = rotation_angles / 180 * np.pi  # tranform from degree to radius
         aug_mat = rot_trans_mat(translation_offsets, rotation_angles)
-        for cloud in clouds:
-            cloud = apply_mat_to_pcd(cloud, aug_mat)
+        center = clouds[-1][..., :3].mean(axis = 0)
+
+        for i in range(len(clouds)):
+            clouds[i][..., :3] -= center
+            clouds[i] = apply_mat_to_pcd(clouds[i], aug_mat)
+            clouds[i][..., :3] += center
+
+        tcps[..., :3] -= center
         tcps = apply_mat_to_pose(tcps, aug_mat, rotation_rep = "quaternion")
+        tcps[..., :3] += center
+
         return clouds, tcps
 
     def _normalize_tcp(self, tcp_list):
@@ -221,6 +231,27 @@ def __getitem__(self, index):
         # point augmentations
         if self.split == 'train' and self.aug:
             clouds, action_tcps = self._augmentation(clouds, action_tcps)
+
+        # visualization
+        if self.vis:
+            points = clouds[-1][..., :3]
+            print("point range", points.min(axis=0), points.max(axis=0))
+            pcd = o3d.geometry.PointCloud()
+            pcd.points = o3d.utility.Vector3dVector(points)
+            pcd.colors = o3d.utility.Vector3dVector(colors * IMG_STD + IMG_MEAN)
+            traj = []
+            # red box stands for the workspace range
+            bbox3d_1 = o3d.geometry.AxisAlignedBoundingBox(WORKSPACE_MIN, WORKSPACE_MAX)
+            bbox3d_1.color = [1, 0, 0]
+            # green box stands for the translation normalization range
+            bbox3d_2 = o3d.geometry.AxisAlignedBoundingBox(TRANS_MIN, TRANS_MAX)
+            bbox3d_2.color = [0, 1, 0]
+            action_tcps_vis = xyz_rot_transform(action_tcps, from_rep = "quaternion", to_rep = "matrix")
+            for i in range(len(action_tcps_vis)):
+                action = action_tcps_vis[i]
+                frame = o3d.geometry.TriangleMesh.create_coordinate_frame(size=0.03).transform(action)
+                traj.append(frame)
+            o3d.visualization.draw_geometries([pcd.voxel_down_sample(self.voxel_size), bbox3d_1, bbox3d_2, *traj])
         
         # rotation transformation (to 6d)
         action_tcps = xyz_rot_transform(action_tcps, from_rep = "quaternion", to_rep = "rotation_6d")
diff --git a/train.py b/train.py
@@ -39,7 +39,8 @@
     "num_epochs": 1000,
     "save_epochs": 50,
     "num_workers": 24,
-    "seed": 233
+    "seed": 233,
+    "vis_data": False
 })
 
 
@@ -72,7 +73,8 @@ def train(args_override):
         voxel_size = args.voxel_size,
         aug = args.aug,
         aug_jitter = args.aug_jitter, 
-        with_cloud = False
+        with_cloud = False,
+        vis = args.vis_data
     )
     sampler = torch.utils.data.distributed.DistributedSampler(
         dataset, 
@@ -85,7 +87,8 @@ def train(args_override):
         batch_size = args.batch_size // WORLD_SIZE,
         num_workers = args.num_workers,
         collate_fn = collate_fn,
-        sampler = sampler
+        sampler = sampler,
+        drop_last = True
     )
 
     # policy
@@ -203,5 +206,6 @@ def train(args_override):
     parser.add_argument('--save_epochs', action = 'store', type = int, help = 'saving epochs', required = False, default = 50)
     parser.add_argument('--num_workers', action = 'store', type = int, help = 'number of workers', required = False, default = 24)
     parser.add_argument('--seed', action = 'store', type = int, help = 'seed', required = False, default = 233)
+    parser.add_argument('--vis_data', action = 'store_true', help = 'whether to visualize the input data and ground truth actions.')
 
     train(vars(parser.parse_args()))
diff --git a/utils/constants.py b/utils/constants.py
@@ -5,7 +5,7 @@
 IMG_STD = np.array([0.229, 0.224, 0.225])
 
 # tcp normalization and gripper width normalization
-TRANS_MIN, TRANS_MAX = np.array([-0.35, -0.35, 0]), np.array([0.35, 0.35, 0.7]) 
+TRANS_MIN, TRANS_MAX = np.array([-0.5, -0.5, 0]), np.array([0.5, 0.5, 1.0]) 
 MAX_GRIPPER_WIDTH = 0.11 # meter
 
 # workspace in camera coordinate