Initial commit

bytedance · Feb 10, 2025 · 0ae333b · 0ae333b
1 parent 673b71d
commit 0ae333b
Show file tree

Hide file tree

Showing 12 changed files with 418 additions and 47 deletions.
diff --git a/LDM/configs/calvin_ldm.py b/LDM/configs/calvin_ldm.py
@@ -113,7 +113,7 @@
     ]
 data_root = "./data/calvin/task_ABCD_D/training"
 train_dataloader = dict(
-    batch_size=1,
+    batch_size=4,
     num_workers=4,
     drop_last=True,
     dataset=dict(
@@ -163,13 +163,13 @@
 
 
 metrics = [
-    dict(
-        type='FVD',
-        prefix='FVD',
-        fake_nums=19772,
-        inception_path='./work_dirs/init/fvd/i3d_torchscript.pt',
-        inception_style='StyleGAN',
-        sample_model='ema'),
+    # dict(
+    #     type='FVD',
+    #     prefix='FVD',
+    #     fake_nums=19772,
+    #     inception_path='./work_dirs/init/fvd/i3d_torchscript.pt',
+    #     inception_style='StyleGAN',
+    #     sample_model='ema'),
 ]
 # config for val
 val_cfg = dict(type='MultiValLoop')
@@ -180,7 +180,7 @@
 test_evaluator = dict(type='LAFeatMFMetric',collect_device='cpu', la_num = 729, gt_act_num = 81)
 
 # load from which checkpoint
-load_from = './work_dirs/init/magvit/iter_332800_new.pth' # load_from=None
+load_from = './work_dirs/magvit_init.pth' # load_from=None
 # load_from = None
 # whether to resume training from the loaded checkpoint
 resume = False

diff --git a/LDM/configs/go_ldm.py b/LDM/configs/go_ldm.py
@@ -185,8 +185,8 @@
 test_evaluator = dict(type='LAGoFeatMetric',collect_device='cpu', la_num = 729, gt_act_num = 81, gt_select_frame=[1])
 
 # load from which checkpoint
-load_from = '/opt/tiger/mmagicinit/ldm/work_dirs/go_2frame_sepqformer_likebefore_interval5/iter_64000_new.pth' # load_from=None
-# load_from = None
+# load_from = '/opt/tiger/mmagicinit/ldm/work_dirs/go_2frame_sepqformer_likebefore_interval5/iter_64000_new.pth' # load_from=None
+load_from = './work_dirs/magvit_init.pth'
 # whether to resume training from the loaded checkpoint
 resume = False
 

diff --git a/LDM/ldm/datasets/calvin_dataset.py b/LDM/ldm/datasets/calvin_dataset.py
@@ -245,8 +245,8 @@ def __init__(
         lang_info_path = osp.join(data_root, 'lang_annotations/auto_lang_ann.npy')
         annotations = np.load(lang_info_path, allow_pickle=True).item()
         annotations = list(zip(annotations["info"]["indx"], annotations["language"]["ann"])) #((np.int64(1401659), np.int64(1401723)), 'move the door all the way to the right')
-        length = len(annotations) // 3
-        annotations = annotations[(length*2):]
+        # length = len(annotations) // 3
+        annotations = annotations[:2]
         data_paths = []
         langs = []
         clip_start_end_ids = []

diff --git a/LDM/ldm/evaluation/metrics/latent_action.py b/LDM/ldm/evaluation/metrics/latent_action.py
@@ -309,7 +309,7 @@ def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
         max_pos = 0.02
         max_orn = 0.05
         for i, data_sample in enumerate(data_samples):
-            indice = data_sample['indice'].squeeze().item()
+            indice = data_sample['indice'].squeeze()
             encode_feat = data_sample['encode_feat'].squeeze(2, 3) #C, T
             state = data_batch['data_samples'][i].states[0]
             deltas = torch.stack([s1 - state[0] for s1 in state[1:]])
@@ -370,10 +370,7 @@ def compute_metrics(self, results: list):
         from matplotlib.colors import LinearSegmentedColormap
         metrics = {}
         bin_length = 0.1
-        # rel_x_dict = defaultdict(list)
-        # rel_y_dict = defaultdict(list)
-        # rel_z_dict = defaultdict(list)
-
+
         torch.save(results, './work_dirs/calvin_ldm_results.pth')
 
         rel_x_list = []
@@ -404,16 +401,6 @@ def compute_metrics(self, results: list):
         cmap = LinearSegmentedColormap.from_list("gradient", [start_color, middle_color, end_color], N=20)
         colors = cmap(np.linspace(0, 1, 20))
 
-        # act_colors = generate_distinct_colors(len(action_types)+1)
-        # fig, ax = plt.subplots(figsize=(6, 2))
-        # for i, color in enumerate(act_colors):
-        #     rect = patches.Rectangle((i, 0), 1, 1, linewidth=1, edgecolor='none', facecolor=color)
-        #     ax.add_patch(rect)
-        # ax.set_xlim(0, len(colors))
-        # ax.set_ylim(0, 1)
-        # ax.axis('off')
-        # plt.savefig('./work_dirscolor_blocks.png', dpi=300)
-
         action_ids = self.gen_action_id(action_types, results)
         for i, item in enumerate(results):
 
@@ -469,12 +456,6 @@ def compute_metrics(self, results: list):
             self.draw_tsne(t_sne_features, gripper_color[fi], dir='./work_dirs', tag=f'gripper_f{fi+1}')
             self.draw_tsne(t_sne_features, act_label_colors, dir='./work_dirs', tag=f'act_f{fi+1}')
 
-
-        # for ai, A2L_dict in enumerate(A2L_dict_list):
-        #     for la in A2L_dict:
-        #         A2L_dict[la] = A2L_dict[la] / A2LNums[ai]
-        # with open('./work_dirsla_test.json', 'w') as f:
-        #     json.dump(A2L_dict_list, f)
         return metrics
 
 

diff --git a/LDM/ldm/models/algorithms/magvit_vq_gan.py b/LDM/ldm/models/algorithms/magvit_vq_gan.py
@@ -439,7 +439,7 @@ def forward(self,
             outputs = outputs.squeeze(dim=2)
         # outputs is BGR
         outputs = self.data_preprocessor.destruct(outputs, data_samples)
-        import pdb;pdb.set_trace()
+
         gen_sample = DataSample()
         gen_sample.indice = indice
         gen_sample.fake_img = outputs

diff --git a/LDM/ldm/models/losses/vqperceptual.py b/LDM/ldm/models/losses/vqperceptual.py
@@ -23,6 +23,9 @@
 
 
 
+CKPT_MAP = {
+    "vgg_lpips": "vgg.pth"
+}
 
 
 def download(url, local_path, chunk_size=1024):
@@ -74,22 +77,25 @@ def __init__(self, use_dropout=True):
         self.lins = nn.ModuleList(lins)
         root = '../work_dirs/init/lpips'
         ckpt = os.path.join(root, 'vgg.pth')
+
         if not os.path.exists(ckpt):
             download("https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1", ckpt)
         self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
         for param in self.parameters():
             param.requires_grad = False            
 
     def load_from_pretrained(self, name="vgg_lpips"):
-        ckpt = os.path.join(name, CKPT_MAP[name])
+        ckpt = os.path.join('../work_dirs/init/lpips', CKPT_MAP[name])
+        if not os.path.exists(ckpt):
+            download("https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1", ckpt)
         self.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
 
 
     @classmethod
     def from_pretrained(cls, name="vgg_lpips"):
         model = cls()
-        ckpt = os.path.join(name, CKPT_MAP[name])
-        ckpt = os.path.join(name, 'vgg.pth')
+        ckpt = os.path.join('work_dirs/init/lpips', CKPT_MAP[name])
+        # ckpt = os.path.join(name, 'vgg.pth')
         model.load_state_dict(torch.load(ckpt, map_location=torch.device("cpu")), strict=False)
         return model
 
@@ -100,14 +106,14 @@ def forward(self, input, target):
         feats0, feats1, diffs = {}, {}, {}
         lins = [self.cos0, self.cos1, self.cos2, self.cos3, self.cos4]
         for kk in range(len(self.channels)):
-        lins = [self.lins[0], self.lins[1], self.lins[2], self.lins[3], self.lins[4]]
+            lins = [self.lins[0], self.lins[1], self.lins[2], self.lins[3], self.lins[4]]
         for kk in range(len(self.channels)):
             feats0[kk], feats1[kk] = normalize_tensor(outs0[kk]), normalize_tensor(outs1[kk])
             diffs[kk] = (feats0[kk] - feats1[kk]) ** 2
         res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.channels))]
         res = [spatial_average(lins[kk].model(diffs[kk]), keepdim=True) for kk in range(len(self.channels))]
         val = res[0]
-        for l in range(1, len(self.channels)):
+        # for l in range(1, len(self.channels)):
         for l in range(1, len(self.channels)):
             val += res[l]
         return val

diff --git a/LDM/tools/calvin_ldm_train.sh b/LDM/tools/calvin_ldm_train.sh
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #!/usr/bin/env bash
-CONFIG="./configs/calvin_ldm.py"
+CONFIG="./configs/calvin_ldm_debug.py"
 GPUS=1
 NNODES=1
 NODE_RANK=0

diff --git a/README.md b/README.md
@@ -95,10 +95,13 @@ Download CALVIN dataset follow the official instructions and organize it as foll
 ```
 ├── VideoWorld
 │   ├── LDM
-│   │   └── data
-│   └──       └── calvin
+│   │   │── data
+│   │   │    └── calvin
+│   │   └── work_dirs
+│   │        └── magvit_init.pth
+│   └──       
 ```
-Use the script ./LDM/tools/calvin_ldm_train.sh to initiate LDM training. Upon completion, the latent codes on the training set will be automatically saved to ./LDM/work_dirs/calvin_ldm_results.pth, and the UMAP visualization of the latent codes will also be generated.
+Use the script ./LDM/tools/calvin_ldm_train.sh to initiate LDM training. Training requires loading the [Magvit weights](https://huggingface.co/maverickrzw/VideoWorld_CALVIN/tree/main) we pre-trained on natural image reconstruction as initialization. Upon completion, the latent codes on the training set will be automatically saved to ./LDM/work_dirs/calvin_ldm_results.pth, and the UMAP visualization of the latent codes will also be generated.
 ```
 cd LDM 
 bash ./tools/calvin_ldm_train.sh

diff --git a/VideoWorld/configs/calvin_test.py b/VideoWorld/configs/calvin_test.py
@@ -99,12 +99,11 @@
     dict(type='ToTensor', keys=['img', *test_to_tensor]),
 ]
 
-data_root = "/mnt/bn/panxuran/calvin/task_ABCD_D/training"
-la_data_path = "/mnt/bn/zhongwei-lf-dev/work_dirs/latent_action_frame2/la_test_calvin_results_interval2_dict_v2.pth"
+data_root = "./data/calvin/task_ABCD_D/training"
+la_data_path = "./work_dirs/calvin_ldm_results.pth"
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
-    # use_web=False,
     pin_memory=False,
     persistent_workers=False,
     sampler=dict(type='InfiniteSampler', shuffle=True),