forked from jayleicn/singularity
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_ret.out
748 lines (742 loc) · 112 KB
/
test_ret.out
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
rdzv_endpoint: worker-1:22226
[32m2023-10-19T12:39:00 | loopitr: [0mLogging to: /home/wiss/zhang/Jinhe/singularity/test_model/model_test_ret_model/train.log
[32m2023-10-19T12:39:00 | __main__: [0mconfig:
{'data_root': '/home/wiss/zhang/nfs/Anet_sing', 'anno_root_downstream': '/home/wiss/zhang/Jinhe/singularity/Data/anetqa', 'train_type': 'anet_ret_train_1.json', 'train_file': ['${anno_root_downstream}/${train_type}', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'test_types': ['temporal_contact_swap'], 'test_file': {'temporal_contact_swap': ['${anno_root_downstream}/anet_ret_temporal_contact_swap.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'temporal_contact_swap_mani': ['${anno_root_downstream}/anet_ret_temporal_contact_swap_mani.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'temporal_action_swap': ['${anno_root_downstream}/anet_ret_temporal_action_swap.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'temporal_action_swap_mani': ['${anno_root_downstream}/anet_ret_temporal_action_swap_mani.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'neighborhood_same_entity': ['${anno_root_downstream}/anet_ret_neighborhood_same_entity.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'neighborhood_same_entity_mani': ['${anno_root_downstream}/anet_ret_neighborhood_same_entity_mani.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'neighborhood_diff_entity': ['${anno_root_downstream}/anet_ret_neighborhood_diff_entity.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'neighborhood_diff_entity_mani': ['${anno_root_downstream}/anet_ret_neighborhood_diff_entity_mani.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'counter_spatial': ['${anno_root_downstream}/anet_ret_counter_spatial.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'counter_spatial_mani': ['${anno_root_downstream}/anet_ret_counter_spatial_mani.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'counter_contact': ['${anno_root_downstream}/anet_ret_counter_contact.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'counter_contact_mani': ['${anno_root_downstream}/anet_ret_counter_contact_mani.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'counter_action': ['${anno_root_downstream}/anet_ret_counter_action.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'counter_action_mani': ['${anno_root_downstream}/anet_ret_counter_action_mani.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'counter_attribute': ['${anno_root_downstream}/anet_ret_counter_attribute.json', '/home/wiss/zhang/nfs/Anet_sing', 'video'], 'counter_attribute_mani': ['${anno_root_downstream}/anet_ret_counter_attribute_mani.json', '/home/wiss/zhang/nfs/Anet_sing', 'video']}, 'stop_key': 'val1/', 'is_paragraph_retrieval': True, 'text_encoder': 'bert-base-uncased', 'bert_config': 'configs/config_bert.json', 'vit_type': 'beit', 'vit_zoo': {'beit': 'microsoft/beit-base-patch16-224-pt22k-ft22k'}, 'vit_name_or_pretrained_path': '${vit_zoo[${vit_type}]}', 'temporal_vision_encoder': {'enable': True, 'num_layers': 2, 'update_pooler_embed': False}, 'add_temporal_embed': True, 'image_res': 224, 'embed_dim': 256, 'video_input': {'num_frames': 1, 'reader': 'decord', 'sample_type': 'rand', 'num_frames_test': 12, 'sample_type_test': 'middle'}, 'max_txt_l': 60, 'batch_size': {'image': 160, 'video': 32}, 'batch_size_test': {'image': 128, 'video': 32}, 'k_test': 128, 'temp': 0.01, 'loss_weight': {'itc': 1.0, 'itm': 1.0}, 'itm_hard_neg': True, 'optimizer': {'opt': 'adamW', 'lr': 1e-05, 'opt_betas': [0.9, 0.999], 'weight_decay': 0.02, 'max_grad_norm': -1, 'different_lr': {'enable': False, 'module_names': [], 'lr': 0.001}}, 'scheduler': {'sched': 'cosine', 'epochs': 30, 'min_lr_multi': 0.1, 'warmup_epochs': 0}, 'output_dir': '/home/wiss/zhang/Jinhe/singularity/test_model/model_test_ret_model', 'resume': False, 'pretrained_path': '/home/wiss/zhang/nfs/anetqa_train_qa_full/ckpt_best.pth', 'evaluate': True, 'eval_frame_ensemble': 'concat', 'eval_x_only': False, 'eval_offload': True, 'device': 'cuda', 'seed': 42, 'log_freq': 100, 'dist_url': 'env://', 'distributed': True, 'fp16': True, 'debug': False, 'num_workers': 24, 'wandb': {'enable': False, 'entity': 'gengyuanzhang', 'project': 'anet_ret'}, 'save_path': '/home/wiss/zhang/nfs/video_prober/singularity/anetqa/', '22226': None, 'rank': 0, 'world_size': 1, 'gpu': 0, 'dist_backend': 'nccl'}
[32m2023-10-19T12:39:00 | __main__: [0mtrain_file: ['${anno_root_downstream}/${train_type}', '/home/wiss/zhang/nfs/Anet_sing', 'video']
[32m2023-10-19T12:39:00 | tasks.pretrain: [0mCreating dataset for ret
[5m[31mWARNING[0m [32m2023-10-19T12:39:00 | py.warnings: [0m/home/wiss/zhang/Jinhe/singularity/utils/distributed.py:18: UserWarning: This DataLoader will create 24 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
builtin_warn(*args, **kwargs)
[5m[31mWARNING[0m [32m2023-10-19T12:39:00 | py.warnings: [0m/home/wiss/zhang/Jinhe/singularity/utils/distributed.py:18: UserWarning: This DataLoader will create 24 worker processes in total. Our suggested max number of worker in current system is 1, which is smaller than what this DataLoader is going to create. Please be aware that excessive worker creation might get DataLoader running slow or even freeze, lower the worker number to avoid potential slowness/freeze if necessary.
builtin_warn(*args, **kwargs)
[32m2023-10-19T12:39:01 | tasks.shared_utils: [0mCreating model
[32m2023-10-19T12:39:04 | models.model_retrieval_base: [0mLoading vit pre-trained weights from huggingface microsoft/beit-base-patch16-224-pt22k-ft22k.
[5m[31mWARNING[0m [32m2023-10-19T12:39:07 | py.warnings: [0m/home/wiss/zhang/anaconda3/envs/probe-sl/lib/python3.7/site-packages/torch/functional.py:445: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /opt/conda/conda-bld/pytorch_1639180594101/work/aten/src/ATen/native/TensorShape.cpp:2157.)
return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
[5m[31mWARNING[0m [32m2023-10-19T12:39:07 | py.warnings: [0m/home/wiss/zhang/anaconda3/envs/probe-sl/lib/python3.7/site-packages/torch/functional.py:445: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /opt/conda/conda-bld/pytorch_1639180594101/work/aten/src/ATen/native/TensorShape.cpp:2157.)
return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
[32m2023-10-19T12:39:08 | models.model_retrieval_base: [0mInit new model with new image size 224, and load weights.
[32m2023-10-19T12:39:10 | models.model_retrieval_base: [0m_IncompatibleKeys(missing_keys=['encoder.layer.0.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.1.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.2.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.3.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.4.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.5.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.6.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.7.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.8.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.9.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.10.attention.attention.relative_position_bias.relative_position_index', 'encoder.layer.11.attention.attention.relative_position_bias.relative_position_index'], unexpected_keys=[])
[32m2023-10-19T12:39:10 | models.model_retrieval_base: [0mBuild text_encoder bert-base-uncased
[32m2023-10-19T12:39:13 | models.model_retrieval_base: [0mBuild text_encoder bert-base-uncased, done!
[32m2023-10-19T12:39:13 | models.model_retrieval_base: [0mBuild temporal_vision_encoder (#layer=2), randomly initialised.
[32m2023-10-19T12:39:14 | models.model_retrieval_base: [0mBuild temporal_vision_encoder, done!
[32m2023-10-19T12:39:14 | utils.optimizer: [0moptimizer -- lr=1e-05 wd=0.02 len(p)=190
[32m2023-10-19T12:39:14 | utils.optimizer: [0moptimizer -- lr=1e-05 wd=0 len(p)=300
[32m2023-10-19T12:39:14 | tasks.shared_utils: [0mLoading checkpoint from /home/wiss/zhang/nfs/anetqa_train_qa_full/ckpt_best.pth
state_dict.keys(): odict_keys(['temporal_embeddings', 'vision_encoder.embeddings.cls_token', 'vision_encoder.embeddings.patch_embeddings.projection.weight', 'vision_encoder.embeddings.patch_embeddings.projection.bias', 'vision_encoder.encoder.layer.0.lambda_1', 'vision_encoder.encoder.layer.0.lambda_2', 'vision_encoder.encoder.layer.0.attention.attention.query.weight', 'vision_encoder.encoder.layer.0.attention.attention.query.bias', 'vision_encoder.encoder.layer.0.attention.attention.key.weight', 'vision_encoder.encoder.layer.0.attention.attention.value.weight', 'vision_encoder.encoder.layer.0.attention.attention.value.bias', 'vision_encoder.encoder.layer.0.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.0.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.0.attention.output.dense.weight', 'vision_encoder.encoder.layer.0.attention.output.dense.bias', 'vision_encoder.encoder.layer.0.intermediate.dense.weight', 'vision_encoder.encoder.layer.0.intermediate.dense.bias', 'vision_encoder.encoder.layer.0.output.dense.weight', 'vision_encoder.encoder.layer.0.output.dense.bias', 'vision_encoder.encoder.layer.0.layernorm_before.weight', 'vision_encoder.encoder.layer.0.layernorm_before.bias', 'vision_encoder.encoder.layer.0.layernorm_after.weight', 'vision_encoder.encoder.layer.0.layernorm_after.bias', 'vision_encoder.encoder.layer.1.lambda_1', 'vision_encoder.encoder.layer.1.lambda_2', 'vision_encoder.encoder.layer.1.attention.attention.query.weight', 'vision_encoder.encoder.layer.1.attention.attention.query.bias', 'vision_encoder.encoder.layer.1.attention.attention.key.weight', 'vision_encoder.encoder.layer.1.attention.attention.value.weight', 'vision_encoder.encoder.layer.1.attention.attention.value.bias', 'vision_encoder.encoder.layer.1.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.1.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.1.attention.output.dense.weight', 'vision_encoder.encoder.layer.1.attention.output.dense.bias', 'vision_encoder.encoder.layer.1.intermediate.dense.weight', 'vision_encoder.encoder.layer.1.intermediate.dense.bias', 'vision_encoder.encoder.layer.1.output.dense.weight', 'vision_encoder.encoder.layer.1.output.dense.bias', 'vision_encoder.encoder.layer.1.layernorm_before.weight', 'vision_encoder.encoder.layer.1.layernorm_before.bias', 'vision_encoder.encoder.layer.1.layernorm_after.weight', 'vision_encoder.encoder.layer.1.layernorm_after.bias', 'vision_encoder.encoder.layer.2.lambda_1', 'vision_encoder.encoder.layer.2.lambda_2', 'vision_encoder.encoder.layer.2.attention.attention.query.weight', 'vision_encoder.encoder.layer.2.attention.attention.query.bias', 'vision_encoder.encoder.layer.2.attention.attention.key.weight', 'vision_encoder.encoder.layer.2.attention.attention.value.weight', 'vision_encoder.encoder.layer.2.attention.attention.value.bias', 'vision_encoder.encoder.layer.2.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.2.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.2.attention.output.dense.weight', 'vision_encoder.encoder.layer.2.attention.output.dense.bias', 'vision_encoder.encoder.layer.2.intermediate.dense.weight', 'vision_encoder.encoder.layer.2.intermediate.dense.bias', 'vision_encoder.encoder.layer.2.output.dense.weight', 'vision_encoder.encoder.layer.2.output.dense.bias', 'vision_encoder.encoder.layer.2.layernorm_before.weight', 'vision_encoder.encoder.layer.2.layernorm_before.bias', 'vision_encoder.encoder.layer.2.layernorm_after.weight', 'vision_encoder.encoder.layer.2.layernorm_after.bias', 'vision_encoder.encoder.layer.3.lambda_1', 'vision_encoder.encoder.layer.3.lambda_2', 'vision_encoder.encoder.layer.3.attention.attention.query.weight', 'vision_encoder.encoder.layer.3.attention.attention.query.bias', 'vision_encoder.encoder.layer.3.attention.attention.key.weight', 'vision_encoder.encoder.layer.3.attention.attention.value.weight', 'vision_encoder.encoder.layer.3.attention.attention.value.bias', 'vision_encoder.encoder.layer.3.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.3.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.3.attention.output.dense.weight', 'vision_encoder.encoder.layer.3.attention.output.dense.bias', 'vision_encoder.encoder.layer.3.intermediate.dense.weight', 'vision_encoder.encoder.layer.3.intermediate.dense.bias', 'vision_encoder.encoder.layer.3.output.dense.weight', 'vision_encoder.encoder.layer.3.output.dense.bias', 'vision_encoder.encoder.layer.3.layernorm_before.weight', 'vision_encoder.encoder.layer.3.layernorm_before.bias', 'vision_encoder.encoder.layer.3.layernorm_after.weight', 'vision_encoder.encoder.layer.3.layernorm_after.bias', 'vision_encoder.encoder.layer.4.lambda_1', 'vision_encoder.encoder.layer.4.lambda_2', 'vision_encoder.encoder.layer.4.attention.attention.query.weight', 'vision_encoder.encoder.layer.4.attention.attention.query.bias', 'vision_encoder.encoder.layer.4.attention.attention.key.weight', 'vision_encoder.encoder.layer.4.attention.attention.value.weight', 'vision_encoder.encoder.layer.4.attention.attention.value.bias', 'vision_encoder.encoder.layer.4.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.4.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.4.attention.output.dense.weight', 'vision_encoder.encoder.layer.4.attention.output.dense.bias', 'vision_encoder.encoder.layer.4.intermediate.dense.weight', 'vision_encoder.encoder.layer.4.intermediate.dense.bias', 'vision_encoder.encoder.layer.4.output.dense.weight', 'vision_encoder.encoder.layer.4.output.dense.bias', 'vision_encoder.encoder.layer.4.layernorm_before.weight', 'vision_encoder.encoder.layer.4.layernorm_before.bias', 'vision_encoder.encoder.layer.4.layernorm_after.weight', 'vision_encoder.encoder.layer.4.layernorm_after.bias', 'vision_encoder.encoder.layer.5.lambda_1', 'vision_encoder.encoder.layer.5.lambda_2', 'vision_encoder.encoder.layer.5.attention.attention.query.weight', 'vision_encoder.encoder.layer.5.attention.attention.query.bias', 'vision_encoder.encoder.layer.5.attention.attention.key.weight', 'vision_encoder.encoder.layer.5.attention.attention.value.weight', 'vision_encoder.encoder.layer.5.attention.attention.value.bias', 'vision_encoder.encoder.layer.5.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.5.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.5.attention.output.dense.weight', 'vision_encoder.encoder.layer.5.attention.output.dense.bias', 'vision_encoder.encoder.layer.5.intermediate.dense.weight', 'vision_encoder.encoder.layer.5.intermediate.dense.bias', 'vision_encoder.encoder.layer.5.output.dense.weight', 'vision_encoder.encoder.layer.5.output.dense.bias', 'vision_encoder.encoder.layer.5.layernorm_before.weight', 'vision_encoder.encoder.layer.5.layernorm_before.bias', 'vision_encoder.encoder.layer.5.layernorm_after.weight', 'vision_encoder.encoder.layer.5.layernorm_after.bias', 'vision_encoder.encoder.layer.6.lambda_1', 'vision_encoder.encoder.layer.6.lambda_2', 'vision_encoder.encoder.layer.6.attention.attention.query.weight', 'vision_encoder.encoder.layer.6.attention.attention.query.bias', 'vision_encoder.encoder.layer.6.attention.attention.key.weight', 'vision_encoder.encoder.layer.6.attention.attention.value.weight', 'vision_encoder.encoder.layer.6.attention.attention.value.bias', 'vision_encoder.encoder.layer.6.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.6.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.6.attention.output.dense.weight', 'vision_encoder.encoder.layer.6.attention.output.dense.bias', 'vision_encoder.encoder.layer.6.intermediate.dense.weight', 'vision_encoder.encoder.layer.6.intermediate.dense.bias', 'vision_encoder.encoder.layer.6.output.dense.weight', 'vision_encoder.encoder.layer.6.output.dense.bias', 'vision_encoder.encoder.layer.6.layernorm_before.weight', 'vision_encoder.encoder.layer.6.layernorm_before.bias', 'vision_encoder.encoder.layer.6.layernorm_after.weight', 'vision_encoder.encoder.layer.6.layernorm_after.bias', 'vision_encoder.encoder.layer.7.lambda_1', 'vision_encoder.encoder.layer.7.lambda_2', 'vision_encoder.encoder.layer.7.attention.attention.query.weight', 'vision_encoder.encoder.layer.7.attention.attention.query.bias', 'vision_encoder.encoder.layer.7.attention.attention.key.weight', 'vision_encoder.encoder.layer.7.attention.attention.value.weight', 'vision_encoder.encoder.layer.7.attention.attention.value.bias', 'vision_encoder.encoder.layer.7.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.7.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.7.attention.output.dense.weight', 'vision_encoder.encoder.layer.7.attention.output.dense.bias', 'vision_encoder.encoder.layer.7.intermediate.dense.weight', 'vision_encoder.encoder.layer.7.intermediate.dense.bias', 'vision_encoder.encoder.layer.7.output.dense.weight', 'vision_encoder.encoder.layer.7.output.dense.bias', 'vision_encoder.encoder.layer.7.layernorm_before.weight', 'vision_encoder.encoder.layer.7.layernorm_before.bias', 'vision_encoder.encoder.layer.7.layernorm_after.weight', 'vision_encoder.encoder.layer.7.layernorm_after.bias', 'vision_encoder.encoder.layer.8.lambda_1', 'vision_encoder.encoder.layer.8.lambda_2', 'vision_encoder.encoder.layer.8.attention.attention.query.weight', 'vision_encoder.encoder.layer.8.attention.attention.query.bias', 'vision_encoder.encoder.layer.8.attention.attention.key.weight', 'vision_encoder.encoder.layer.8.attention.attention.value.weight', 'vision_encoder.encoder.layer.8.attention.attention.value.bias', 'vision_encoder.encoder.layer.8.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.8.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.8.attention.output.dense.weight', 'vision_encoder.encoder.layer.8.attention.output.dense.bias', 'vision_encoder.encoder.layer.8.intermediate.dense.weight', 'vision_encoder.encoder.layer.8.intermediate.dense.bias', 'vision_encoder.encoder.layer.8.output.dense.weight', 'vision_encoder.encoder.layer.8.output.dense.bias', 'vision_encoder.encoder.layer.8.layernorm_before.weight', 'vision_encoder.encoder.layer.8.layernorm_before.bias', 'vision_encoder.encoder.layer.8.layernorm_after.weight', 'vision_encoder.encoder.layer.8.layernorm_after.bias', 'vision_encoder.encoder.layer.9.lambda_1', 'vision_encoder.encoder.layer.9.lambda_2', 'vision_encoder.encoder.layer.9.attention.attention.query.weight', 'vision_encoder.encoder.layer.9.attention.attention.query.bias', 'vision_encoder.encoder.layer.9.attention.attention.key.weight', 'vision_encoder.encoder.layer.9.attention.attention.value.weight', 'vision_encoder.encoder.layer.9.attention.attention.value.bias', 'vision_encoder.encoder.layer.9.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.9.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.9.attention.output.dense.weight', 'vision_encoder.encoder.layer.9.attention.output.dense.bias', 'vision_encoder.encoder.layer.9.intermediate.dense.weight', 'vision_encoder.encoder.layer.9.intermediate.dense.bias', 'vision_encoder.encoder.layer.9.output.dense.weight', 'vision_encoder.encoder.layer.9.output.dense.bias', 'vision_encoder.encoder.layer.9.layernorm_before.weight', 'vision_encoder.encoder.layer.9.layernorm_before.bias', 'vision_encoder.encoder.layer.9.layernorm_after.weight', 'vision_encoder.encoder.layer.9.layernorm_after.bias', 'vision_encoder.encoder.layer.10.lambda_1', 'vision_encoder.encoder.layer.10.lambda_2', 'vision_encoder.encoder.layer.10.attention.attention.query.weight', 'vision_encoder.encoder.layer.10.attention.attention.query.bias', 'vision_encoder.encoder.layer.10.attention.attention.key.weight', 'vision_encoder.encoder.layer.10.attention.attention.value.weight', 'vision_encoder.encoder.layer.10.attention.attention.value.bias', 'vision_encoder.encoder.layer.10.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.10.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.10.attention.output.dense.weight', 'vision_encoder.encoder.layer.10.attention.output.dense.bias', 'vision_encoder.encoder.layer.10.intermediate.dense.weight', 'vision_encoder.encoder.layer.10.intermediate.dense.bias', 'vision_encoder.encoder.layer.10.output.dense.weight', 'vision_encoder.encoder.layer.10.output.dense.bias', 'vision_encoder.encoder.layer.10.layernorm_before.weight', 'vision_encoder.encoder.layer.10.layernorm_before.bias', 'vision_encoder.encoder.layer.10.layernorm_after.weight', 'vision_encoder.encoder.layer.10.layernorm_after.bias', 'vision_encoder.encoder.layer.11.lambda_1', 'vision_encoder.encoder.layer.11.lambda_2', 'vision_encoder.encoder.layer.11.attention.attention.query.weight', 'vision_encoder.encoder.layer.11.attention.attention.query.bias', 'vision_encoder.encoder.layer.11.attention.attention.key.weight', 'vision_encoder.encoder.layer.11.attention.attention.value.weight', 'vision_encoder.encoder.layer.11.attention.attention.value.bias', 'vision_encoder.encoder.layer.11.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.11.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.11.attention.output.dense.weight', 'vision_encoder.encoder.layer.11.attention.output.dense.bias', 'vision_encoder.encoder.layer.11.intermediate.dense.weight', 'vision_encoder.encoder.layer.11.intermediate.dense.bias', 'vision_encoder.encoder.layer.11.output.dense.weight', 'vision_encoder.encoder.layer.11.output.dense.bias', 'vision_encoder.encoder.layer.11.layernorm_before.weight', 'vision_encoder.encoder.layer.11.layernorm_before.bias', 'vision_encoder.encoder.layer.11.layernorm_after.weight', 'vision_encoder.encoder.layer.11.layernorm_after.bias', 'vision_layernorm.weight', 'vision_layernorm.bias', 'text_encoder.embeddings.position_ids', 'text_encoder.embeddings.word_embeddings.weight', 'text_encoder.embeddings.position_embeddings.weight', 'text_encoder.embeddings.token_type_embeddings.weight', 'text_encoder.embeddings.LayerNorm.weight', 'text_encoder.embeddings.LayerNorm.bias', 'text_encoder.encoder.layer.0.attention.self.query.weight', 'text_encoder.encoder.layer.0.attention.self.query.bias', 'text_encoder.encoder.layer.0.attention.self.key.weight', 'text_encoder.encoder.layer.0.attention.self.key.bias', 'text_encoder.encoder.layer.0.attention.self.value.weight', 'text_encoder.encoder.layer.0.attention.self.value.bias', 'text_encoder.encoder.layer.0.attention.output.dense.weight', 'text_encoder.encoder.layer.0.attention.output.dense.bias', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.0.intermediate.dense.weight', 'text_encoder.encoder.layer.0.intermediate.dense.bias', 'text_encoder.encoder.layer.0.output.dense.weight', 'text_encoder.encoder.layer.0.output.dense.bias', 'text_encoder.encoder.layer.0.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.attention.self.query.weight', 'text_encoder.encoder.layer.1.attention.self.query.bias', 'text_encoder.encoder.layer.1.attention.self.key.weight', 'text_encoder.encoder.layer.1.attention.self.key.bias', 'text_encoder.encoder.layer.1.attention.self.value.weight', 'text_encoder.encoder.layer.1.attention.self.value.bias', 'text_encoder.encoder.layer.1.attention.output.dense.weight', 'text_encoder.encoder.layer.1.attention.output.dense.bias', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.intermediate.dense.weight', 'text_encoder.encoder.layer.1.intermediate.dense.bias', 'text_encoder.encoder.layer.1.output.dense.weight', 'text_encoder.encoder.layer.1.output.dense.bias', 'text_encoder.encoder.layer.1.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.attention.self.query.weight', 'text_encoder.encoder.layer.2.attention.self.query.bias', 'text_encoder.encoder.layer.2.attention.self.key.weight', 'text_encoder.encoder.layer.2.attention.self.key.bias', 'text_encoder.encoder.layer.2.attention.self.value.weight', 'text_encoder.encoder.layer.2.attention.self.value.bias', 'text_encoder.encoder.layer.2.attention.output.dense.weight', 'text_encoder.encoder.layer.2.attention.output.dense.bias', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.intermediate.dense.weight', 'text_encoder.encoder.layer.2.intermediate.dense.bias', 'text_encoder.encoder.layer.2.output.dense.weight', 'text_encoder.encoder.layer.2.output.dense.bias', 'text_encoder.encoder.layer.2.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.attention.self.query.weight', 'text_encoder.encoder.layer.3.attention.self.query.bias', 'text_encoder.encoder.layer.3.attention.self.key.weight', 'text_encoder.encoder.layer.3.attention.self.key.bias', 'text_encoder.encoder.layer.3.attention.self.value.weight', 'text_encoder.encoder.layer.3.attention.self.value.bias', 'text_encoder.encoder.layer.3.attention.output.dense.weight', 'text_encoder.encoder.layer.3.attention.output.dense.bias', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.intermediate.dense.weight', 'text_encoder.encoder.layer.3.intermediate.dense.bias', 'text_encoder.encoder.layer.3.output.dense.weight', 'text_encoder.encoder.layer.3.output.dense.bias', 'text_encoder.encoder.layer.3.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.attention.self.query.weight', 'text_encoder.encoder.layer.4.attention.self.query.bias', 'text_encoder.encoder.layer.4.attention.self.key.weight', 'text_encoder.encoder.layer.4.attention.self.key.bias', 'text_encoder.encoder.layer.4.attention.self.value.weight', 'text_encoder.encoder.layer.4.attention.self.value.bias', 'text_encoder.encoder.layer.4.attention.output.dense.weight', 'text_encoder.encoder.layer.4.attention.output.dense.bias', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.intermediate.dense.weight', 'text_encoder.encoder.layer.4.intermediate.dense.bias', 'text_encoder.encoder.layer.4.output.dense.weight', 'text_encoder.encoder.layer.4.output.dense.bias', 'text_encoder.encoder.layer.4.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.attention.self.query.weight', 'text_encoder.encoder.layer.5.attention.self.query.bias', 'text_encoder.encoder.layer.5.attention.self.key.weight', 'text_encoder.encoder.layer.5.attention.self.key.bias', 'text_encoder.encoder.layer.5.attention.self.value.weight', 'text_encoder.encoder.layer.5.attention.self.value.bias', 'text_encoder.encoder.layer.5.attention.output.dense.weight', 'text_encoder.encoder.layer.5.attention.output.dense.bias', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.intermediate.dense.weight', 'text_encoder.encoder.layer.5.intermediate.dense.bias', 'text_encoder.encoder.layer.5.output.dense.weight', 'text_encoder.encoder.layer.5.output.dense.bias', 'text_encoder.encoder.layer.5.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.attention.self.query.weight', 'text_encoder.encoder.layer.6.attention.self.query.bias', 'text_encoder.encoder.layer.6.attention.self.key.weight', 'text_encoder.encoder.layer.6.attention.self.key.bias', 'text_encoder.encoder.layer.6.attention.self.value.weight', 'text_encoder.encoder.layer.6.attention.self.value.bias', 'text_encoder.encoder.layer.6.attention.output.dense.weight', 'text_encoder.encoder.layer.6.attention.output.dense.bias', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.intermediate.dense.weight', 'text_encoder.encoder.layer.6.intermediate.dense.bias', 'text_encoder.encoder.layer.6.output.dense.weight', 'text_encoder.encoder.layer.6.output.dense.bias', 'text_encoder.encoder.layer.6.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.attention.self.query.weight', 'text_encoder.encoder.layer.7.attention.self.query.bias', 'text_encoder.encoder.layer.7.attention.self.key.weight', 'text_encoder.encoder.layer.7.attention.self.key.bias', 'text_encoder.encoder.layer.7.attention.self.value.weight', 'text_encoder.encoder.layer.7.attention.self.value.bias', 'text_encoder.encoder.layer.7.attention.output.dense.weight', 'text_encoder.encoder.layer.7.attention.output.dense.bias', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.intermediate.dense.weight', 'text_encoder.encoder.layer.7.intermediate.dense.bias', 'text_encoder.encoder.layer.7.output.dense.weight', 'text_encoder.encoder.layer.7.output.dense.bias', 'text_encoder.encoder.layer.7.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.attention.self.query.weight', 'text_encoder.encoder.layer.8.attention.self.query.bias', 'text_encoder.encoder.layer.8.attention.self.key.weight', 'text_encoder.encoder.layer.8.attention.self.key.bias', 'text_encoder.encoder.layer.8.attention.self.value.weight', 'text_encoder.encoder.layer.8.attention.self.value.bias', 'text_encoder.encoder.layer.8.attention.output.dense.weight', 'text_encoder.encoder.layer.8.attention.output.dense.bias', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.intermediate.dense.weight', 'text_encoder.encoder.layer.8.intermediate.dense.bias', 'text_encoder.encoder.layer.8.output.dense.weight', 'text_encoder.encoder.layer.8.output.dense.bias', 'text_encoder.encoder.layer.8.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.attention.self.query.weight', 'text_encoder.encoder.layer.9.attention.self.query.bias', 'text_encoder.encoder.layer.9.attention.self.key.weight', 'text_encoder.encoder.layer.9.attention.self.key.bias', 'text_encoder.encoder.layer.9.attention.self.value.weight', 'text_encoder.encoder.layer.9.attention.self.value.bias', 'text_encoder.encoder.layer.9.attention.output.dense.weight', 'text_encoder.encoder.layer.9.attention.output.dense.bias', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.crossattention.self.query.weight', 'text_encoder.encoder.layer.9.crossattention.self.query.bias', 'text_encoder.encoder.layer.9.crossattention.self.key.weight', 'text_encoder.encoder.layer.9.crossattention.self.key.bias', 'text_encoder.encoder.layer.9.crossattention.self.value.weight', 'text_encoder.encoder.layer.9.crossattention.self.value.bias', 'text_encoder.encoder.layer.9.crossattention.output.dense.weight', 'text_encoder.encoder.layer.9.crossattention.output.dense.bias', 'text_encoder.encoder.layer.9.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.intermediate.dense.weight', 'text_encoder.encoder.layer.9.intermediate.dense.bias', 'text_encoder.encoder.layer.9.output.dense.weight', 'text_encoder.encoder.layer.9.output.dense.bias', 'text_encoder.encoder.layer.9.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.attention.self.query.weight', 'text_encoder.encoder.layer.10.attention.self.query.bias', 'text_encoder.encoder.layer.10.attention.self.key.weight', 'text_encoder.encoder.layer.10.attention.self.key.bias', 'text_encoder.encoder.layer.10.attention.self.value.weight', 'text_encoder.encoder.layer.10.attention.self.value.bias', 'text_encoder.encoder.layer.10.attention.output.dense.weight', 'text_encoder.encoder.layer.10.attention.output.dense.bias', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.crossattention.self.query.weight', 'text_encoder.encoder.layer.10.crossattention.self.query.bias', 'text_encoder.encoder.layer.10.crossattention.self.key.weight', 'text_encoder.encoder.layer.10.crossattention.self.key.bias', 'text_encoder.encoder.layer.10.crossattention.self.value.weight', 'text_encoder.encoder.layer.10.crossattention.self.value.bias', 'text_encoder.encoder.layer.10.crossattention.output.dense.weight', 'text_encoder.encoder.layer.10.crossattention.output.dense.bias', 'text_encoder.encoder.layer.10.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.intermediate.dense.weight', 'text_encoder.encoder.layer.10.intermediate.dense.bias', 'text_encoder.encoder.layer.10.output.dense.weight', 'text_encoder.encoder.layer.10.output.dense.bias', 'text_encoder.encoder.layer.10.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.attention.self.query.weight', 'text_encoder.encoder.layer.11.attention.self.query.bias', 'text_encoder.encoder.layer.11.attention.self.key.weight', 'text_encoder.encoder.layer.11.attention.self.key.bias', 'text_encoder.encoder.layer.11.attention.self.value.weight', 'text_encoder.encoder.layer.11.attention.self.value.bias', 'text_encoder.encoder.layer.11.attention.output.dense.weight', 'text_encoder.encoder.layer.11.attention.output.dense.bias', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.crossattention.self.query.weight', 'text_encoder.encoder.layer.11.crossattention.self.query.bias', 'text_encoder.encoder.layer.11.crossattention.self.key.weight', 'text_encoder.encoder.layer.11.crossattention.self.key.bias', 'text_encoder.encoder.layer.11.crossattention.self.value.weight', 'text_encoder.encoder.layer.11.crossattention.self.value.bias', 'text_encoder.encoder.layer.11.crossattention.output.dense.weight', 'text_encoder.encoder.layer.11.crossattention.output.dense.bias', 'text_encoder.encoder.layer.11.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.intermediate.dense.weight', 'text_encoder.encoder.layer.11.intermediate.dense.bias', 'text_encoder.encoder.layer.11.output.dense.weight', 'text_encoder.encoder.layer.11.output.dense.bias', 'text_encoder.encoder.layer.11.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.output.LayerNorm.bias', 'temporal_vision_encoder.layer.0.attention.self.query.weight', 'temporal_vision_encoder.layer.0.attention.self.query.bias', 'temporal_vision_encoder.layer.0.attention.self.key.weight', 'temporal_vision_encoder.layer.0.attention.self.key.bias', 'temporal_vision_encoder.layer.0.attention.self.value.weight', 'temporal_vision_encoder.layer.0.attention.self.value.bias', 'temporal_vision_encoder.layer.0.attention.output.dense.weight', 'temporal_vision_encoder.layer.0.attention.output.dense.bias', 'temporal_vision_encoder.layer.0.attention.output.LayerNorm.weight', 'temporal_vision_encoder.layer.0.attention.output.LayerNorm.bias', 'temporal_vision_encoder.layer.0.intermediate.dense.weight', 'temporal_vision_encoder.layer.0.intermediate.dense.bias', 'temporal_vision_encoder.layer.0.output.dense.weight', 'temporal_vision_encoder.layer.0.output.dense.bias', 'temporal_vision_encoder.layer.0.output.LayerNorm.weight', 'temporal_vision_encoder.layer.0.output.LayerNorm.bias', 'temporal_vision_encoder.layer.1.attention.self.query.weight', 'temporal_vision_encoder.layer.1.attention.self.query.bias', 'temporal_vision_encoder.layer.1.attention.self.key.weight', 'temporal_vision_encoder.layer.1.attention.self.key.bias', 'temporal_vision_encoder.layer.1.attention.self.value.weight', 'temporal_vision_encoder.layer.1.attention.self.value.bias', 'temporal_vision_encoder.layer.1.attention.output.dense.weight', 'temporal_vision_encoder.layer.1.attention.output.dense.bias', 'temporal_vision_encoder.layer.1.attention.output.LayerNorm.weight', 'temporal_vision_encoder.layer.1.attention.output.LayerNorm.bias', 'temporal_vision_encoder.layer.1.intermediate.dense.weight', 'temporal_vision_encoder.layer.1.intermediate.dense.bias', 'temporal_vision_encoder.layer.1.output.dense.weight', 'temporal_vision_encoder.layer.1.output.dense.bias', 'temporal_vision_encoder.layer.1.output.LayerNorm.weight', 'temporal_vision_encoder.layer.1.output.LayerNorm.bias', 'text_decoder.bert.embeddings.position_ids', 'text_decoder.bert.embeddings.word_embeddings.weight', 'text_decoder.bert.embeddings.position_embeddings.weight', 'text_decoder.bert.embeddings.token_type_embeddings.weight', 'text_decoder.bert.embeddings.LayerNorm.weight', 'text_decoder.bert.embeddings.LayerNorm.bias', 'text_decoder.bert.encoder.layer.0.attention.self.query.weight', 'text_decoder.bert.encoder.layer.0.attention.self.query.bias', 'text_decoder.bert.encoder.layer.0.attention.self.key.weight', 'text_decoder.bert.encoder.layer.0.attention.self.key.bias', 'text_decoder.bert.encoder.layer.0.attention.self.value.weight', 'text_decoder.bert.encoder.layer.0.attention.self.value.bias', 'text_decoder.bert.encoder.layer.0.attention.output.dense.weight', 'text_decoder.bert.encoder.layer.0.attention.output.dense.bias', 'text_decoder.bert.encoder.layer.0.attention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.0.attention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.0.crossattention.self.query.weight', 'text_decoder.bert.encoder.layer.0.crossattention.self.query.bias', 'text_decoder.bert.encoder.layer.0.crossattention.self.key.weight', 'text_decoder.bert.encoder.layer.0.crossattention.self.key.bias', 'text_decoder.bert.encoder.layer.0.crossattention.self.value.weight', 'text_decoder.bert.encoder.layer.0.crossattention.self.value.bias', 'text_decoder.bert.encoder.layer.0.crossattention.output.dense.weight', 'text_decoder.bert.encoder.layer.0.crossattention.output.dense.bias', 'text_decoder.bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.0.intermediate.dense.weight', 'text_decoder.bert.encoder.layer.0.intermediate.dense.bias', 'text_decoder.bert.encoder.layer.0.output.dense.weight', 'text_decoder.bert.encoder.layer.0.output.dense.bias', 'text_decoder.bert.encoder.layer.0.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.0.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.1.attention.self.query.weight', 'text_decoder.bert.encoder.layer.1.attention.self.query.bias', 'text_decoder.bert.encoder.layer.1.attention.self.key.weight', 'text_decoder.bert.encoder.layer.1.attention.self.key.bias', 'text_decoder.bert.encoder.layer.1.attention.self.value.weight', 'text_decoder.bert.encoder.layer.1.attention.self.value.bias', 'text_decoder.bert.encoder.layer.1.attention.output.dense.weight', 'text_decoder.bert.encoder.layer.1.attention.output.dense.bias', 'text_decoder.bert.encoder.layer.1.attention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.1.attention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.1.crossattention.self.query.weight', 'text_decoder.bert.encoder.layer.1.crossattention.self.query.bias', 'text_decoder.bert.encoder.layer.1.crossattention.self.key.weight', 'text_decoder.bert.encoder.layer.1.crossattention.self.key.bias', 'text_decoder.bert.encoder.layer.1.crossattention.self.value.weight', 'text_decoder.bert.encoder.layer.1.crossattention.self.value.bias', 'text_decoder.bert.encoder.layer.1.crossattention.output.dense.weight', 'text_decoder.bert.encoder.layer.1.crossattention.output.dense.bias', 'text_decoder.bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.1.intermediate.dense.weight', 'text_decoder.bert.encoder.layer.1.intermediate.dense.bias', 'text_decoder.bert.encoder.layer.1.output.dense.weight', 'text_decoder.bert.encoder.layer.1.output.dense.bias', 'text_decoder.bert.encoder.layer.1.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.1.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.2.attention.self.query.weight', 'text_decoder.bert.encoder.layer.2.attention.self.query.bias', 'text_decoder.bert.encoder.layer.2.attention.self.key.weight', 'text_decoder.bert.encoder.layer.2.attention.self.key.bias', 'text_decoder.bert.encoder.layer.2.attention.self.value.weight', 'text_decoder.bert.encoder.layer.2.attention.self.value.bias', 'text_decoder.bert.encoder.layer.2.attention.output.dense.weight', 'text_decoder.bert.encoder.layer.2.attention.output.dense.bias', 'text_decoder.bert.encoder.layer.2.attention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.2.attention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.2.crossattention.self.query.weight', 'text_decoder.bert.encoder.layer.2.crossattention.self.query.bias', 'text_decoder.bert.encoder.layer.2.crossattention.self.key.weight', 'text_decoder.bert.encoder.layer.2.crossattention.self.key.bias', 'text_decoder.bert.encoder.layer.2.crossattention.self.value.weight', 'text_decoder.bert.encoder.layer.2.crossattention.self.value.bias', 'text_decoder.bert.encoder.layer.2.crossattention.output.dense.weight', 'text_decoder.bert.encoder.layer.2.crossattention.output.dense.bias', 'text_decoder.bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.2.crossattention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.2.intermediate.dense.weight', 'text_decoder.bert.encoder.layer.2.intermediate.dense.bias', 'text_decoder.bert.encoder.layer.2.output.dense.weight', 'text_decoder.bert.encoder.layer.2.output.dense.bias', 'text_decoder.bert.encoder.layer.2.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.2.output.LayerNorm.bias', 'text_decoder.cls.predictions.bias', 'text_decoder.cls.predictions.transform.dense.weight', 'text_decoder.cls.predictions.transform.dense.bias', 'text_decoder.cls.predictions.transform.LayerNorm.weight', 'text_decoder.cls.predictions.transform.LayerNorm.bias', 'text_decoder.cls.predictions.decoder.weight', 'text_decoder.cls.predictions.decoder.bias'])
[32m2023-10-19T12:39:23 | models.utils: [0mLoad temporal_embeddings, lengths: 4-->1
model_without_ddp: Singularity(
(vision_encoder): BeitModel(
(embeddings): BeitEmbeddings(
(patch_embeddings): PatchEmbeddings(
(projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
)
(dropout): Dropout(p=0.0, inplace=False)
)
(encoder): BeitEncoder(
(layer): ModuleList(
(0): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): Identity()
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(1): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.00909090880304575)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(2): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.0181818176060915)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(3): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.027272727340459824)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(4): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.036363635212183)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(5): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.045454543083906174)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(6): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.054545458406209946)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(7): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.06363636255264282)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(8): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.0727272778749466)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(9): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.08181818574666977)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(10): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.09090909361839294)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
(11): BeitLayer(
(attention): BeitAttention(
(attention): BeitSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=False)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
(relative_position_bias): BeitRelativePositionBias()
)
(output): BeitSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
)
(intermediate): BeitIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BeitOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(dropout): Dropout(p=0.0, inplace=False)
)
(layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(drop_path): DropPath(p=0.10000000149011612)
(layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
)
)
(layernorm): Identity()
(pooler): BeitPooler(
(layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)
)
(vision_layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(text_encoder): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(crossattention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(crossattention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(crossattention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
)
(vision_proj): Linear(in_features=768, out_features=256, bias=True)
(text_proj): Linear(in_features=768, out_features=256, bias=True)
(itm_head): Linear(in_features=768, out_features=2, bias=True)
(temporal_vision_encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
)
model_without_ddp.state_dict().keys(): odict_keys(['temp', 'temporal_embeddings', 'vision_encoder.embeddings.cls_token', 'vision_encoder.embeddings.patch_embeddings.projection.weight', 'vision_encoder.embeddings.patch_embeddings.projection.bias', 'vision_encoder.encoder.layer.0.lambda_1', 'vision_encoder.encoder.layer.0.lambda_2', 'vision_encoder.encoder.layer.0.attention.attention.query.weight', 'vision_encoder.encoder.layer.0.attention.attention.query.bias', 'vision_encoder.encoder.layer.0.attention.attention.key.weight', 'vision_encoder.encoder.layer.0.attention.attention.value.weight', 'vision_encoder.encoder.layer.0.attention.attention.value.bias', 'vision_encoder.encoder.layer.0.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.0.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.0.attention.output.dense.weight', 'vision_encoder.encoder.layer.0.attention.output.dense.bias', 'vision_encoder.encoder.layer.0.intermediate.dense.weight', 'vision_encoder.encoder.layer.0.intermediate.dense.bias', 'vision_encoder.encoder.layer.0.output.dense.weight', 'vision_encoder.encoder.layer.0.output.dense.bias', 'vision_encoder.encoder.layer.0.layernorm_before.weight', 'vision_encoder.encoder.layer.0.layernorm_before.bias', 'vision_encoder.encoder.layer.0.layernorm_after.weight', 'vision_encoder.encoder.layer.0.layernorm_after.bias', 'vision_encoder.encoder.layer.1.lambda_1', 'vision_encoder.encoder.layer.1.lambda_2', 'vision_encoder.encoder.layer.1.attention.attention.query.weight', 'vision_encoder.encoder.layer.1.attention.attention.query.bias', 'vision_encoder.encoder.layer.1.attention.attention.key.weight', 'vision_encoder.encoder.layer.1.attention.attention.value.weight', 'vision_encoder.encoder.layer.1.attention.attention.value.bias', 'vision_encoder.encoder.layer.1.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.1.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.1.attention.output.dense.weight', 'vision_encoder.encoder.layer.1.attention.output.dense.bias', 'vision_encoder.encoder.layer.1.intermediate.dense.weight', 'vision_encoder.encoder.layer.1.intermediate.dense.bias', 'vision_encoder.encoder.layer.1.output.dense.weight', 'vision_encoder.encoder.layer.1.output.dense.bias', 'vision_encoder.encoder.layer.1.layernorm_before.weight', 'vision_encoder.encoder.layer.1.layernorm_before.bias', 'vision_encoder.encoder.layer.1.layernorm_after.weight', 'vision_encoder.encoder.layer.1.layernorm_after.bias', 'vision_encoder.encoder.layer.2.lambda_1', 'vision_encoder.encoder.layer.2.lambda_2', 'vision_encoder.encoder.layer.2.attention.attention.query.weight', 'vision_encoder.encoder.layer.2.attention.attention.query.bias', 'vision_encoder.encoder.layer.2.attention.attention.key.weight', 'vision_encoder.encoder.layer.2.attention.attention.value.weight', 'vision_encoder.encoder.layer.2.attention.attention.value.bias', 'vision_encoder.encoder.layer.2.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.2.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.2.attention.output.dense.weight', 'vision_encoder.encoder.layer.2.attention.output.dense.bias', 'vision_encoder.encoder.layer.2.intermediate.dense.weight', 'vision_encoder.encoder.layer.2.intermediate.dense.bias', 'vision_encoder.encoder.layer.2.output.dense.weight', 'vision_encoder.encoder.layer.2.output.dense.bias', 'vision_encoder.encoder.layer.2.layernorm_before.weight', 'vision_encoder.encoder.layer.2.layernorm_before.bias', 'vision_encoder.encoder.layer.2.layernorm_after.weight', 'vision_encoder.encoder.layer.2.layernorm_after.bias', 'vision_encoder.encoder.layer.3.lambda_1', 'vision_encoder.encoder.layer.3.lambda_2', 'vision_encoder.encoder.layer.3.attention.attention.query.weight', 'vision_encoder.encoder.layer.3.attention.attention.query.bias', 'vision_encoder.encoder.layer.3.attention.attention.key.weight', 'vision_encoder.encoder.layer.3.attention.attention.value.weight', 'vision_encoder.encoder.layer.3.attention.attention.value.bias', 'vision_encoder.encoder.layer.3.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.3.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.3.attention.output.dense.weight', 'vision_encoder.encoder.layer.3.attention.output.dense.bias', 'vision_encoder.encoder.layer.3.intermediate.dense.weight', 'vision_encoder.encoder.layer.3.intermediate.dense.bias', 'vision_encoder.encoder.layer.3.output.dense.weight', 'vision_encoder.encoder.layer.3.output.dense.bias', 'vision_encoder.encoder.layer.3.layernorm_before.weight', 'vision_encoder.encoder.layer.3.layernorm_before.bias', 'vision_encoder.encoder.layer.3.layernorm_after.weight', 'vision_encoder.encoder.layer.3.layernorm_after.bias', 'vision_encoder.encoder.layer.4.lambda_1', 'vision_encoder.encoder.layer.4.lambda_2', 'vision_encoder.encoder.layer.4.attention.attention.query.weight', 'vision_encoder.encoder.layer.4.attention.attention.query.bias', 'vision_encoder.encoder.layer.4.attention.attention.key.weight', 'vision_encoder.encoder.layer.4.attention.attention.value.weight', 'vision_encoder.encoder.layer.4.attention.attention.value.bias', 'vision_encoder.encoder.layer.4.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.4.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.4.attention.output.dense.weight', 'vision_encoder.encoder.layer.4.attention.output.dense.bias', 'vision_encoder.encoder.layer.4.intermediate.dense.weight', 'vision_encoder.encoder.layer.4.intermediate.dense.bias', 'vision_encoder.encoder.layer.4.output.dense.weight', 'vision_encoder.encoder.layer.4.output.dense.bias', 'vision_encoder.encoder.layer.4.layernorm_before.weight', 'vision_encoder.encoder.layer.4.layernorm_before.bias', 'vision_encoder.encoder.layer.4.layernorm_after.weight', 'vision_encoder.encoder.layer.4.layernorm_after.bias', 'vision_encoder.encoder.layer.5.lambda_1', 'vision_encoder.encoder.layer.5.lambda_2', 'vision_encoder.encoder.layer.5.attention.attention.query.weight', 'vision_encoder.encoder.layer.5.attention.attention.query.bias', 'vision_encoder.encoder.layer.5.attention.attention.key.weight', 'vision_encoder.encoder.layer.5.attention.attention.value.weight', 'vision_encoder.encoder.layer.5.attention.attention.value.bias', 'vision_encoder.encoder.layer.5.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.5.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.5.attention.output.dense.weight', 'vision_encoder.encoder.layer.5.attention.output.dense.bias', 'vision_encoder.encoder.layer.5.intermediate.dense.weight', 'vision_encoder.encoder.layer.5.intermediate.dense.bias', 'vision_encoder.encoder.layer.5.output.dense.weight', 'vision_encoder.encoder.layer.5.output.dense.bias', 'vision_encoder.encoder.layer.5.layernorm_before.weight', 'vision_encoder.encoder.layer.5.layernorm_before.bias', 'vision_encoder.encoder.layer.5.layernorm_after.weight', 'vision_encoder.encoder.layer.5.layernorm_after.bias', 'vision_encoder.encoder.layer.6.lambda_1', 'vision_encoder.encoder.layer.6.lambda_2', 'vision_encoder.encoder.layer.6.attention.attention.query.weight', 'vision_encoder.encoder.layer.6.attention.attention.query.bias', 'vision_encoder.encoder.layer.6.attention.attention.key.weight', 'vision_encoder.encoder.layer.6.attention.attention.value.weight', 'vision_encoder.encoder.layer.6.attention.attention.value.bias', 'vision_encoder.encoder.layer.6.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.6.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.6.attention.output.dense.weight', 'vision_encoder.encoder.layer.6.attention.output.dense.bias', 'vision_encoder.encoder.layer.6.intermediate.dense.weight', 'vision_encoder.encoder.layer.6.intermediate.dense.bias', 'vision_encoder.encoder.layer.6.output.dense.weight', 'vision_encoder.encoder.layer.6.output.dense.bias', 'vision_encoder.encoder.layer.6.layernorm_before.weight', 'vision_encoder.encoder.layer.6.layernorm_before.bias', 'vision_encoder.encoder.layer.6.layernorm_after.weight', 'vision_encoder.encoder.layer.6.layernorm_after.bias', 'vision_encoder.encoder.layer.7.lambda_1', 'vision_encoder.encoder.layer.7.lambda_2', 'vision_encoder.encoder.layer.7.attention.attention.query.weight', 'vision_encoder.encoder.layer.7.attention.attention.query.bias', 'vision_encoder.encoder.layer.7.attention.attention.key.weight', 'vision_encoder.encoder.layer.7.attention.attention.value.weight', 'vision_encoder.encoder.layer.7.attention.attention.value.bias', 'vision_encoder.encoder.layer.7.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.7.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.7.attention.output.dense.weight', 'vision_encoder.encoder.layer.7.attention.output.dense.bias', 'vision_encoder.encoder.layer.7.intermediate.dense.weight', 'vision_encoder.encoder.layer.7.intermediate.dense.bias', 'vision_encoder.encoder.layer.7.output.dense.weight', 'vision_encoder.encoder.layer.7.output.dense.bias', 'vision_encoder.encoder.layer.7.layernorm_before.weight', 'vision_encoder.encoder.layer.7.layernorm_before.bias', 'vision_encoder.encoder.layer.7.layernorm_after.weight', 'vision_encoder.encoder.layer.7.layernorm_after.bias', 'vision_encoder.encoder.layer.8.lambda_1', 'vision_encoder.encoder.layer.8.lambda_2', 'vision_encoder.encoder.layer.8.attention.attention.query.weight', 'vision_encoder.encoder.layer.8.attention.attention.query.bias', 'vision_encoder.encoder.layer.8.attention.attention.key.weight', 'vision_encoder.encoder.layer.8.attention.attention.value.weight', 'vision_encoder.encoder.layer.8.attention.attention.value.bias', 'vision_encoder.encoder.layer.8.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.8.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.8.attention.output.dense.weight', 'vision_encoder.encoder.layer.8.attention.output.dense.bias', 'vision_encoder.encoder.layer.8.intermediate.dense.weight', 'vision_encoder.encoder.layer.8.intermediate.dense.bias', 'vision_encoder.encoder.layer.8.output.dense.weight', 'vision_encoder.encoder.layer.8.output.dense.bias', 'vision_encoder.encoder.layer.8.layernorm_before.weight', 'vision_encoder.encoder.layer.8.layernorm_before.bias', 'vision_encoder.encoder.layer.8.layernorm_after.weight', 'vision_encoder.encoder.layer.8.layernorm_after.bias', 'vision_encoder.encoder.layer.9.lambda_1', 'vision_encoder.encoder.layer.9.lambda_2', 'vision_encoder.encoder.layer.9.attention.attention.query.weight', 'vision_encoder.encoder.layer.9.attention.attention.query.bias', 'vision_encoder.encoder.layer.9.attention.attention.key.weight', 'vision_encoder.encoder.layer.9.attention.attention.value.weight', 'vision_encoder.encoder.layer.9.attention.attention.value.bias', 'vision_encoder.encoder.layer.9.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.9.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.9.attention.output.dense.weight', 'vision_encoder.encoder.layer.9.attention.output.dense.bias', 'vision_encoder.encoder.layer.9.intermediate.dense.weight', 'vision_encoder.encoder.layer.9.intermediate.dense.bias', 'vision_encoder.encoder.layer.9.output.dense.weight', 'vision_encoder.encoder.layer.9.output.dense.bias', 'vision_encoder.encoder.layer.9.layernorm_before.weight', 'vision_encoder.encoder.layer.9.layernorm_before.bias', 'vision_encoder.encoder.layer.9.layernorm_after.weight', 'vision_encoder.encoder.layer.9.layernorm_after.bias', 'vision_encoder.encoder.layer.10.lambda_1', 'vision_encoder.encoder.layer.10.lambda_2', 'vision_encoder.encoder.layer.10.attention.attention.query.weight', 'vision_encoder.encoder.layer.10.attention.attention.query.bias', 'vision_encoder.encoder.layer.10.attention.attention.key.weight', 'vision_encoder.encoder.layer.10.attention.attention.value.weight', 'vision_encoder.encoder.layer.10.attention.attention.value.bias', 'vision_encoder.encoder.layer.10.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.10.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.10.attention.output.dense.weight', 'vision_encoder.encoder.layer.10.attention.output.dense.bias', 'vision_encoder.encoder.layer.10.intermediate.dense.weight', 'vision_encoder.encoder.layer.10.intermediate.dense.bias', 'vision_encoder.encoder.layer.10.output.dense.weight', 'vision_encoder.encoder.layer.10.output.dense.bias', 'vision_encoder.encoder.layer.10.layernorm_before.weight', 'vision_encoder.encoder.layer.10.layernorm_before.bias', 'vision_encoder.encoder.layer.10.layernorm_after.weight', 'vision_encoder.encoder.layer.10.layernorm_after.bias', 'vision_encoder.encoder.layer.11.lambda_1', 'vision_encoder.encoder.layer.11.lambda_2', 'vision_encoder.encoder.layer.11.attention.attention.query.weight', 'vision_encoder.encoder.layer.11.attention.attention.query.bias', 'vision_encoder.encoder.layer.11.attention.attention.key.weight', 'vision_encoder.encoder.layer.11.attention.attention.value.weight', 'vision_encoder.encoder.layer.11.attention.attention.value.bias', 'vision_encoder.encoder.layer.11.attention.attention.relative_position_bias.relative_position_bias_table', 'vision_encoder.encoder.layer.11.attention.attention.relative_position_bias.relative_position_index', 'vision_encoder.encoder.layer.11.attention.output.dense.weight', 'vision_encoder.encoder.layer.11.attention.output.dense.bias', 'vision_encoder.encoder.layer.11.intermediate.dense.weight', 'vision_encoder.encoder.layer.11.intermediate.dense.bias', 'vision_encoder.encoder.layer.11.output.dense.weight', 'vision_encoder.encoder.layer.11.output.dense.bias', 'vision_encoder.encoder.layer.11.layernorm_before.weight', 'vision_encoder.encoder.layer.11.layernorm_before.bias', 'vision_encoder.encoder.layer.11.layernorm_after.weight', 'vision_encoder.encoder.layer.11.layernorm_after.bias', 'vision_encoder.pooler.layernorm.weight', 'vision_encoder.pooler.layernorm.bias', 'vision_layernorm.weight', 'vision_layernorm.bias', 'text_encoder.embeddings.position_ids', 'text_encoder.embeddings.word_embeddings.weight', 'text_encoder.embeddings.position_embeddings.weight', 'text_encoder.embeddings.token_type_embeddings.weight', 'text_encoder.embeddings.LayerNorm.weight', 'text_encoder.embeddings.LayerNorm.bias', 'text_encoder.encoder.layer.0.attention.self.query.weight', 'text_encoder.encoder.layer.0.attention.self.query.bias', 'text_encoder.encoder.layer.0.attention.self.key.weight', 'text_encoder.encoder.layer.0.attention.self.key.bias', 'text_encoder.encoder.layer.0.attention.self.value.weight', 'text_encoder.encoder.layer.0.attention.self.value.bias', 'text_encoder.encoder.layer.0.attention.output.dense.weight', 'text_encoder.encoder.layer.0.attention.output.dense.bias', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.0.intermediate.dense.weight', 'text_encoder.encoder.layer.0.intermediate.dense.bias', 'text_encoder.encoder.layer.0.output.dense.weight', 'text_encoder.encoder.layer.0.output.dense.bias', 'text_encoder.encoder.layer.0.output.LayerNorm.weight', 'text_encoder.encoder.layer.0.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.attention.self.query.weight', 'text_encoder.encoder.layer.1.attention.self.query.bias', 'text_encoder.encoder.layer.1.attention.self.key.weight', 'text_encoder.encoder.layer.1.attention.self.key.bias', 'text_encoder.encoder.layer.1.attention.self.value.weight', 'text_encoder.encoder.layer.1.attention.self.value.bias', 'text_encoder.encoder.layer.1.attention.output.dense.weight', 'text_encoder.encoder.layer.1.attention.output.dense.bias', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.1.intermediate.dense.weight', 'text_encoder.encoder.layer.1.intermediate.dense.bias', 'text_encoder.encoder.layer.1.output.dense.weight', 'text_encoder.encoder.layer.1.output.dense.bias', 'text_encoder.encoder.layer.1.output.LayerNorm.weight', 'text_encoder.encoder.layer.1.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.attention.self.query.weight', 'text_encoder.encoder.layer.2.attention.self.query.bias', 'text_encoder.encoder.layer.2.attention.self.key.weight', 'text_encoder.encoder.layer.2.attention.self.key.bias', 'text_encoder.encoder.layer.2.attention.self.value.weight', 'text_encoder.encoder.layer.2.attention.self.value.bias', 'text_encoder.encoder.layer.2.attention.output.dense.weight', 'text_encoder.encoder.layer.2.attention.output.dense.bias', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.2.intermediate.dense.weight', 'text_encoder.encoder.layer.2.intermediate.dense.bias', 'text_encoder.encoder.layer.2.output.dense.weight', 'text_encoder.encoder.layer.2.output.dense.bias', 'text_encoder.encoder.layer.2.output.LayerNorm.weight', 'text_encoder.encoder.layer.2.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.attention.self.query.weight', 'text_encoder.encoder.layer.3.attention.self.query.bias', 'text_encoder.encoder.layer.3.attention.self.key.weight', 'text_encoder.encoder.layer.3.attention.self.key.bias', 'text_encoder.encoder.layer.3.attention.self.value.weight', 'text_encoder.encoder.layer.3.attention.self.value.bias', 'text_encoder.encoder.layer.3.attention.output.dense.weight', 'text_encoder.encoder.layer.3.attention.output.dense.bias', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.3.intermediate.dense.weight', 'text_encoder.encoder.layer.3.intermediate.dense.bias', 'text_encoder.encoder.layer.3.output.dense.weight', 'text_encoder.encoder.layer.3.output.dense.bias', 'text_encoder.encoder.layer.3.output.LayerNorm.weight', 'text_encoder.encoder.layer.3.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.attention.self.query.weight', 'text_encoder.encoder.layer.4.attention.self.query.bias', 'text_encoder.encoder.layer.4.attention.self.key.weight', 'text_encoder.encoder.layer.4.attention.self.key.bias', 'text_encoder.encoder.layer.4.attention.self.value.weight', 'text_encoder.encoder.layer.4.attention.self.value.bias', 'text_encoder.encoder.layer.4.attention.output.dense.weight', 'text_encoder.encoder.layer.4.attention.output.dense.bias', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.4.intermediate.dense.weight', 'text_encoder.encoder.layer.4.intermediate.dense.bias', 'text_encoder.encoder.layer.4.output.dense.weight', 'text_encoder.encoder.layer.4.output.dense.bias', 'text_encoder.encoder.layer.4.output.LayerNorm.weight', 'text_encoder.encoder.layer.4.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.attention.self.query.weight', 'text_encoder.encoder.layer.5.attention.self.query.bias', 'text_encoder.encoder.layer.5.attention.self.key.weight', 'text_encoder.encoder.layer.5.attention.self.key.bias', 'text_encoder.encoder.layer.5.attention.self.value.weight', 'text_encoder.encoder.layer.5.attention.self.value.bias', 'text_encoder.encoder.layer.5.attention.output.dense.weight', 'text_encoder.encoder.layer.5.attention.output.dense.bias', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.5.intermediate.dense.weight', 'text_encoder.encoder.layer.5.intermediate.dense.bias', 'text_encoder.encoder.layer.5.output.dense.weight', 'text_encoder.encoder.layer.5.output.dense.bias', 'text_encoder.encoder.layer.5.output.LayerNorm.weight', 'text_encoder.encoder.layer.5.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.attention.self.query.weight', 'text_encoder.encoder.layer.6.attention.self.query.bias', 'text_encoder.encoder.layer.6.attention.self.key.weight', 'text_encoder.encoder.layer.6.attention.self.key.bias', 'text_encoder.encoder.layer.6.attention.self.value.weight', 'text_encoder.encoder.layer.6.attention.self.value.bias', 'text_encoder.encoder.layer.6.attention.output.dense.weight', 'text_encoder.encoder.layer.6.attention.output.dense.bias', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.6.intermediate.dense.weight', 'text_encoder.encoder.layer.6.intermediate.dense.bias', 'text_encoder.encoder.layer.6.output.dense.weight', 'text_encoder.encoder.layer.6.output.dense.bias', 'text_encoder.encoder.layer.6.output.LayerNorm.weight', 'text_encoder.encoder.layer.6.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.attention.self.query.weight', 'text_encoder.encoder.layer.7.attention.self.query.bias', 'text_encoder.encoder.layer.7.attention.self.key.weight', 'text_encoder.encoder.layer.7.attention.self.key.bias', 'text_encoder.encoder.layer.7.attention.self.value.weight', 'text_encoder.encoder.layer.7.attention.self.value.bias', 'text_encoder.encoder.layer.7.attention.output.dense.weight', 'text_encoder.encoder.layer.7.attention.output.dense.bias', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.7.intermediate.dense.weight', 'text_encoder.encoder.layer.7.intermediate.dense.bias', 'text_encoder.encoder.layer.7.output.dense.weight', 'text_encoder.encoder.layer.7.output.dense.bias', 'text_encoder.encoder.layer.7.output.LayerNorm.weight', 'text_encoder.encoder.layer.7.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.attention.self.query.weight', 'text_encoder.encoder.layer.8.attention.self.query.bias', 'text_encoder.encoder.layer.8.attention.self.key.weight', 'text_encoder.encoder.layer.8.attention.self.key.bias', 'text_encoder.encoder.layer.8.attention.self.value.weight', 'text_encoder.encoder.layer.8.attention.self.value.bias', 'text_encoder.encoder.layer.8.attention.output.dense.weight', 'text_encoder.encoder.layer.8.attention.output.dense.bias', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.8.intermediate.dense.weight', 'text_encoder.encoder.layer.8.intermediate.dense.bias', 'text_encoder.encoder.layer.8.output.dense.weight', 'text_encoder.encoder.layer.8.output.dense.bias', 'text_encoder.encoder.layer.8.output.LayerNorm.weight', 'text_encoder.encoder.layer.8.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.attention.self.query.weight', 'text_encoder.encoder.layer.9.attention.self.query.bias', 'text_encoder.encoder.layer.9.attention.self.key.weight', 'text_encoder.encoder.layer.9.attention.self.key.bias', 'text_encoder.encoder.layer.9.attention.self.value.weight', 'text_encoder.encoder.layer.9.attention.self.value.bias', 'text_encoder.encoder.layer.9.attention.output.dense.weight', 'text_encoder.encoder.layer.9.attention.output.dense.bias', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.crossattention.self.query.weight', 'text_encoder.encoder.layer.9.crossattention.self.query.bias', 'text_encoder.encoder.layer.9.crossattention.self.key.weight', 'text_encoder.encoder.layer.9.crossattention.self.key.bias', 'text_encoder.encoder.layer.9.crossattention.self.value.weight', 'text_encoder.encoder.layer.9.crossattention.self.value.bias', 'text_encoder.encoder.layer.9.crossattention.output.dense.weight', 'text_encoder.encoder.layer.9.crossattention.output.dense.bias', 'text_encoder.encoder.layer.9.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.9.intermediate.dense.weight', 'text_encoder.encoder.layer.9.intermediate.dense.bias', 'text_encoder.encoder.layer.9.output.dense.weight', 'text_encoder.encoder.layer.9.output.dense.bias', 'text_encoder.encoder.layer.9.output.LayerNorm.weight', 'text_encoder.encoder.layer.9.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.attention.self.query.weight', 'text_encoder.encoder.layer.10.attention.self.query.bias', 'text_encoder.encoder.layer.10.attention.self.key.weight', 'text_encoder.encoder.layer.10.attention.self.key.bias', 'text_encoder.encoder.layer.10.attention.self.value.weight', 'text_encoder.encoder.layer.10.attention.self.value.bias', 'text_encoder.encoder.layer.10.attention.output.dense.weight', 'text_encoder.encoder.layer.10.attention.output.dense.bias', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.crossattention.self.query.weight', 'text_encoder.encoder.layer.10.crossattention.self.query.bias', 'text_encoder.encoder.layer.10.crossattention.self.key.weight', 'text_encoder.encoder.layer.10.crossattention.self.key.bias', 'text_encoder.encoder.layer.10.crossattention.self.value.weight', 'text_encoder.encoder.layer.10.crossattention.self.value.bias', 'text_encoder.encoder.layer.10.crossattention.output.dense.weight', 'text_encoder.encoder.layer.10.crossattention.output.dense.bias', 'text_encoder.encoder.layer.10.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.10.intermediate.dense.weight', 'text_encoder.encoder.layer.10.intermediate.dense.bias', 'text_encoder.encoder.layer.10.output.dense.weight', 'text_encoder.encoder.layer.10.output.dense.bias', 'text_encoder.encoder.layer.10.output.LayerNorm.weight', 'text_encoder.encoder.layer.10.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.attention.self.query.weight', 'text_encoder.encoder.layer.11.attention.self.query.bias', 'text_encoder.encoder.layer.11.attention.self.key.weight', 'text_encoder.encoder.layer.11.attention.self.key.bias', 'text_encoder.encoder.layer.11.attention.self.value.weight', 'text_encoder.encoder.layer.11.attention.self.value.bias', 'text_encoder.encoder.layer.11.attention.output.dense.weight', 'text_encoder.encoder.layer.11.attention.output.dense.bias', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.attention.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.crossattention.self.query.weight', 'text_encoder.encoder.layer.11.crossattention.self.query.bias', 'text_encoder.encoder.layer.11.crossattention.self.key.weight', 'text_encoder.encoder.layer.11.crossattention.self.key.bias', 'text_encoder.encoder.layer.11.crossattention.self.value.weight', 'text_encoder.encoder.layer.11.crossattention.self.value.bias', 'text_encoder.encoder.layer.11.crossattention.output.dense.weight', 'text_encoder.encoder.layer.11.crossattention.output.dense.bias', 'text_encoder.encoder.layer.11.crossattention.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.crossattention.output.LayerNorm.bias', 'text_encoder.encoder.layer.11.intermediate.dense.weight', 'text_encoder.encoder.layer.11.intermediate.dense.bias', 'text_encoder.encoder.layer.11.output.dense.weight', 'text_encoder.encoder.layer.11.output.dense.bias', 'text_encoder.encoder.layer.11.output.LayerNorm.weight', 'text_encoder.encoder.layer.11.output.LayerNorm.bias', 'vision_proj.weight', 'vision_proj.bias', 'text_proj.weight', 'text_proj.bias', 'itm_head.weight', 'itm_head.bias', 'temporal_vision_encoder.layer.0.attention.self.query.weight', 'temporal_vision_encoder.layer.0.attention.self.query.bias', 'temporal_vision_encoder.layer.0.attention.self.key.weight', 'temporal_vision_encoder.layer.0.attention.self.key.bias', 'temporal_vision_encoder.layer.0.attention.self.value.weight', 'temporal_vision_encoder.layer.0.attention.self.value.bias', 'temporal_vision_encoder.layer.0.attention.output.dense.weight', 'temporal_vision_encoder.layer.0.attention.output.dense.bias', 'temporal_vision_encoder.layer.0.attention.output.LayerNorm.weight', 'temporal_vision_encoder.layer.0.attention.output.LayerNorm.bias', 'temporal_vision_encoder.layer.0.intermediate.dense.weight', 'temporal_vision_encoder.layer.0.intermediate.dense.bias', 'temporal_vision_encoder.layer.0.output.dense.weight', 'temporal_vision_encoder.layer.0.output.dense.bias', 'temporal_vision_encoder.layer.0.output.LayerNorm.weight', 'temporal_vision_encoder.layer.0.output.LayerNorm.bias', 'temporal_vision_encoder.layer.1.attention.self.query.weight', 'temporal_vision_encoder.layer.1.attention.self.query.bias', 'temporal_vision_encoder.layer.1.attention.self.key.weight', 'temporal_vision_encoder.layer.1.attention.self.key.bias', 'temporal_vision_encoder.layer.1.attention.self.value.weight', 'temporal_vision_encoder.layer.1.attention.self.value.bias', 'temporal_vision_encoder.layer.1.attention.output.dense.weight', 'temporal_vision_encoder.layer.1.attention.output.dense.bias', 'temporal_vision_encoder.layer.1.attention.output.LayerNorm.weight', 'temporal_vision_encoder.layer.1.attention.output.LayerNorm.bias', 'temporal_vision_encoder.layer.1.intermediate.dense.weight', 'temporal_vision_encoder.layer.1.intermediate.dense.bias', 'temporal_vision_encoder.layer.1.output.dense.weight', 'temporal_vision_encoder.layer.1.output.dense.bias', 'temporal_vision_encoder.layer.1.output.LayerNorm.weight', 'temporal_vision_encoder.layer.1.output.LayerNorm.bias'])
[32m2023-10-19T12:39:23 | tasks.shared_utils: [0m_IncompatibleKeys(missing_keys=['temp', 'vision_encoder.pooler.layernorm.weight', 'vision_encoder.pooler.layernorm.bias', 'vision_proj.weight', 'vision_proj.bias', 'text_proj.weight', 'text_proj.bias', 'itm_head.weight', 'itm_head.bias'], unexpected_keys=['text_decoder.bert.embeddings.position_ids', 'text_decoder.bert.embeddings.word_embeddings.weight', 'text_decoder.bert.embeddings.position_embeddings.weight', 'text_decoder.bert.embeddings.token_type_embeddings.weight', 'text_decoder.bert.embeddings.LayerNorm.weight', 'text_decoder.bert.embeddings.LayerNorm.bias', 'text_decoder.bert.encoder.layer.0.attention.self.query.weight', 'text_decoder.bert.encoder.layer.0.attention.self.query.bias', 'text_decoder.bert.encoder.layer.0.attention.self.key.weight', 'text_decoder.bert.encoder.layer.0.attention.self.key.bias', 'text_decoder.bert.encoder.layer.0.attention.self.value.weight', 'text_decoder.bert.encoder.layer.0.attention.self.value.bias', 'text_decoder.bert.encoder.layer.0.attention.output.dense.weight', 'text_decoder.bert.encoder.layer.0.attention.output.dense.bias', 'text_decoder.bert.encoder.layer.0.attention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.0.attention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.0.crossattention.self.query.weight', 'text_decoder.bert.encoder.layer.0.crossattention.self.query.bias', 'text_decoder.bert.encoder.layer.0.crossattention.self.key.weight', 'text_decoder.bert.encoder.layer.0.crossattention.self.key.bias', 'text_decoder.bert.encoder.layer.0.crossattention.self.value.weight', 'text_decoder.bert.encoder.layer.0.crossattention.self.value.bias', 'text_decoder.bert.encoder.layer.0.crossattention.output.dense.weight', 'text_decoder.bert.encoder.layer.0.crossattention.output.dense.bias', 'text_decoder.bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.0.intermediate.dense.weight', 'text_decoder.bert.encoder.layer.0.intermediate.dense.bias', 'text_decoder.bert.encoder.layer.0.output.dense.weight', 'text_decoder.bert.encoder.layer.0.output.dense.bias', 'text_decoder.bert.encoder.layer.0.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.0.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.1.attention.self.query.weight', 'text_decoder.bert.encoder.layer.1.attention.self.query.bias', 'text_decoder.bert.encoder.layer.1.attention.self.key.weight', 'text_decoder.bert.encoder.layer.1.attention.self.key.bias', 'text_decoder.bert.encoder.layer.1.attention.self.value.weight', 'text_decoder.bert.encoder.layer.1.attention.self.value.bias', 'text_decoder.bert.encoder.layer.1.attention.output.dense.weight', 'text_decoder.bert.encoder.layer.1.attention.output.dense.bias', 'text_decoder.bert.encoder.layer.1.attention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.1.attention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.1.crossattention.self.query.weight', 'text_decoder.bert.encoder.layer.1.crossattention.self.query.bias', 'text_decoder.bert.encoder.layer.1.crossattention.self.key.weight', 'text_decoder.bert.encoder.layer.1.crossattention.self.key.bias', 'text_decoder.bert.encoder.layer.1.crossattention.self.value.weight', 'text_decoder.bert.encoder.layer.1.crossattention.self.value.bias', 'text_decoder.bert.encoder.layer.1.crossattention.output.dense.weight', 'text_decoder.bert.encoder.layer.1.crossattention.output.dense.bias', 'text_decoder.bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.1.intermediate.dense.weight', 'text_decoder.bert.encoder.layer.1.intermediate.dense.bias', 'text_decoder.bert.encoder.layer.1.output.dense.weight', 'text_decoder.bert.encoder.layer.1.output.dense.bias', 'text_decoder.bert.encoder.layer.1.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.1.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.2.attention.self.query.weight', 'text_decoder.bert.encoder.layer.2.attention.self.query.bias', 'text_decoder.bert.encoder.layer.2.attention.self.key.weight', 'text_decoder.bert.encoder.layer.2.attention.self.key.bias', 'text_decoder.bert.encoder.layer.2.attention.self.value.weight', 'text_decoder.bert.encoder.layer.2.attention.self.value.bias', 'text_decoder.bert.encoder.layer.2.attention.output.dense.weight', 'text_decoder.bert.encoder.layer.2.attention.output.dense.bias', 'text_decoder.bert.encoder.layer.2.attention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.2.attention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.2.crossattention.self.query.weight', 'text_decoder.bert.encoder.layer.2.crossattention.self.query.bias', 'text_decoder.bert.encoder.layer.2.crossattention.self.key.weight', 'text_decoder.bert.encoder.layer.2.crossattention.self.key.bias', 'text_decoder.bert.encoder.layer.2.crossattention.self.value.weight', 'text_decoder.bert.encoder.layer.2.crossattention.self.value.bias', 'text_decoder.bert.encoder.layer.2.crossattention.output.dense.weight', 'text_decoder.bert.encoder.layer.2.crossattention.output.dense.bias', 'text_decoder.bert.encoder.layer.2.crossattention.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.2.crossattention.output.LayerNorm.bias', 'text_decoder.bert.encoder.layer.2.intermediate.dense.weight', 'text_decoder.bert.encoder.layer.2.intermediate.dense.bias', 'text_decoder.bert.encoder.layer.2.output.dense.weight', 'text_decoder.bert.encoder.layer.2.output.dense.bias', 'text_decoder.bert.encoder.layer.2.output.LayerNorm.weight', 'text_decoder.bert.encoder.layer.2.output.LayerNorm.bias', 'text_decoder.cls.predictions.bias', 'text_decoder.cls.predictions.transform.dense.weight', 'text_decoder.cls.predictions.transform.dense.bias', 'text_decoder.cls.predictions.transform.LayerNorm.weight', 'text_decoder.cls.predictions.transform.LayerNorm.bias', 'text_decoder.cls.predictions.decoder.weight', 'text_decoder.cls.predictions.decoder.bias'])
[32m2023-10-19T12:39:23 | tasks.shared_utils: [0mLoaded checkpoint from /home/wiss/zhang/nfs/anetqa_train_qa_full/ckpt_best.pth
[32m2023-10-19T12:39:23 | __main__: [0mStart evaluation
[32m2023-10-19T12:39:23 | tasks.retrieval_utils: [0mStart evaluation for media_type=video
[32m2023-10-19T12:39:23 | tasks.retrieval_utils: [0mComputing dual encoder features...
[5m[31mWARNING[0m [32m2023-10-19T12:40:13 | py.warnings: [0m/home/wiss/zhang/Jinhe/singularity/utils/distributed.py:18: UserWarning: Default upsampling behavior when mode=linear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
builtin_warn(*args, **kwargs)
[5m[31mWARNING[0m [32m2023-10-19T12:40:13 | py.warnings: [0m/home/wiss/zhang/Jinhe/singularity/utils/distributed.py:18: UserWarning: Default upsampling behavior when mode=linear is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
builtin_warn(*args, **kwargs)
[32m2023-10-19T12:40:15 | utils.basic_utils: [0mextracting image feats [0/6] eta: 0:04:48 time: 48.1265 data: 46.1289 max mem: 5164 res mem: 6070
[32m2023-10-19T12:40:19 | utils.basic_utils: [0mextracting image feats [5/6] eta: 0:00:08 time: 8.6470 data: 7.6979 max mem: 5424 res mem: 6070
[32m2023-10-19T12:40:19 | utils.basic_utils: [0mextracting image feats Total time: 0:00:51 (8.6521 s / it)
[32m2023-10-19T12:40:19 | tasks.retrieval_utils: [0mFinished feature extraction
[32m2023-10-19T12:40:19 | tasks.retrieval_utils: [0mComputing ITC scores [dot-product]