Skip to content

Commit

Permalink
add meta data link and script to convert lmdb
Browse files Browse the repository at this point in the history
  • Loading branch information
jiasenlu committed Aug 24, 2019
1 parent 1a5c406 commit 66f5226
Show file tree
Hide file tree
Showing 6 changed files with 193 additions and 7 deletions.
8 changes: 1 addition & 7 deletions data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,4 @@ You can directly download the feature and test the pre-trained model.
### Meta Data
We also provide the Meta data for ViLBERT, you can download them and run the code more eailiy.

|Task| Link |
| :-------------: | :---------------------------------------------: |
| VQA | |
| VCR | |
| RefCOCO+ | |
| Flickr30k | |

[Google drive](https://drive.google.com/drive/folders/1o7sCLl1_PKCoaGvigCr_uGuBg6koOJm8?usp=sharing)
38 changes: 38 additions & 0 deletions script/convert_lmdb_flickr30k.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import h5py
import os
import pdb
import numpy as np
import json
import sys
FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features', 'cls_prob']
import csv
import base64
import pickle
import lmdb # install lmdb by "pip install lmdb"

csv.field_size_limit(sys.maxsize)

count = 0
num_file = 1
name = '/srv/share2/jlu347/bottom-up-attention/feature/flickr30k/flickr30k_test_resnet101_faster_rcnn_genome.tsv.%d'
infiles = [name % i for i in range(num_file)]
# infiles.append('/srv/share2/jlu347/bottom-up-attention/feature/coco/coco_val_resnet101_faster_rcnn_genome.tsv.0')

save_path = os.path.join('flickr30k_test_resnet101_faster_rcnn_genome1.lmdb')
env = lmdb.open(save_path, map_size=1099511627776)

id_list = []
with env.begin(write=True) as txn:
for infile in infiles:
with open(infile) as tsv_in_file:
reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
for item in reader:
img_id = str(item['image_id']).encode()
id_list.append(img_id)
txn.put(img_id, pickle.dumps(item))
if count % 1000 == 0:
print(count)
count += 1
txn.put('keys'.encode(), pickle.dumps(id_list))

print(count)
39 changes: 39 additions & 0 deletions script/convert_lmdb_refcoco+.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import h5py
import os
import pdb
import numpy as np
import json
import sys
FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features', 'cls_prob']
import csv
import base64
csv.field_size_limit(sys.maxsize)
import sys
import pickle
import lmdb # install lmdb by "pip install lmdb"

count = 0
num_file = 1
name = '/srv/share2/jlu347/bottom-up-attention/feature/refcoco_unc/refcoco+_unc_resnet101_faster_rcnn_genome.tsv.%d'
infiles = [name % i for i in range(num_file)]

save_path = os.path.join('refcoco+.lmdb')
env = lmdb.open(save_path, map_size=1099511627776)

id_list = []
with env.begin(write=True) as txn:
for infile in infiles:
with open(infile) as tsv_in_file:
reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
for item in reader:

img_id = str(item['image_id']).encode()
id_list.append(img_id)
txn.put(img_id, pickle.dumps(item))

if count % 1000 == 0:
print(count)
count += 1
txn.put('keys'.encode(), pickle.dumps(id_list))

print(count)
39 changes: 39 additions & 0 deletions script/convert_lmdb_refcoco+_gt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import h5py
import os
import pdb
import numpy as np
import json
import sys
FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features', 'cls_prob']
import csv
import base64
csv.field_size_limit(sys.maxsize)
import sys
import pickle
import lmdb # install lmdb by "pip install lmdb"

count = 0
num_file = 1
name = '/srv/share2/jlu347/bottom-up-attention/feature/refcoco_unc/refcoco+_unc_gt_resnet101_faster_rcnn_genome.tsv.%d'
infiles = [name % i for i in range(num_file)]

save_path = os.path.join('refcoco+_gt.lmdb')
env = lmdb.open(save_path, map_size=1099511627776)

id_list = []
with env.begin(write=True) as txn:
for infile in infiles:
with open(infile) as tsv_in_file:
reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
for item in reader:

img_id = str(item['image_id']).encode()
id_list.append(img_id)
txn.put(img_id, pickle.dumps(item))

if count % 1000 == 0:
print(count)
count += 1
txn.put('keys'.encode(), pickle.dumps(id_list))

print(count)
35 changes: 35 additions & 0 deletions script/convert_test_lmdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import h5py
import os
import pdb
import numpy as np
import json
import sys
FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features']
import csv
import base64
import pickle
import lmdb # install lmdb by "pip install lmdb"

csv.field_size_limit(sys.maxsize)

name = '/srv/share2/jlu347/bottom-up-attention/feature/coco/test2015/test2015_resnet101_faster_rcnn_genome.tsv'
infiles = [name]

save_path = os.path.join('coco_test_resnet101_faster_rcnn_genome.lmdb')
env = lmdb.open(save_path, map_size=1099511627776)

id_list = []
count = 0
with env.begin(write=True) as txn:
for infile in infiles:
with open(infile) as tsv_in_file:
reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
for item in reader:
img_id = str(item['image_id']).encode()
id_list.append(img_id)
txn.put(img_id, pickle.dumps(item))
if count % 1000 == 0:
print(count)
count += 1
txn.put('keys'.encode(), pickle.dumps(id_list))

41 changes: 41 additions & 0 deletions script/convert_trainval_lmdb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import h5py
import os
import pdb
import numpy as np
import json
import sys
FIELDNAMES = ['image_id', 'image_w','image_h','num_boxes', 'boxes', 'features']
import csv
import base64
import pickle
import lmdb # install lmdb by "pip install lmdb"

csv.field_size_limit(sys.maxsize)

count = 0
infiles = []

path = '/srv/share2/jlu347/bottom-up-attention/feature/coco/trainval/'
infiles.append(path + 'karpathy_train_resnet101_faster_rcnn_genome.tsv.0')
infiles.append(path + 'karpathy_train_resnet101_faster_rcnn_genome.tsv.1')
infiles.append(path + 'karpathy_val_resnet101_faster_rcnn_genome.tsv')
infiles.append(path + 'karpathy_test_resnet101_faster_rcnn_genome.tsv')

save_path = os.path.join('coco_trainval_resnet101_faster_rcnn_genome.lmdb')
env = lmdb.open(save_path, map_size=1099511627776)

id_list = []
with env.begin(write=True) as txn:
for infile in infiles:
with open(infile) as tsv_in_file:
reader = csv.DictReader(tsv_in_file, delimiter='\t', fieldnames = FIELDNAMES)
for item in reader:
img_id = str(item['image_id']).encode()
id_list.append(img_id)
txn.put(img_id, pickle.dumps(item))
if count % 1000 == 0:
print(count)
count += 1
txn.put('keys'.encode(), pickle.dumps(id_list))

print(count)

0 comments on commit 66f5226

Please sign in to comment.