-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransforms.py
289 lines (227 loc) · 11.1 KB
/
transforms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import torch
import numpy as np
import random
import torchvision.transforms.functional as FT
import cv2
from utils import find_jaccard_overlap, find_intersection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Some augmentation functions below have been adapted from
# From https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py
def expand(image, boxes, filler):
"""
Perform a zooming out operation by placing the image in a larger canvas of filler material.
Helps to learn to detect smaller objects.
:param image: image, a tensor of dimensions (3, original_h, original_w)
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:param filler: RBG values of the filler material, a list like [R, G, B]
:return: expanded image, updated bounding box coordinates
"""
# Calculate dimensions of proposed expanded (zoomed-out) image
original_h = image.size(1)
original_w = image.size(2)
max_scale = 4
scale = random.uniform(1, max_scale)
new_h = int(scale * original_h)
new_w = int(scale * original_w)
# Create such an image with the filler
filler = torch.FloatTensor(filler) # (3)
new_image = torch.ones((3, new_h, new_w), dtype=torch.float) * filler.unsqueeze(1).unsqueeze(1) # (3, new_h, new_w)
# Note - do not use expand() like new_image = filler.unsqueeze(1).unsqueeze(1).expand(3, new_h, new_w)
# because all expanded values will share the same memory, so changing one pixel will change all
# Place the original image at random coordinates in this new image (origin at top-left of image)
left = random.randint(0, new_w - original_w)
right = left + original_w
top = random.randint(0, new_h - original_h)
bottom = top + original_h
new_image[:, top:bottom, left:right] = image
# Adjust bounding boxes' coordinates accordingly
new_boxes = boxes + torch.FloatTensor([left, top, left, top]).unsqueeze(
0) # (n_objects, 4), n_objects is the no. of objects in this image
return new_image, new_boxes
def random_crop(image, boxes, labels):
"""
Performs a random crop in the manner stated in the paper. Helps to learn to detect larger and partial objects.
Note that some objects may be cut out entirely.
Adapted from https://github.com/amdegroot/ssd.pytorch/blob/master/utils/augmentations.py
:param image: image, a tensor of dimensions (3, original_h, original_w)
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:param labels: labels of objects, a tensor of dimensions (n_objects)
:return: cropped image, updated bounding box coordinates, updated labels, updated difficulties
"""
original_h = image.size(1)
original_w = image.size(2)
# Keep choosing a minimum overlap until a successful crop is made
while True:
# Randomly draw the value for minimum overlap
min_overlap = random.choice([0., .1, .3, .5, .7, .9, None]) # 'None' refers to no cropping
# If not cropping
if min_overlap is None:
return image, boxes, labels
# Try up to 50 times for this choice of minimum overlap
# This isn't mentioned in the paper, of course, but 50 is chosen in paper authors' original Caffe repo
max_trials = 50
for _ in range(max_trials):
# Crop dimensions must be in [0.3, 1] of original dimensions
# Note - it's [0.1, 1] in the paper, but actually [0.3, 1] in the authors' repo
min_scale = 0.3
scale_h = random.uniform(min_scale, 1)
scale_w = random.uniform(min_scale, 1)
new_h = int(scale_h * original_h)
new_w = int(scale_w * original_w)
# Aspect ratio has to be in [0.5, 2]
aspect_ratio = new_h / new_w
if not 0.5 < aspect_ratio < 2:
continue
# Crop coordinates (origin at top-left of image)
left = random.randint(0, original_w - new_w)
right = left + new_w
top = random.randint(0, original_h - new_h)
bottom = top + new_h
crop = torch.FloatTensor([left, top, right, bottom]) # (4)
# Calculate Jaccard overlap between the crop and the bounding boxes
overlap = find_jaccard_overlap(crop.unsqueeze(0),
boxes) # (1, n_objects), n_objects is the no. of objects in this image
overlap = overlap.squeeze(0) # (n_objects)
# If not a single bounding box has a Jaccard overlap of greater than the minimum, try again
if overlap.max().item() < min_overlap:
continue
# Crop image
new_image = image[:, top:bottom, left:right] # (3, new_h, new_w)
# Find centers of original bounding boxes
bb_centers = (boxes[:, :2] + boxes[:, 2:]) / 2. # (n_objects, 2)
# Find bounding boxes whose centers are in the crop
centers_in_crop = (bb_centers[:, 0] > left) * (bb_centers[:, 0] < right) * (bb_centers[:, 1] > top) * (
bb_centers[:, 1] < bottom) # (n_objects), a Torch uInt8/Byte tensor, can be used as a boolean index
# If not a single bounding box has its center in the crop, try again
if not centers_in_crop.any():
continue
# Discard bounding boxes that don't meet this criterion
new_boxes = boxes[centers_in_crop, :]
new_labels = labels[centers_in_crop]
# Calculate bounding boxes' new coordinates in the crop
new_boxes[:, :2] = torch.max(new_boxes[:, :2], crop[:2]) # crop[:2] is [left, top]
new_boxes[:, :2] -= crop[:2]
new_boxes[:, 2:] = torch.min(new_boxes[:, 2:], crop[2:]) # crop[2:] is [right, bottom]
new_boxes[:, 2:] -= crop[:2]
return new_image, new_boxes, new_labels
def flip(image, boxes):
"""
Flip image horizontally.
:param image: image, a PIL Image
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:return: flipped image, updated bounding box coordinates
"""
# Flip image
new_image = FT.hflip(image)
# Flip boxes
new_boxes = boxes
new_boxes[:, 0] = image.width - boxes[:, 0] - 1
new_boxes[:, 2] = image.width - boxes[:, 2] - 1
new_boxes = new_boxes[:, [2, 1, 0, 3]]
return new_image, new_boxes
def photometric_distort(image):
"""
Distort brightness, contrast, saturation, and hue, each with a 50% chance, in random order.
:param image: image, a PIL Image
:return: distorted image
"""
new_image = image
distortions = [FT.adjust_brightness,
FT.adjust_contrast,
FT.adjust_saturation,
FT.adjust_hue]
random.shuffle(distortions)
for d in distortions:
if random.random() < 0.5:
if d.__name__ is 'adjust_hue':
# Caffe repo uses a 'hue_delta' of 18 - we divide by 255 because PyTorch needs a normalized value
adjust_factor = random.uniform(-18 / 255., 18 / 255.)
else:
# Caffe repo uses 'lower' and 'upper' values of 0.5 and 1.5 for brightness, contrast, and saturation
adjust_factor = random.uniform(0.5, 1.5)
# Apply this distortion
new_image = d(new_image, adjust_factor)
return new_image
def transform(image, boxes, labels, resize, is_train):
"""
Apply the transformations above.
:param image: image, a PIL Image
:param boxes: bounding boxes in boundary coordinates, a tensor of dimensions (n_objects, 4)
:param is_train: one of 'TRAIN' or 'TEST', since different sets of transformations are applied
:return: transformed image, transformed bounding box coordinates, transformed labels
"""
# Mean and standard deviation of ImageNet data that our base VGG from torchvision was trained on
# see: https://pytorch.org/docs/stable/torchvision/models.html
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
new_image = image
new_boxes = boxes
new_labels = labels
# Skip the following operations for evaluation/testing
if is_train:
# A series of photometric distortions in random order, each with 50% chance of occurrence, as in Caffe repo
new_image = photometric_distort(new_image)
# Convert PIL image to Torch tensor
new_image = FT.to_tensor(new_image)
# Expand image (zoom out) with a 50% chance - helpful for training detection of small objects
# Fill surrounding space with the mean of ImageNet data that our base VGG was trained on
if random.random() < 0.5:
new_image, new_boxes = expand(new_image, boxes, filler=mean)
# Randomly crop image (zoom in)
new_image, new_boxes, new_labels = random_crop(new_image, new_boxes, new_labels)
# Convert Torch tensor to PIL image
new_image = FT.to_pil_image(new_image)
# Flip image with a 50% chance
if random.random() < 0.5:
new_image, new_boxes = flip(new_image, new_boxes)
new_image, new_boxes = resize_img_bbox_letterbox(new_image, new_boxes, resize)
# Convert PIL image to Torch tensor
new_image = FT.to_tensor(new_image)
# Normalize by mean and standard deviation of ImageNet data that our base VGG was trained on
new_image = FT.normalize(new_image, mean=mean, std=std)
new_boxes = normalize_bbox(new_image, new_boxes)
return new_image, new_boxes, new_labels
def normalize_bbox(img, bbox):
'''
Arugments:
img - image tensor (channel, height, width)
bbox - bounding box (center x, center y, width, height, mask_label)
size - the resize image
Return:
bbox - normalize bounding box according to image size
'''
w, h = img.size(2), img.size(1)
bbox[:,0] = bbox[:,0]/w
bbox[:,2] = bbox[:,2]/w
bbox[:,1] = bbox[:,1]/h
bbox[:,3] = bbox[:,3]/h
return bbox
def resize_img_bbox_letterbox(img, bbox, size, return_coords=True):
'''
Arugments:
img - image PIL (height, width, channel)
bbox - bounding box (xmin, ymin, xmax, ymax)
size - the resize image
Return:
img - resize image as a letter box padding, keep the original aspect ratio of the image and padding the smaller aspect of the image
bbox - resize bounding box according to the image
'''
w, h = img.width, img.height
scale = min(size/w, size/h)
new_w = int(w * scale)
new_h = int(h * scale)
resized_image = FT.resize(img, (new_h, new_w))
canvas = np.full((size, size, 3), 0)
canvas[(size-new_h)//2:(size-new_h)//2 + new_h, (size-new_w) //
2:(size-new_w)//2 + new_w, :] = resized_image
canvas = canvas.astype(np.uint8)
if return_coords:
bbox[:,:4] = bbox[:,:4] * scale
# add padding h w
bbox[:,:4] += torch.FloatTensor([(size - new_w)/2,
(size - new_h)/2, (size - new_w)/2, (size - new_h)/2])
return canvas, bbox
else:
return canvas
def image_pytorch_format(img):
return img.reshape((img.shape[2], img.shape[0], img.shape[1]))