11
11
12
12
BASE_DIR = os .path .join (tempfile .gettempdir (), '_posenet_weights' )
13
13
14
+ # Note that this file contains reverse-engineered documentation that contains several notes about points that need to be verified.
15
+
14
16
15
17
def to_output_strided_layers (convolution_def , output_stride ):
18
+ """
19
+ There seem to be some magic formulas used in this function. The output magically aligns with the details of the layer definition
20
+ for MobilenetV1. Not sure how reusable this is for other networks that use depthwise convolutions.
21
+
22
+ Note: Verify whether we can reuse this function for other networks, like MobilenetV2.
23
+
24
+ :param convolution_def: A MobileNet convolution definition selection from the config.yaml file.
25
+ :param output_stride: The chosen output stride. Note to check how the output stride is coupled to the chosen network
26
+ variables (see the load_variables function).
27
+ :return: An array containing an element for each layer with the detailed layer specs defined in each of them.
28
+ """
16
29
current_stride = 1
17
30
rate = 1
18
31
block_id = 0
@@ -21,28 +34,42 @@ def to_output_strided_layers(convolution_def, output_stride):
21
34
conv_type = _a [0 ]
22
35
stride = _a [1 ]
23
36
24
- if current_stride == output_stride :
25
- layer_stride = 1
37
+ if current_stride == output_stride : # How often do we get here?
38
+ layer_stride = 1 # tf.nn.depthwise_conv2d nets require the strides to be 1 when the rate (dilation) is >1
26
39
layer_rate = rate
27
- rate *= stride
40
+ rate *= stride # why is this?
28
41
else :
29
42
layer_stride = stride
30
- layer_rate = 1
31
- current_stride *= stride
43
+ layer_rate = 1 # tf.nn.depthwise_conv2d nets require the rate (dilation) to be 1 when the strides are >1
44
+ current_stride *= stride # why is this?
32
45
33
46
buff .append ({
34
47
'blockId' : block_id ,
35
48
'convType' : conv_type ,
36
49
'stride' : layer_stride ,
37
50
'rate' : layer_rate ,
38
- 'outputStride' : current_stride
51
+ 'outputStride' : current_stride # Looks like the variable 'outputStride' is never used anywhere.
39
52
})
40
53
block_id += 1
41
54
42
55
return buff
43
56
44
57
45
58
def load_variables (chkpoint , base_dir = BASE_DIR ):
59
+ """
60
+ Load all weights and biases from the C-struct binary files the manifest.json file refers to into tensorflow variables and
61
+ attach those to the manifest data structure as property 'x' under their corresponding variable name.
62
+ If no manifest is found, it will be downloaded first together with all the variable files it refers to.
63
+
64
+ :param chkpoint: The checkpoint name. This name is important because it is part of the URL structure where the variables
65
+ are downloaded from, and the name is reused on the local filesystem for consistency.
66
+ :param base_dir: The local folder name where the posenet weights are downloaded in (usually in a temp folder).
67
+ :return: The loaded content of the manifest is used as a data structure where the tensorflow variables created in this
68
+ function are added to and hashed under the 'x' property of each variable.
69
+
70
+ Note for refactoring: To make this function reusable for other networks, the weights downloader should be either
71
+ 1/ more generic, or 2/ extracted outside this function. Apart from this, this function is likely very reusable for other networks.
72
+ """
46
73
manifest_path = os .path .join (base_dir , chkpoint , "manifest.json" )
47
74
if not os .path .exists (manifest_path ):
48
75
print ('Weights for checkpoint %s are not downloaded. Downloading to %s ...' % (chkpoint , base_dir ))
@@ -67,7 +94,16 @@ def load_variables(chkpoint, base_dir=BASE_DIR):
67
94
68
95
69
96
def _read_imgfile (path , width , height ):
97
+ """
98
+ Read an image file, resize it and normalize its values to match the MobileNetV1's expected input features.
99
+
100
+ :param path: The path on the fs where the image is located.
101
+ :param width: The requested image target width.
102
+ :param height: The requested image target height.
103
+ :return: The resized image with normalized pixels as a 3D array (height, width, channels).
104
+ """
70
105
img = cv2 .imread (path )
106
+ # The cv2.resize shape definition is indeed (width, height), while the image shape from cv2.imread is (height, width, channels).
71
107
img = cv2 .resize (img , (width , height ))
72
108
img = cv2 .cvtColor (img , cv2 .COLOR_BGR2RGB )
73
109
img = img .astype (float )
@@ -76,6 +112,19 @@ def _read_imgfile(path, width, height):
76
112
77
113
78
114
def build_network (image , layers , variables ):
115
+ """
116
+ Build a tensorflow network instance based on the definition in the 'layers' parameter and the given variables.
117
+ The layer names used are MobileNetV1 specific.
118
+
119
+ Note: See how/if this can be made more generic to build other networks like MobileNetV2 / ResNet50 / ...
120
+
121
+ :param image: The tensor placeholder that will be used to feed image data into the network. It's the starting point for the network.
122
+ :param layers: The layer definitions as defined by the 'to_output_strided_layers' function.
123
+ :param variables: The variables that instantiate the requested network. This parameter represents the network's manifest that
124
+ was loaded from the manifest.json file and that was enriched with tensorflow variables that were loaded from the variable
125
+ snapshot files the manifest refers to (by the 'load_variables' function).
126
+ :return: The built tensorflow network.
127
+ """
79
128
80
129
def _weights (layer_name ):
81
130
return variables ["MobilenetV1/" + layer_name + "/weights" ]['x' ]
@@ -94,7 +143,9 @@ def _conv_to_output(mobile_net_output, output_layer_name):
94
143
def _conv (inputs , stride , block_id ):
95
144
return tf .nn .relu6 (
96
145
tf .nn .conv2d (inputs , _weights ("Conv2d_" + str (block_id )), stride , padding = 'SAME' )
97
- + _biases ("Conv2d_" + str (block_id )))
146
+ +
147
+ _biases ("Conv2d_" + str (block_id ))
148
+ )
98
149
99
150
def _separable_conv (inputs , stride , block_id , dilations ):
100
151
if dilations is None :
@@ -103,8 +154,12 @@ def _separable_conv(inputs, stride, block_id, dilations):
103
154
dw_layer = "Conv2d_" + str (block_id ) + "_depthwise"
104
155
pw_layer = "Conv2d_" + str (block_id ) + "_pointwise"
105
156
106
- w = tf .nn .depthwise_conv2d (
107
- inputs , _depthwise_weights (dw_layer ), stride , 'SAME' , rate = dilations , data_format = 'NHWC' )
157
+ # 'NHWC' = data format [batch, height, width, channels]
158
+ # The dilations are the number of repeated values in the height and width dimension to get a depthwise convolution.
159
+ # A depthwise convolution uses a filter (kernel) with a depth of 1 instead of the channel depth to get fewer variables that
160
+ # have to be learned, and so achieve a faster but less accurate network. When the rate (or dilation) is 1, then the strides
161
+ # must all be 1, see: https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/nn/depthwise_conv2d
162
+ w = tf .nn .depthwise_conv2d (inputs , _depthwise_weights (dw_layer ), stride , 'SAME' , rate = dilations , data_format = 'NHWC' )
108
163
w = tf .nn .bias_add (w , _biases (dw_layer ))
109
164
w = tf .nn .relu6 (w )
110
165
@@ -115,24 +170,27 @@ def _separable_conv(inputs, stride, block_id, dilations):
115
170
return w
116
171
117
172
x = image
118
- buff = []
173
+ buff = [] # remove this buffer, seems like it's not used
119
174
with tf .variable_scope (None , 'MobilenetV1' ):
120
175
121
176
for m in layers :
122
177
stride = [1 , m ['stride' ], m ['stride' ], 1 ]
123
178
rate = [m ['rate' ], m ['rate' ]]
124
179
if m ['convType' ] == "conv2d" :
125
180
x = _conv (x , stride , m ['blockId' ])
126
- buff .append (x )
181
+ buff .append (x ) # remove this buffer
127
182
elif m ['convType' ] == "separableConv" :
128
183
x = _separable_conv (x , stride , m ['blockId' ], rate )
129
- buff .append (x )
184
+ buff .append (x ) # remove this buffer
130
185
131
186
heatmaps = _conv_to_output (x , 'heatmap_2' )
132
187
offsets = _conv_to_output (x , 'offset_2' )
133
188
displacement_fwd = _conv_to_output (x , 'displacement_fwd_2' )
134
189
displacement_bwd = _conv_to_output (x , 'displacement_bwd_2' )
135
190
heatmaps = tf .sigmoid (heatmaps , 'heatmap' )
191
+ # It looks like the outputs 'partheat', 'partoff' and 'segment' are not used.
192
+ # It looks like only the '_2' variant is used of 'heatmap', 'offset', 'displacement_fwd' and 'displacement_bwd'.
193
+ # To verify: Are the '_2' variants coupled to the choice of the outputstride of 16 in the config.yaml file?
136
194
137
195
return heatmaps , offsets , displacement_fwd , displacement_bwd
138
196
@@ -141,7 +199,7 @@ def convert(model_id, model_dir, check=False):
141
199
cfg = load_config ()
142
200
checkpoints = cfg ['checkpoints' ]
143
201
image_size = cfg ['imageSize' ]
144
- output_stride = cfg ['outputStride' ]
202
+ output_stride = cfg ['outputStride' ] # to verify: is this output_stride coupled to the downloaded weights? (current assumption is 'yes')
145
203
chkpoint = checkpoints [model_id ]
146
204
147
205
if chkpoint == 'mobilenet_v1_050' :
@@ -150,6 +208,7 @@ def convert(model_id, model_dir, check=False):
150
208
mobile_net_arch = cfg ['mobileNet75Architecture' ]
151
209
else :
152
210
mobile_net_arch = cfg ['mobileNet100Architecture' ]
211
+ # The 'mobilenet_v1_101' seems to have the same architecture as 'mobileNet100Architecture'.
153
212
154
213
width = image_size
155
214
height = image_size
0 commit comments