Skip to content

Commit 194c8b4

Browse files
committed
Adding documentation and notes for refactoring.
1 parent aa6d0e9 commit 194c8b4

File tree

2 files changed

+72
-14
lines changed

2 files changed

+72
-14
lines changed

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,4 +86,3 @@ The Python conversion code that started me on my way was adapted from the CoreML
8686
* Performance improvements (especially edge loops in 'decode.py')
8787
* OpenGL rendering/drawing
8888
* Comment interfaces, tensor dimensions, etc
89-

posenet/converter/tfjs2python.py

Lines changed: 72 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,21 @@
1111

1212
BASE_DIR = os.path.join(tempfile.gettempdir(), '_posenet_weights')
1313

14+
# Note that this file contains reverse-engineered documentation that contains several notes about points that need to be verified.
15+
1416

1517
def to_output_strided_layers(convolution_def, output_stride):
18+
"""
19+
There seem to be some magic formulas used in this function. The output magically aligns with the details of the layer definition
20+
for MobilenetV1. Not sure how reusable this is for other networks that use depthwise convolutions.
21+
22+
Note: Verify whether we can reuse this function for other networks, like MobilenetV2.
23+
24+
:param convolution_def: A MobileNet convolution definition selection from the config.yaml file.
25+
:param output_stride: The chosen output stride. Note to check how the output stride is coupled to the chosen network
26+
variables (see the load_variables function).
27+
:return: An array containing an element for each layer with the detailed layer specs defined in each of them.
28+
"""
1629
current_stride = 1
1730
rate = 1
1831
block_id = 0
@@ -21,28 +34,42 @@ def to_output_strided_layers(convolution_def, output_stride):
2134
conv_type = _a[0]
2235
stride = _a[1]
2336

24-
if current_stride == output_stride:
25-
layer_stride = 1
37+
if current_stride == output_stride: # How often do we get here?
38+
layer_stride = 1 # tf.nn.depthwise_conv2d nets require the strides to be 1 when the rate (dilation) is >1
2639
layer_rate = rate
27-
rate *= stride
40+
rate *= stride # why is this?
2841
else:
2942
layer_stride = stride
30-
layer_rate = 1
31-
current_stride *= stride
43+
layer_rate = 1 # tf.nn.depthwise_conv2d nets require the rate (dilation) to be 1 when the strides are >1
44+
current_stride *= stride # why is this?
3245

3346
buff.append({
3447
'blockId': block_id,
3548
'convType': conv_type,
3649
'stride': layer_stride,
3750
'rate': layer_rate,
38-
'outputStride': current_stride
51+
'outputStride': current_stride # Looks like the variable 'outputStride' is never used anywhere.
3952
})
4053
block_id += 1
4154

4255
return buff
4356

4457

4558
def load_variables(chkpoint, base_dir=BASE_DIR):
59+
"""
60+
Load all weights and biases from the C-struct binary files the manifest.json file refers to into tensorflow variables and
61+
attach those to the manifest data structure as property 'x' under their corresponding variable name.
62+
If no manifest is found, it will be downloaded first together with all the variable files it refers to.
63+
64+
:param chkpoint: The checkpoint name. This name is important because it is part of the URL structure where the variables
65+
are downloaded from, and the name is reused on the local filesystem for consistency.
66+
:param base_dir: The local folder name where the posenet weights are downloaded in (usually in a temp folder).
67+
:return: The loaded content of the manifest is used as a data structure where the tensorflow variables created in this
68+
function are added to and hashed under the 'x' property of each variable.
69+
70+
Note for refactoring: To make this function reusable for other networks, the weights downloader should be either
71+
1/ more generic, or 2/ extracted outside this function. Apart from this, this function is likely very reusable for other networks.
72+
"""
4673
manifest_path = os.path.join(base_dir, chkpoint, "manifest.json")
4774
if not os.path.exists(manifest_path):
4875
print('Weights for checkpoint %s are not downloaded. Downloading to %s ...' % (chkpoint, base_dir))
@@ -67,7 +94,16 @@ def load_variables(chkpoint, base_dir=BASE_DIR):
6794

6895

6996
def _read_imgfile(path, width, height):
97+
"""
98+
Read an image file, resize it and normalize its values to match the MobileNetV1's expected input features.
99+
100+
:param path: The path on the fs where the image is located.
101+
:param width: The requested image target width.
102+
:param height: The requested image target height.
103+
:return: The resized image with normalized pixels as a 3D array (height, width, channels).
104+
"""
70105
img = cv2.imread(path)
106+
# The cv2.resize shape definition is indeed (width, height), while the image shape from cv2.imread is (height, width, channels).
71107
img = cv2.resize(img, (width, height))
72108
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
73109
img = img.astype(float)
@@ -76,6 +112,19 @@ def _read_imgfile(path, width, height):
76112

77113

78114
def build_network(image, layers, variables):
115+
"""
116+
Build a tensorflow network instance based on the definition in the 'layers' parameter and the given variables.
117+
The layer names used are MobileNetV1 specific.
118+
119+
Note: See how/if this can be made more generic to build other networks like MobileNetV2 / ResNet50 / ...
120+
121+
:param image: The tensor placeholder that will be used to feed image data into the network. It's the starting point for the network.
122+
:param layers: The layer definitions as defined by the 'to_output_strided_layers' function.
123+
:param variables: The variables that instantiate the requested network. This parameter represents the network's manifest that
124+
was loaded from the manifest.json file and that was enriched with tensorflow variables that were loaded from the variable
125+
snapshot files the manifest refers to (by the 'load_variables' function).
126+
:return: The built tensorflow network.
127+
"""
79128

80129
def _weights(layer_name):
81130
return variables["MobilenetV1/" + layer_name + "/weights"]['x']
@@ -94,7 +143,9 @@ def _conv_to_output(mobile_net_output, output_layer_name):
94143
def _conv(inputs, stride, block_id):
95144
return tf.nn.relu6(
96145
tf.nn.conv2d(inputs, _weights("Conv2d_" + str(block_id)), stride, padding='SAME')
97-
+ _biases("Conv2d_" + str(block_id)))
146+
+
147+
_biases("Conv2d_" + str(block_id))
148+
)
98149

99150
def _separable_conv(inputs, stride, block_id, dilations):
100151
if dilations is None:
@@ -103,8 +154,12 @@ def _separable_conv(inputs, stride, block_id, dilations):
103154
dw_layer = "Conv2d_" + str(block_id) + "_depthwise"
104155
pw_layer = "Conv2d_" + str(block_id) + "_pointwise"
105156

106-
w = tf.nn.depthwise_conv2d(
107-
inputs, _depthwise_weights(dw_layer), stride, 'SAME', rate=dilations, data_format='NHWC')
157+
# 'NHWC' = data format [batch, height, width, channels]
158+
# The dilations are the number of repeated values in the height and width dimension to get a depthwise convolution.
159+
# A depthwise convolution uses a filter (kernel) with a depth of 1 instead of the channel depth to get fewer variables that
160+
# have to be learned, and so achieve a faster but less accurate network. When the rate (or dilation) is 1, then the strides
161+
# must all be 1, see: https://www.tensorflow.org/versions/r1.15/api_docs/python/tf/nn/depthwise_conv2d
162+
w = tf.nn.depthwise_conv2d(inputs, _depthwise_weights(dw_layer), stride, 'SAME', rate=dilations, data_format='NHWC')
108163
w = tf.nn.bias_add(w, _biases(dw_layer))
109164
w = tf.nn.relu6(w)
110165

@@ -115,24 +170,27 @@ def _separable_conv(inputs, stride, block_id, dilations):
115170
return w
116171

117172
x = image
118-
buff = []
173+
buff = [] # remove this buffer, seems like it's not used
119174
with tf.variable_scope(None, 'MobilenetV1'):
120175

121176
for m in layers:
122177
stride = [1, m['stride'], m['stride'], 1]
123178
rate = [m['rate'], m['rate']]
124179
if m['convType'] == "conv2d":
125180
x = _conv(x, stride, m['blockId'])
126-
buff.append(x)
181+
buff.append(x) # remove this buffer
127182
elif m['convType'] == "separableConv":
128183
x = _separable_conv(x, stride, m['blockId'], rate)
129-
buff.append(x)
184+
buff.append(x) # remove this buffer
130185

131186
heatmaps = _conv_to_output(x, 'heatmap_2')
132187
offsets = _conv_to_output(x, 'offset_2')
133188
displacement_fwd = _conv_to_output(x, 'displacement_fwd_2')
134189
displacement_bwd = _conv_to_output(x, 'displacement_bwd_2')
135190
heatmaps = tf.sigmoid(heatmaps, 'heatmap')
191+
# It looks like the outputs 'partheat', 'partoff' and 'segment' are not used.
192+
# It looks like only the '_2' variant is used of 'heatmap', 'offset', 'displacement_fwd' and 'displacement_bwd'.
193+
# To verify: Are the '_2' variants coupled to the choice of the outputstride of 16 in the config.yaml file?
136194

137195
return heatmaps, offsets, displacement_fwd, displacement_bwd
138196

@@ -141,7 +199,7 @@ def convert(model_id, model_dir, check=False):
141199
cfg = load_config()
142200
checkpoints = cfg['checkpoints']
143201
image_size = cfg['imageSize']
144-
output_stride = cfg['outputStride']
202+
output_stride = cfg['outputStride'] # to verify: is this output_stride coupled to the downloaded weights? (current assumption is 'yes')
145203
chkpoint = checkpoints[model_id]
146204

147205
if chkpoint == 'mobilenet_v1_050':
@@ -150,6 +208,7 @@ def convert(model_id, model_dir, check=False):
150208
mobile_net_arch = cfg['mobileNet75Architecture']
151209
else:
152210
mobile_net_arch = cfg['mobileNet100Architecture']
211+
# The 'mobilenet_v1_101' seems to have the same architecture as 'mobileNet100Architecture'.
153212

154213
width = image_size
155214
height = image_size

0 commit comments

Comments
 (0)