-
Notifications
You must be signed in to change notification settings - Fork 12
/
model.py
208 lines (160 loc) · 9 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
"""
Build graph for both training and inference
"""
import tensorflow as tf
class ProposalModel(object):
def __init__(self, options):
self.options = options
self.initializer = tf.random_uniform_initializer(
minval = - self.options['init_scale'],
maxval = self.options['init_scale'])
# build inference to get proposal events
def build_proposal_inference(self, reuse=False):
inputs = {}
outputs = {}
## dim1: batch, dim2: video sequence length, dim3: video feature dimension
## video feature sequence
video_feat = tf.placeholder(tf.float32, [None, None, self.options['video_feat_dim']], name='video_feat')
inputs['video_feat'] = video_feat
## video feature masking, enable variable-length feature sequence input
video_feat_mask = tf.placeholder(tf.float32, [None, None], name='video_feat_mask')
inputs['video_feat_mask'] = video_feat_mask
batch_size = tf.shape(video_feat)[0]
# set rnn type
def get_rnn_cell():
if self.options['rnn_type'] == 'lstm':
rnn_cell_video = tf.contrib.rnn.LSTMCell(
num_units=self.options['rnn_size'],
state_is_tuple=True,
initializer=tf.orthogonal_initializer()
)
elif self.options['rnn_type'] == 'gru':
rnn_cell_video = tf.contrib.rnn.GRUCell(
num_units=self.options['rnn_size']
)
else:
raise ValueError('Unsupported RNN type.')
return rnn_cell_video
if self.options['rnn_type'] == 'lstm':
multi_rnn_cell_video = tf.contrib.rnn.MultiRNNCell([get_rnn_cell() for _ in range(self.options['num_rnn_layers'])], state_is_tuple=True)
elif self.options['rnn_type'] == 'gru':
multi_rnn_cell_video = tf.contrib.rnn.MultiRNNCell([get_rnn_cell() for _ in range(self.options['num_rnn_layers'])])
else:
raise ValueError('Unsupported RNN type.')
with tf.variable_scope('proposal_module', reuse=reuse) as proposal_scope:
# video feature sequence encoding: use multi-layer LSTM
with tf.variable_scope('video_encoder', reuse=reuse) as scope:
sequence_length = tf.reduce_sum(video_feat_mask, axis=-1)
initial_state = multi_rnn_cell_video.zero_state(batch_size=batch_size, dtype=tf.float32)
rnn_outputs, _ = tf.nn.dynamic_rnn(
cell=multi_rnn_cell_video,
inputs=video_feat,
sequence_length=sequence_length,
initial_state=initial_state,
dtype=tf.float32
)
rnn_outputs_reshape = tf.reshape(rnn_outputs, [-1, self.options['rnn_size']], name='rnn_outputs_reshape')
# predict proposal at each time step: use fully connected layer to output scores for every anchors
with tf.variable_scope('predict_proposal', reuse=reuse) as scope:
logit_output = tf.contrib.layers.fully_connected(
inputs = rnn_outputs_reshape,
num_outputs = self.options['num_anchors'],
activation_fn = None
)
# score
proposal_score = tf.sigmoid(logit_output, name='proposal_score')
proposal_score = tf.reshape(proposal_score, [batch_size, -1, self.options['num_anchors']])
# outputs from proposal module
outputs['proposal_score'] = proposal_score
return inputs, outputs
def build_train(self):
inputs = {}
outputs = {}
## dim1: batch, dim2: video sequence length, dim3: video feature dimension
## video feature sequence
video_feat = tf.placeholder(tf.float32, [None, None, self.options['video_feat_dim']], name='video_feat')
inputs['video_feat'] = video_feat
## video feature masking, enable variable-length feature sequence input
video_feat_mask = tf.placeholder(tf.float32, [None, None], name='video_feat_mask')
inputs['video_feat_mask'] = video_feat_mask
## proposal data, densely annotated
proposal = tf.placeholder(tf.int32, [None, None, self.options['num_anchors']], name='proposal')
inputs['proposal'] = proposal
## weighting for positive/negative labels (solve imblance data problem)
proposal_weight = tf.placeholder(tf.float32, [self.options['num_anchors'], 2], name='proposal_weight')
inputs['proposal_weight'] = proposal_weight
# get batch size, which is a scalar tensor
batch_size = tf.shape(video_feat)[0]
if self.options['rnn_drop'] > 0:
print('using dropout in rnn!')
# set rnn drop out
rnn_drop = tf.placeholder(tf.float32)
inputs['rnn_drop'] = rnn_drop
def get_rnn_cell():
if self.options['rnn_type'] == 'lstm':
rnn_cell_video = tf.contrib.rnn.LSTMCell(
num_units=self.options['rnn_size'],
state_is_tuple=True,
initializer=tf.orthogonal_initializer()
)
elif self.options['rnn_type'] == 'gru':
rnn_cell_video = tf.contrib.rnn.GRUCell(
num_units=self.options['rnn_size']
)
else:
raise ValueError('Unsupported RNN type.')
rnn_cell_video = tf.contrib.rnn.DropoutWrapper(
rnn_cell_video,
input_keep_prob=1.0 - rnn_drop,
output_keep_prob=1.0 - rnn_drop
)
return rnn_cell_video
if self.options['rnn_type'] == 'lstm':
multi_rnn_cell_video = tf.contrib.rnn.MultiRNNCell([get_rnn_cell() for _ in range(self.options['num_rnn_layers'])], state_is_tuple=True)
elif self.options['rnn_type'] == 'gru':
multi_rnn_cell_video = tf.contrib.rnn.MultiRNNCell([get_rnn_cell() for _ in range(self.options['num_rnn_layers'])])
else:
raise ValueError('Unsupported RNN type.')
with tf.variable_scope('proposal_module') as proposal_scope:
# video feature sequence encoding: use multi-layer LSTM
with tf.variable_scope('video_encoder') as scope:
sequence_length = tf.reduce_sum(video_feat_mask, axis=-1)
initial_state = multi_rnn_cell_video.zero_state(batch_size=batch_size, dtype=tf.float32)
rnn_outputs, _ = tf.nn.dynamic_rnn(
cell=multi_rnn_cell_video,
inputs=video_feat,
sequence_length=sequence_length,
initial_state=initial_state,
dtype=tf.float32
)
rnn_outputs_reshape = tf.reshape(rnn_outputs, [-1, self.options['rnn_size']], name='rnn_outputs_reshape')
# predict proposal at each time step: use fully connected layer to output scores for every anchors
with tf.variable_scope('predict_proposal') as scope:
logit_output = tf.contrib.layers.fully_connected(
inputs = rnn_outputs_reshape,
num_outputs = self.options['num_anchors'],
activation_fn = None
)
# calculate multi-label loss: use weighted binary cross entropy objective
proposal_reshape = tf.reshape(proposal, [-1, self.options['num_anchors']], name='proposal_reshape')
proposal_float = tf.to_float(proposal_reshape)
# weighting positive samples
weight0 = tf.reshape(proposal_weight[:, 0], [-1, self.options['num_anchors']])
# weighting negative samples
weight1 = tf.reshape(proposal_weight[:, 1], [-1, self.options['num_anchors']])
# tile weight batch_size times
weight0 = tf.tile(weight0, [tf.shape(logit_output)[0], 1])
weight1 = tf.tile(weight1, [tf.shape(logit_output)[0], 1])
# get weighted sigmoid xentropy loss
loss_term = tf.nn.weighted_cross_entropy_with_logits(targets=proposal_float, logits=logit_output, pos_weight=weight0)
loss_term_sum = tf.reduce_sum(loss_term, axis=-1, name='loss_term_sum')
video_feat_mask = tf.to_float(tf.reshape(video_feat_mask, [-1]))
proposal_loss = tf.reduce_sum(tf.multiply(loss_term_sum, video_feat_mask)) / tf.reduce_sum(video_feat_mask)
# summary data, for visualization using Tensorboard
tf.summary.scalar('proposal_loss', proposal_loss)
# outputs from proposal module
outputs['loss'] = proposal_loss
# L2 regularization
reg_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
outputs['reg_loss'] = reg_loss
return inputs, outputs