-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmodel.py
247 lines (187 loc) · 12 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# keras imports
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.normalization import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.merge import concatenate
from keras.callbacks import TensorBoard
from keras.models import load_model, Model
# std imports
import time
import gc
import os
from input_handler import create_train_dev_set
class SiameseBiLSTM:
def __init__(self, embedding_dim, max_sequence_length, number_lstm, number_dense, rate_drop_lstm,
rate_drop_dense, hidden_activation, validation_split_ratio):
self.embedding_dim = embedding_dim
self.max_sequence_length = max_sequence_length
self.number_lstm_units = number_lstm
self.rate_drop_lstm = rate_drop_lstm
self.number_dense_units = number_dense
self.activation_function = hidden_activation
self.rate_drop_dense = rate_drop_dense
self.validation_split_ratio = validation_split_ratio
def train_model(self, sentences_pair, is_similar, embedding_meta_data, model_save_directory='./'):
"""
Train Siamese network to find similarity between sentences in `sentences_pair`
Steps Involved:
1. Pass the each from sentences_pairs to bidirectional LSTM encoder.
2. Merge the vectors from LSTM encodes and passed to dense layer.
3. Pass the dense layer vectors to sigmoid output layer.
4. Use cross entropy loss to train weights
Args:
sentences_pair (list): list of tuple of sentence pairs
is_similar (list): target value 1 if same sentences pair are similar otherwise 0
embedding_meta_data (dict): dict containing tokenizer and word embedding matrix
model_save_directory (str): working directory for where to save models
Returns:
return (best_model_path): path of best model
"""
tokenizer, embedding_matrix = embedding_meta_data['tokenizer'], embedding_meta_data['embedding_matrix']
train_data_x1, train_data_x2, train_labels, leaks_train, \
val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, sentences_pair,
is_similar, self.max_sequence_length,
self.validation_split_ratio)
if train_data_x1 is None:
print("++++ !! Failure: Unable to train model ++++")
return None
nb_words = len(tokenizer.word_index) + 1
# Creating word embedding layer
embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix],
input_length=self.max_sequence_length, trainable=False)
# Creating LSTM Encoder
lstm_layer = Bidirectional(LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm))
# Creating LSTM Encoder layer for First Sentence
sequence_1_input = Input(shape=(self.max_sequence_length,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)
# Creating LSTM Encoder layer for Second Sentence
sequence_2_input = Input(shape=(self.max_sequence_length,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
x2 = lstm_layer(embedded_sequences_2)
# Creating leaks input
leaks_input = Input(shape=(leaks_train.shape[1],))
leaks_dense = Dense(self.number_dense_units/2, activation=self.activation_function)(leaks_input)
# Merging two LSTM encodes vectors from sentences to
# pass it to dense layer applying dropout and batch normalisation
merged = concatenate([x1, x2, leaks_dense])
merged = BatchNormalization()(merged)
merged = Dropout(self.rate_drop_dense)(merged)
merged = Dense(self.number_dense_units, activation=self.activation_function)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(self.rate_drop_dense)(merged)
preds = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds)
model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['accuracy'])
STAMP = 'lstm_%d_%d_%.2f_%.2f' % (self.number_lstm_units, self.number_dense_units, self.rate_drop_lstm, self.rate_drop_dense)
checkpoint_dir = model_save_directory + 'checkpoints/' + str(int(time.time())) + '/'
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
bst_model_path = checkpoint_dir + STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False)
tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time()))
model.fit([train_data_x1, train_data_x2, leaks_train], train_labels,
validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels),
epochs=100, batch_size=64, shuffle=True,
callbacks=[model_checkpoint, tensorboard])
return bst_model_path
def train_entailment(self, sentences_pair, is_similar, embedding_meta_data, model_save_directory='./'):
"""
Train Siamese network to find similarity between sentences in `sentences_pair`
Steps Involved:
1. Pass the each from sentences_pairs to bidirectional LSTM encoder.
2. Merge the vectors from LSTM encodes and passed to dense layer.
3. Pass the dense layer vectors to sigmoid output layer.
4. Use cross entropy loss to train weights
Args:
sentences_pair (list): list of tuple of sentence pairs
is_similar (list): target value 1 if same sentences pair are similar otherwise 0
embedding_meta_data (dict): dict containing tokenizer and word embedding matrix
model_save_directory (str): working directory for where to save models
Returns:
return (best_model_path): path of best model
"""
tokenizer, embedding_matrix = embedding_meta_data['tokenizer'], embedding_meta_data['embedding_matrix']
train_data_x1, train_data_x2, train_labels, leaks_train, \
val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, sentences_pair,
is_similar, self.max_sequence_length,
self.validation_split_ratio)
if train_data_x1 is None:
print("++++ !! Failure: Unable to train model ++++")
return None
nb_words = len(tokenizer.word_index) + 1
# Creating word embedding layer
embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix],
input_length=self.max_sequence_length, trainable=False)
# Creating LSTM Encoder
lstm_layer = Bidirectional(LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm))
# Creating LSTM Encoder layer for First Sentence
sequence_1_input = Input(shape=(self.max_sequence_length,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)
# Creating LSTM Encoder layer for Second Sentence
sequence_2_input = Input(shape=(self.max_sequence_length,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
x2 = lstm_layer(embedded_sequences_2)
# Creating leaks input
leaks_input = Input(shape=(leaks_train.shape[1],))
leaks_dense = Dense(self.number_dense_units/2, activation=self.activation_function)(leaks_input)
# Merging two LSTM encodes vectors from sentences to
# pass it to dense layer applying dropout and batch normalisation
merged = concatenate([x1, x2, leaks_dense])
merged = BatchNormalization()(merged)
merged = Dropout(self.rate_drop_dense)(merged)
merged = Dense(self.number_dense_units, activation=self.activation_function)(merged)
merged = BatchNormalization()(merged)
merged = Dropout(self.rate_drop_dense)(merged)
merged = Dense(25, activation='relu')(merged)
preds = Dense(3, activation='softmax')(merged)
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
STAMP = 'lstm_%d_%d_%.2f_%.2f' % (self.number_lstm_units, self.number_dense_units, self.rate_drop_lstm, self.rate_drop_dense)
checkpoint_dir = model_save_directory + 'checkpoints/' + str(int(time.time())) + '/'
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
bst_model_path = checkpoint_dir + STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False)
tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time()))
history = model.fit([train_data_x1, train_data_x2, leaks_train], train_labels,
validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels),
epochs=100, batch_size=64, shuffle=True,
callbacks=[model_checkpoint, tensorboard])
return (bst_model_path, history)
def update_model(self, saved_model_path, new_sentences_pair, is_similar, embedding_meta_data):
"""
Update trained siamese model for given new sentences pairs
Steps Involved:
1. Pass the each from sentences from new_sentences_pair to bidirectional LSTM encoder.
2. Merge the vectors from LSTM encodes and passed to dense layer.
3. Pass the dense layer vectors to sigmoid output layer.
4. Use cross entropy loss to train weights
Args:
model_path (str): model path of already trained siamese model
new_sentences_pair (list): list of tuple of new sentences pairs
is_similar (list): target value 1 if same sentences pair are similar otherwise 0
embedding_meta_data (dict): dict containing tokenizer and word embedding matrix
Returns:
return (best_model_path): path of best model
"""
tokenizer = embedding_meta_data['tokenizer']
train_data_x1, train_data_x2, train_labels, leaks_train, \
val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, new_sentences_pair,
is_similar, self.max_sequence_length,
self.validation_split_ratio)
model = load_model(saved_model_path)
model_file_name = saved_model_path.split('/')[-1]
new_model_checkpoint_path = saved_model_path.split('/')[:-2] + str(int(time.time())) + '/'
new_model_path = new_model_checkpoint_path + model_file_name
model_checkpoint = ModelCheckpoint(new_model_checkpoint_path + model_file_name,
save_best_only=True, save_weights_only=False)
#early_stopping = EarlyStopping(monitor='val_loss', patience=3)
tensorboard = TensorBoard(log_dir=new_model_checkpoint_path + "logs/{}".format(time.time()))
model.fit([train_data_x1, train_data_x2, leaks_train], train_labels,
validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels),
epochs=50, batch_size=3, shuffle=True,
callbacks=[model_checkpoint, tensorboard])
return new_model_path