7
7
import pandas as pd
8
8
from baseline import BaselineExperiment
9
9
10
- from models .models import SentimentGRU , SentimentCNN
11
- from transformer .data_processing import TransformerDataset
12
- from transformer .models import TransformerClassifier
10
+ from models import BiGRU , TextCNN , TransformerClassifier
13
11
from utils import init_weights
14
12
from settings import *
15
- from data_processing import Lang , CustomDataset
13
+ from data_processing import Lang , CustomDataset , TransformerDataset
16
14
17
15
from nltk .corpus import movie_reviews , subjectivity
18
16
from sklearn .model_selection import train_test_split
@@ -34,10 +32,12 @@ def __init__(self, model_name, task="polarity", sjv_classifier=None, sjv_vectori
34
32
self .sjv_classifier = sjv_classifier
35
33
self .sjv_vectorizer = sjv_vectorizer
36
34
37
- if model_name == "SentimentGRU" :
38
- self .model_config = SentimentGRU_config
39
- if model_name == "SentimentCNN" :
40
- self .model_config = SentimentCNN_config
35
+ if model_name == "BiGRU" :
36
+ self .model_config = BiGRU_config
37
+ if model_name == "BiGRUAttention" :
38
+ self .model_config = BiGRUAttention_config
39
+ if model_name == "TextCNN" :
40
+ self .model_config = TextCNN_config
41
41
42
42
def prepare_data (self ):
43
43
if self .task == "polarity" :
@@ -69,20 +69,10 @@ def prepare_data(self):
69
69
self .data_Y += [1 ]* len (subj_sents )
70
70
print ("Total samples: " , len (self .data_raw ))
71
71
72
- elif (self .task == "polarity-no-obj-sents "
72
+ elif (self .task == "polarity-filter "
73
73
and self .sjv_classifier is not None
74
74
and self .sjv_vectorizer is not None
75
75
):
76
- def removeObjectiveSents (docs_sents , mask ):
77
- i = 0
78
- clean_docs = []
79
- for doc in docs_sents :
80
- clean_docs .append ([])
81
- for sent in doc :
82
- if mask [i ] == 1 :
83
- clean_docs [- 1 ] += sent
84
- i += 1
85
- return clean_docs
86
76
87
77
# get docs divided in sentences
88
78
negative_fileids = movie_reviews .fileids ('neg' )
@@ -95,7 +85,7 @@ def removeObjectiveSents(docs_sents, mask):
95
85
# shallow subjectivity classifier is used to allow comparisons
96
86
movie_sjv_vectors = self .sjv_vectorizer .transform (mr_sents )
97
87
pred = self .sjv_classifier .predict (movie_sjv_vectors )
98
- clean_mr = removeObjectiveSents (mr_docs_sents , pred )
88
+ clean_mr = Experiment . removeObjectiveSents (mr_docs_sents , pred )
99
89
100
90
mr_neg = [{"document" : doc , "label" : 0 } for doc in clean_mr [:1000 ]]
101
91
mr_Y_neg = [0 ]* len (mr_neg )
@@ -111,6 +101,21 @@ def removeObjectiveSents(docs_sents, mask):
111
101
print ("Cannot prepare data. Wrong parameters." )
112
102
exit ()
113
103
104
+ @staticmethod
105
+ def removeObjectiveSents (docs_sents , mask ):
106
+ i = 0
107
+ remaining_sents = 0
108
+ clean_docs = []
109
+ for doc in docs_sents :
110
+ clean_docs .append ([])
111
+ for sent in doc :
112
+ if mask [i ] == 1 :
113
+ clean_docs [- 1 ] += sent
114
+ remaining_sents += 1
115
+ i += 1
116
+ print (f"Remaining { remaining_sents } sentences from original { i } sentences count." )
117
+ return clean_docs
118
+
114
119
def create_fold (self ):
115
120
train , test , _ , _ = train_test_split (self .data_raw , self .data_Y , test_size = TRAIN_TEST_SPLIT ,
116
121
random_state = RANDOM_SEED ,
@@ -122,46 +127,49 @@ def create_fold(self):
122
127
train_dataset = CustomDataset (train , self .lang )
123
128
test_dataset = CustomDataset (test , self .lang )
124
129
125
- self .train_loader = DataLoader (train_dataset , batch_size = BATCH_SIZE , collate_fn = train_dataset .collate_fn , shuffle = True )
126
- self .test_loader = DataLoader (test_dataset , batch_size = BATCH_SIZE , collate_fn = test_dataset .collate_fn , drop_last = True )
130
+ self .train_loader = DataLoader (train_dataset , batch_size = self . model_config [ "batch_size" ] , collate_fn = train_dataset .collate_fn , shuffle = True )
131
+ self .test_loader = DataLoader (test_dataset , batch_size = self . model_config [ "batch_size" ] , collate_fn = test_dataset .collate_fn )
127
132
128
133
def run (self ):
129
134
self .prepare_data ()
130
135
models = []
131
136
metrics_list = []
132
- for i_fold in range (N_FOLDS ):
137
+ for fold_idx in range (N_FOLDS ):
133
138
self .create_fold ()
134
139
135
- if self .model_name == "SentimentGRU" :
140
+ if self .model_name == "BiGRU" :
141
+ vocab_size = len (self .lang .word2id )
142
+ model = BiGRU (vocab_size , self .model_config )
143
+ if self .model_name == "BiGRUAttention" :
136
144
vocab_size = len (self .lang .word2id )
137
- model = SentimentGRU (vocab_size , self .model_config )
138
- elif self .model_name == "SentimentCNN " :
145
+ model = BiGRU (vocab_size , self .model_config )
146
+ elif self .model_name == "TextCNN " :
139
147
vocab_size = len (self .lang .word2id )
140
- model = SentimentCNN (vocab_size , self .model_config )
148
+ model = TextCNN (vocab_size , self .model_config )
141
149
elif self .model_name == "Transformer" :
142
150
model = TransformerClassifier (self .model_config )
143
151
else :
144
152
print ("Model name does not exist" )
145
153
return
154
+
155
+ print (model )
146
156
model .to (DEVICE )
147
157
148
158
run = wandb .init (
149
159
project = "NLU_SA" ,
150
160
entity = "filippomomesso" ,
151
161
group = f"{ self .model_name } " ,
152
- name = f"fold_ { i_fold :02d} " ,
162
+ name = f"{ self . task } _ { self . model_name } _fold_ { fold_idx :02d} " ,
153
163
config = {
154
- "model" : self .model_name ,
155
- "epochs" : EPOCHS ,
156
- "batch_size" : BATCH_SIZE ,
157
- "lr" : LR ,
164
+ "task" : self .task ,
165
+ ** self .model_config ,
158
166
"loss" : "BCELoss" ,
159
167
"optimizer" : "Adam"
160
168
}
161
169
)
162
- # wandb.watch(model, "gradients", log_freq=5)
170
+ wandb .watch (model , "gradients" , log_freq = 5 )
163
171
self .optimizer = optim .Adam (model .parameters (), lr = run .config ['lr' ])
164
- self .cost_fn = torch .nn .BCEWithLogitsLoss () # Because we do not have the pad token
172
+ self .cost_fn = torch .nn .BCEWithLogitsLoss ()
165
173
166
174
best_model , metrics = self .training_loop (model , self .train_loader , self .test_loader , run )
167
175
models .append (best_model )
@@ -171,15 +179,68 @@ def run(self):
171
179
metrics_df = pd .DataFrame .from_dict (metrics_list )
172
180
metrics_df .loc ["mean" ] = metrics_df [:N_FOLDS ].mean ()
173
181
metrics_df .loc ["std" ] = metrics_df [:N_FOLDS ].std ()
174
- metrics_df .loc ["max" ] = metrics_df [:N_FOLDS ].max ()
175
- metrics_df .loc ["min" ] = metrics_df [:N_FOLDS ].min ()
176
182
print (metrics_df )
177
- metrics_df .to_csv (f"{ self .model_name } _stats .csv" )
183
+ metrics_df .to_csv (f"{ STATS_SAVE_PATH } / { self .model_name } _ { self . task } .csv" )
178
184
179
185
best_model_overall_idx = metrics_df ["acc" ].idxmax ()
180
186
return models [best_model_overall_idx ]
181
187
182
- def training_step (self , model , data_loader , optimizer , cost_function , clip = CLIP_GRADIENTS , epoch = 0 ):
188
+ def training_loop (self , model , tr_dl , ts_dl , wandb_run , save = True ):
189
+ print (f"Runnig: { wandb_run .name } " )
190
+
191
+ # Check if model is pretrained to avoid initializing weights
192
+ if not wandb_run .config .get ("pretrained" ):
193
+ print ("Model is not pretrained: initializing weigths." )
194
+ model .apply (init_weights )
195
+
196
+ optimizer = self .optimizer
197
+ cost_fn = self .cost_fn
198
+
199
+ best_loss = 0.
200
+ best_acc = 0.
201
+
202
+ print ("Start training" )
203
+ for e in tqdm (range (wandb_run .config ['epochs' ]), desc = "Training Loop" ):
204
+ train_metrics = self .training_step (model , tr_dl , optimizer , cost_fn , clip = wandb_run .config ["clip_gradients" ], epoch = e )
205
+ test_metrics = self .test_step (model , ts_dl , cost_fn , epoch = e )
206
+
207
+ metrics = {** train_metrics , ** test_metrics }
208
+ wandb .log (metrics )
209
+
210
+ train_loss = train_metrics ['train/train_loss' ]
211
+ train_acc = train_metrics ['train/train_acc' ]
212
+
213
+ test_loss = test_metrics ['test/test_loss' ]
214
+ test_acc = test_metrics ['test/test_acc' ]
215
+ test_f1 = test_metrics ['test/test_f1' ]
216
+
217
+ if best_acc < test_acc or e == 0 :
218
+ best_acc = test_acc
219
+ best_loss = test_loss
220
+ best_f1 = test_f1
221
+ best_model = copy .deepcopy (model )
222
+ # Save new best weights
223
+ if save :
224
+ self .save_weights (e , model , optimizer , test_loss , f"{ WEIGHTS_SAVE_PATH } /{ wandb_run .name } .pth" )
225
+ artifact = wandb .Artifact (f'{ wandb_run .name } ' , type = 'model' , metadata = {** wandb_run .config , ** metrics })
226
+ artifact .add_file (f"{ WEIGHTS_SAVE_PATH } /{ wandb_run .name } .pth" )
227
+ wandb_run .log_artifact (artifact )
228
+
229
+ print ('\n Epoch: {:d}' .format (e + 1 ))
230
+ print ('\t Training loss {:.5f}, Training accuracy {:.2f}' .format (train_loss , train_acc ))
231
+ print ('\t Test loss {:.5f}, Test accuracy {:.2f}, Test F1 {:.2f}' .format (test_loss , test_acc , test_f1 ))
232
+ print ('-----------------------------------------------------' )
233
+
234
+ #visualize(best_model, ts_dl, wandb_run)
235
+ print ('\t BEST Test loss {:.5f}, Test accuracy {:.2f}, Test F1 {:.2f}' .format (best_loss , best_acc , best_f1 ))
236
+ wandb .summary ["test_best_loss" ] = best_loss
237
+ wandb .summary ["test_best_accuracy" ] = best_acc
238
+ wandb .summary ["test_best_f1" ] = best_f1
239
+ wandb .finish ()
240
+ best_metrics = {"loss" : best_loss , "acc" : best_acc , "f1" : best_f1 }
241
+ return best_model , best_metrics
242
+
243
+ def training_step (self , model , data_loader , optimizer , cost_function , clip = 0 , epoch = 0 ):
183
244
n_samples = 0
184
245
cumulative_loss = 0.
185
246
cumulative_accuracy = 0.
@@ -188,7 +249,7 @@ def training_step(self, model, data_loader, optimizer, cost_function, clip=CLIP_
188
249
189
250
for batch_idx , (inputs , targets ) in enumerate (tqdm (data_loader , desc = "Training Step" , leave = False )):
190
251
for k in inputs .keys ():
191
- inputs [k ] = inputs [k ].to (DEVICE )
252
+ inputs [k ] = inputs [k ].to (DEVICE )
192
253
targets = targets .to (DEVICE )
193
254
outputs = model (inputs )
194
255
@@ -232,7 +293,7 @@ def test_step(self, model, data_loader, cost_function, epoch=0):
232
293
with torch .no_grad ():
233
294
for batch_idx , (inputs , targets ) in enumerate (tqdm (data_loader , desc = "Test Step" , leave = False )):
234
295
for k in inputs .keys ():
235
- inputs [k ] = inputs [k ].to (DEVICE )
296
+ inputs [k ] = inputs [k ].to (DEVICE )
236
297
targets = targets .to (DEVICE )
237
298
outputs = model (inputs )
238
299
loss = cost_function (outputs , targets .unsqueeze (- 1 ).float ())
@@ -282,58 +343,6 @@ def load_weights(self, model, optimizer, weights_path, DEVICE, scheduler=None):
282
343
283
344
return epoch , model , optimizer , scheduler
284
345
285
- def training_loop (self , model , tr_dl , ts_dl , wandb_run , save = False ):
286
- print (wandb_run .name )
287
- model .apply (init_weights )
288
- experiment = wandb_run .name
289
-
290
- optimizer = self .optimizer
291
- cost_fn = self .cost_fn
292
-
293
- best_loss = 0.
294
- best_acc = 0.
295
-
296
- print ("Start training" )
297
- for e in tqdm (range (wandb_run .config ['epochs' ]), desc = "Training Loop" ):
298
- train_metrics = self .training_step (model , tr_dl , optimizer , cost_fn , epoch = e )
299
- test_metrics = self .test_step (model , ts_dl , cost_fn , epoch = e )
300
-
301
- metrics = {** train_metrics , ** test_metrics }
302
- wandb .log (metrics )
303
-
304
- train_loss = train_metrics ['train/train_loss' ]
305
- train_acc = train_metrics ['train/train_acc' ]
306
-
307
- test_loss = test_metrics ['test/test_loss' ]
308
- test_acc = test_metrics ['test/test_acc' ]
309
- test_f1 = test_metrics ['test/test_f1' ]
310
-
311
- if best_acc < test_acc or e == 0 :
312
- best_acc = test_acc
313
- best_loss = test_loss
314
- best_f1 = test_f1
315
- best_model = copy .deepcopy (model )
316
- # Save new best weights
317
- if save :
318
- self .save_weights (e , model , optimizer , test_loss , f"./weights/{ wandb_run .group } _{ wandb_run .name } " )
319
- artifact = wandb .Artifact (f'ResNet18CAN_{ experiment } ' , type = 'model' , metadata = {** wandb_run .config , ** metrics })
320
- artifact .add_file (f"./weights/{ wandb_run .group } _{ wandb_run .name } " )
321
- wandb_run .log_artifact (artifact )
322
-
323
- print ('\n Epoch: {:d}' .format (e + 1 ))
324
- print ('\t Training loss {:.5f}, Training accuracy {:.2f}' .format (train_loss , train_acc ))
325
- print ('\t Test loss {:.5f}, Test accuracy {:.2f}, Test F1 {:.2f}' .format (test_loss , test_acc , test_f1 ))
326
- print ('-----------------------------------------------------' )
327
-
328
- #visualize(best_model, ts_dl, wandb_run)
329
- print ('\t BEST Test loss {:.5f}, Test accuracy {:.2f}, Test F1 {:.2f}' .format (best_loss , best_acc , best_f1 ))
330
- wandb .summary ["test_best_loss" ] = best_loss
331
- wandb .summary ["test_best_accuracy" ] = best_acc
332
- wandb .summary ["test_best_f1" ] = best_f1
333
- wandb .finish ()
334
- best_metrics = {"loss" : best_loss , "acc" : best_acc , "f1" : best_f1 }
335
- return best_model , best_metrics
336
-
337
346
338
347
class TransformerExperiment (Experiment ):
339
348
def __init__ (self , model_name , task = "polarity" , sjv_classifier = None , sjv_vectorizer = None ):
@@ -343,15 +352,14 @@ def __init__(self, model_name, task="polarity", sjv_classifier=None, sjv_vectori
343
352
344
353
def create_fold (self ):
345
354
train , test , train_y , test_y = train_test_split (self .data_raw , self .data_Y , test_size = TRAIN_TEST_SPLIT ,
346
- random_state = RANDOM_SEED ,
347
- shuffle = True ,
348
- stratify = self .data_Y )
349
-
350
- train_dataset = TransformerDataset (train , train_y )
351
- test_dataset = TransformerDataset (test , test_y )
355
+ random_state = RANDOM_SEED ,
356
+ shuffle = True ,
357
+ stratify = self .data_Y )
358
+ train_dataset = TransformerDataset (train , train_y , self .model_config , self .task )
359
+ test_dataset = TransformerDataset (test , test_y , self .model_config , self .task )
352
360
353
- self .train_loader = DataLoader (train_dataset , batch_size = BATCH_SIZE , shuffle = True )
354
- self .test_loader = DataLoader (test_dataset , batch_size = BATCH_SIZE )
361
+ self .train_loader = DataLoader (train_dataset , batch_size = self . model_config [ "batch_size" ], shuffle = True )
362
+ self .test_loader = DataLoader (test_dataset , batch_size = self . model_config [ "batch_size" ] )
355
363
356
364
def prepare_data (self ):
357
- BaselineExperiment .prepare_data (self )
365
+ BaselineExperiment .prepare_data (self )
0 commit comments