Skip to content

Commit e523840

Browse files
committedAug 21, 2022
♻️ Mega refactor
1 parent 88d4fe6 commit e523840

15 files changed

+385
-197
lines changed
 

‎baseline.py

+32-31
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import pandas as pd
1414
import itertools
1515

16+
1617
class BaselineExperiment:
1718

1819
def __init__(self, task="polarity", sjv_classifier=None, sjv_vectorizer=None):
@@ -22,6 +23,22 @@ def __init__(self, task="polarity", sjv_classifier=None, sjv_vectorizer=None):
2223
self.sjv_classifier = sjv_classifier
2324
self.sjv_vectorizer = sjv_vectorizer
2425

26+
@staticmethod
27+
def removeObjectiveSents(docs_sents, mask):
28+
i = 0
29+
remaining_sents = 0
30+
clean_docs = []
31+
for doc in docs_sents:
32+
clean_docs.append([])
33+
for sent in doc:
34+
if mask[i] == 1:
35+
clean_docs[-1] += sent
36+
remaining_sents += 1
37+
i += 1
38+
clean_docs = [" ".join(sents) for sents in clean_docs]
39+
print(f"Remaining {remaining_sents} sentences from original {i} sentences count.")
40+
return clean_docs
41+
2542
def prepare_data(self):
2643
print("Loading data")
2744
if self.task == "polarity":
@@ -33,33 +50,19 @@ def prepare_data(self):
3350
self.data_Y = [0] * len(neg_raw) + [1] * len(pos_raw)
3451

3552
elif self.task == "subjectivity":
36-
obj_fileid = subjectivity.fileids()[0] # plot.tok.gt9.5000
37-
subj_fileid = subjectivity.fileids()[1] # quote.tok.gt9.5000
53+
obj_fileid = subjectivity.fileids()[0] # plot.tok.gt9.5000
54+
subj_fileid = subjectivity.fileids()[1] # quote.tok.gt9.5000
3855

3956
# this to avoid splitting words into lists
4057
obj_raw = subjectivity.raw(fileids=obj_fileid).split('\n')[:5000]
4158
subj_raw = subjectivity.raw(fileids=subj_fileid).split('\n')[:5000]
4259
self.data_raw = obj_raw + subj_raw
4360
self.data_Y = [0] * len(obj_raw) + [1] * len(subj_raw)
44-
elif (self.task == "polarity-no-obj-sents"
45-
and self.sjv_classifier is not None
46-
and self.sjv_vectorizer is not None
47-
):
48-
def removeObjectiveSents(docs_sents, mask):
49-
i = 0
50-
remaining_sents = 0
51-
clean_docs = []
52-
for doc in docs_sents:
53-
clean_docs.append([])
54-
for sent in doc:
55-
if mask[i] == 1:
56-
clean_docs[-1] += sent
57-
remaining_sents += 1
58-
i += 1
59-
clean_docs = [" ".join(sents) for sents in clean_docs]
60-
print(f"Remaining {remaining_sents} sentences from original {i} sentences count.")
61-
return clean_docs
62-
61+
elif (self.task == "polarity-filter"
62+
and self.sjv_classifier is not None
63+
and self.sjv_vectorizer is not None
64+
):
65+
6366
# get docs divided in sentences
6467
negative_fileids = movie_reviews.fileids('neg')
6568
positive_fileids = movie_reviews.fileids('pos')
@@ -70,8 +73,8 @@ def removeObjectiveSents(docs_sents, mask):
7073

7174
mr_sjv_vectors = self.sjv_vectorizer.transform(mr_sents)
7275
pred = self.sjv_classifier.predict(mr_sjv_vectors)
73-
74-
self.data_raw = removeObjectiveSents(mr_corpus, pred)
76+
77+
self.data_raw = BaselineExperiment.removeObjectiveSents(mr_corpus, pred)
7578
self.data_Y = [0] * len(negative_fileids) + [1] * len(positive_fileids)
7679
else:
7780
print("Cannot prepare data. Wrong parameters.")
@@ -81,15 +84,13 @@ def run(self):
8184
print(f"Running experiment {self.task} classification.")
8285
self.prepare_data()
8386
vectorizer = CountVectorizer()
84-
classifier = MultinomialNB()
87+
classifier = MultinomialNB()
8588
vectors = vectorizer.fit_transform(self.data_raw)
86-
scores = cross_validate(classifier, vectors, self.data_Y, cv=StratifiedKFold(n_splits=N_FOLDS_BASELINE) , scoring=['accuracy', 'f1'], return_estimator=True)
89+
scores = cross_validate(classifier, vectors, self.data_Y, cv=StratifiedKFold(n_splits=N_FOLDS_BASELINE), scoring=['accuracy', 'f1'], return_estimator=True)
8790
best_model = scores["estimator"][np.argmax(scores["test_accuracy"])]
8891

8992
metrics_df = pd.DataFrame.from_dict(scores)
9093
metrics_df.drop("estimator", axis='columns', inplace=True)
91-
metrics_df.loc["max"] = metrics_df[:N_FOLDS_BASELINE].max()
92-
metrics_df.loc["min"] = metrics_df[:N_FOLDS_BASELINE].min()
9394
metrics_df.loc["mean"] = metrics_df[:N_FOLDS_BASELINE].mean()
9495
metrics_df.loc["std"] = metrics_df[:N_FOLDS_BASELINE].std()
9596
print(metrics_df)
@@ -101,12 +102,12 @@ def run(self):
101102
if __name__ == "__main__":
102103
# Run polarity on whole movie review dataset
103104
exp_polarity = BaselineExperiment(task="polarity")
104-
exp_polarity.run()
105+
exp_polarity.run()
105106

106107
# Run subjectivity
107108
exp_subjectivity = BaselineExperiment(task="subjectivity")
108-
sjv_classifier, sjv_vectorizer = exp_subjectivity.run()
109+
sjv_classifier, sjv_vectorizer = exp_subjectivity.run()
109110

110111
# Run polarity on movie review dataset removing objective sentences
111-
exp = BaselineExperiment(task="polarity-no-obj-sents", sjv_classifier=sjv_classifier, sjv_vectorizer=sjv_vectorizer)
112-
exp.run()
112+
exp = BaselineExperiment(task="polarity-filter", sjv_classifier=sjv_classifier, sjv_vectorizer=sjv_vectorizer)
113+
exp.run()

‎data_processing.py

+26
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from collections import Counter
33
from torch.utils.data import Dataset
44
import torch
5+
from transformers import AutoTokenizer
56

67

78
class Lang():
@@ -81,3 +82,28 @@ def merge(sequences):
8182
label = torch.LongTensor(new_item["label"])
8283
text_lens = torch.LongTensor(lenghts)
8384
return ({"document": src_docs, "text_lens": text_lens}, label)
85+
86+
87+
class TransformerDataset(Dataset):
88+
89+
def __init__(self, documents, labels, config, task):
90+
self.tokenizer = AutoTokenizer.from_pretrained(config["pretrained_model"])
91+
self.documents = documents
92+
self.labels = labels
93+
94+
self.docs_tensor = self.tokenizer(self.documents,
95+
padding='max_length',
96+
max_length=config["sequence_max_len"][task],
97+
truncation=True,
98+
return_tensors="pt")
99+
100+
def __len__(self):
101+
return len(self.documents)
102+
103+
def __getitem__(self, idx):
104+
label = torch.tensor(self.labels[idx])
105+
sample = {'input_ids': self.docs_tensor["input_ids"][idx],
106+
'attention_mask': self.docs_tensor["attention_mask"][idx]}
107+
if "token_type_ids" in self.docs_tensor.keys():
108+
sample["token_type_ids"] = self.docs_tensor["token_type_ids"][idx]
109+
return sample, label

‎dataset_stats.py

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import nltk
2+
from nltk.corpus import movie_reviews, subjectivity, stopwords
3+
from experiment import Experiment
4+
from settings import STATS_SAVE_PATH
5+
from baseline import BaselineExperiment
6+
7+
import numpy as np
8+
import matplotlib.pyplot as plt
9+
import pandas as pd
10+
import string
11+
12+
def compute_stats(data, name):
13+
stats = {}
14+
seq_lens = [len(sents) for sents in data]
15+
stats["num_sequences"] = len(data)
16+
stats["num_words"] = sum([len(sent) for sents in data for sent in sents])
17+
stats["avg_seq_len"] = np.average(seq_lens)
18+
stats["max_seq_len"] = np.max(seq_lens)
19+
stats["min_seq_len"] = np.min(seq_lens)
20+
21+
NLTK_STOP_WORDS = set(stopwords.words('english')+list(string.punctuation))
22+
lexicon = set([w for doc in data for w in doc])
23+
24+
filtered_mr_words = [word for word in lexicon if not word in NLTK_STOP_WORDS]
25+
lexicon_filtered = set(filtered_mr_words)
26+
27+
stats["lexicon_size"] = len(lexicon)
28+
stats["lexicon_size_no_stopwords"] = len(lexicon_filtered)
29+
return stats
30+
31+
32+
if __name__ == "__main__":
33+
stats = {}
34+
35+
# Movie review dataset
36+
negative_fileids = movie_reviews.fileids('neg')
37+
positive_fileids = movie_reviews.fileids('pos')
38+
39+
# each is a list of documents
40+
mr_neg_words = [movie_reviews.words(fileids=fileid) for fileid in negative_fileids]
41+
mr_pos_words = [movie_reviews.words(fileids=fileid) for fileid in positive_fileids]
42+
mr_neg_sents = [movie_reviews.sents(fileids=fileid) for fileid in negative_fileids]
43+
mr_pos_sents = [movie_reviews.sents(fileids=fileid) for fileid in positive_fileids]
44+
45+
mr_sents = mr_neg_sents + mr_pos_sents
46+
mr_words = mr_neg_words + mr_pos_words
47+
48+
stats["MR"] = compute_stats(mr_words, "MR")
49+
50+
# Treating MR as subjectivity dataset (list of sentences)
51+
mr_sjv = [sent for doc in mr_sents for sent in doc]
52+
stats["MR_sjv"] = compute_stats(mr_sjv, "MR_SJV")
53+
54+
# Subjectivity dataset
55+
obj_fileid = subjectivity.fileids()[0] # plot.tok.gt9.5000
56+
subj_fileid = subjectivity.fileids()[1] # quote.tok.gt9.5000
57+
obj_words = subjectivity.sents(fileids=obj_fileid)
58+
subj_words = subjectivity.sents(fileids=subj_fileid)
59+
sjv_words = obj_words + subj_words
60+
stats["SJV"] = compute_stats(sjv_words, "SJV")
61+
62+
# Clean MR
63+
# Train baseline subjectivity classifier
64+
exp_subjectivity = BaselineExperiment(task="subjectivity")
65+
sjv_classifier, sjv_vectorizer = exp_subjectivity.run()
66+
mr_vectors = sjv_vectorizer.transform([" ".join(sent) for sent in mr_sjv])
67+
preds = sjv_classifier.predict(mr_vectors)
68+
69+
# Remove objective sentences
70+
mr_sents_filtered = Experiment.removeObjectiveSents(mr_sents, preds)
71+
stats["MR_clean_baseline"] = compute_stats(mr_sents_filtered, "MR_clean_baseline")
72+
73+
stats_df = pd.DataFrame.from_dict(stats, orient="index")
74+
stats_df.to_csv(f"{STATS_SAVE_PATH}/datasets.csv")
75+
print(stats_df)

‎experiment.py

+109-101
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,10 @@
77
import pandas as pd
88
from baseline import BaselineExperiment
99

10-
from models.models import SentimentGRU, SentimentCNN
11-
from transformer.data_processing import TransformerDataset
12-
from transformer.models import TransformerClassifier
10+
from models import BiGRU, TextCNN, TransformerClassifier
1311
from utils import init_weights
1412
from settings import *
15-
from data_processing import Lang, CustomDataset
13+
from data_processing import Lang, CustomDataset, TransformerDataset
1614

1715
from nltk.corpus import movie_reviews, subjectivity
1816
from sklearn.model_selection import train_test_split
@@ -34,10 +32,12 @@ def __init__(self, model_name, task="polarity", sjv_classifier=None, sjv_vectori
3432
self.sjv_classifier = sjv_classifier
3533
self.sjv_vectorizer = sjv_vectorizer
3634

37-
if model_name == "SentimentGRU":
38-
self.model_config = SentimentGRU_config
39-
if model_name == "SentimentCNN":
40-
self.model_config = SentimentCNN_config
35+
if model_name == "BiGRU":
36+
self.model_config = BiGRU_config
37+
if model_name == "BiGRUAttention":
38+
self.model_config = BiGRUAttention_config
39+
if model_name == "TextCNN":
40+
self.model_config = TextCNN_config
4141

4242
def prepare_data(self):
4343
if self.task == "polarity":
@@ -69,20 +69,10 @@ def prepare_data(self):
6969
self.data_Y += [1]*len(subj_sents)
7070
print("Total samples: ", len(self.data_raw))
7171

72-
elif (self.task == "polarity-no-obj-sents"
72+
elif (self.task == "polarity-filter"
7373
and self.sjv_classifier is not None
7474
and self.sjv_vectorizer is not None
7575
):
76-
def removeObjectiveSents(docs_sents, mask):
77-
i = 0
78-
clean_docs = []
79-
for doc in docs_sents:
80-
clean_docs.append([])
81-
for sent in doc:
82-
if mask[i] == 1:
83-
clean_docs[-1] += sent
84-
i += 1
85-
return clean_docs
8676

8777
# get docs divided in sentences
8878
negative_fileids = movie_reviews.fileids('neg')
@@ -95,7 +85,7 @@ def removeObjectiveSents(docs_sents, mask):
9585
# shallow subjectivity classifier is used to allow comparisons
9686
movie_sjv_vectors = self.sjv_vectorizer.transform(mr_sents)
9787
pred = self.sjv_classifier.predict(movie_sjv_vectors)
98-
clean_mr = removeObjectiveSents(mr_docs_sents, pred)
88+
clean_mr = Experiment.removeObjectiveSents(mr_docs_sents, pred)
9989

10090
mr_neg = [{"document": doc, "label": 0} for doc in clean_mr[:1000]]
10191
mr_Y_neg = [0]*len(mr_neg)
@@ -111,6 +101,21 @@ def removeObjectiveSents(docs_sents, mask):
111101
print("Cannot prepare data. Wrong parameters.")
112102
exit()
113103

104+
@staticmethod
105+
def removeObjectiveSents(docs_sents, mask):
106+
i = 0
107+
remaining_sents = 0
108+
clean_docs = []
109+
for doc in docs_sents:
110+
clean_docs.append([])
111+
for sent in doc:
112+
if mask[i] == 1:
113+
clean_docs[-1] += sent
114+
remaining_sents += 1
115+
i += 1
116+
print(f"Remaining {remaining_sents} sentences from original {i} sentences count.")
117+
return clean_docs
118+
114119
def create_fold(self):
115120
train, test, _, _ = train_test_split(self.data_raw, self.data_Y, test_size=TRAIN_TEST_SPLIT,
116121
random_state=RANDOM_SEED,
@@ -122,46 +127,49 @@ def create_fold(self):
122127
train_dataset = CustomDataset(train, self.lang)
123128
test_dataset = CustomDataset(test, self.lang)
124129

125-
self.train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=train_dataset.collate_fn, shuffle=True)
126-
self.test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=test_dataset.collate_fn, drop_last=True)
130+
self.train_loader = DataLoader(train_dataset, batch_size=self.model_config["batch_size"], collate_fn=train_dataset.collate_fn, shuffle=True)
131+
self.test_loader = DataLoader(test_dataset, batch_size=self.model_config["batch_size"], collate_fn=test_dataset.collate_fn)
127132

128133
def run(self):
129134
self.prepare_data()
130135
models = []
131136
metrics_list = []
132-
for i_fold in range(N_FOLDS):
137+
for fold_idx in range(N_FOLDS):
133138
self.create_fold()
134139

135-
if self.model_name == "SentimentGRU":
140+
if self.model_name == "BiGRU":
141+
vocab_size = len(self.lang.word2id)
142+
model = BiGRU(vocab_size, self.model_config)
143+
if self.model_name == "BiGRUAttention":
136144
vocab_size = len(self.lang.word2id)
137-
model = SentimentGRU(vocab_size, self.model_config)
138-
elif self.model_name == "SentimentCNN":
145+
model = BiGRU(vocab_size, self.model_config)
146+
elif self.model_name == "TextCNN":
139147
vocab_size = len(self.lang.word2id)
140-
model = SentimentCNN(vocab_size, self.model_config)
148+
model = TextCNN(vocab_size, self.model_config)
141149
elif self.model_name == "Transformer":
142150
model = TransformerClassifier(self.model_config)
143151
else:
144152
print("Model name does not exist")
145153
return
154+
155+
print(model)
146156
model.to(DEVICE)
147157

148158
run = wandb.init(
149159
project="NLU_SA",
150160
entity="filippomomesso",
151161
group=f"{self.model_name}",
152-
name=f"fold_{i_fold:02d}",
162+
name=f"{self.task}_{self.model_name}_fold_{fold_idx:02d}",
153163
config={
154-
"model": self.model_name,
155-
"epochs": EPOCHS,
156-
"batch_size": BATCH_SIZE,
157-
"lr": LR,
164+
"task": self.task,
165+
**self.model_config,
158166
"loss": "BCELoss",
159167
"optimizer": "Adam"
160168
}
161169
)
162-
#wandb.watch(model, "gradients", log_freq=5)
170+
wandb.watch(model, "gradients", log_freq=5)
163171
self.optimizer = optim.Adam(model.parameters(), lr=run.config['lr'])
164-
self.cost_fn = torch.nn.BCEWithLogitsLoss() # Because we do not have the pad token
172+
self.cost_fn = torch.nn.BCEWithLogitsLoss()
165173

166174
best_model, metrics = self.training_loop(model, self.train_loader, self.test_loader, run)
167175
models.append(best_model)
@@ -171,15 +179,68 @@ def run(self):
171179
metrics_df = pd.DataFrame.from_dict(metrics_list)
172180
metrics_df.loc["mean"] = metrics_df[:N_FOLDS].mean()
173181
metrics_df.loc["std"] = metrics_df[:N_FOLDS].std()
174-
metrics_df.loc["max"] = metrics_df[:N_FOLDS].max()
175-
metrics_df.loc["min"] = metrics_df[:N_FOLDS].min()
176182
print(metrics_df)
177-
metrics_df.to_csv(f"{self.model_name}_stats.csv")
183+
metrics_df.to_csv(f"{STATS_SAVE_PATH}/{self.model_name}_{self.task}.csv")
178184

179185
best_model_overall_idx = metrics_df["acc"].idxmax()
180186
return models[best_model_overall_idx]
181187

182-
def training_step(self, model, data_loader, optimizer, cost_function, clip=CLIP_GRADIENTS, epoch=0):
188+
def training_loop(self, model, tr_dl, ts_dl, wandb_run, save=True):
189+
print(f"Runnig: {wandb_run.name}")
190+
191+
# Check if model is pretrained to avoid initializing weights
192+
if not wandb_run.config.get("pretrained"):
193+
print("Model is not pretrained: initializing weigths.")
194+
model.apply(init_weights)
195+
196+
optimizer = self.optimizer
197+
cost_fn = self.cost_fn
198+
199+
best_loss = 0.
200+
best_acc = 0.
201+
202+
print("Start training")
203+
for e in tqdm(range(wandb_run.config['epochs']), desc="Training Loop"):
204+
train_metrics = self.training_step(model, tr_dl, optimizer, cost_fn, clip=wandb_run.config["clip_gradients"], epoch=e)
205+
test_metrics = self.test_step(model, ts_dl, cost_fn, epoch=e)
206+
207+
metrics = {**train_metrics, **test_metrics}
208+
wandb.log(metrics)
209+
210+
train_loss = train_metrics['train/train_loss']
211+
train_acc = train_metrics['train/train_acc']
212+
213+
test_loss = test_metrics['test/test_loss']
214+
test_acc = test_metrics['test/test_acc']
215+
test_f1 = test_metrics['test/test_f1']
216+
217+
if best_acc < test_acc or e == 0:
218+
best_acc = test_acc
219+
best_loss = test_loss
220+
best_f1 = test_f1
221+
best_model = copy.deepcopy(model)
222+
# Save new best weights
223+
if save:
224+
self.save_weights(e, model, optimizer, test_loss, f"{WEIGHTS_SAVE_PATH}/{wandb_run.name}.pth")
225+
artifact = wandb.Artifact(f'{wandb_run.name}', type='model', metadata={**wandb_run.config, **metrics})
226+
artifact.add_file(f"{WEIGHTS_SAVE_PATH}/{wandb_run.name}.pth")
227+
wandb_run.log_artifact(artifact)
228+
229+
print('\n Epoch: {:d}'.format(e + 1))
230+
print('\t Training loss {:.5f}, Training accuracy {:.2f}'.format(train_loss, train_acc))
231+
print('\t Test loss {:.5f}, Test accuracy {:.2f}, Test F1 {:.2f}'.format(test_loss, test_acc, test_f1))
232+
print('-----------------------------------------------------')
233+
234+
#visualize(best_model, ts_dl, wandb_run)
235+
print('\t BEST Test loss {:.5f}, Test accuracy {:.2f}, Test F1 {:.2f}'.format(best_loss, best_acc, best_f1))
236+
wandb.summary["test_best_loss"] = best_loss
237+
wandb.summary["test_best_accuracy"] = best_acc
238+
wandb.summary["test_best_f1"] = best_f1
239+
wandb.finish()
240+
best_metrics = {"loss": best_loss, "acc": best_acc, "f1": best_f1}
241+
return best_model, best_metrics
242+
243+
def training_step(self, model, data_loader, optimizer, cost_function, clip=0, epoch=0):
183244
n_samples = 0
184245
cumulative_loss = 0.
185246
cumulative_accuracy = 0.
@@ -188,7 +249,7 @@ def training_step(self, model, data_loader, optimizer, cost_function, clip=CLIP_
188249

189250
for batch_idx, (inputs, targets) in enumerate(tqdm(data_loader, desc="Training Step", leave=False)):
190251
for k in inputs.keys():
191-
inputs[k] = inputs[k].to(DEVICE)
252+
inputs[k] = inputs[k].to(DEVICE)
192253
targets = targets.to(DEVICE)
193254
outputs = model(inputs)
194255

@@ -232,7 +293,7 @@ def test_step(self, model, data_loader, cost_function, epoch=0):
232293
with torch.no_grad():
233294
for batch_idx, (inputs, targets) in enumerate(tqdm(data_loader, desc="Test Step", leave=False)):
234295
for k in inputs.keys():
235-
inputs[k] = inputs[k].to(DEVICE)
296+
inputs[k] = inputs[k].to(DEVICE)
236297
targets = targets.to(DEVICE)
237298
outputs = model(inputs)
238299
loss = cost_function(outputs, targets.unsqueeze(-1).float())
@@ -282,58 +343,6 @@ def load_weights(self, model, optimizer, weights_path, DEVICE, scheduler=None):
282343

283344
return epoch, model, optimizer, scheduler
284345

285-
def training_loop(self, model, tr_dl, ts_dl, wandb_run, save=False):
286-
print(wandb_run.name)
287-
model.apply(init_weights)
288-
experiment = wandb_run.name
289-
290-
optimizer = self.optimizer
291-
cost_fn = self.cost_fn
292-
293-
best_loss = 0.
294-
best_acc = 0.
295-
296-
print("Start training")
297-
for e in tqdm(range(wandb_run.config['epochs']), desc="Training Loop"):
298-
train_metrics = self.training_step(model, tr_dl, optimizer, cost_fn, epoch=e)
299-
test_metrics = self.test_step(model, ts_dl, cost_fn, epoch=e)
300-
301-
metrics = {**train_metrics, **test_metrics}
302-
wandb.log(metrics)
303-
304-
train_loss = train_metrics['train/train_loss']
305-
train_acc = train_metrics['train/train_acc']
306-
307-
test_loss = test_metrics['test/test_loss']
308-
test_acc = test_metrics['test/test_acc']
309-
test_f1 = test_metrics['test/test_f1']
310-
311-
if best_acc < test_acc or e == 0:
312-
best_acc = test_acc
313-
best_loss = test_loss
314-
best_f1 = test_f1
315-
best_model = copy.deepcopy(model)
316-
# Save new best weights
317-
if save:
318-
self.save_weights(e, model, optimizer, test_loss, f"./weights/{wandb_run.group}_{wandb_run.name}")
319-
artifact = wandb.Artifact(f'ResNet18CAN_{experiment}', type='model', metadata={**wandb_run.config, **metrics})
320-
artifact.add_file(f"./weights/{wandb_run.group}_{wandb_run.name}")
321-
wandb_run.log_artifact(artifact)
322-
323-
print('\n Epoch: {:d}'.format(e + 1))
324-
print('\t Training loss {:.5f}, Training accuracy {:.2f}'.format(train_loss, train_acc))
325-
print('\t Test loss {:.5f}, Test accuracy {:.2f}, Test F1 {:.2f}'.format(test_loss, test_acc, test_f1))
326-
print('-----------------------------------------------------')
327-
328-
#visualize(best_model, ts_dl, wandb_run)
329-
print('\t BEST Test loss {:.5f}, Test accuracy {:.2f}, Test F1 {:.2f}'.format(best_loss, best_acc, best_f1))
330-
wandb.summary["test_best_loss"] = best_loss
331-
wandb.summary["test_best_accuracy"] = best_acc
332-
wandb.summary["test_best_f1"] = best_f1
333-
wandb.finish()
334-
best_metrics = {"loss": best_loss, "acc": best_acc, "f1": best_f1}
335-
return best_model, best_metrics
336-
337346

338347
class TransformerExperiment(Experiment):
339348
def __init__(self, model_name, task="polarity", sjv_classifier=None, sjv_vectorizer=None):
@@ -343,15 +352,14 @@ def __init__(self, model_name, task="polarity", sjv_classifier=None, sjv_vectori
343352

344353
def create_fold(self):
345354
train, test, train_y, test_y = train_test_split(self.data_raw, self.data_Y, test_size=TRAIN_TEST_SPLIT,
346-
random_state=RANDOM_SEED,
347-
shuffle=True,
348-
stratify=self.data_Y)
349-
350-
train_dataset = TransformerDataset(train, train_y)
351-
test_dataset = TransformerDataset(test, test_y)
355+
random_state=RANDOM_SEED,
356+
shuffle=True,
357+
stratify=self.data_Y)
358+
train_dataset = TransformerDataset(train, train_y, self.model_config, self.task)
359+
test_dataset = TransformerDataset(test, test_y, self.model_config, self.task)
352360

353-
self.train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
354-
self.test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)
361+
self.train_loader = DataLoader(train_dataset, batch_size=self.model_config["batch_size"], shuffle=True)
362+
self.test_loader = DataLoader(test_dataset, batch_size=self.model_config["batch_size"])
355363

356364
def prepare_data(self):
357-
BaselineExperiment.prepare_data(self)
365+
BaselineExperiment.prepare_data(self)

‎main.py

+29-2
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,38 @@
33

44
if __name__ == "__main__":
55
filter = False
6+
# if filter:
7+
# # Run subjectivity
8+
# exp_subjectivity = BaselineExperiment(task="subjectivity")
9+
# sjv_classifier, sjv_vectorizer = exp_subjectivity.run()
10+
# exp = TransformerExperiment("Transformer", "polarity-filter", sjv_classifier, sjv_vectorizer)
11+
# else:
12+
# exp = TransformerExperiment("Transformer", "subjectivity")
13+
# best_model = exp.run()
14+
15+
# if filter:
16+
# # Run subjectivity
17+
# exp_subjectivity = BaselineExperiment(task="subjectivity")
18+
# sjv_classifier, sjv_vectorizer = exp_subjectivity.run()
19+
# exp = Experiment("BiGRU", "polarity-filter", sjv_classifier, sjv_vectorizer)
20+
# else:
21+
# exp = Experiment("BiGRU", "polarity")
22+
# best_model = exp.run()
23+
24+
# if filter:
25+
# # Run subjectivity
26+
# exp_subjectivity = BaselineExperiment(task="subjectivity")
27+
# sjv_classifier, sjv_vectorizer = exp_subjectivity.run()
28+
# exp = Experiment("BiGRUAttention", "polarity-filter", sjv_classifier, sjv_vectorizer)
29+
# else:
30+
# exp = Experiment("BiGRUAttention", "subjectivity")
31+
# best_model = exp.run()
32+
633
if filter:
734
# Run subjectivity
835
exp_subjectivity = BaselineExperiment(task="subjectivity")
936
sjv_classifier, sjv_vectorizer = exp_subjectivity.run()
10-
exp = TransformerExperiment("Transformer", "polarity-no-obj-sents", sjv_classifier, sjv_vectorizer)
37+
exp = Experiment("TextCNN", "polarity-filter", sjv_classifier, sjv_vectorizer)
1138
else:
12-
exp = TransformerExperiment("Transformer", "polarity")
39+
exp = Experiment("TextCNN", "subjectivity")
1340
best_model = exp.run()

‎models/models.py ‎models.py

+29-6
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,15 @@
22
from torch import nn
33
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
44
from settings import PAD_TOKEN
5+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
56

67

78
class SoftAttention(nn.Module):
9+
'''
10+
Multilayer perception to learn attention coefficients.
11+
As described in https://arxiv.org/pdf/1409.0473.pdf, Bengio et al. ICLR 2015
12+
'''
13+
814
def __init__(self, dim, hidden_dim, dropout_ratio=0.1):
915
super(SoftAttention, self).__init__()
1016
self.attention = nn.Sequential(
@@ -18,13 +24,14 @@ def __init__(self, dim, hidden_dim, dropout_ratio=0.1):
1824
def forward(self, context_vector):
1925
return self.attention(context_vector)
2026

21-
class SentimentGRU(nn.Module):
27+
28+
class BiGRU(nn.Module):
2229
'''
23-
Architecture based on the one seen during lab
30+
Architecture based on the one seen during lab.
2431
'''
2532

2633
def __init__(self, vocab_size, config, pad_index=0):
27-
super(SentimentGRU, self).__init__()
34+
super(BiGRU, self).__init__()
2835
self.emb_size = config["emb_size"]
2936
self.hidden_size = config["hidden_size"]
3037
self.out_size = config["out_size"]
@@ -37,6 +44,8 @@ def __init__(self, vocab_size, config, pad_index=0):
3744
if self.attention:
3845
self.att_hidden_size = config["att_hidden_size"]
3946
self.attention_module = SoftAttention(self.hidden_size*self.num_dir, self.att_hidden_size, dropout_ratio=self.dropout_ratio)
47+
if self.num_layers == 1:
48+
self.dropout_ratio = 0
4049

4150
self.embedding = nn.Embedding(vocab_size, self.emb_size, padding_idx=pad_index)
4251
self.utt_encoder = nn.GRU(self.emb_size, self.hidden_size, self.num_layers, bidirectional=self.bidirectional, dropout=self.dropout_ratio)
@@ -63,7 +72,7 @@ def forward(self, inputs):
6372
# "A potential issue with this encoder–decoder approach is that a neural network
6473
# needs to be able to compress all the necessary information of a source sentence into a fixed-length vector.
6574
# This may make it difficult for the neural network to cope with long sentences,
66-
# especially those that are longer than the sentences in the training corpus."
75+
# especially those that are longer than the sentences in the training corpus."
6776
# https://arxiv.org/pdf/1409.0473.pdf, Bengio et al. ICLR 2015
6877
if not self.attention:
6978
hidden_view = hidden.view(self.num_layers, self.num_dir, batch_size, self.hidden_size) # 2 for bidirectional
@@ -86,7 +95,7 @@ def forward(self, inputs):
8695
return out
8796

8897

89-
class SentimentCNN(nn.Module):
98+
class TextCNN(nn.Module):
9099
'''
91100
Architecture based on:
92101
Yoon Kim. 2014. Convolutional Neural Networks for Sentence Classification.
@@ -102,7 +111,7 @@ class SentimentCNN(nn.Module):
102111
'''
103112

104113
def __init__(self, vocab_size, config):
105-
super(SentimentCNN, self).__init__()
114+
super(TextCNN, self).__init__()
106115
self.emb_size = config["emb_size"]
107116
self.num_filters = config["num_filters"]
108117
self.filter_sizes = config["filter_sizes"]
@@ -129,3 +138,17 @@ def forward(self, inputs):
129138
x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list], dim=1)
130139
logits = self.fc(self.dropout(x_fc))
131140
return logits
141+
142+
143+
class TransformerClassifier(nn.Module):
144+
145+
def __init__(self, config):
146+
super(TransformerClassifier, self).__init__()
147+
self.out_size = config["out_size"]
148+
self.transformer = AutoModelForSequenceClassification.from_pretrained(
149+
config["pretrained_model"],
150+
num_labels=self.out_size,
151+
ignore_mismatched_sizes=True)
152+
153+
def forward(self, input):
154+
return self.transformer(**input, return_dict=True).logits

‎settings.py

+56-10
Original file line numberDiff line numberDiff line change
@@ -7,39 +7,85 @@
77
# nltk.download("movie_reviews")
88
# nltk.download("subjectivity")
99

10-
N_FOLDS = 1
11-
N_FOLDS_BASELINE = 5
1210
RANDOM_SEED = 42
13-
BATCH_SIZE = 512
14-
PAD_TOKEN = 0
11+
N_FOLDS = 5
12+
N_FOLDS_BASELINE = 5
1513
TRAIN_TEST_SPLIT = 0.2
16-
EPOCHS = 10
14+
15+
PAD_TOKEN = 0
16+
17+
EPOCHS = 50
18+
EPOCHS_PRETRAINED = 5
19+
1720
LR = 0.001
21+
LR_PRETRAINED = 5e-5
22+
23+
SEQUENCE_MAX_LENGTHS = {
24+
"polarity": 512,
25+
"subjectivity": 128,
26+
"polarity-filter": 512
27+
}
28+
1829
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
19-
SEQUENCE_MAX_LENGTH= 10
2030
PRETRAINED_MODEL_NAME = "distilbert-base-uncased"
2131
CLIP_GRADIENTS = 5
2232

33+
WEIGHTS_SAVE_PATH = "./weights"
34+
STATS_SAVE_PATH = "./stats"
35+
2336
# models config
24-
SentimentGRU_config = {
37+
BiGRUAttention_config = {
38+
"model_name": "BiGRUAttention",
39+
"epochs": EPOCHS,
40+
"batch_size": 256,
41+
"lr": LR,
2542
"emb_size": 300,
2643
"hidden_size": 128,
2744
"out_size": 1,
2845
"num_layers": 2,
2946
"dropout_ratio": 0.5,
3047
"bidirectional": True,
3148
"attention": True,
32-
"att_hidden_size": 64
49+
"att_hidden_size": 64,
50+
"clip_gradients": CLIP_GRADIENTS
3351
}
3452

35-
SentimentCNN_config = {
53+
BiGRU_config = {
54+
"model_name": "BiGRU",
55+
"epochs": EPOCHS,
56+
"batch_size": 256,
57+
"lr": LR,
58+
"emb_size": 300,
59+
"hidden_size": 128,
60+
"out_size": 1,
61+
"num_layers": 2,
62+
"dropout_ratio": 0.5,
63+
"bidirectional": True,
64+
"attention": False,
65+
"clip_gradients": CLIP_GRADIENTS
66+
}
67+
68+
TextCNN_config = {
69+
"model_name": "TextCNN",
70+
"epochs": EPOCHS,
71+
"batch_size": 256,
72+
"lr": LR,
3673
"emb_size": 300,
3774
"out_size": 1,
3875
"filter_sizes": [3, 5, 7],
3976
"num_filters": [100, 100, 100],
4077
"dropout_ratio": 0.5,
78+
"clip_gradients": 0
4179
}
4280

4381
Transformer_config = {
82+
"model_name": "Transfomer",
83+
"pretrained_model": PRETRAINED_MODEL_NAME,
84+
"epochs": EPOCHS_PRETRAINED,
85+
"batch_size": 32,
86+
"lr": LR_PRETRAINED,
87+
"sequence_max_len": SEQUENCE_MAX_LENGTHS,
4488
"out_size": 1,
45-
}
89+
"pretrained": True,
90+
"clip_gradients": 0
91+
}
+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
,fit_time,score_time,test_accuracy,test_f1
2+
0,0.00449824333190918,0.0020368099212646484,0.8225,0.8174807197943446
3+
1,0.004383087158203125,0.001954793930053711,0.8625,0.8641975308641976
4+
2,0.004187107086181641,0.0021529197692871094,0.84,0.8375634517766498
5+
3,0.003898143768310547,0.0018398761749267578,0.87,0.8659793814432989
6+
4,0.004026889801025391,0.0017871856689453125,0.825,0.8214285714285714
7+
mean,0.004198694229125976,0.001954317092895508,0.844,0.8413299310614125
8+
std,0.0002468359881403142,0.00014771421203888375,0.021549361939509966,0.022964961813428303

‎stats/baseline_polarity_stats.csv

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
,fit_time,score_time,test_accuracy,test_f1
2+
0,0.007039070129394531,0.0022912025451660156,0.8125,0.810126582278481
3+
1,0.006242990493774414,0.002053976058959961,0.8275,0.8261964735516373
4+
2,0.005444049835205078,0.002074003219604492,0.8075,0.80306905370844
5+
3,0.0054531097412109375,0.0020837783813476562,0.8325,0.830379746835443
6+
4,0.005436897277832031,0.002056121826171875,0.7925,0.785529715762274
7+
mean,0.005923223495483398,0.00211181640625,0.8145,0.811060314427255
8+
std,0.0007131781285630203,0.00010104794256270079,0.01604680653588123,0.018158244363148276

‎stats/baseline_subjectivity_stats.csv

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
,fit_time,score_time,test_accuracy,test_f1
2+
0,0.004384040832519531,0.002396821975708008,0.9175,0.9183572488866898
3+
1,0.003802061080932617,0.0023458003997802734,0.918,0.9196865817825661
4+
2,0.0037550926208496094,0.0023450851440429688,0.9275,0.9288168875797741
5+
3,0.0037908554077148438,0.0023431777954101562,0.9275,0.9284657128761717
6+
4,0.00376129150390625,0.002343893051147461,0.9105,0.9113422486379396
7+
mean,0.0038986682891845705,0.0023549556732177734,0.9202,0.9213337359526284
8+
std,0.00027204293300494247,2.3426126839661284e-05,0.00729383301152419,0.007386898266289606

‎stats/datasets.csv

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
,num_sequences,num_words,avg_seq_len,max_seq_len,min_seq_len,lexicon_size,lexicon_size_no_stopwords
2+
MR,2000,6226700,791.91,2879,19,39768,39587
3+
MR_sjv,65258,6226700,24.270127800422937,187,1,39768,39587
4+
SJV,10000,1049750,24.0576,120,10,23906,23737
5+
MR_clean_baseline,2000,3937817,501.3715,2169,19,30266,30088

‎transformer/__init__.py

Whitespace-only changes.

‎transformer/data_processing.py

-32
This file was deleted.

‎transformer/models.py

-15
This file was deleted.
File renamed without changes.

0 commit comments

Comments
 (0)
Please sign in to comment.