Skip to content

Commit dc36a04

Browse files
committedJan 5, 2023
✨ Adversarial examples for subjectivity
1 parent a58e28e commit dc36a04

5 files changed

+126
-7
lines changed
 

‎dataset_stats.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,10 @@ def compute_stats(data, name, neg=None, pos=None):
4040
stats["most_common_words_pos"] = [w for w, _ in nltk.FreqDist(filtered_pos).most_common(10)]
4141
intersect_common_words = set(stats["most_common_words_neg"]).intersection(set(stats["most_common_words_pos"]))
4242
stats["most_common_words_intersect"] = list(intersect_common_words)
43-
stats["neg_only_words"] = len([w for w in set(filtered_neg) if w not in lexicon_intersection])
44-
stats["pos_only_words"] = len([w for w in set(filtered_pos) if w not in lexicon_intersection])
43+
stats["neg_only_words_len"] = len([w for w in set(filtered_neg) if w not in lexicon_intersection])
44+
stats["neg_only_words"] = [w for w in set(filtered_neg) if w not in lexicon_intersection]
45+
stats["pos_only_words_len"] = len([w for w in set(filtered_pos) if w not in lexicon_intersection])
46+
stats["pos_only_words"] = [w for w in set(filtered_pos) if w not in lexicon_intersection]
4547
return stats
4648

4749

‎stats/datasets.csv

+5-5
Large diffs are not rendered by default.

‎stats/obj_lexicon.txt

+1
Large diffs are not rendered by default.

‎stats/subj_lexicon.txt

+1
Large diffs are not rendered by default.

‎tricking-models.py

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import json
2+
import argparse
3+
import wandb
4+
from nltk.corpus import movie_reviews, subjectivity, stopwords
5+
from baseline import BaselineExperiment
6+
from sklearn.metrics import accuracy_score, f1_score
7+
8+
from experiment import Experiment
9+
from baseline import BaselineExperiment
10+
from models import *
11+
from settings import *
12+
13+
nameToModel = {
14+
"BiGRU": BiGRU,
15+
"BiGRUAttention": BiGRU,
16+
"TextCNN": TextCNN
17+
}
18+
19+
# Subjective sentence generated by ChatGPT usign only tokens from the objective only lexicon
20+
subj_sentences = ["I was shocked to discover that the financial webcams we had been using were actually part of a scheme known as 'frodes', and I couldn't believe that Daddy's client would scoff at the idea of being caught up in such a bale of trouble.",
21+
"I felt betrayed and stunned, but I knew I had to move on and find a new situation-based opportunity, even if it meant leaving behind the familiar Composers' Castle and the territorial Marjorie and Margaret"
22+
]
23+
obj_sentences = ["The widely reserved, self-determination and simplicity of the 12-step program have proven to be an effective life-affirming method for those seeking to overcome addiction and achieve reconciliation with themselves and others.",
24+
"The artist-agent's creative approach to marketing and promotion has helped to boost the success and stylishness of numerous music and entertainment projects."
25+
]
26+
27+
sentences = obj_sentences + subj_sentences
28+
29+
def baseline(task):
30+
exp_subjectivity = BaselineExperiment(task=task)
31+
classifier, vectorizer = exp_subjectivity.run()
32+
33+
vectors = vectorizer.transform(sentences)
34+
preds = classifier.predict(vectors)
35+
print(preds)
36+
37+
38+
if __name__ == '__main__':
39+
parser = argparse.ArgumentParser()
40+
parser.add_argument(
41+
"model", choices=["Baseline", "BiGRU", "BiGRUAttention", "TextCNN"], help="Specify model type. Eg. 'BiGRU'")
42+
parser.add_argument("task", choices=["subjectivity"], help="Specify which task to perform.")
43+
parser.add_argument("--fold_index", type=int, choices=[
44+
0, 1, 2, 3, 4], help="Specifify the fold index to load correct train/test split.")
45+
parser.add_argument("-pe", "--pretrained_embeddings",
46+
action="store_true", help="Specify if use pretrained embeddings.")
47+
args = parser.parse_args()
48+
49+
sjv_classifier = None
50+
sjv_vectorizer = None
51+
52+
if args.model == "Baseline":
53+
baseline(args.task)
54+
exit(0)
55+
56+
# load model
57+
api = wandb.Api()
58+
pe_string = "_pe" if args.pretrained_embeddings else ""
59+
name = f"{args.task}_{args.model}{pe_string}_fold_{args.fold_index:02d}"
60+
61+
artifact_name = f'{WANDB_ENTITY}/{WANDB_PROJECT}/{name}:latest'
62+
print(artifact_name)
63+
64+
checkpoint = f"{name}.pth"
65+
print(checkpoint)
66+
67+
artifact = api.artifact(artifact_name)
68+
artifact.download(root=WEIGHTS_SAVE_PATH)
69+
print(artifact.metadata)
70+
model_config = artifact.metadata
71+
72+
if model_config.get("vocab_size"):
73+
model = nameToModel[args.model](
74+
model_config["vocab_size"], model_config)
75+
else:
76+
raise Exception("Config does not specify vocab_size.")
77+
78+
checkpoint = torch.load(
79+
f"{WEIGHTS_SAVE_PATH}/{checkpoint}", map_location=DEVICE)
80+
model.load_state_dict(checkpoint['model_state_dict'])
81+
82+
# create the same language on which the model was trained
83+
exp = Experiment(args.task, sjv_classifier, sjv_vectorizer)
84+
exp.model_config = model_config
85+
exp.prepare_data()
86+
exp.create_folds()
87+
exp.create_dataloaders(args.fold_index)
88+
89+
# tokenize sentences
90+
tokenized = [nltk.WordPunctTokenizer().tokenize(sent) for sent in sentences]
91+
tokenized = [[t.lower() for t in sent] for sent in tokenized]
92+
print(tokenized)
93+
94+
# convert to ids and pad
95+
ids = [[exp.lang.word2id.get(t, exp.lang.word2id['<unk>']) for t in sent] for sent in tokenized]
96+
ids = [torch.tensor(sent) for sent in ids]
97+
y_gt = [0, 0, 1, 1]
98+
print(y_gt)
99+
100+
# predict
101+
y_pred = []
102+
model.eval()
103+
with torch.no_grad():
104+
for sent in ids:
105+
sent = sent.unsqueeze(0).to(DEVICE)
106+
text_len = torch.tensor(len(sent)).unsqueeze(0).to(DEVICE)
107+
out = model({"document": sent, "text_len": text_len})
108+
if args.model == "BiGRUAttention":
109+
out = out[0]
110+
prediction = torch.sigmoid(out).round().int()
111+
y_pred.append(prediction.item())
112+
113+
print(y_pred)
114+
115+

0 commit comments

Comments
 (0)
Please sign in to comment.