Momofil31 · Jan 5, 2023
diff --git a/‎dataset_stats.py
+4-2 b/‎dataset_stats.py
+4-2
diff --git a/‎stats/datasets.csv
+5-5 b/‎stats/datasets.csv
+5-5
diff --git a/‎stats/obj_lexicon.txt
+1 b/‎stats/obj_lexicon.txt
+1
diff --git a/‎stats/subj_lexicon.txt
+1 b/‎stats/subj_lexicon.txt
+1
diff --git a/‎tricking-models.py
+115 b/‎tricking-models.py
+115
@@ -40,8 +40,10 @@ def compute_stats(data, name, neg=None, pos=None):
         stats["most_common_words_pos"] = [w for w, _ in nltk.FreqDist(filtered_pos).most_common(10)]
         intersect_common_words = set(stats["most_common_words_neg"]).intersection(set(stats["most_common_words_pos"]))
         stats["most_common_words_intersect"] = list(intersect_common_words)
-        stats["neg_only_words"] = len([w for w in set(filtered_neg) if w not in lexicon_intersection])
-        stats["pos_only_words"] = len([w for w in set(filtered_pos) if w not in lexicon_intersection])
+        stats["neg_only_words_len"] = len([w for w in set(filtered_neg) if w not in lexicon_intersection])
+        stats["neg_only_words"] = [w for w in set(filtered_neg) if w not in lexicon_intersection]
+        stats["pos_only_words_len"] = len([w for w in set(filtered_pos) if w not in lexicon_intersection])
+        stats["pos_only_words"] = [w for w in set(filtered_pos) if w not in lexicon_intersection]
     return stats
 
 
 
@@ -0,0 +1,115 @@
+import json
+import argparse
+import wandb
+from nltk.corpus import movie_reviews, subjectivity, stopwords
+from baseline import BaselineExperiment
+from sklearn.metrics import accuracy_score, f1_score
+
+from experiment import Experiment
+from baseline import BaselineExperiment
+from models import *
+from settings import *
+
+nameToModel = {
+    "BiGRU": BiGRU,
+    "BiGRUAttention": BiGRU,
+    "TextCNN": TextCNN
+}
+
+# Subjective sentence generated by ChatGPT usign only tokens from the objective only lexicon
+subj_sentences = ["I was shocked to discover that the financial webcams we had been using were actually part of a scheme known as 'frodes', and I couldn't believe that Daddy's client would scoff at the idea of being caught up in such a bale of trouble.",
+                    "I felt betrayed and stunned, but I knew I had to move on and find a new situation-based opportunity, even if it meant leaving behind the familiar Composers' Castle and the territorial Marjorie and Margaret"
+                    ]
+obj_sentences = ["The widely reserved, self-determination and simplicity of the 12-step program have proven to be an effective life-affirming method for those seeking to overcome addiction and achieve reconciliation with themselves and others.",
+                    "The artist-agent's creative approach to marketing and promotion has helped to boost the success and stylishness of numerous music and entertainment projects."
+                    ]
+
+sentences = obj_sentences + subj_sentences
+
+def baseline(task):
+    exp_subjectivity = BaselineExperiment(task=task)
+    classifier, vectorizer = exp_subjectivity.run()
+
+    vectors = vectorizer.transform(sentences)
+    preds = classifier.predict(vectors)
+    print(preds)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "model", choices=["Baseline", "BiGRU", "BiGRUAttention", "TextCNN"], help="Specify model type. Eg. 'BiGRU'")
+    parser.add_argument("task", choices=["subjectivity"], help="Specify which task to perform.")
+    parser.add_argument("--fold_index", type=int, choices=[
+                        0, 1, 2, 3, 4],  help="Specifify the fold index to load correct train/test split.")
+    parser.add_argument("-pe", "--pretrained_embeddings",
+                        action="store_true", help="Specify if use pretrained embeddings.")
+    args = parser.parse_args()
+
+    sjv_classifier = None
+    sjv_vectorizer = None
+
+    if args.model == "Baseline":
+        baseline(args.task)
+        exit(0)
+
+    # load model
+    api = wandb.Api()
+    pe_string = "_pe" if args.pretrained_embeddings else ""
+    name = f"{args.task}_{args.model}{pe_string}_fold_{args.fold_index:02d}"
+
+    artifact_name = f'{WANDB_ENTITY}/{WANDB_PROJECT}/{name}:latest'
+    print(artifact_name)
+
+    checkpoint = f"{name}.pth"
+    print(checkpoint)
+
+    artifact = api.artifact(artifact_name)
+    artifact.download(root=WEIGHTS_SAVE_PATH)
+    print(artifact.metadata)
+    model_config = artifact.metadata
+
+    if model_config.get("vocab_size"):
+        model = nameToModel[args.model](
+            model_config["vocab_size"], model_config)
+    else:
+        raise Exception("Config does not specify vocab_size.")
+
+    checkpoint = torch.load(
+        f"{WEIGHTS_SAVE_PATH}/{checkpoint}", map_location=DEVICE)
+    model.load_state_dict(checkpoint['model_state_dict'])
+
+    # create the same language on which the model was trained
+    exp = Experiment(args.task, sjv_classifier, sjv_vectorizer)
+    exp.model_config = model_config
+    exp.prepare_data()
+    exp.create_folds()
+    exp.create_dataloaders(args.fold_index)
+
+    # tokenize sentences
+    tokenized = [nltk.WordPunctTokenizer().tokenize(sent) for sent in sentences]
+    tokenized = [[t.lower() for t in sent] for sent in tokenized]
+    print(tokenized)
+
+    # convert to ids and pad
+    ids = [[exp.lang.word2id.get(t, exp.lang.word2id['<unk>']) for t in sent] for sent in tokenized]
+    ids = [torch.tensor(sent) for sent in ids]
+    y_gt = [0, 0, 1, 1]
+    print(y_gt)
+
+    # predict
+    y_pred = []
+    model.eval()
+    with torch.no_grad():
+        for sent in ids:
+            sent = sent.unsqueeze(0).to(DEVICE)
+            text_len = torch.tensor(len(sent)).unsqueeze(0).to(DEVICE)
+            out = model({"document": sent, "text_len": text_len})
+            if args.model == "BiGRUAttention":
+                out = out[0]
+            prediction = torch.sigmoid(out).round().int()
+            y_pred.append(prediction.item())
+    
+    print(y_pred)
+
+