-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathsvm_classification_new.py
162 lines (109 loc) · 8.16 KB
/
svm_classification_new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
import io
import re
from sklearn import svm, model_selection, tree
import json
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import time
stopwords = ['or', 'could', 'should', 'along', 'nor', 'nothing', 'may', 'etc', 'show', 'them', 'whereby', 'mostly', 'nine', 'system', 'found', 'nobody', 'only', 'hundred', 'it', 'beyond', 'over', 're', 'sometime', 'thence', 'up', 'else', 'both', 'take', 'must', 'full', 'find', 'since', 'how', 'third', 'those', 'upon', 'thereafter', 'empty', 'herein', 'sixty', 'than', 'other', 'interest', 'whose', 'below', 'four', 'is', 'these', 'cant', 'all', 'alone', 'always', 'at', 'someone', 'first', 'done', 'less', 'next', 'off', 'nevertheless', 'part', 'front', 'whereafter', 'eight', 'six', 'something', 'to', 'are', 'amoungst', 'fifty', 'further', 'would', 'onto', 'amount', 'detail', 'well', 'even', 'twelve', 'meanwhile', 'whenever', 'until', 'thru','seemed', 'whatever', 'although', 'otherwise', 'anything', 'that', 'hence', 'towards', 'between', 'here', 'co', 'any', 'back', 'every', 'might', 'besides', 'ten', 'were', 'because', 'some', 'ever', 'and', 'for', 'through', 'move', 'formerly', 'keep', 'none', 'inc', 'once', 'hereby', 'never', 'then', 'but', 'cannot', 'elsewhere', 'everywhere', 'however', 'forty', 'five', 'namely', 'enough', 'get', 'whither', 'an', 'with', 'mill', 'another', 'they', 'above', 'whole', 'please', 'latterly', 'sometimes', 'under', 'whence', 'describe', 'bottom', 'made', 'do', 'seem', 'latter', 'via', 'what', 'toward', 'anyhow', 'name', 'though', 'several', 'also', 'least', 'put', 'noone', 'therefore', 'in', 'almost', 'again', 'fifteen', 'so', 'see', 'anywhere', 'there', 'throughout', 'top', 'without', 'becomes', 'often', 'seeming', 'why', 'thus', 'indeed', 'thereby', 'of', 'bill', 'into', 'have', 'amongst', 'sincere', 'after', 'already', 'such', 'either', 'wherein', 'de', 'nowhere', 'becoming', 'from', 'everything', 'moreover', 'among', 'about', 'yet', 'per', 'as', 'no', 'afterwards', 'others', 'during', 'being', 'if', 'when', 'beforehand', 'across', 'by', 'seems', 'beside', 'call', 'con', 'couldnt', 'ie', 'before', 'un', 'behind', 'eleven', 'still', 'therein', 'hereafter', 'whether', 'not', 'perhaps', 'can', 'anyway', 'thereupon', 'many', 'own', 'where', 'whereupon', 'within', 'wherever', 'down', 'few', 'most', 'around', 'much', 'somewhere', 'which', 'each', 'while', 'yourselves', 'out', 'her', 'hers', 'too', 'together', 'hereupon', 'yours', 'rather', 'very', 'side', 'somehow', 'neither', 'on', 'eg', 'whereas', 'ltd', 'more', 'except', 'twenty', 'go', 'fill', 'former', 'last',
'going', 'too.', '...', '-', '&', '+']
# true asthma tweets must related to the author
personal_pronouns = ["i", "i'm", "i've", "me", "we", "my"]
# true asthma tweets must mentioned asthma
key_word = "asthma"
# true asthma tweets must have supporting symptoms
key_symptoms = ["inhaler", "wheez", "sneez", "breath", "surviv", "fever", "cough", "lung", "throat", "chest", "stress",
"headache", "respirat", "weak", "tired", "weak", "anxiety", "panic", "pale", "moody", "cold", "exercise",
"sweaty", "blue", "lips", "fingernail", "stress", "irritant", "parent", "famil", "pollen", "air", "albuterol",
"xolair", "montelukast", "nebulizer", "flovent", "singulair", "advair", "bronchodilator"]
# tweet phrase are very useful features
# task 1
# \W represent any Any non-alphanumeric character (equivalent to [^a-zA-Z0-9_].
# * represent Zero or more of previous item
# (A|B) represent A or B
key_phrase = ["my\W*asthma", "asthma\W*attack", "i\W*have\W*asthma", "i’ve\W*had\W*asthma", "have\W*asthma", "have\W*a\W*asthma",
"have\W*#asthma", "have\W*severe\W*asthma", "have\W*bad\W*asthma", "have\W*slight\W*asthma", "i\W*had\W*asthma", "had\W*asthma",
"had\W*a\W*asthma", "had\W*severe\W*asthma", "having\W*asthma", "i\W*got\W*asthma", "got\W*asthma", "get\W*asthma",
"asthma\W*got\W*me", "getting\W*asthma", "have\W*chronic\W*asthma", "have\W*skin\W*asthma", "caught\W*asthma",
"an\W*asthmatic", "fighting\W*with\W*asthma", "surfer\W*from\W*asthma", "with\W*asthma", "of\W*asthma", "like\W*asthma",
"this\W*asthma", "the\W*asthma", "got\W*bronchospasm", "had\W*a\W*bronchospasm", "had\W*bronchospasm", "an\W*asthma", "fuck\W*asthma",
"(trigger|triggered)\W*asthma", "induced\W*asthma", "asthma\W*trigger", "cough\W*(and|&)\W*asthma", "allergies\W*(and|&)\W*asthma",
"stress\W*(and|&)\W*asthma", "cough\W*asthma", "asthma\W*(medication|meds)", "asthma\W*treatment", "status\W*asthmaticus",
"asthma\W*control", "(causes|cause)\W*asthma", "my\W*inhaler", "i\W*(can not|can’t)\W*breathe"]
# number of hashtags may be useful features
key_hashtags = ["#asthmasurvivor", "#asthma", "#BeatingAsthma"]
# air condition words may be useful features
air_condition = ["poll", "air", "weather", "co", "no2", "pm2.5", "cold", "dry", "humid", "moist", "hot"]
# number of features we added: 103
def prepare_data(file1, file2):
# load positive and negative tweets
positive_tweets = []
negative_tweets = []
for line in open(file1).readlines():
text = line.lower().strip()
positive_tweets.append([int(1), text])
for line in open(file2).readlines():
text = line.lower().strip()
negative_tweets.append([int(0), text])
tweets = positive_tweets + negative_tweets
# Extract the vocabulary of keywords
vocab = dict()
for class_label, text in tweets:
for term in text.split():
term = term.lower()
if len(term) >= 1 and term not in stopwords:
if term in vocab:
vocab[term] = vocab[term] + 1
else:
vocab[term] = 1
# Remove terms whose frequencies are less than a threshold (e.g., 15)
vocab = {term: freq for term, freq in vocab.items() if freq > 30}
# Generate an id (starting from 0) for each term in vocab
vocab = {term: idx for idx, (term, freq) in enumerate(vocab.items())}
#print(vocab) # number of features =
# Generate X and y
X = []
y = []
for class_label, text in tweets:
x = [0] * (len(vocab) + len(key_symptoms) + len(key_phrase) + len(air_condition) + 3)
# compute the vocabulary features
terms = [term for term in text.split() if len(term) >= 1]
for term in terms:
if term in vocab:
x[vocab[term]] += 1
# compute the personal_pronoun feature
if term in personal_pronouns:
x[len(vocab) + 0] += 1
# compute key word asthma feature
x[len(vocab) + 1] += text.count(key_word)
# compute the frequency of key hashtag feature
for item in key_hashtags:
x[len(vocab) + 2] += text.count(item)
symptom_count = 1
for item in key_symptoms:
x[len(vocab) + 2 + symptom_count] += text.count(item)
symptom_count += 1
phrase_count = 1
for item in key_phrase:
x[len(vocab) + 2+ len(key_symptoms) + phrase_count] += text.count(item)
phrase_count += 1
air_count = 1
for item in air_condition:
x[len(vocab) + 2 + len(key_symptoms) +len(key_phrase) + air_count] += text.count(item)
air_count += 1
y.append(class_label)
X.append(x)
return X, y
def svm_classification():
X, y = prepare_data("./data/positive_tweets_new.txt", "./data/negative_tweets_new.txt")
# 10 folder cross validation to estimate the best w and b, using SVM
clf = svm.SVC(kernel='linear', C= 1, class_weight={1: 2})
scores = cross_val_score(clf, X, y, cv = 10, scoring = make_scorer(f1_score))
print('f1_macro: ', "mean: ", np.mean(scores), "std: ", np.std(scores))
scores = cross_val_score(clf, X, y, cv = 10, scoring='precision')
print('precision: ', "mean: ", np.mean(scores), "std: ", np.std(scores))
scores = cross_val_score(clf, X, y, cv = 10, scoring='recall')
print('recall: ', "mean: ", np.mean(scores), "std: ", np.std(scores))
svm_classification()