-
Notifications
You must be signed in to change notification settings - Fork 5
/
spam_classifier.R
212 lines (164 loc) · 6.52 KB
/
spam_classifier.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
if(!require("tm"))
install.packages("tm")
#required package for SVM
if(!require("e1071"))
install.packages("e1071")
#required package for KNN
if(!require("RWeka"))
install.packages("RWeka", dependencies = TRUE)
#required package for Adaboost
if(!require("ada"))
install.packages("ada")
library("tm")
library("e1071")
library(RWeka)
library("ada")
library("caret")
#Initialize random generator
set.seed(1245)
#This function makes vector (Vector Space Model) from text message using highly repeated words
vsm<-function(message,highlyrepeatedwords){
tokenizedmessage<-strsplit(message, "\\s+")[[1]]
#making vector
v<-rep(0, length(highlyrepeatedwords))
for(i in 1:length(highlyrepeatedwords)){
for(j in 1:length(tokenizedmessage)){
if(highlyrepeatedwords[i]==tokenizedmessage[j]){
v[i]<-v[i]+1
}
}
}
return (v)
}
#loading data. Original data is from http://archive.ics.uci.edu/ml/datasets/tweet+Spam+Collection
print("Uploading tweet Spams and Hams!\n")
tweettable<-read.csv("C:/E Drive/ASU/classes/SML/Project NLP/Spam filter/Sample.csv", header = TRUE, sep = ",")
tweettabletmp<-tweettable
print("Extracting Ham and Spam Basic Statistics!")
#Basic Statisctics like mean and variance of spam and hams
hamavg<-mean(tweettabletmp$type)
print("Average Ham is :");hamavg
hamvariance<-var(tweettabletmp$type)
print("Var of Ham is :");hamvariance
print("Extract average token of Hams and Spams!")
nohamtokens<-0
noham<-0
nospamtokens<-0
nospam<-0
for(i in 1:length(tweettable$type)){
if(tweettable[i,1]==1){
nohamtokens<-length(strsplit(as.character(tweettable[i,2]), "\\s+")[[1]])+nohamtokens
noham<-noham+1
}else{
nospamtokens<-length(strsplit(as.character(tweettable[i,2]), "\\s+")[[1]])+nospamtokens
nospam<-nospam+1
}
}
totaltokens<-nospamtokens+nohamtokens;
print("total number of tokens is:")
print(totaltokens)
avgtokenperham<-nohamtokens/noham
print("Avarage number of tokens per ham message")
print(avgtokenperham)
avgtokenperspam<-nospamtokens/nospam
print("Avarage number of tokens per spam message")
print(avgtokenperspam)
print(" Make two different sets, training data and test data!")
#select the percent of data that you want to use as training set
trdatapercent<-0.7
#training data set
trdata=NULL
#test data set
tedata=NULL
# for(i in 1:length(tweettable$type)){
# if(runif(1)<trdatapercent){
# trdata=rbind(trdata,c(tweettable[i,1],tolower(tweettable[i,2])))
# }
# else{
# tedata=rbind(tedata,c(tweettable[i,1],tolower(tweettable[i,2])))
# }
# }
for(i in 1:length(tweettable$type)){
if(i<trdatapercent*length(tweettable$type)){
trdata=rbind(trdata,c(tweettable[i,1],tolower(tweettable[i,2])))
}
else{
tedata=rbind(tedata,c(tweettable[i,1],tolower(tweettable[i,2])))
}
}
print("Training data size is!")
dim(trdata)
print("Test data size is!")
dim(tedata)
# Text feature extraction using tm package
trtweets<-Corpus(VectorSource(trdata[,2]))
trtweets<-tm_map(trtweets, stripWhitespace)
trtweets <- tm_map(trtweets, removeNumbers)
trtweets<-tm_map(trtweets, tolower)
trtweets <- tm_map(trtweets, removePunctuation)
trtweets<-tm_map(trtweets, removeWords, stopwords("english"))
trtweets <- tm_map(trtweets, PlainTextDocument)
dtm <- DocumentTermMatrix(trtweets)
highlyrepeatedwords<-findFreqTerms(dtm, 1)
#These highly used words are used as an index to make VSM
#(vector space model) for trained data and test data
#vectorized training data set
vtrdata=NULL
#vectorized test data set
vtedata=NULL
for(i in 1:length(trdata[,2])){
if(trdata[i,1]==1){
vtrdata=rbind(vtrdata,c(1,vsm(trdata[i,2],highlyrepeatedwords)))
}
else{
vtrdata=rbind(vtrdata,c(0,vsm(trdata[i,2],highlyrepeatedwords)))
}
}
for(i in 1:length(tedata[,2])){
if(tedata[i,1]==1){
vtedata=rbind(vtedata,c(1,vsm(tedata[i,2],highlyrepeatedwords)))
}
else{
vtedata=rbind(vtedata,c(0,vsm(tedata[i,2],highlyrepeatedwords)))
}
}
# Run different classification algorithms
# differnet SVMs with different Kernels
print("----------------------------------SVM-----------------------------------------")
print("Linear Kernel")
svmlinmodel <- svm(x=vtrdata[,2:length(vtrdata[1,])],y=vtrdata[,1],type='C', kernel='linear',cost = 40);
summary(svmlinmodel)
predictionlin <- predict(svmlinmodel, vtedata[,2:length(vtedata[1,])])
confusionMatrix(predictionlin, vtedata[,1])
# tablinear <- table(pred = predictionlin , true = vtedata[,1]); tablinear
# precisionlin<-sum(diag(tablinear))/sum(tablinear);
# print("General Error using Linear SVM is (in percent):");(1-precisionlin)*100
#print("Ham Error using Linear SVM is (in percent):");(tablinear[1,2]/sum(tablinear[,2]))*100
#print("Spam Error using Linear SVM is (in percent):");(tablinear[2,1]/sum(tablinear[,1]))*100
print("Polynomial Kernel")
svmpolymodel <- svm(x=vtrdata[,2:length(vtrdata[1,])],y=vtrdata[,1], kernel='polynomial',degree=12, probability=FALSE)
predictionpoly <- predict(svmpolymodel, vtedata[,2:length(vtedata[1,])])
tabpoly <- table(vtedata[,1], round(predictionpoly)); tabpoly
precisionpoly<-sum(diag(tabpoly))/sum(tabpoly);
print("General Error using Poly SVM is (in percent):");(1-precisionpoly)*100
print("Radial Kernel")
svmradmodel <- svm(x=vtrdata[,2:length(vtrdata[1,])],y=vtrdata[,1], kernel = "radial", gamma = 0.09, cost = 0.01, cross = 5, probability=FALSE)
predictionrad <- predict(svmradmodel, vtedata[,2:length(vtedata[1,])])
tabrad <- table(vtedata[,1], round(predictionrad)); tabrad
precisionrad<-sum(diag(tabrad))/sum(tabrad);
print("General Error using Radial SVM is (in percent):");(1-precisionrad)*100
print("----------------------------------KNN-----------------------------------------")
# data<-data.frame(tweet=vtrdata[,2:length(vtrdata[1,])],type=vtrdata[,1])
# classifier <- IBk(type ~.,data, control = Weka_control(K = 2, X = TRUE))
# summary(classifier)
# evaluate_Weka_classifier(classifier, numFolds = 5)
#
# Knnmodel <- knn(data,vtedata[,2:length(vtedata[1,])],y=vtrdata[,1], k = 2, prob = FALSE, algorithm=c("kd_tree", "cover_tree", "brute"))
# predictionrad <- predict(svmradmodel, vtedata[,2:length(vtedata[1,])])
# tabrad <- table(vtedata[,1], round(predictionrad)); tabrad
# precisionrad<-sum(diag(tabrad))/sum(tabrad);
# print("General Error using Radial SVM is (in percent):");(1-precisionrad)*100
print("---------------------------------Adaboost-------------------------------------")
adaptiveboost<-ada(x=vtrdata[,2:length(vtrdata[1,])],y=vtrdata[,1],test.x=vtedata[,2:length(vtedata[1,])], test.y=vtedata[,1], loss="logistic", type="gentle", iter=100)
summary(adaptiveboost)
varplot(adaptiveboost)