-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarker.py
232 lines (211 loc) · 9.36 KB
/
marker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
import os
import pip
import subprocess
import sys
import time
import zipfile
package_content = ["marker.py", "Readme.md", "submission.zip", \
"test_filter.java", "test_filter.py", \
"emails", ".git"]
marking_criteria = """\
Part 1 (40%):
- [ ] Your program classifies the testing set with an accuracy significantly higher than random within 30 minutes
- [ ] Use very simple data preprocessing so that the emails can be read into the Naive Bayes (remove everything else other than words from emails)
- [ ] Write simple Naive Bayes multinomial classifier or use an implementation from a library of your choice
- [ ] Classify the data
- [ ] Report your results with a metric (e.g. accuracy) and method (e.g. cross validation) of your choice
- [ ] Choose a baseline and compare your classifier against it
Part 2 (30%):
- [ ] Use some smart feature processing techniques to improve the classification results
- [ ] Compare the classification results with and without these techniques
- [ ] Analyse how the classification results depend on the parameters (if available) of chosen techniques
- [ ] Compare (statistically) your results against any other algorithm of your choice (use can use any library); compare and contrast results, ensure fair comparison
Part 3 (30%):
- [ ] Calibration (15%): calibrate Naive Bayes probabilities, such that they result in low mean squared error
- [ ] Naive Bayes extension (15%): modify the algorithm in some interesting way (e.g. weighted Naive Bayes)
~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*~*
Classification evaluation:
"""
emails_an_hour = 1000.0
feedback_file = "feedback.txt"
archive_name = "submission.zip"
path_prefix = "./"
emails_path_prefix = os.path.dirname(os.path.realpath(__file__))
emails_dir = ["emails"]
emails_dir = [os.path.join(emails_path_prefix, i) for i in emails_dir]
# If this flag is set to `True` Python packages are installed globally.
# Additionally, if directory is used as a parameter it will get tested,
# rather than being compressed for submission.
VIRTUAL_ENV = False
# Show progress in email testing
show_progress = True
try:
from nyanbar import NyanBar
progress_bar = "nyan"
except ImportError:
progress_bar = "classic"
# Install requirements
install_requirements = False
def python_requirements(path):
# Install requirements if Python
if not install_requirements:
return
requirements = os.path.join(path, "requirements.txt")
if os.path.isfile(requirements):
pip_options = ["install"]
if VIRTUAL_ENV:
print "Installing Python requirements globally (hopefully VirtualEnv)"
else:
print "Installing Python requirements locally (--user)"
pip_options += ["--user"]
with open(requirements) as r:
for l in r:
pip.main(pip_options + [l])
else:
sys.exit("Unknown Python requirements file: " + requirements)
# Detect zip submission
if len(sys.argv) == 2:
arg = sys.argv[1].strip()
# Unpack zip for marking
if os.path.isfile(arg):
# Unpack zip for marking
print "Unpacking archive"
if arg.endswith(".zip"):
if os.path.basename(arg) != archive_name:
sys.exit("Archive must be named: " + archive_name)
else:
path_prefix = os.path.join(os.path.dirname(arg), os.path.splitext(archive_name)[0])
with zipfile.ZipFile(arg, "r") as z:
z.extractall(path_prefix)
# Install requirements if Python
python_requirements(path_prefix)
# Directory passed as an argument
elif os.path.isdir(arg):
if VIRTUAL_ENV:
# Test submission (directory rather than zip)
print "VIRTUAL_ENV=True; testing directory."
path_prefix = arg
# Install requirements if Python
python_requirements(path_prefix)
else:
# Create zip for submission
print "Creating zip for submission..."
arg_files = os.listdir(arg)
if "filter.java" not in arg_files and "filter.py" not in arg_files:
sys.exit("Filter.{py,java} missing in indicated folder:\n " + arg)
arg_files = [os.path.join(arg, i) for i in arg_files if i not in package_content]
with zipfile.ZipFile(archive_name, "w", zipfile.ZIP_DEFLATED) as z:
while arg_files:
f = arg_files.pop()
if os.path.isfile(f):
z.write(f, os.path.relpath(f, arg))
elif os.path.isdir(f):
arg_files += [os.path.join(f, i) for i in os.listdir(f)]
print "Please upload: %s" % archive_name
sys.exit()
# Detect language
print "Testing on sample emails"
LANGUAGE = ""
if os.path.isfile(os.path.join(path_prefix, "filter.java")):
LANGUAGE = "java"
print "`filter.java` found!"
elif os.path.isfile(os.path.join(path_prefix, "filter.py")):
LANGUAGE = "python"
print "`filter.py` found!"
else:
sys.exit("Neither `filter.java` nor `filter.py` was found in current directory!")
if LANGUAGE == "java":
# Find all *.java* files
java_files = [d for d in os.listdir(path_prefix) if d.endswith(".java")]
java_files = " ".join(java_files)
jar_files = ["./", "./lib/"]
jar_files += [d for d in os.listdir(path_prefix) if d.endswith(".jar")]
if os.path.exists(os.path.join(path_prefix, "lib")):
jar_files += [os.path.join("lib", d) for d in os.listdir(os.path.join(path_prefix, "lib")) if d.endswith(".jar")]
jar_files += ["$CLASSPATH"]
classpath= ":".join(jar_files)
java_compile = "javac -cp %s %s" % (classpath, java_files)
comp = subprocess.Popen(java_compile.split(), cwd=path_prefix, stdout=subprocess.PIPE)
comp.communicate()
execute = "java -cp " + classpath + " filter %s"
elif LANGUAGE == "python":
execute = "python filter.py %s"
evaluation_results = []
for ei, e in enumerate(emails_dir):
evaluation = ""
emails = []
for d in os.listdir(e):
if ("spam" in d or "ham" in d) and os.path.isfile(os.path.join(e,d)):
emails.append(os.path.join(e,d))
else:
print "The test filename must either contain word *spam* or *ham* indicating its class!"
print "The filter won't be tested on `%s` file." % os.path.join(e,d)
if show_progress and progress_bar == "nyan":
print "Test [%d/%d]" % (ei+1, len(emails_dir))
progress = NyanBar(tasks=100)
tp, tn, fp, fn = 0, 0, 0, 0
evaluation_start_time = time.time()
for ii, i in enumerate(emails):
if show_progress and progress_bar == "classic":
print "[%d/%d] %.2d%% (%s)" % (ei+1, len(emails_dir), 100.*ii/len(emails), i)
elif show_progress and progress_bar == "nyan":
progress.update(100.*ii/len(emails))
current_email = execute % os.path.abspath(i)
if "spam" in os.path.split(i)[1]:
ground_truth = "spam"
elif "ham" in os.path.split(i)[1]:
ground_truth = "ham"
else:
sys.exit("File %s has neither *ham* nor *spam* keyword in its name!" % i)
exe = subprocess.Popen(current_email.split(), cwd=path_prefix, stdout=subprocess.PIPE)
output, error = exe.communicate()
output = output.strip()
if output == "spam" and ground_truth == "spam":
if not VIRTUAL_ENV: print "%s correctly predicted as SPAM" % i
tp += 1
elif output == "spam" and ground_truth == "ham":
if not VIRTUAL_ENV: print "%s incorrectly predicted as SPAM" % i
fp += 1
elif output == "ham" and ground_truth == "ham":
if not VIRTUAL_ENV: print "%s correctly predicted as HAM" % i
tn += 1
elif output == "ham" and ground_truth == "spam":
if not VIRTUAL_ENV: print "%s incorrectly predicted as HAM" % i
fn += 1
else:
print "Currently tested email is: %s" % i
print "The output of your program should be either *spam* or *ham*."
print "Current output:\n %s" % output
ri = raw_input("(s) to skip; other keys to exit\n> ")
if ri == "s":
continue
else:
sys.exit()
if show_progress and progress_bar == "classic":
print "[%d/%d] %.2d%% (%s)" % (ei+1, len(emails_dir), 100, "")
elif show_progress and progress_bar == "nyan":
progress.finish()
# Print statistics
evaluation += "Test %s" % os.path.split(e)[1]
evaluation += """
Predicted: | SPAM | HAM
----------------------------
Ground Truth: | |
SPAM | %4d | %4d
HAM | %4d | %4d
""" % (tp, fn, fp, tn)
acc = 100.0*(tp+tn)/(fp+fn+tp+tn)
evaluation += "Accuracy: %.2f" % (acc) + "%"
evaluation_end_time = time.time()
evaluation_time = evaluation_end_time - evaluation_start_time
evaluation += "\nEvaluation time: %s seconds" % evaluation_time
if evaluation_time > 3600*(fp+fn+tp+tn)/emails_an_hour:
evaluation += "\nFAILED time test"
print "\n", evaluation
evaluation_results.append(evaluation)
# Save feedback
if VIRTUAL_ENV:
with open(os.path.join(path_prefix, feedback_file), "w") as ff:
ff.write(marking_criteria)
ff.write("\n\n++++++++++++++++++++++++++++++\n\n".join(evaluation_results))