Skip to content

Commit 2a7c068

Browse files
committed
Add binary case for classification
1 parent ff21977 commit 2a7c068

File tree

1 file changed

+127
-30
lines changed

1 file changed

+127
-30
lines changed

classify.py

Lines changed: 127 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -4,51 +4,146 @@
44
from itertools import product
55

66
import numpy as np
7+
import numpy.ma as ma
78
from sklearn import linear_model
8-
from sklearn import preprocessing
99
from sklearn import cross_validation
10+
from sklearn.feature_selection import chi2
1011

12+
from binary import validate_length, decode
1113
from utils import alphabet, success, error
1214

13-
def build_features(tks, cs):
14-
features = []
15-
for tk in tks:
16-
fc = [ float(tk.count(c))/len(tk) for c in cs ]
17-
fc2 = [ float(tk.count(c1+c2))/(len(tk)/2) for c1 in cs for c2 in cs ]
18-
fc3 = [ 1 if c == a else 0 for c in cs for a in tk ]
19-
f = fc + fc2 + fc3
20-
features.append(f)
15+
def build_character_features(tks, cs):
16+
"""
17+
Build a set of features based on the tokens:
18+
* Set of unigram (numbers of "A")
19+
* Set of bigram (numbers of "AB")
20+
* Unigram at position (if "A" is at the second position)
21+
22+
Return a matrix of size n_tokens x n_features
23+
"""
24+
f1_size = len(cs)
25+
f2_size = len(cs) ** 2
26+
f3_size = len(tks[0]) * len(cs)
27+
n_features = f1_size + f2_size + f3_size
28+
features = np.ndarray(shape=(len(tks), n_features))
29+
for i, tk in enumerate(tks):
30+
# Unigram
31+
features[i][:f1_size] = [ float(tk.count(c))/len(tk) for c in cs ]
32+
fp = f1_size
33+
# Bigram
34+
features[i][fp:fp+f2_size] = [ float(tk.count(c1+c2))/(len(tk)/2)
35+
for c1 in cs for c2 in cs ]
36+
fp += f2_size
37+
# Unigram at position
38+
features[i][fp:fp+f3_size] = [ 1 if c == a else 0
39+
for c in cs for a in tk ]
2140
feature_type = [ "#" + str(c) for c in cs ]
2241
feature_type.extend([ "#" + str(c1) + str(c2) for c1 in cs for c2 in cs ])
2342
feature_type.extend([ str(c) + "@" + str(i) for c in cs for i in range(len(tk))])
2443
return features, feature_type
2544

26-
def classify(f1, f2, verbose):
45+
def build_binary_features(tks, cs, n=8):
46+
"""
47+
Build a set of features based on the binary tokens:
48+
* Set of unigram (numbers of "1")
49+
* Set of bigram (numbers of "01")
50+
* ...
51+
* Set of n-grams (numbers of "01100011")
52+
* Unigram at position (if "1" is at the second position)
53+
54+
Return a matrix of size n_tokens x n_features
55+
"""
56+
fn_size = 2**(n+1) - 2
57+
f3_size = len(tks[0]) * 2
58+
n_features = fn_size + f3_size
59+
features = np.ndarray(shape=(len(tks), n_features))
60+
for i, tk in enumerate(tks):
61+
# n-grams
62+
fp = 0
63+
for l in range(1, n+1):
64+
l_size = 2**l
65+
features[i][fp:fp+l_size] = [ float(tk.count("".join(g)))/len(tk)/l
66+
for g in product("01", repeat=l) ]
67+
fp += l_size
68+
# Unigram at position
69+
features[i][fp:fp+f3_size] = [ 1 if c == a else 0
70+
for c in cs for a in tk ]
71+
feature_type = []
72+
for l in range(1,n+1):
73+
feature_type.extend([ "#" + "".join(g) for g in product("01", repeat=l) ])
74+
feature_type.extend([ str(c) + "@" + str(i) for c in cs for i in range(len(tk))])
75+
return features, feature_type
76+
77+
def mask_features(X):
78+
"""
79+
Mask features that are null for all samples
80+
"""
81+
masked = []
82+
for i, feature in enumerate(X.T):
83+
if not feature.any():
84+
masked.append(i)
85+
return masked
86+
87+
def read_characters(tks1, tks2, encoding):
88+
if not validate_length(tks1):
89+
error("The first samples have different sizes")
90+
return
91+
if not validate_length(tks2):
92+
error("The second samples have different sizes")
93+
return
94+
cs = list(alphabet(tks1 + tks2))
95+
print "Alphabet contains", len(cs), "characters:", "".join(sorted(cs))
96+
return tks1, tks2, cs
97+
98+
def read_binaries(tks1, tks2, encoding):
99+
dtks1 = decode(tks1, encoding)
100+
if not dtks1 or not validate_length(dtks1):
101+
error("The first samples have different sizes")
102+
return
103+
dtks2 = decode(tks2, encoding)
104+
if not dtks2 or not validate_length(dtks2):
105+
error("The second samples have different sizes")
106+
return
107+
btks1 = [ "".join([np.binary_repr(ord(c), width=8) for c in tk ]) for tk in dtks1 ]
108+
btks2 = [ "".join([np.binary_repr(ord(c), width=8) for c in tk ]) for tk in dtks2 ]
109+
return btks1, btks2, "01"
110+
111+
def classify(f1, f2, encoding, verbose):
27112
# Read tokens
28113
print "Reading tokens"
29114
tks1 = f1.read().splitlines()
30115
tks2 = f2.read().splitlines()
31-
cs = list(alphabet(tks1 + tks2))
32-
print "Alphabet contains", len(cs), "characters:", "".join(sorted(cs))
116+
reader = read_binaries if encoding else read_characters
117+
dtks1, dtks2, cs = reader(tks1, tks2, encoding)
118+
print "Size of samples:", len(dtks1), "and", len(dtks2)
33119

34120
# Build features from both sets
35121
print "Building features"
36-
f1,f_type1 = build_features(tks1, cs)
37-
f2,f_type2 = build_features(tks2, cs)
122+
feature_builder = build_binary_features if encoding else build_character_features
123+
f1,f_type1 = feature_builder(dtks1, cs)
124+
f2,f_type2 = feature_builder(dtks2, cs)
38125
assert len(f_type1) == len(f_type2)
39-
print len(f_type1), "features have been generated"
40-
target = [0,] * len(f1) + [1,] * len(f2)
126+
X = np.concatenate((f1, f2))
127+
print X.shape[1], "features have been generated"
128+
print "Dropping empty features"
129+
masked_features = mask_features(X)
130+
X = np.delete(X, masked_features, 1)
131+
f_type = np.delete(np.array(f_type1), masked_features)
132+
print X.shape[1], "features have been kept"
133+
target = np.concatenate([np.zeros(len(f1)), np.ones(len(f2))])
41134

42-
#print f_type1
43-
#print f1[:2]
44-
#print f2[:2]
45-
#print target[:2]
135+
# Running Chi2
136+
#print u"Running features selection via \u03c7\u00b2"
137+
#c2, pval = chi2(X, target)
138+
#print list(sorted(pval))
139+
#for i, pv in enumerate(pval):
140+
# if pv < 0.001:
141+
# print pv, f_type[i]
46142

47143
# Cross validate (learn & test)
48-
print "Cross-validating the model"
49-
X = f1 + f2
144+
print "Cross-validating the model"
50145
logistic = linear_model.LogisticRegression()
51-
scores = cross_validation.cross_val_score(logistic, X, np.array(target), cv=5)
146+
scores = cross_validation.cross_val_score(logistic, X, target, cv=5)
52147
acc = scores.mean()
53148
if acc > 0.9:
54149
print(success("Accuracy: %0.2f (+/- %0.2f)" % (acc, scores.std() * 2)))
@@ -59,23 +154,25 @@ def classify(f1, f2, verbose):
59154
ordered_coef = sorted(enumerate(logistic.coef_[0]), key=operator.itemgetter(1))
60155
if verbose:
61156
for i, c in ordered_coef:
62-
print c, f_type1[i]
157+
print c, f_type[i]
63158
else:
64159
for i, c in ordered_coef[:5]:
65-
print c, f_type1[i]
160+
print c, f_type[i]
66161
print "..."
67162
for i, c in ordered_coef[-5:]:
68-
print c, f_type1[i]
163+
print c, f_type[i]
69164

70165
def main():
71166
parser = argparse.ArgumentParser()
72-
parser.add_argument("-v", "--verbose",
167+
parser.add_argument("-v", "--verbose",
73168
help="increase output verbosity", action="count")
169+
parser.add_argument("-e", "--encoding", action="store", default=None,
170+
help="specify the format of the tokens")
74171
parser.add_argument("file1", type=file, help="file1")
75172
parser.add_argument("file2", type=file, help="file2")
76173
args = parser.parse_args()
77-
78-
classify(args.file1, args.file2, args.verbose)
79-
174+
175+
classify(args.file1, args.file2, args.encoding, args.verbose)
176+
80177
if __name__ == '__main__':
81178
main()

0 commit comments

Comments
 (0)