4
4
from itertools import product
5
5
6
6
import numpy as np
7
+ import numpy .ma as ma
7
8
from sklearn import linear_model
8
- from sklearn import preprocessing
9
9
from sklearn import cross_validation
10
+ from sklearn .feature_selection import chi2
10
11
12
+ from binary import validate_length , decode
11
13
from utils import alphabet , success , error
12
14
13
- def build_features (tks , cs ):
14
- features = []
15
- for tk in tks :
16
- fc = [ float (tk .count (c ))/ len (tk ) for c in cs ]
17
- fc2 = [ float (tk .count (c1 + c2 ))/ (len (tk )/ 2 ) for c1 in cs for c2 in cs ]
18
- fc3 = [ 1 if c == a else 0 for c in cs for a in tk ]
19
- f = fc + fc2 + fc3
20
- features .append (f )
15
+ def build_character_features (tks , cs ):
16
+ """
17
+ Build a set of features based on the tokens:
18
+ * Set of unigram (numbers of "A")
19
+ * Set of bigram (numbers of "AB")
20
+ * Unigram at position (if "A" is at the second position)
21
+
22
+ Return a matrix of size n_tokens x n_features
23
+ """
24
+ f1_size = len (cs )
25
+ f2_size = len (cs ) ** 2
26
+ f3_size = len (tks [0 ]) * len (cs )
27
+ n_features = f1_size + f2_size + f3_size
28
+ features = np .ndarray (shape = (len (tks ), n_features ))
29
+ for i , tk in enumerate (tks ):
30
+ # Unigram
31
+ features [i ][:f1_size ] = [ float (tk .count (c ))/ len (tk ) for c in cs ]
32
+ fp = f1_size
33
+ # Bigram
34
+ features [i ][fp :fp + f2_size ] = [ float (tk .count (c1 + c2 ))/ (len (tk )/ 2 )
35
+ for c1 in cs for c2 in cs ]
36
+ fp += f2_size
37
+ # Unigram at position
38
+ features [i ][fp :fp + f3_size ] = [ 1 if c == a else 0
39
+ for c in cs for a in tk ]
21
40
feature_type = [ "#" + str (c ) for c in cs ]
22
41
feature_type .extend ([ "#" + str (c1 ) + str (c2 ) for c1 in cs for c2 in cs ])
23
42
feature_type .extend ([ str (c ) + "@" + str (i ) for c in cs for i in range (len (tk ))])
24
43
return features , feature_type
25
44
26
- def classify (f1 , f2 , verbose ):
45
+ def build_binary_features (tks , cs , n = 8 ):
46
+ """
47
+ Build a set of features based on the binary tokens:
48
+ * Set of unigram (numbers of "1")
49
+ * Set of bigram (numbers of "01")
50
+ * ...
51
+ * Set of n-grams (numbers of "01100011")
52
+ * Unigram at position (if "1" is at the second position)
53
+
54
+ Return a matrix of size n_tokens x n_features
55
+ """
56
+ fn_size = 2 ** (n + 1 ) - 2
57
+ f3_size = len (tks [0 ]) * 2
58
+ n_features = fn_size + f3_size
59
+ features = np .ndarray (shape = (len (tks ), n_features ))
60
+ for i , tk in enumerate (tks ):
61
+ # n-grams
62
+ fp = 0
63
+ for l in range (1 , n + 1 ):
64
+ l_size = 2 ** l
65
+ features [i ][fp :fp + l_size ] = [ float (tk .count ("" .join (g )))/ len (tk )/ l
66
+ for g in product ("01" , repeat = l ) ]
67
+ fp += l_size
68
+ # Unigram at position
69
+ features [i ][fp :fp + f3_size ] = [ 1 if c == a else 0
70
+ for c in cs for a in tk ]
71
+ feature_type = []
72
+ for l in range (1 ,n + 1 ):
73
+ feature_type .extend ([ "#" + "" .join (g ) for g in product ("01" , repeat = l ) ])
74
+ feature_type .extend ([ str (c ) + "@" + str (i ) for c in cs for i in range (len (tk ))])
75
+ return features , feature_type
76
+
77
+ def mask_features (X ):
78
+ """
79
+ Mask features that are null for all samples
80
+ """
81
+ masked = []
82
+ for i , feature in enumerate (X .T ):
83
+ if not feature .any ():
84
+ masked .append (i )
85
+ return masked
86
+
87
+ def read_characters (tks1 , tks2 , encoding ):
88
+ if not validate_length (tks1 ):
89
+ error ("The first samples have different sizes" )
90
+ return
91
+ if not validate_length (tks2 ):
92
+ error ("The second samples have different sizes" )
93
+ return
94
+ cs = list (alphabet (tks1 + tks2 ))
95
+ print "Alphabet contains" , len (cs ), "characters:" , "" .join (sorted (cs ))
96
+ return tks1 , tks2 , cs
97
+
98
+ def read_binaries (tks1 , tks2 , encoding ):
99
+ dtks1 = decode (tks1 , encoding )
100
+ if not dtks1 or not validate_length (dtks1 ):
101
+ error ("The first samples have different sizes" )
102
+ return
103
+ dtks2 = decode (tks2 , encoding )
104
+ if not dtks2 or not validate_length (dtks2 ):
105
+ error ("The second samples have different sizes" )
106
+ return
107
+ btks1 = [ "" .join ([np .binary_repr (ord (c ), width = 8 ) for c in tk ]) for tk in dtks1 ]
108
+ btks2 = [ "" .join ([np .binary_repr (ord (c ), width = 8 ) for c in tk ]) for tk in dtks2 ]
109
+ return btks1 , btks2 , "01"
110
+
111
+ def classify (f1 , f2 , encoding , verbose ):
27
112
# Read tokens
28
113
print "Reading tokens"
29
114
tks1 = f1 .read ().splitlines ()
30
115
tks2 = f2 .read ().splitlines ()
31
- cs = list (alphabet (tks1 + tks2 ))
32
- print "Alphabet contains" , len (cs ), "characters:" , "" .join (sorted (cs ))
116
+ reader = read_binaries if encoding else read_characters
117
+ dtks1 , dtks2 , cs = reader (tks1 , tks2 , encoding )
118
+ print "Size of samples:" , len (dtks1 ), "and" , len (dtks2 )
33
119
34
120
# Build features from both sets
35
121
print "Building features"
36
- f1 ,f_type1 = build_features (tks1 , cs )
37
- f2 ,f_type2 = build_features (tks2 , cs )
122
+ feature_builder = build_binary_features if encoding else build_character_features
123
+ f1 ,f_type1 = feature_builder (dtks1 , cs )
124
+ f2 ,f_type2 = feature_builder (dtks2 , cs )
38
125
assert len (f_type1 ) == len (f_type2 )
39
- print len (f_type1 ), "features have been generated"
40
- target = [0 ,] * len (f1 ) + [1 ,] * len (f2 )
126
+ X = np .concatenate ((f1 , f2 ))
127
+ print X .shape [1 ], "features have been generated"
128
+ print "Dropping empty features"
129
+ masked_features = mask_features (X )
130
+ X = np .delete (X , masked_features , 1 )
131
+ f_type = np .delete (np .array (f_type1 ), masked_features )
132
+ print X .shape [1 ], "features have been kept"
133
+ target = np .concatenate ([np .zeros (len (f1 )), np .ones (len (f2 ))])
41
134
42
- #print f_type1
43
- #print f1[:2]
44
- #print f2[:2]
45
- #print target[:2]
135
+ # Running Chi2
136
+ #print u"Running features selection via \u03c7\u00b2"
137
+ #c2, pval = chi2(X, target)
138
+ #print list(sorted(pval))
139
+ #for i, pv in enumerate(pval):
140
+ # if pv < 0.001:
141
+ # print pv, f_type[i]
46
142
47
143
# Cross validate (learn & test)
48
- print "Cross-validating the model"
49
- X = f1 + f2
144
+ print "Cross-validating the model"
50
145
logistic = linear_model .LogisticRegression ()
51
- scores = cross_validation .cross_val_score (logistic , X , np . array ( target ) , cv = 5 )
146
+ scores = cross_validation .cross_val_score (logistic , X , target , cv = 5 )
52
147
acc = scores .mean ()
53
148
if acc > 0.9 :
54
149
print (success ("Accuracy: %0.2f (+/- %0.2f)" % (acc , scores .std () * 2 )))
@@ -59,23 +154,25 @@ def classify(f1, f2, verbose):
59
154
ordered_coef = sorted (enumerate (logistic .coef_ [0 ]), key = operator .itemgetter (1 ))
60
155
if verbose :
61
156
for i , c in ordered_coef :
62
- print c , f_type1 [i ]
157
+ print c , f_type [i ]
63
158
else :
64
159
for i , c in ordered_coef [:5 ]:
65
- print c , f_type1 [i ]
160
+ print c , f_type [i ]
66
161
print "..."
67
162
for i , c in ordered_coef [- 5 :]:
68
- print c , f_type1 [i ]
163
+ print c , f_type [i ]
69
164
70
165
def main ():
71
166
parser = argparse .ArgumentParser ()
72
- parser .add_argument ("-v" , "--verbose" ,
167
+ parser .add_argument ("-v" , "--verbose" ,
73
168
help = "increase output verbosity" , action = "count" )
169
+ parser .add_argument ("-e" , "--encoding" , action = "store" , default = None ,
170
+ help = "specify the format of the tokens" )
74
171
parser .add_argument ("file1" , type = file , help = "file1" )
75
172
parser .add_argument ("file2" , type = file , help = "file2" )
76
173
args = parser .parse_args ()
77
-
78
- classify (args .file1 , args .file2 , args .verbose )
79
-
174
+
175
+ classify (args .file1 , args .file2 , args .encoding , args . verbose )
176
+
80
177
if __name__ == '__main__' :
81
178
main ()
0 commit comments