Add binary case for classification

tweksteen · tweksteen · commit 2a7c068ee92c · 2015-06-01T15:32:52.000+10:00
diff --git a/classify.py b/classify.py
@@ -4,51 +4,146 @@
 from itertools import product
 
 import numpy as np
+import numpy.ma as ma
 from sklearn import linear_model
-from sklearn import preprocessing
 from sklearn import cross_validation
+from sklearn.feature_selection import chi2
 
+from binary import validate_length, decode
 from utils import alphabet, success, error
 
-def build_features(tks, cs):
-  features = []
-  for tk in tks:
-    fc  = [ float(tk.count(c))/len(tk) for c in cs ]
-    fc2 = [ float(tk.count(c1+c2))/(len(tk)/2) for c1 in cs for c2 in cs ]
-    fc3 = [ 1 if c == a else 0 for c in cs for a in tk ] 
-    f = fc + fc2 + fc3
-    features.append(f)
+def build_character_features(tks, cs):
+  """
+  Build a set of features based on the tokens:
+    * Set of unigram (numbers of "A")
+    * Set of bigram (numbers of "AB")
+    * Unigram at position (if "A" is at the second position)
+
+  Return a matrix of size n_tokens x n_features
+  """
+  f1_size = len(cs)
+  f2_size = len(cs) ** 2
+  f3_size = len(tks[0]) * len(cs)
+  n_features = f1_size + f2_size + f3_size
+  features = np.ndarray(shape=(len(tks), n_features))
+  for i, tk in enumerate(tks):
+    # Unigram
+    features[i][:f1_size] = [ float(tk.count(c))/len(tk) for c in cs ]
+    fp = f1_size
+    # Bigram
+    features[i][fp:fp+f2_size] = [ float(tk.count(c1+c2))/(len(tk)/2)
+                                   for c1 in cs for c2 in cs ]
+    fp += f2_size
+    # Unigram at position
+    features[i][fp:fp+f3_size] = [ 1 if c == a else 0
+                                   for c in cs for a in tk ]
   feature_type = [ "#" + str(c) for c in cs ]
   feature_type.extend([ "#" + str(c1) + str(c2) for c1 in cs for c2 in cs ])
   feature_type.extend([ str(c) + "@" + str(i) for c in cs for i in range(len(tk))])
   return features, feature_type
 
-def classify(f1, f2, verbose):
+def build_binary_features(tks, cs, n=8):
+  """
+  Build a set of features based on the binary tokens:
+    * Set of unigram (numbers of "1")
+    * Set of bigram (numbers of "01")
+    * ...
+    * Set of n-grams (numbers of "01100011")
+    * Unigram at position (if "1" is at the second position)
+
+  Return a matrix of size n_tokens x n_features
+  """
+  fn_size = 2**(n+1) - 2
+  f3_size = len(tks[0]) * 2
+  n_features = fn_size + f3_size
+  features = np.ndarray(shape=(len(tks), n_features))
+  for i, tk in enumerate(tks):
+    # n-grams
+    fp = 0
+    for l in range(1, n+1):
+      l_size = 2**l
+      features[i][fp:fp+l_size] = [ float(tk.count("".join(g)))/len(tk)/l
+                                    for g in product("01", repeat=l) ]
+      fp += l_size
+    # Unigram at position
+    features[i][fp:fp+f3_size] = [ 1 if c == a else 0
+                                   for c in cs for a in tk ]
+  feature_type = []
+  for l in range(1,n+1):
+    feature_type.extend([ "#" + "".join(g) for g in product("01", repeat=l) ])
+  feature_type.extend([ str(c) + "@" + str(i) for c in cs for i in range(len(tk))])
+  return features, feature_type
+
+def mask_features(X):
+  """
+  Mask features that are null for all samples
+  """
+  masked = []
+  for i, feature in enumerate(X.T):
+    if not feature.any():
+      masked.append(i)
+  return masked
+
+def read_characters(tks1, tks2, encoding):
+  if not validate_length(tks1):
+    error("The first samples have different sizes")
+    return
+  if not validate_length(tks2):
+    error("The second samples have different sizes")
+    return
+  cs = list(alphabet(tks1 + tks2))
+  print "Alphabet contains", len(cs), "characters:", "".join(sorted(cs))
+  return tks1, tks2, cs
+
+def read_binaries(tks1, tks2, encoding):
+  dtks1 = decode(tks1, encoding)
+  if not dtks1 or not validate_length(dtks1):
+    error("The first samples have different sizes")
+    return
+  dtks2 = decode(tks2, encoding)
+  if not dtks2 or not validate_length(dtks2):
+    error("The second samples have different sizes")
+    return
+  btks1 = [ "".join([np.binary_repr(ord(c), width=8) for c in tk ]) for tk in dtks1 ]
+  btks2 = [ "".join([np.binary_repr(ord(c), width=8) for c in tk ]) for tk in dtks2 ]
+  return btks1, btks2, "01"
+
+def classify(f1, f2, encoding, verbose):
   # Read tokens
   print "Reading tokens"
   tks1 = f1.read().splitlines()
   tks2 = f2.read().splitlines()
-  cs = list(alphabet(tks1 + tks2))
-  print "Alphabet contains", len(cs), "characters:", "".join(sorted(cs))
+  reader = read_binaries if encoding else read_characters
+  dtks1, dtks2, cs = reader(tks1, tks2, encoding)
+  print "Size of samples:", len(dtks1), "and", len(dtks2)
 
   # Build features from both sets
   print "Building features"
-  f1,f_type1 = build_features(tks1, cs)
-  f2,f_type2 = build_features(tks2, cs)
+  feature_builder = build_binary_features if encoding else build_character_features
+  f1,f_type1 = feature_builder(dtks1, cs)
+  f2,f_type2 = feature_builder(dtks2, cs)
   assert len(f_type1) == len(f_type2)
-  print len(f_type1), "features have been generated"
-  target = [0,] * len(f1) + [1,] * len(f2)
+  X = np.concatenate((f1, f2))
+  print X.shape[1], "features have been generated"
+  print "Dropping empty features"
+  masked_features = mask_features(X)
+  X = np.delete(X, masked_features, 1)
+  f_type = np.delete(np.array(f_type1), masked_features)
+  print X.shape[1], "features have been kept"
+  target = np.concatenate([np.zeros(len(f1)), np.ones(len(f2))])
 
-  #print f_type1
-  #print f1[:2]
-  #print f2[:2]
-  #print target[:2]
+  # Running Chi2
+  #print u"Running features selection via \u03c7\u00b2"
+  #c2, pval = chi2(X, target)
+  #print list(sorted(pval))
+  #for i, pv in enumerate(pval):
+  #  if pv < 0.001:
+  #    print pv, f_type[i]
 
   # Cross validate (learn & test)
-  print "Cross-validating the model" 
-  X = f1 + f2
+  print "Cross-validating the model"
   logistic = linear_model.LogisticRegression()
-  scores = cross_validation.cross_val_score(logistic, X, np.array(target), cv=5)  
+  scores = cross_validation.cross_val_score(logistic, X, target, cv=5)
   acc = scores.mean()
   if acc > 0.9:
     print(success("Accuracy: %0.2f (+/- %0.2f)" % (acc, scores.std() * 2)))
@@ -59,23 +154,25 @@ def classify(f1, f2, verbose):
   ordered_coef = sorted(enumerate(logistic.coef_[0]), key=operator.itemgetter(1))
   if verbose:
     for i, c in ordered_coef:
-      print c, f_type1[i]
+      print c, f_type[i]
   else:
     for i, c in ordered_coef[:5]:
-      print c, f_type1[i]
+      print c, f_type[i]
     print "..."
     for i, c in ordered_coef[-5:]:
-      print c, f_type1[i]
+      print c, f_type[i]
 
 def main():
   parser = argparse.ArgumentParser()
-  parser.add_argument("-v", "--verbose", 
+  parser.add_argument("-v", "--verbose",
                       help="increase output verbosity", action="count")
+  parser.add_argument("-e", "--encoding", action="store", default=None,
+                     help="specify the format of the tokens")
   parser.add_argument("file1", type=file, help="file1")
   parser.add_argument("file2", type=file, help="file2")
   args = parser.parse_args()
-  
-  classify(args.file1, args.file2, args.verbose)
-  
+
+  classify(args.file1, args.file2, args.encoding, args.verbose)
+
 if __name__ == '__main__':
   main()