Added template for the defeat learners project

bhrolenok · bhrolenok · commit ddb7f6feb685 · 2017-01-11T16:48:10.000-05:00
diff --git a/mc3h1_defeat_learners/LinRegLearner.py b/mc3h1_defeat_learners/LinRegLearner.py
@@ -0,0 +1,35 @@
+"""
+A simple wrapper for linear regression.  (c) 2015 Tucker Balch
+"""
+
+import numpy as np
+
+class LinRegLearner(object):
+
+    def __init__(self, verbose = False):
+        pass # move along, these aren't the drones you're looking for
+
+    def addEvidence(self,dataX,dataY):
+        """
+        @summary: Add training data to learner
+        @param dataX: X values of data to add
+        @param dataY: the Y training values
+        """
+
+        # slap on 1s column so linear regression finds a constant term
+        newdataX = np.ones([dataX.shape[0],dataX.shape[1]+1])
+        newdataX[:,0:dataX.shape[1]]=dataX
+
+        # build and save the model
+        self.model_coefs, residuals, rank, s = np.linalg.lstsq(newdataX, dataY)
+        
+    def query(self,points):
+        """
+        @summary: Estimate a set of test points given the model we built.
+        @param points: should be a numpy array with each row corresponding to a specific query.
+        @returns the estimated values according to the saved model.
+        """
+        return (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1]
+
+if __name__=="__main__":
+    print "the secret clue is 'zzyzx'"
diff --git a/mc3h1_defeat_learners/RTLearner.py b/mc3h1_defeat_learners/RTLearner.py
@@ -0,0 +1,42 @@
+"""
+A FAKE Random Tree Learner.  (c) 2016 Tucker Balch
+This is just a linear regression learner implemented named RTLearner so
+it is used as a template or placeholder for use by testbest4.  You should
+replace this code with your own RTLearner.
+"""
+
+import numpy as np
+
+class RTLearner(object):
+
+    def __init__(self, verbose = False, leaf_size = 1):
+        pass # move along, these aren't the drones you're looking for
+
+    def addEvidence(self,dataX,dataY):
+        """
+        @summary: Add training data to learner
+        @param dataX: X values of data to add
+        @param dataY: the Y training values
+        """
+
+        # slap on 1s column so linear regression finds a constant term
+        newdataX = np.ones([dataX.shape[0],dataX.shape[1]+1])
+        newdataX[:,0:dataX.shape[1]]=dataX
+
+        # build and save the model
+        self.model_coefs, residuals, rank, s = np.linalg.lstsq(newdataX, dataY)
+        
+    def query(self,points):
+        """
+        @summary: Estimate a set of test points given the model we built.
+        @param points: should be a numpy array with each row corresponding to a specific query.
+        @returns the estimated values according to the saved model.
+        """
+        # get the linear result
+        ret_val = (self.model_coefs[:-1] * points).sum(axis = 1) + self.model_coefs[-1]
+        # add some random noise
+	ret_val = ret_val + 0.09 * np.random.normal(size = ret_val.shape[0])
+	return ret_val
+
+if __name__=="__main__":
+    print "get me a shrubbery"
diff --git a/mc3h1_defeat_learners/feedback/.gitignore b/mc3h1_defeat_learners/feedback/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/mc3h1_defeat_learners/gen_data.py b/mc3h1_defeat_learners/gen_data.py
@@ -0,0 +1,21 @@
+"""
+template for generating data to fool learners (c) 2016 Tucker Balch
+"""
+
+import numpy as np
+import math
+
+# this function should return a dataset (X and Y) that will work
+# better for linear regresstion than random trees
+def best4LinReg():
+    X = np.random.normal(size = (100, 4)) 
+    Y = np.sin(X[:,1])*np.cos(1./(0.0001+X[:,0]**2)) 
+    return X, Y
+
+def best4RT():
+    X = np.random.normal(size = (50, 2)) 
+    Y = 0.8 * X[:,0] + 5.0 * X[:,1] 
+    return X, Y
+
+if __name__=="__main__":
+    print "they call me Tim."
diff --git a/mc3h1_defeat_learners/testbest4.py b/mc3h1_defeat_learners/testbest4.py
@@ -0,0 +1,81 @@
+"""
+Test best4 data generator.  (c) 2016 Tucker Balch
+"""
+
+import numpy as np
+import math
+import LinRegLearner as lrl
+import RTLearner as rt
+from gen_data import best4LinReg, best4RT
+
+# compare two learners' rmse out of sample
+def compare_os_rmse(learner1, learner2, X, Y):
+
+    # compute how much of the data is training and testing
+    train_rows = math.floor(0.6* X.shape[0])
+    test_rows = X.shape[0] - train_rows
+
+    # separate out training and testing data
+    train = np.random.choice(X.shape[0], size=train_rows, replace=False)
+    test = np.setdiff1d(np.array(range(X.shape[0])), train)
+    trainX = X[train, :] 
+    trainY = Y[train]
+    testX = X[test, :]
+    testY = Y[test]
+
+    # train the learners
+    learner1.addEvidence(trainX, trainY) # train it
+    learner2.addEvidence(trainX, trainY) # train it
+
+    # evaluate learner1 out of sample
+    predY = learner1.query(testX) # get the predictions
+    rmse1 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
+
+    # evaluate learner2 out of sample
+    predY = learner2.query(testX) # get the predictions
+    rmse2 = math.sqrt(((testY - predY) ** 2).sum()/testY.shape[0])
+
+    return rmse1, rmse2
+
+def test_code():
+
+    # create two learners and get data
+    lrlearner = lrl.LinRegLearner(verbose = False)
+    rtlearner = rt.RTLearner(verbose = False, leaf_size = 1)
+    X, Y = best4LinReg()
+
+    # compare the two learners
+    rmseLR, rmseRT = compare_os_rmse(lrlearner, rtlearner, X, Y)
+
+    # share results
+    print
+    print "best4LinReg() results"
+    print "RMSE LR    : ", rmseLR
+    print "RMSE RT    : ", rmseRT
+    if rmseLR < 0.9 * rmseRT:
+        print "LR < 0.9 RT:  pass"
+    else:
+        print "LR < 0.9 RT:  fail"
+    print
+
+    # get data that is best for a random tree
+    lrlearner = lrl.LinRegLearner(verbose = False)
+    rtlearner = rt.RTLearner(verbose = False, leaf_size = 1)
+    X, Y = best4RT()
+
+    # compare the two learners
+    rmseLR, rmseRT = compare_os_rmse(lrlearner, rtlearner, X, Y)
+
+    # share results
+    print
+    print "best4RT() results"
+    print "RMSE LR    : ", rmseLR
+    print "RMSE RT    : ", rmseRT
+    if rmseRT < 0.9 * rmseLR:
+        print "RT < 0.9 LR:  pass"
+    else:
+        print "RT < 0.9 LR:  fail"
+    print
+
+if __name__=="__main__":
+    test_code()