Merge pull request #104 from clemente-lab/influence-unittests

Influence unittests
clemente-lab · Aug 5, 2020 · 0f6a9bd · 0f6a9bd
2 parents e00b8a9 + 30aa006
commit 0f6a9bd
Show file tree

Hide file tree

Showing 2 changed files with 52 additions and 127 deletions.
diff --git a/cutie/statistics.py b/cutie/statistics.py
@@ -255,7 +255,7 @@ def resample1_cutie_pc(var1_index, var2_index, samp_var1, samp_var2, **kwargs):
 def cookd(var1_index, var2_index, samp_var1, samp_var2, **kwargs):
     """
     Takes a given var1 and var2 by indices and recomputes Cook's D for each i-th
-    sample. (UT in test_pointwise_metrics)
+    sample. Not intended to be used with nan data. (UT in test_pointwise_metrics)
     ----------------------------------------------------------------------------
     INPUTS
     var1_index - Integer. Index for variable from file 1 in pairwise correlation.
@@ -310,7 +310,7 @@ def cookd(var1_index, var2_index, samp_var1, samp_var2, **kwargs):
 def dffits(var1_index, var2_index, samp_var1, samp_var2, **kwargs):
     """
     Takes a given var1 and var2 by indices and recomputes DFFITS for each i-th
-    sample. (UT in test_pointwise_metrics)
+    sample. Not intended to be used with nan data. (UT in test_pointwise_metrics)
     ----------------------------------------------------------------------------
     INPUTS
     var1_index - Integer. Index for variable from file 1 in pairwise correlation.
@@ -365,7 +365,7 @@ def dffits(var1_index, var2_index, samp_var1, samp_var2, **kwargs):
 def dsr(var1_index, var2_index, samp_var1, samp_var2, **kwargs):
     """
     Takes a given var1 and var2 by indices and recomputes DFFITS for each i-th
-    sample. (UT in test_pointwise_metrics)
+    sample. Not intended to be used with nan data. (UT in test_pointwise_metrics)
     ----------------------------------------------------------------------------
     INPUTS
     var1_index - Integer. Index for variable from file 1 in pairwise correlation.

diff --git a/tests/test_statistics.py b/tests/test_statistics.py
@@ -186,42 +186,6 @@ def setUp(self):
         # specific to pearson, paired var1 = 1 var2 = 2 with bonferonni
         self.var1, self.var2 = 1, 2
         self.sign = np.sign(self.correlations[self.var1][self.var2])
-        self.pointwise_results = {
-            'p': {
-                'cutie_1pc': (np.array([1., 1., 1., 1., 1.]),
-                              np.array([0., 0., 0., 0., 0.]),
-                              np.array([-0.58387421, -0.65710223, -0.6882472 , -0.54772256, -0.51298918]),
-                              np.array([0.41612579, 0.34289777, 0.3117528 , 0.45227744, 0.48701082])),
-                'cookd': (np.array([0., 0., 0., 0., 0.]),
-                          np.array([0., 0., 0., 0., 1.]),
-                          np.array([5.27697119e-02, 1.44618414e-01, 4.84502508e-01, 7.29729730e-03,       8.51351351e+00]),
-                          np.array([0.94945921, 0.87104214, 0.65714228, 0.99274683, 0.05797719])),
-                'dffits': (np.array([0., 0., 0., 0., 0.]),
-                           np.array([0., 0., 1., 0., 1.]),
-                           np.array([-0.27560574, -0.55227095,  2.28912231, -0.09944903,  3.72104204]),
-                           np.array([1.26491106, 1.26491106, 1.26491106, 1.26491106, 1.26491106])),
-                'dsr': (np.array([0., 0., 0., 0., 0.]),
-                        np.array([0., 0., 1., 0., 0.]),
-                        np.array([-0.39893758, -1.07867081,  3.77134948, -0.18156826,  0.66299354]),
-                        np.array([-0.39893758, -1.07867081,  3.77134948, -0.18156826,  0.66299354]))},
-            'r': {
-                'cutie_1pc': (np.array([1., 1., 1., 1., 1.]),
-                              np.array([0., 0., 0., 0., 0.]),
-                              np.array([-0.58387421, -0.65710223, -0.6882472 , -0.54772256, -0.51298918]),
-                              np.array([0.41612579, 0.34289777, 0.3117528 , 0.45227744, 0.48701082])),
-                'cookd': (np.array([0., 0., 0., 0., 0.]),
-                          np.array([0., 0., 0., 0., 1.]),
-                          np.array([5.27697119e-02, 1.44618414e-01, 4.84502508e-01, 7.29729730e-03,       8.51351351e+00]),
-                          np.array([0.94945921, 0.87104214, 0.65714228, 0.99274683, 0.05797719])),
-                'dffits': (np.array([0., 0., 0., 0., 0.]),
-                           np.array([0., 0., 1., 0., 1.]),
-                           np.array([-0.27560574, -0.55227095,  2.28912231, -0.09944903,  3.72104204]),
-                           np.array([1.26491106, 1.26491106, 1.26491106, 1.26491106, 1.26491106])),
-                'dsr': (np.array([0., 0., 0., 0., 0.]),
-                        np.array([0., 0., 1., 0., 0.]),
-                        np.array([-0.39893758, -1.07867081,  3.77134948, -0.18156826,  0.66299354]),
-                        np.array([-0.39893758, -1.07867081,  3.77134948, -0.18156826,  0.66299354]))}
-                }
 
         self.infln_metrics = ['cutie_1pc', 'cookd', 'dffits', 'dsr']
         self.infln_mapping = {
@@ -397,62 +361,11 @@ def setUp(self):
         self.test_dir = os.path.abspath(os.path.dirname(__file__))
         self.work_dir = os.path.join(self.test_dir, 'test_data/')
 
-        self.pointwise_results = {}
-        for p in ['p', 'r']:
-            self.pointwise_results[p] = {}
-            for f in self.infln_mapping:
-                x_old = self.samp_var1[:, self.var1]
-                y_old = self.samp_var2[:, self.var2]
-
-                # remove nan for influence calculation
-                var1_values, var2_values = utils.remove_nans(x_old, y_old)
-                influence = statistics.return_influence(var1_values, var2_values)
-
-                #assert_almost_equal(self.pointwise_results[p][f],
-                arr_0, arr_1, arr_2, arr_3 = self.infln_mapping[f](var1_index=self.var1,
-                    var2_index=self.var2, samp_var1=self.samp_var1,
-                    samp_var2=self.samp_var2, influence=influence,
-                    threshold=self.threshold[p], fold=self.fold,
-                    fold_value=self.fold_value[p], param=p)
-
-                fp = self.work_dir + '_'.join(['normal', p, f, '.npz'])
-                np.savez(fp, arr_0, arr_1, arr_2, arr_3)
-                results = []
-                for key, value in np.load(fp).iteritems():
-                    results.append(value)
-                    np.savetxt(self.work_dir + '_'.join['normal', p, f, key + '.txt', value)
-                self.pointwise_results[p][f] = results
-
         # setup pointwise results with intermediate files for negnan
-        tuples = [(0,0),(0,1)]
-
-        self.nan_neg_pointwise_results = {}
-        for t in tuples:
-            t1, t2, = t
-            self.nan_neg_pointwise_results[str(t)] = {}
-            for p in ['p', 'r']:
-                self.nan_neg_pointwise_results[str(t)][p] = {}
-                for f in self.infln_mapping:
-                    var1_values = self.nanneg_samp_var1[:, t1]
-                    var2_values = self.nanneg_samp_var2[:, t2]
+        tuplesP = list(itertools.permutations(list(range(len(self.samp_var1[0,:]))), 2))
+        tuplesC = list(itertools.combinations_with_replacement(list(range(len(self.samp_var2[0,:]))), 2))
+        self.tuples  = sorted(list(set(tuplesP) | set(tuplesC)))
 
-                    # remove nan for influence calculation
-                    # var1_values, var2_values = utils.remove_nans(x_old, y_old)
-                    influence = statistics.return_influence(var1_values, var2_values)
-
-                    arr_0, arr_1, arr_2, arr_3 = self.infln_mapping[f](var1_index=t1,
-                        var2_index=t2, samp_var1=self.nanneg_samp_var1,
-                        samp_var2=self.nanneg_samp_var2, influence=influence,
-                        threshold=self.threshold[p], fold=self.fold,
-                        fold_value=self.fold_value[p], param=p)
-
-                    fp = self.work_dir + '_'.join(['nanneg', str(t), p, f, '.npz'])
-                    np.savez(fp, arr_0, arr_1, arr_2, arr_3)
-                    results = []
-                    for key, value in np.load(fp).iteritems():
-                        results.append(value)
-                        np.savetxt(self.work_dir + '_'.join['nanneg', str(t), p, f, key + '.txt', value)
-                    self.nan_neg_pointwise_results[str(t)][p][f] = results
 
     def test_compute_pc(self):
         assert_almost_equal((np.nan, np.nan), statistics.compute_pc(self.undef_corr[0],
@@ -535,43 +448,55 @@ def test_calculate_FP_sets(self):
             self.samp_var1, self.samp_var2, self.infln_metrics, self.infln_mapping,
             self.threshold[p], True, self.fold_value[p], p)
 
-
     def test_pointwise_metrics(self):
-        # test normal data
-        # test cutie, cookd, dffits, dsr for just var1 = 1 var2 = 2
-        x_old = self.samp_var1[:, self.var1]
-        y_old = self.samp_var2[:, self.var2]
-
-        # remove nan for influence calculation
-        var1_values, var2_values = utils.remove_nans(x_old, y_old)
-        influence = statistics.return_influence(var1_values, var2_values)
-        for p in ['p', 'r']:
-            for f in self.infln_mapping:
-                assert_almost_equal(self.pointwise_results[p][f],
-                    self.infln_mapping[f](var1_index=self.var1, var2_index=self.var2,
-                    samp_var1=self.samp_var1, samp_var2=self.samp_var2,
-                    influence=influence,
-                    threshold=self.threshold[p], fold=self.fold,
-                    fold_value=self.fold_value[p], param=p))
-
-        # test nanneg data
-        # test cutie, cookd, dffits, dsr for just var1 = 1 var2 = 2
-        x_old = self.nanneg_samp_var1[:, 0]
-        y_old = self.nanneg_samp_var1[:, 0]
-
-        # remove nan for influence calculation
-        var1_values, var2_values = utils.remove_nans(x_old, y_old)
-        influence = statistics.return_influence(var1_values, var2_values)
-        for p in ['p', 'r']:
-            for f in self.infln_mapping:
-                assert_almost_equal(self.nan_neg_pointwise_results[p][f],
-                    self.infln_mapping[f](var1_index=self.var1, var2_index=self.var2,
-                    samp_var1=self.nanneg_samp_var1, samp_var2=self.nanneg_samp_var2,
-                    influence=influence,
-                    threshold=self.threshold[p], fold=self.fold,
-                    fold_value=self.fold_value[p], param=p))
+        # generate results and output intermediate file
+        pointwise_results = {}
+        for t in self.tuples:
+            t1, t2, = t
+            pointwise_results[str(t)] = {}
+
+            for p in ['p', 'r']:
+                pointwise_results[str(t)][p] = {}
+
+                for f in self.infln_mapping::
+                    x_old = self.samp_var1[:, t1]
+                    y_old = self.samp_var2[:, t2]
 
+                    var1_values, var2_values = utils.remove_nans(x_old, y_old)
+                    influence = statistics.return_influence(var1_values, var2_values)
 
+                    arr_0, arr_1, arr_2, arr_3 = self.infln_mapping[f](var1_index=t1,
+                        var2_index=t2, samp_var1=self.samp_var1, samp_var2=self.samp_var2,
+                        influence=influence, threshold=self.threshold[p], fold=self.fold,
+                        fold_value=self.fold_value[p], param=p)
+
+                    # save results to compressed object and later text file
+                    fp = self.work_dir + '_'.join([str(t), p, f, '.npz'])
+                    np.savez(fp, arr_0, arr_1, arr_2, arr_3)
+                    results = []
+                    for key, value in np.load(fp).items():
+                        results.append(value)
+                        np.savetxt(self.work_dir + '_'.join([str(t), p, f, key + '.txt']), value)
+
+                    pointwise_results[str(t)][p][f] = results
+
+        # test cutie, cookd, dffits, dsr
+        # with inputs mixed with nan and neg values as defined in setUp
+        for t in self.tuples:
+            t1, t2 = t
+            var1_values = self.samp_var1[:, t1]
+            var2_values = self.samp_var2[:, t2]
+            influence = statistics.return_influence(var1_values, var2_values)
+
+            for p in ['p', 'r']:
+                for f in self.infln_mapping:
+                    results = self.infln_mapping[f](var1_index=t1,
+                        var2_index=t2, samp_var1=self.samp_var1, samp_var2=self.samp_var2,
+                        influence=influence,
+                        threshold=self.threshold[p], fold=self.fold,
+                        fold_value=self.fold_value[p], param=p)
+                    # comparison to 7 decimal places is the default value
+                    assert_almost_equal(pointwise_results[str(t)][p][f], results)
 
     def test_pointwise_comparison(self):
         for p in ['p', 'r']: