diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index e5f4769..a2e65c5 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -572,8 +572,7 @@ def _run(self) -> None:
 
         thresholded_metrics_df = _create_thresholded_metrics_df()
         for i in range(threshold_matrix_size):
-            print(type(combined_test[i]))
-            print(combined_test[i])
+            print(f"Aggregate threshold matrix entry {i}")
             thresholded_metrics_df = _aggregate_per_threshold_results(
                 thresholded_metrics_df, combined_test[i], best_models
             )
@@ -1007,6 +1006,7 @@ def _aggregate_per_threshold_results(
     # training_results: list[ThresholdTestResult],
     best_models: list[ModelEval],
 ) -> pd.DataFrame:
+
     # The threshold is the same for all entries in the lists
     alpha_threshold = prediction_results[0].alpha_threshold
     threshold_ratio = prediction_results[0].threshold_ratio
@@ -1015,16 +1015,17 @@ def _aggregate_per_threshold_results(
     precision_test = [
         r.precision for r in prediction_results if r.precision is not np.nan
     ]
-    recall_test = [r.recall for r in prediction_results if r.recall is not np.NaN]
-    pr_auc_test = [r.pr_auc for r in prediction_results]
-    mcc_test = [r.mcc for r in prediction_results]
+    recall_test = [r.recall for r in prediction_results if r.recall is not np.nan]
+    pr_auc_test = [r.pr_auc for r in prediction_results if r.pr_auc is not np.nan]
+    mcc_test = [r.mcc for r in prediction_results if r.mcc is not np.nan]
 
-    """
-    precision_train = [r.precision for r in training_results]
-    recall_train = [r.recall for r in training_results]
-    pr_auc_train = [r.pr_auc for r in training_results]
-    mcc_train = [r.mcc for r in training_results]
-    """
+    # # variance requires at least two values
+    precision_test_sd = (
+        statistics.stdev(precision_test) if len(precision_test) > 1 else np.nan
+    )
+    recall_test_sd = statistics.stdev(recall_test) if len(recall_test) > 1 else np.nan
+    pr_auc_test_sd = statistics.stdev(pr_auc_test) if len(pr_auc_test) > 1 else np.nan
+    mcc_test_sd = statistics.stdev(mcc_test) if len(mcc_test) > 1 else np.nan
 
     new_desc = pd.DataFrame(
         {
@@ -1033,13 +1034,13 @@ def _aggregate_per_threshold_results(
             "alpha_threshold": [alpha_threshold],
             "threshold_ratio": [threshold_ratio],
             "precision_test_mean": [statistics.mean(precision_test)],
-            "precision_test_sd": [statistics.stdev(precision_test)],
+            "precision_test_sd": [precision_test_sd],
             "recall_test_mean": [statistics.mean(recall_test)],
-            "recall_test_sd": [statistics.stdev(recall_test)],
+            "recall_test_sd": [recall_test_sd],
             "pr_auc_test_mean": [statistics.mean(pr_auc_test)],
-            "pr_auc_test_sd": [statistics.stdev(pr_auc_test)],
+            "pr_auc_test_sd": [pr_auc_test_sd],
             "mcc_test_mean": [statistics.mean(mcc_test)],
-            "mcc_test_sd": [statistics.stdev(mcc_test)],
+            "mcc_test_sd": [mcc_test_sd],
         },
     )
 
@@ -1052,17 +1053,8 @@ def _aggregate_per_threshold_results(
 
 def _print_thresholded_metrics_df(desc_df: pd.DataFrame) -> None:
     pd.set_option("display.max_colwidth", None)
-    print(
-        desc_df.drop(
-            [
-                "recall_test_sd",
-                "recall_train_sd",
-                "precision_test_sd",
-                "precision_train_sd",
-            ],
-            axis=1,
-        ).iloc[-1]
-    )
+    print(desc_df.iloc[-1])
+
     print("\n")
 
 
@@ -1105,17 +1097,7 @@ def _create_thresholded_metrics_df() -> pd.DataFrame:
             "recall_test_mean",
             "recall_test_sd",
             "mcc_test_mean",
-            "mcc_test_sd"
-            """
-            "precision_train_mean",
-            "precision_train_sd",
-            "recall_train_mean",
-            "recall_train_sd",
-            "pr_auc_mean",
-            "pr_auc_sd",
-            "mcc_train_mean",
-            "mcc_train_sd",
-            """,
+            "mcc_test_sd",
         ]
     )
 
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index cc2e9c1..30bca92 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -725,7 +725,7 @@ def test_step_2_train_logistic_regression_spark(
 
     tr = spark.table("model_eval_training_results").toPandas()
 
-    assert tr.shape == (1, 9)
+    assert tr.shape == (1, 11)
     # This is now 0.83333333333.... I'm not sure it's worth testing against
     # assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] == 0.75
     assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] > 0.74
@@ -754,7 +754,7 @@ def test_step_2_train_decision_tree_spark(
     print(f"Decision tree results: {tr}")
 
     # TODO This is  1,12 instead of 1,13, because the precision_test_mean column is dropped as it is NaN
-    assert tr.shape == (1, 12)
+    assert tr.shape == (1, 13)
     # assert tr.query("model == 'decision_tree'")["precision_test_mean"].iloc[0] > 0
     assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3
     assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1