diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
index 5514d5a..c7f9887 100644
--- a/hlink/linking/model_exploration/link_step_train_test_models.py
+++ b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -535,7 +535,7 @@ def _run(self) -> None:
         # threshold matrix entries.
         threshold_matrix_size = len(threshold_test_results[0])
 
-        thresholded_metrics_df = _create_thresholded_metrics_df()
+        thresholded_metrics_df = pd.DataFrame()
         for i in range(threshold_matrix_size):
             print(f"Aggregate threshold matrix entry {i}")
             thresholded_metrics_df = _aggregate_per_threshold_results(
@@ -549,7 +549,7 @@ def _run(self) -> None:
             thresholded_metrics_df
         )
         _print_thresholded_metrics_df(
-            thresholded_metrics_df.sort_values(by="mcc_test_mean", ascending=False)
+            thresholded_metrics_df.sort_values(by="mcc_mean", ascending=False)
         )
 
         self._save_training_results(thresholded_metrics_df, self.task.spark)
@@ -835,17 +835,17 @@ def _aggregate_per_threshold_results(
     threshold_ratio = prediction_results[0].threshold_ratio
 
     # Pull out columns to be aggregated
-    precision_test = [
-        r.precision for r in prediction_results if not math.isnan(r.precision)
-    ]
-    recall_test = [r.recall for r in prediction_results if not math.isnan(r.recall)]
-    pr_auc_test = [r.pr_auc for r in prediction_results if not math.isnan(r.pr_auc)]
-    mcc_test = [r.mcc for r in prediction_results if not math.isnan(r.mcc)]
-
-    (precision_test_mean, precision_test_sd) = _compute_mean_and_stdev(precision_test)
-    (recall_test_mean, recall_test_sd) = _compute_mean_and_stdev(recall_test)
-    (pr_auc_test_mean, pr_auc_test_sd) = _compute_mean_and_stdev(pr_auc_test)
-    (mcc_test_mean, mcc_test_sd) = _compute_mean_and_stdev(mcc_test)
+    precision = [r.precision for r in prediction_results if not math.isnan(r.precision)]
+    recall = [r.recall for r in prediction_results if not math.isnan(r.recall)]
+    pr_auc = [r.pr_auc for r in prediction_results if not math.isnan(r.pr_auc)]
+    mcc = [r.mcc for r in prediction_results if not math.isnan(r.mcc)]
+    f_measure = [r.f_measure for r in prediction_results if not math.isnan(r.f_measure)]
+
+    (precision_mean, precision_sd) = _compute_mean_and_stdev(precision)
+    (recall_mean, recall_sd) = _compute_mean_and_stdev(recall)
+    (pr_auc_mean, pr_auc_sd) = _compute_mean_and_stdev(pr_auc)
+    (mcc_mean, mcc_sd) = _compute_mean_and_stdev(mcc)
+    (f_measure_mean, f_measure_sd) = _compute_mean_and_stdev(f_measure)
 
     new_desc = pd.DataFrame(
         {
@@ -853,14 +853,16 @@ def _aggregate_per_threshold_results(
             "parameters": [best_models[0].hyperparams],
             "alpha_threshold": [alpha_threshold],
             "threshold_ratio": [threshold_ratio],
-            "precision_test_mean": [precision_test_mean],
-            "precision_test_sd": [precision_test_sd],
-            "recall_test_mean": [recall_test_mean],
-            "recall_test_sd": [recall_test_sd],
-            "pr_auc_test_mean": [pr_auc_test_mean],
-            "pr_auc_test_sd": [pr_auc_test_sd],
-            "mcc_test_mean": [mcc_test_mean],
-            "mcc_test_sd": [mcc_test_sd],
+            "precision_mean": [precision_mean],
+            "precision_sd": [precision_sd],
+            "recall_mean": [recall_mean],
+            "recall_sd": [recall_sd],
+            "pr_auc_mean": [pr_auc_mean],
+            "pr_auc_sd": [pr_auc_sd],
+            "mcc_mean": [mcc_mean],
+            "mcc_sd": [mcc_sd],
+            "f_measure_mean": [f_measure_mean],
+            "f_measure_sd": [f_measure_sd],
         },
     )
 
@@ -905,23 +907,6 @@ def _load_thresholded_metrics_df_params(desc_df: pd.DataFrame) -> pd.DataFrame:
     return desc_df
 
 
-def _create_thresholded_metrics_df() -> pd.DataFrame:
-    return pd.DataFrame(
-        columns=[
-            "model",
-            "parameters",
-            "alpha_threshold",
-            "threshold_ratio",
-            "precision_test_mean",
-            "precision_test_sd",
-            "recall_test_mean",
-            "recall_test_sd",
-            "mcc_test_mean",
-            "mcc_test_sd",
-        ]
-    )
-
-
 def _custom_param_grid_builder(
     model_parameters: list[dict[str, Any]]
 ) -> list[dict[str, Any]]:
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
index 7414ef4..38ab80a 100644
--- a/hlink/tests/model_exploration_test.py
+++ b/hlink/tests/model_exploration_test.py
@@ -695,7 +695,7 @@ def test_step_2_train_random_forest_spark(
     tr = spark.table("model_eval_training_results").toPandas()
     print(f"training results {tr}")
     # assert tr.shape == (1, 18)
-    assert tr.query("model == 'random_forest'")["pr_auc_test_mean"].iloc[0] > 2.0 / 3.0
+    assert tr.query("model == 'random_forest'")["pr_auc_mean"].iloc[0] > 2.0 / 3.0
     assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3
 
     # TODO probably remove these since we're not planning to test suspicious data anymore.
@@ -731,10 +731,10 @@ def test_step_2_train_logistic_regression_spark(
     tr = spark.table("model_eval_training_results").toPandas()
     # assert tr.count == 3
 
-    assert tr.shape == (1, 11)
+    assert tr.shape == (1, 13)
     # This is now 0.83333333333.... I'm not sure it's worth testing against
     # assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] == 0.75
-    assert tr.query("model == 'logistic_regression'")["pr_auc_test_mean"].iloc[0] > 0.74
+    assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] > 0.74
     assert (
         round(tr.query("model == 'logistic_regression'")["alpha_threshold"].iloc[0], 1)
         == 0.7
@@ -759,9 +759,8 @@ def test_step_2_train_decision_tree_spark(
 
     print(f"Decision tree results: {tr}")
 
-    # TODO This is  1,12 instead of 1,13, because the precision_test_mean column is dropped as it is NaN
-    assert tr.shape == (1, 13)
-    # assert tr.query("model == 'decision_tree'")["precision_test_mean"].iloc[0] > 0
+    assert tr.shape == (1, 15)
+    # assert tr.query("model == 'decision_tree'")["precision_mean"].iloc[0] > 0
     assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3
     assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1
     assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7