diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index 5514d5a..c7f9887 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -535,7 +535,7 @@ def _run(self) -> None: # threshold matrix entries. threshold_matrix_size = len(threshold_test_results[0]) - thresholded_metrics_df = _create_thresholded_metrics_df() + thresholded_metrics_df = pd.DataFrame() for i in range(threshold_matrix_size): print(f"Aggregate threshold matrix entry {i}") thresholded_metrics_df = _aggregate_per_threshold_results( @@ -549,7 +549,7 @@ def _run(self) -> None: thresholded_metrics_df ) _print_thresholded_metrics_df( - thresholded_metrics_df.sort_values(by="mcc_test_mean", ascending=False) + thresholded_metrics_df.sort_values(by="mcc_mean", ascending=False) ) self._save_training_results(thresholded_metrics_df, self.task.spark) @@ -835,17 +835,17 @@ def _aggregate_per_threshold_results( threshold_ratio = prediction_results[0].threshold_ratio # Pull out columns to be aggregated - precision_test = [ - r.precision for r in prediction_results if not math.isnan(r.precision) - ] - recall_test = [r.recall for r in prediction_results if not math.isnan(r.recall)] - pr_auc_test = [r.pr_auc for r in prediction_results if not math.isnan(r.pr_auc)] - mcc_test = [r.mcc for r in prediction_results if not math.isnan(r.mcc)] - - (precision_test_mean, precision_test_sd) = _compute_mean_and_stdev(precision_test) - (recall_test_mean, recall_test_sd) = _compute_mean_and_stdev(recall_test) - (pr_auc_test_mean, pr_auc_test_sd) = _compute_mean_and_stdev(pr_auc_test) - (mcc_test_mean, mcc_test_sd) = _compute_mean_and_stdev(mcc_test) + precision = [r.precision for r in prediction_results if not math.isnan(r.precision)] + recall = [r.recall for r in prediction_results if not math.isnan(r.recall)] + pr_auc = [r.pr_auc for r in prediction_results if not math.isnan(r.pr_auc)] + mcc = [r.mcc for r in prediction_results if not math.isnan(r.mcc)] + f_measure = [r.f_measure for r in prediction_results if not math.isnan(r.f_measure)] + + (precision_mean, precision_sd) = _compute_mean_and_stdev(precision) + (recall_mean, recall_sd) = _compute_mean_and_stdev(recall) + (pr_auc_mean, pr_auc_sd) = _compute_mean_and_stdev(pr_auc) + (mcc_mean, mcc_sd) = _compute_mean_and_stdev(mcc) + (f_measure_mean, f_measure_sd) = _compute_mean_and_stdev(f_measure) new_desc = pd.DataFrame( { @@ -853,14 +853,16 @@ def _aggregate_per_threshold_results( "parameters": [best_models[0].hyperparams], "alpha_threshold": [alpha_threshold], "threshold_ratio": [threshold_ratio], - "precision_test_mean": [precision_test_mean], - "precision_test_sd": [precision_test_sd], - "recall_test_mean": [recall_test_mean], - "recall_test_sd": [recall_test_sd], - "pr_auc_test_mean": [pr_auc_test_mean], - "pr_auc_test_sd": [pr_auc_test_sd], - "mcc_test_mean": [mcc_test_mean], - "mcc_test_sd": [mcc_test_sd], + "precision_mean": [precision_mean], + "precision_sd": [precision_sd], + "recall_mean": [recall_mean], + "recall_sd": [recall_sd], + "pr_auc_mean": [pr_auc_mean], + "pr_auc_sd": [pr_auc_sd], + "mcc_mean": [mcc_mean], + "mcc_sd": [mcc_sd], + "f_measure_mean": [f_measure_mean], + "f_measure_sd": [f_measure_sd], }, ) @@ -905,23 +907,6 @@ def _load_thresholded_metrics_df_params(desc_df: pd.DataFrame) -> pd.DataFrame: return desc_df -def _create_thresholded_metrics_df() -> pd.DataFrame: - return pd.DataFrame( - columns=[ - "model", - "parameters", - "alpha_threshold", - "threshold_ratio", - "precision_test_mean", - "precision_test_sd", - "recall_test_mean", - "recall_test_sd", - "mcc_test_mean", - "mcc_test_sd", - ] - ) - - def _custom_param_grid_builder( model_parameters: list[dict[str, Any]] ) -> list[dict[str, Any]]: diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index 7414ef4..38ab80a 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -695,7 +695,7 @@ def test_step_2_train_random_forest_spark( tr = spark.table("model_eval_training_results").toPandas() print(f"training results {tr}") # assert tr.shape == (1, 18) - assert tr.query("model == 'random_forest'")["pr_auc_test_mean"].iloc[0] > 2.0 / 3.0 + assert tr.query("model == 'random_forest'")["pr_auc_mean"].iloc[0] > 2.0 / 3.0 assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3 # TODO probably remove these since we're not planning to test suspicious data anymore. @@ -731,10 +731,10 @@ def test_step_2_train_logistic_regression_spark( tr = spark.table("model_eval_training_results").toPandas() # assert tr.count == 3 - assert tr.shape == (1, 11) + assert tr.shape == (1, 13) # This is now 0.83333333333.... I'm not sure it's worth testing against # assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] == 0.75 - assert tr.query("model == 'logistic_regression'")["pr_auc_test_mean"].iloc[0] > 0.74 + assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] > 0.74 assert ( round(tr.query("model == 'logistic_regression'")["alpha_threshold"].iloc[0], 1) == 0.7 @@ -759,9 +759,8 @@ def test_step_2_train_decision_tree_spark( print(f"Decision tree results: {tr}") - # TODO This is 1,12 instead of 1,13, because the precision_test_mean column is dropped as it is NaN - assert tr.shape == (1, 13) - # assert tr.query("model == 'decision_tree'")["precision_test_mean"].iloc[0] > 0 + assert tr.shape == (1, 15) + # assert tr.query("model == 'decision_tree'")["precision_mean"].iloc[0] > 0 assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3 assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1 assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7