diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py index c7f9887..63a1e3b 100644 --- a/hlink/linking/model_exploration/link_step_train_test_models.py +++ b/hlink/linking/model_exploration/link_step_train_test_models.py @@ -544,13 +544,16 @@ def _run(self) -> None: print("*** Final thresholded metrics ***") + # Convert the parameters column to dtype string so that Spark can handle it + thresholded_metrics_df["parameters"] = thresholded_metrics_df[ + "parameters" + ].apply(lambda t: str(t) if pd.notnull(t) else t) # thresholded_metrics_df has one row per threshold combination. and each outer fold - thresholded_metrics_df = _load_thresholded_metrics_df_params( - thresholded_metrics_df - ) - _print_thresholded_metrics_df( - thresholded_metrics_df.sort_values(by="mcc_mean", ascending=False) - ) + with pd.option_context( + "display.max_columns", None, "display.max_colwidth", None + ): + print(thresholded_metrics_df.sort_values(by="mcc_mean", ascending=False)) + print("\n") self._save_training_results(thresholded_metrics_df, self.task.spark) self.task.spark.sql("set spark.sql.shuffle.partitions=200") @@ -693,7 +696,6 @@ def _save_training_results( if desc_df.empty: print("Training results dataframe is empty.") else: - desc_df.dropna(axis=1, how="all", inplace=True) spark.createDataFrame(desc_df, samplingRatio=1).write.mode( "overwrite" ).saveAsTable(f"{table_prefix}training_results") @@ -873,40 +875,6 @@ def _aggregate_per_threshold_results( return thresholded_metrics_df -def _print_thresholded_metrics_df(desc_df: pd.DataFrame) -> None: - pd.set_option("display.max_colwidth", None) - print(desc_df.iloc[-1]) - - print("\n") - - -def _load_thresholded_metrics_df_params(desc_df: pd.DataFrame) -> pd.DataFrame: - params = [ - "maxDepth", - "numTrees", - "featureSubsetStrategy", - "subsample", - "minInstancesPerNode", - "maxBins", - "class_weight", - "C", - "kernel", - "threshold", - "maxIter", - ] - - load_params = lambda j, param: j.get(param, np.nan) - for param in params: - desc_df[param] = desc_df["parameters"].apply(load_params, args=(param,)) - desc_df["class_weight"] = desc_df["class_weight"].apply( - lambda x: str(x) if pd.notnull(x) else x - ) - desc_df["parameters"] = desc_df["parameters"].apply( - lambda t: str(t) if pd.notnull(t) else t - ) - return desc_df - - def _custom_param_grid_builder( model_parameters: list[dict[str, Any]] ) -> list[dict[str, Any]]: diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py index cc5db41..aad193e 100644 --- a/hlink/tests/model_exploration_test.py +++ b/hlink/tests/model_exploration_test.py @@ -696,7 +696,7 @@ def test_step_2_train_random_forest_spark( print(f"training results {tr}") # assert tr.shape == (1, 18) assert tr.query("model == 'random_forest'")["pr_auc_mean"].iloc[0] > 2.0 / 3.0 - assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3 + # assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3 # TODO probably remove these since we're not planning to test suspicious data anymore. # I disabled the saving of suspicious in this test config so these are invalid currently. @@ -731,7 +731,7 @@ def test_step_2_train_logistic_regression_spark( tr = spark.table("model_eval_training_results").toPandas() # assert tr.count == 3 - assert tr.shape == (1, 13) + assert tr.shape == (1, 14) # This is now 0.83333333333.... I'm not sure it's worth testing against # assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] == 0.75 assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] > 0.74 @@ -761,9 +761,9 @@ def test_step_2_train_decision_tree_spark( assert tr.shape == (1, 14) # assert tr.query("model == 'decision_tree'")["precision_mean"].iloc[0] > 0 - assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3 - assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1 - assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7 + # assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3 + # assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1 + # assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7 main.do_drop_all("") @@ -803,12 +803,12 @@ def test_step_2_train_gradient_boosted_trees_spark( # assert ( # tr.query("model == 'gradient_boosted_trees'")["precision_test_mean"].iloc[0] > 0 # ) - assert tr.query("model == 'gradient_boosted_trees'")["maxDepth"].iloc[0] == 5 - assert ( - tr.query("model == 'gradient_boosted_trees'")["minInstancesPerNode"].iloc[0] - == 1 - ) - assert tr.query("model == 'gradient_boosted_trees'")["maxBins"].iloc[0] == 5 + # assert tr.query("model == 'gradient_boosted_trees'")["maxDepth"].iloc[0] == 5 + # assert ( + # tr.query("model == 'gradient_boosted_trees'")["minInstancesPerNode"].iloc[0] + # == 1 + # ) + # assert tr.query("model == 'gradient_boosted_trees'")["maxBins"].iloc[0] == 5 main.do_drop_all("")