[#179] Don't automatically add or drop columns from thresholded metri…

…cs df
ipums · Dec 12, 2024 · bd934f5 · bd934f5
1 parent b454276
commit bd934f5
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 52 deletions.
diff --git a/hlink/linking/model_exploration/link_step_train_test_models.py b/hlink/linking/model_exploration/link_step_train_test_models.py
@@ -544,13 +544,16 @@ def _run(self) -> None:
 
         print("***   Final thresholded metrics ***")
 
+        # Convert the parameters column to dtype string so that Spark can handle it
+        thresholded_metrics_df["parameters"] = thresholded_metrics_df[
+            "parameters"
+        ].apply(lambda t: str(t) if pd.notnull(t) else t)
         # thresholded_metrics_df has one row per threshold combination. and each outer fold
-        thresholded_metrics_df = _load_thresholded_metrics_df_params(
-            thresholded_metrics_df
-        )
-        _print_thresholded_metrics_df(
-            thresholded_metrics_df.sort_values(by="mcc_mean", ascending=False)
-        )
+        with pd.option_context(
+            "display.max_columns", None, "display.max_colwidth", None
+        ):
+            print(thresholded_metrics_df.sort_values(by="mcc_mean", ascending=False))
+        print("\n")
 
         self._save_training_results(thresholded_metrics_df, self.task.spark)
         self.task.spark.sql("set spark.sql.shuffle.partitions=200")
@@ -693,7 +696,6 @@ def _save_training_results(
         if desc_df.empty:
             print("Training results dataframe is empty.")
         else:
-            desc_df.dropna(axis=1, how="all", inplace=True)
             spark.createDataFrame(desc_df, samplingRatio=1).write.mode(
                 "overwrite"
             ).saveAsTable(f"{table_prefix}training_results")
@@ -873,40 +875,6 @@ def _aggregate_per_threshold_results(
     return thresholded_metrics_df
 
 
-def _print_thresholded_metrics_df(desc_df: pd.DataFrame) -> None:
-    pd.set_option("display.max_colwidth", None)
-    print(desc_df.iloc[-1])
-
-    print("\n")
-
-
-def _load_thresholded_metrics_df_params(desc_df: pd.DataFrame) -> pd.DataFrame:
-    params = [
-        "maxDepth",
-        "numTrees",
-        "featureSubsetStrategy",
-        "subsample",
-        "minInstancesPerNode",
-        "maxBins",
-        "class_weight",
-        "C",
-        "kernel",
-        "threshold",
-        "maxIter",
-    ]
-
-    load_params = lambda j, param: j.get(param, np.nan)
-    for param in params:
-        desc_df[param] = desc_df["parameters"].apply(load_params, args=(param,))
-    desc_df["class_weight"] = desc_df["class_weight"].apply(
-        lambda x: str(x) if pd.notnull(x) else x
-    )
-    desc_df["parameters"] = desc_df["parameters"].apply(
-        lambda t: str(t) if pd.notnull(t) else t
-    )
-    return desc_df
-
-
 def _custom_param_grid_builder(
     model_parameters: list[dict[str, Any]]
 ) -> list[dict[str, Any]]:

diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
@@ -696,7 +696,7 @@ def test_step_2_train_random_forest_spark(
     print(f"training results {tr}")
     # assert tr.shape == (1, 18)
     assert tr.query("model == 'random_forest'")["pr_auc_mean"].iloc[0] > 2.0 / 3.0
-    assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3
+    #  assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3
 
     # TODO probably remove these since we're not planning to test suspicious data anymore.
     # I disabled the saving of suspicious in this test config so these are invalid currently.
@@ -731,7 +731,7 @@ def test_step_2_train_logistic_regression_spark(
     tr = spark.table("model_eval_training_results").toPandas()
     # assert tr.count == 3
 
-    assert tr.shape == (1, 13)
+    assert tr.shape == (1, 14)
     # This is now 0.83333333333.... I'm not sure it's worth testing against
     # assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] == 0.75
     assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] > 0.74
@@ -761,9 +761,9 @@ def test_step_2_train_decision_tree_spark(
 
     assert tr.shape == (1, 14)
     # assert tr.query("model == 'decision_tree'")["precision_mean"].iloc[0] > 0
-    assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3
-    assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1
-    assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7
+    #  assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3
+    #  assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1
+    #  assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7
 
     main.do_drop_all("")
 
@@ -803,12 +803,12 @@ def test_step_2_train_gradient_boosted_trees_spark(
     # assert (
     #    tr.query("model == 'gradient_boosted_trees'")["precision_test_mean"].iloc[0] > 0
     # )
-    assert tr.query("model == 'gradient_boosted_trees'")["maxDepth"].iloc[0] == 5
-    assert (
-        tr.query("model == 'gradient_boosted_trees'")["minInstancesPerNode"].iloc[0]
-        == 1
-    )
-    assert tr.query("model == 'gradient_boosted_trees'")["maxBins"].iloc[0] == 5
+    #  assert tr.query("model == 'gradient_boosted_trees'")["maxDepth"].iloc[0] == 5
+    #  assert (
+    #  tr.query("model == 'gradient_boosted_trees'")["minInstancesPerNode"].iloc[0]
+    #  == 1
+    #  )
+    #  assert tr.query("model == 'gradient_boosted_trees'")["maxBins"].iloc[0] == 5
 
     main.do_drop_all("")