Skip to content

Commit

Permalink
[#179] Don't automatically add or drop columns from thresholded metri…
Browse files Browse the repository at this point in the history
…cs df
  • Loading branch information
riley-harper committed Dec 12, 2024
1 parent b454276 commit bd934f5
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 52 deletions.
50 changes: 9 additions & 41 deletions hlink/linking/model_exploration/link_step_train_test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,13 +544,16 @@ def _run(self) -> None:

print("*** Final thresholded metrics ***")

# Convert the parameters column to dtype string so that Spark can handle it
thresholded_metrics_df["parameters"] = thresholded_metrics_df[
"parameters"
].apply(lambda t: str(t) if pd.notnull(t) else t)
# thresholded_metrics_df has one row per threshold combination. and each outer fold
thresholded_metrics_df = _load_thresholded_metrics_df_params(
thresholded_metrics_df
)
_print_thresholded_metrics_df(
thresholded_metrics_df.sort_values(by="mcc_mean", ascending=False)
)
with pd.option_context(
"display.max_columns", None, "display.max_colwidth", None
):
print(thresholded_metrics_df.sort_values(by="mcc_mean", ascending=False))
print("\n")

self._save_training_results(thresholded_metrics_df, self.task.spark)
self.task.spark.sql("set spark.sql.shuffle.partitions=200")
Expand Down Expand Up @@ -693,7 +696,6 @@ def _save_training_results(
if desc_df.empty:
print("Training results dataframe is empty.")
else:
desc_df.dropna(axis=1, how="all", inplace=True)
spark.createDataFrame(desc_df, samplingRatio=1).write.mode(
"overwrite"
).saveAsTable(f"{table_prefix}training_results")
Expand Down Expand Up @@ -873,40 +875,6 @@ def _aggregate_per_threshold_results(
return thresholded_metrics_df


def _print_thresholded_metrics_df(desc_df: pd.DataFrame) -> None:
pd.set_option("display.max_colwidth", None)
print(desc_df.iloc[-1])

print("\n")


def _load_thresholded_metrics_df_params(desc_df: pd.DataFrame) -> pd.DataFrame:
params = [
"maxDepth",
"numTrees",
"featureSubsetStrategy",
"subsample",
"minInstancesPerNode",
"maxBins",
"class_weight",
"C",
"kernel",
"threshold",
"maxIter",
]

load_params = lambda j, param: j.get(param, np.nan)
for param in params:
desc_df[param] = desc_df["parameters"].apply(load_params, args=(param,))
desc_df["class_weight"] = desc_df["class_weight"].apply(
lambda x: str(x) if pd.notnull(x) else x
)
desc_df["parameters"] = desc_df["parameters"].apply(
lambda t: str(t) if pd.notnull(t) else t
)
return desc_df


def _custom_param_grid_builder(
model_parameters: list[dict[str, Any]]
) -> list[dict[str, Any]]:
Expand Down
22 changes: 11 additions & 11 deletions hlink/tests/model_exploration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,7 +696,7 @@ def test_step_2_train_random_forest_spark(
print(f"training results {tr}")
# assert tr.shape == (1, 18)
assert tr.query("model == 'random_forest'")["pr_auc_mean"].iloc[0] > 2.0 / 3.0
assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3
# assert tr.query("model == 'random_forest'")["maxDepth"].iloc[0] == 3

# TODO probably remove these since we're not planning to test suspicious data anymore.
# I disabled the saving of suspicious in this test config so these are invalid currently.
Expand Down Expand Up @@ -731,7 +731,7 @@ def test_step_2_train_logistic_regression_spark(
tr = spark.table("model_eval_training_results").toPandas()
# assert tr.count == 3

assert tr.shape == (1, 13)
assert tr.shape == (1, 14)
# This is now 0.83333333333.... I'm not sure it's worth testing against
# assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] == 0.75
assert tr.query("model == 'logistic_regression'")["pr_auc_mean"].iloc[0] > 0.74
Expand Down Expand Up @@ -761,9 +761,9 @@ def test_step_2_train_decision_tree_spark(

assert tr.shape == (1, 14)
# assert tr.query("model == 'decision_tree'")["precision_mean"].iloc[0] > 0
assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3
assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1
assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7
# assert tr.query("model == 'decision_tree'")["maxDepth"].iloc[0] == 3
# assert tr.query("model == 'decision_tree'")["minInstancesPerNode"].iloc[0] == 1
# assert tr.query("model == 'decision_tree'")["maxBins"].iloc[0] == 7

main.do_drop_all("")

Expand Down Expand Up @@ -803,12 +803,12 @@ def test_step_2_train_gradient_boosted_trees_spark(
# assert (
# tr.query("model == 'gradient_boosted_trees'")["precision_test_mean"].iloc[0] > 0
# )
assert tr.query("model == 'gradient_boosted_trees'")["maxDepth"].iloc[0] == 5
assert (
tr.query("model == 'gradient_boosted_trees'")["minInstancesPerNode"].iloc[0]
== 1
)
assert tr.query("model == 'gradient_boosted_trees'")["maxBins"].iloc[0] == 5
# assert tr.query("model == 'gradient_boosted_trees'")["maxDepth"].iloc[0] == 5
# assert (
# tr.query("model == 'gradient_boosted_trees'")["minInstancesPerNode"].iloc[0]
# == 1
# )
# assert tr.query("model == 'gradient_boosted_trees'")["maxBins"].iloc[0] == 5

main.do_drop_all("")

Expand Down

0 comments on commit bd934f5

Please sign in to comment.