Skip to content

Commit

Permalink
[#21] Rewrite a skipped model exploration test
Browse files Browse the repository at this point in the history
This tests Training step 3 with a probit model instead of a random forest
model. Probit models save coefficients, not feature importances.
  • Loading branch information
riley-harper committed Jun 18, 2024
1 parent db68660 commit 0ddb4c3
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 81 deletions.
81 changes: 0 additions & 81 deletions hlink/tests/model_exploration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,84 +552,3 @@ def test_step_2_split_by_id_a(
assert splits[1][1].toPandas()["id_a"].unique().tolist() == ["30"]

main.do_drop_all("")


@pytest.mark.skip(
reason="Need to get tests working for new version of feature importances"
)
def test_step_3_get_feature_importances_probit(
spark,
training_conf,
training,
state_dist_path,
datasource_training_input,
potential_matches_path,
spark_test_tmp_dir_path,
matching,
):
"""Test running the chosen model on potential matches dataset"""
td_path, pa_path, pb_path = datasource_training_input

training_conf["comparison_features"] = [
{
"alias": "regionf",
"column_name": "region",
"comparison_type": "fetch_a",
"categorical": True,
},
{
"alias": "namelast_jw",
"column_name": "namelast",
"comparison_type": "jaro_winkler",
},
{
"alias": "state_distance",
"key_count": 1,
"column_name": "bpl",
"comparison_type": "geo_distance",
"loc_a": "statecode1",
"loc_b": "statecode2",
"distance_col": "dist",
"table_name": "state_distances_lookup",
"distances_file": state_dist_path,
},
]

training_conf["training"]["dataset"] = td_path
training_conf["training"]["dependent_var"] = "match"
training_conf["training"]["independent_vars"] = [
"namelast_jw",
"regionf",
"state_distance",
]

training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5}

# training_conf["training"]["use_potential_matches_features"] = True
training_conf["training"]["score_with_model"] = True
training_conf["training"]["feature_importances"] = True
training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path
training_conf["drop_data_from_scored_matches"] = True

training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("prepped_df_a")
training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("prepped_df_b")
training.spark.read.csv(
potential_matches_path, header=True, inferSchema=True
).write.mode("overwrite").saveAsTable("potential_matches")

training.run_step(0)
training.run_step(1)
training.run_step(2)
matching.run_step(2)
training.run_step(3)

fi_df = training.spark.table("feature_importances").toPandas()

assert fi_df.shape == (6, 3)
assert 25 > fi_df.query("idx == 0")["score"].iloc()[0] >= -5
assert "regionf_onehotencoded_2" in list(fi_df["name"])
assert "regionf_onehotencoded_invalidValues" in list(fi_df["name"])
83 changes: 83 additions & 0 deletions hlink/tests/training_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,89 @@ def test_step_3_interacted_categorical_features(
)


def test_step_3_with_probit_model(
spark, training_conf, training, state_dist_path, datasource_training_input
):
training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input
"""Run training step 3 with a probit ML model."""
training_conf["comparison_features"] = [
{
"alias": "regionf",
"column_name": "region",
"comparison_type": "fetch_a",
"categorical": True,
},
{
"alias": "namelast_jw",
"column_name": "namelast",
"comparison_type": "jaro_winkler",
},
{
"alias": "state_distance",
"key_count": 1,
"column_name": "bpl",
"comparison_type": "geo_distance",
"loc_a": "statecode1",
"loc_b": "statecode2",
"distance_col": "dist",
"table_name": "state_distances_lookup",
"distances_file": state_dist_path,
},
]
training_conf["training"]["dataset"] = training_data_path
training_conf["training"]["dependent_var"] = "match"
training_conf["training"]["independent_vars"] = [
"namelast_jw",
"regionf",
"state_distance",
]

training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5}
training_conf["training"]["score_with_model"] = True
training_conf["training"]["feature_importances"] = True

spark.read.csv(prepped_df_a_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("prepped_df_a")
spark.read.csv(prepped_df_b_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("prepped_df_b")

training.run_step(0)
training.run_step(1)
training.run_step(2)
training.run_step(3)

tfi = spark.table("training_feature_importances").toPandas()
assert (
8.9
<= tfi.query("feature_name == 'namelast_jw'")[
"coefficient_or_importance"
].item()
<= 9.0
)
assert (
tfi.query("feature_name == 'regionf' and category == 0")[
"coefficient_or_importance"
].item()
== 0
)
assert (
-7.6
<= tfi.query("feature_name == 'regionf' and category == 1")[
"coefficient_or_importance"
].item()
<= -7.5
)
assert (
6.4
<= tfi.query("feature_name == 'regionf' and category == 99")[
"coefficient_or_importance"
].item()
<= 6.5
)


def test_step_3_requires_table(training_conf, training):
training_conf["training"]["feature_importances"] = True
with pytest.raises(RuntimeError, match="Missing input tables"):
Expand Down

0 comments on commit 0ddb4c3

Please sign in to comment.