Skip to content

Commit

Permalink
[#21] Remove a skipped model_exploration step 3 test
Browse files Browse the repository at this point in the history
model_exploration_test.test_step_3_get_feature_importances_random_forest is
covered by tests for training step 3 in training_test.py.
  • Loading branch information
riley-harper committed Jun 18, 2024
1 parent bd69a9e commit db68660
Showing 1 changed file with 0 additions and 85 deletions.
85 changes: 0 additions & 85 deletions hlink/tests/model_exploration_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -554,91 +554,6 @@ def test_step_2_split_by_id_a(
main.do_drop_all("")


@pytest.mark.skip(
reason="Need to get tests working for new version of feature importances"
)
def test_step_3_get_feature_importances_random_forest(
spark,
training_conf,
training,
state_dist_path,
datasource_training_input,
potential_matches_path,
spark_test_tmp_dir_path,
model_exploration,
):
"""Test running the chosen model on potential matches dataset"""
td_path, pa_path, pb_path = datasource_training_input

training_conf["comparison_features"] = [
{
"alias": "regionf",
"column_name": "region",
"comparison_type": "fetch_a",
"categorical": True,
},
{
"alias": "namelast_jw",
"column_name": "namelast",
"comparison_type": "jaro_winkler",
},
{
"alias": "state_distance",
"column_name": "bpl",
"key_count": 1,
"comparison_type": "geo_distance",
"loc_a": "statecode1",
"loc_b": "statecode2",
"distance_col": "dist",
"table_name": "state_distances_lookup",
"distances_file": state_dist_path,
},
]

training_conf["training"]["dataset"] = td_path
training_conf["training"]["dependent_var"] = "match"
training_conf["training"]["independent_vars"] = [
"namelast_jw",
"regionf",
"state_distance",
]
training_conf["training"]["chosen_model"] = {
"type": "random_forest",
"maxDepth": 6,
"numTrees": 100,
"featureSubsetStrategy": "sqrt",
}

# training_conf["training"]["use_potential_matches_features"] = True
training_conf["training"]["score_with_model"] = True
training_conf["training"]["feature_importances"] = True
training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path
training_conf["drop_data_from_scored_matches"] = True

training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("prepped_df_a")
training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode(
"overwrite"
).saveAsTable("prepped_df_b")
training.spark.read.csv(
potential_matches_path, header=True, inferSchema=True
).write.mode("overwrite").saveAsTable("potential_matches")

training.run_step(0)
training.run_step(1)
training.run_step(2)

model_exploration.run_step(3)

fi_df = training.spark.table("feature_importances").toPandas()

assert fi_df.shape == (6, 3)
assert 1 > fi_df.query("idx == 0")["score"].iloc()[0] >= 0
assert "regionf_onehotencoded_2" in list(fi_df["name"])
assert "regionf_onehotencoded_invalidValues" in list(fi_df["name"])


@pytest.mark.skip(
reason="Need to get tests working for new version of feature importances"
)
Expand Down

0 comments on commit db68660

Please sign in to comment.