Skip to content

Commit

Permalink
[#21] Remove a skipped matching secondary_threshold test
Browse files Browse the repository at this point in the history
Reading the comment at the top of this test, I went looking for something that
directly replaced the "secondary_threshold" matching attribute. I couldn't find
anything that looked closely related. So let's just remove this test.
  • Loading branch information
riley-harper committed Jun 18, 2024
1 parent b87ed91 commit 1b69d54
Showing 1 changed file with 0 additions and 68 deletions.
68 changes: 0 additions & 68 deletions hlink/tests/matching_scoring_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,74 +10,6 @@
from hlink.linking.matching.link_step_score import LinkStepScore


@pytest.mark.skip(
reason="We still want to test that whatever 'secondary_threshold' became is being applied correctly, but we need to refactor this test to account for the fact that this was totally renamed and is now being carried out in a different step (step 3 doesn't exist anymore)."
)
def test_step_3_uniq_and_secondary_threshold(spark, matching_conf, matching):
"""Test a secondary threshold with uniqueness"""
matching_conf["comparison_features"] = [
{
"alias": "namefrst_jw",
"column_name": "namefrst",
"comparison_type": "jaro_winkler",
},
{
"alias": "namelast_jw",
"column_name": "namelast",
"comparison_type": "jaro_winkler",
},
]

matching_conf["comparisons"] = {
"comp_a": {
"feature_name": "namefrst_jw",
"threshold": 0.8,
"comparison_type": "threshold",
},
"comp_b": {
"feature_name": "namelast_jw",
"comparison_type": "threshold",
"threshold": 0.8,
},
"operator": "AND",
}

matching_conf["secondary_threshold"] = {
"threshold_a": {
"feature_name": "namefrst_jw",
"comparison_type": "threshold",
"threshold": 0.9,
},
"threshold_b": {
"feature_name": "namelast_jw",
"comparison_type": "threshold",
"threshold": 0.9,
},
"unique_true": {"id_a": "id_a", "id_b": "id_b"},
"operator": "AND",
"secondary": True,
}

matching.step_0_explode()
matching.step_1_match()
hlink.linking.matching._step_2_score.__create_features(matching, matching_conf)

# Create pandas DFs of the step_2 potential matches table
potential_matches_df = spark.table("potential_matches_prepped").toPandas()

# matching.step_3_secondary_threshold()
# unique_matches_df = spark.table("potential_matches").toPandas()
unique_high_matches_df = spark.table("potential_matches_prepped").toPandas()

assert len(potential_matches_df.id_a) == 5
# assert (len(unique_matches_df.id_a) == 1)
# assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] > 0.8)
# assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] < 0.9)
# assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namefrst_jw"].iloc[0] > 0.8)
# assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namefrst_jw"].iloc[0] > 0.9)
assert unique_high_matches_df.empty


def test_step_2_skip_on_no_conf(spark, matching_conf, matching, capsys):
"""Test matching step 2 doesn't run if no training config"""

Expand Down

0 comments on commit 1b69d54

Please sign in to comment.