diff --git a/hlink/tests/matching_scoring_test.py b/hlink/tests/matching_scoring_test.py index 33e74f8..b8715a7 100755 --- a/hlink/tests/matching_scoring_test.py +++ b/hlink/tests/matching_scoring_test.py @@ -10,74 +10,6 @@ from hlink.linking.matching.link_step_score import LinkStepScore -@pytest.mark.skip( - reason="We still want to test that whatever 'secondary_threshold' became is being applied correctly, but we need to refactor this test to account for the fact that this was totally renamed and is now being carried out in a different step (step 3 doesn't exist anymore)." -) -def test_step_3_uniq_and_secondary_threshold(spark, matching_conf, matching): - """Test a secondary threshold with uniqueness""" - matching_conf["comparison_features"] = [ - { - "alias": "namefrst_jw", - "column_name": "namefrst", - "comparison_type": "jaro_winkler", - }, - { - "alias": "namelast_jw", - "column_name": "namelast", - "comparison_type": "jaro_winkler", - }, - ] - - matching_conf["comparisons"] = { - "comp_a": { - "feature_name": "namefrst_jw", - "threshold": 0.8, - "comparison_type": "threshold", - }, - "comp_b": { - "feature_name": "namelast_jw", - "comparison_type": "threshold", - "threshold": 0.8, - }, - "operator": "AND", - } - - matching_conf["secondary_threshold"] = { - "threshold_a": { - "feature_name": "namefrst_jw", - "comparison_type": "threshold", - "threshold": 0.9, - }, - "threshold_b": { - "feature_name": "namelast_jw", - "comparison_type": "threshold", - "threshold": 0.9, - }, - "unique_true": {"id_a": "id_a", "id_b": "id_b"}, - "operator": "AND", - "secondary": True, - } - - matching.step_0_explode() - matching.step_1_match() - hlink.linking.matching._step_2_score.__create_features(matching, matching_conf) - - # Create pandas DFs of the step_2 potential matches table - potential_matches_df = spark.table("potential_matches_prepped").toPandas() - - # matching.step_3_secondary_threshold() - # unique_matches_df = spark.table("potential_matches").toPandas() - unique_high_matches_df = spark.table("potential_matches_prepped").toPandas() - - assert len(potential_matches_df.id_a) == 5 - # assert (len(unique_matches_df.id_a) == 1) - # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] > 0.8) - # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namelast_jw"].iloc[0] < 0.9) - # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namefrst_jw"].iloc[0] > 0.8) - # assert (unique_matches_df.query("id_a == 10 and id_b == 10")["namefrst_jw"].iloc[0] > 0.9) - assert unique_high_matches_df.empty - - def test_step_2_skip_on_no_conf(spark, matching_conf, matching, capsys): """Test matching step 2 doesn't run if no training config"""