[#21] Rewrite a skipped model exploration test

This tests Training step 3 with a probit model instead of a random forest model. Probit models save coefficients, not feature importances.
ipums · Jun 18, 2024 · 0ddb4c3 · 0ddb4c3
1 parent db68660
commit 0ddb4c3
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 81 deletions.
diff --git a/hlink/tests/model_exploration_test.py b/hlink/tests/model_exploration_test.py
@@ -552,84 +552,3 @@ def test_step_2_split_by_id_a(
     assert splits[1][1].toPandas()["id_a"].unique().tolist() == ["30"]
 
     main.do_drop_all("")
-
-
-@pytest.mark.skip(
-    reason="Need to get tests working for new version of feature importances"
-)
-def test_step_3_get_feature_importances_probit(
-    spark,
-    training_conf,
-    training,
-    state_dist_path,
-    datasource_training_input,
-    potential_matches_path,
-    spark_test_tmp_dir_path,
-    matching,
-):
-    """Test running the chosen model on potential matches dataset"""
-    td_path, pa_path, pb_path = datasource_training_input
-
-    training_conf["comparison_features"] = [
-        {
-            "alias": "regionf",
-            "column_name": "region",
-            "comparison_type": "fetch_a",
-            "categorical": True,
-        },
-        {
-            "alias": "namelast_jw",
-            "column_name": "namelast",
-            "comparison_type": "jaro_winkler",
-        },
-        {
-            "alias": "state_distance",
-            "key_count": 1,
-            "column_name": "bpl",
-            "comparison_type": "geo_distance",
-            "loc_a": "statecode1",
-            "loc_b": "statecode2",
-            "distance_col": "dist",
-            "table_name": "state_distances_lookup",
-            "distances_file": state_dist_path,
-        },
-    ]
-
-    training_conf["training"]["dataset"] = td_path
-    training_conf["training"]["dependent_var"] = "match"
-    training_conf["training"]["independent_vars"] = [
-        "namelast_jw",
-        "regionf",
-        "state_distance",
-    ]
-
-    training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5}
-
-    # training_conf["training"]["use_potential_matches_features"] = True
-    training_conf["training"]["score_with_model"] = True
-    training_conf["training"]["feature_importances"] = True
-    training_conf["spark_tmp_dir"] = spark_test_tmp_dir_path
-    training_conf["drop_data_from_scored_matches"] = True
-
-    training.spark.read.csv(pa_path, header=True, inferSchema=True).write.mode(
-        "overwrite"
-    ).saveAsTable("prepped_df_a")
-    training.spark.read.csv(pb_path, header=True, inferSchema=True).write.mode(
-        "overwrite"
-    ).saveAsTable("prepped_df_b")
-    training.spark.read.csv(
-        potential_matches_path, header=True, inferSchema=True
-    ).write.mode("overwrite").saveAsTable("potential_matches")
-
-    training.run_step(0)
-    training.run_step(1)
-    training.run_step(2)
-    matching.run_step(2)
-    training.run_step(3)
-
-    fi_df = training.spark.table("feature_importances").toPandas()
-
-    assert fi_df.shape == (6, 3)
-    assert 25 > fi_df.query("idx == 0")["score"].iloc()[0] >= -5
-    assert "regionf_onehotencoded_2" in list(fi_df["name"])
-    assert "regionf_onehotencoded_invalidValues" in list(fi_df["name"])
diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py
@@ -349,6 +349,89 @@ def test_step_3_interacted_categorical_features(
     )
 
 
+def test_step_3_with_probit_model(
+    spark, training_conf, training, state_dist_path, datasource_training_input
+):
+    training_data_path, prepped_df_a_path, prepped_df_b_path = datasource_training_input
+    """Run training step 3 with a probit ML model."""
+    training_conf["comparison_features"] = [
+        {
+            "alias": "regionf",
+            "column_name": "region",
+            "comparison_type": "fetch_a",
+            "categorical": True,
+        },
+        {
+            "alias": "namelast_jw",
+            "column_name": "namelast",
+            "comparison_type": "jaro_winkler",
+        },
+        {
+            "alias": "state_distance",
+            "key_count": 1,
+            "column_name": "bpl",
+            "comparison_type": "geo_distance",
+            "loc_a": "statecode1",
+            "loc_b": "statecode2",
+            "distance_col": "dist",
+            "table_name": "state_distances_lookup",
+            "distances_file": state_dist_path,
+        },
+    ]
+    training_conf["training"]["dataset"] = training_data_path
+    training_conf["training"]["dependent_var"] = "match"
+    training_conf["training"]["independent_vars"] = [
+        "namelast_jw",
+        "regionf",
+        "state_distance",
+    ]
+
+    training_conf["training"]["chosen_model"] = {"type": "probit", "threshold": 0.5}
+    training_conf["training"]["score_with_model"] = True
+    training_conf["training"]["feature_importances"] = True
+
+    spark.read.csv(prepped_df_a_path, header=True, inferSchema=True).write.mode(
+        "overwrite"
+    ).saveAsTable("prepped_df_a")
+    spark.read.csv(prepped_df_b_path, header=True, inferSchema=True).write.mode(
+        "overwrite"
+    ).saveAsTable("prepped_df_b")
+
+    training.run_step(0)
+    training.run_step(1)
+    training.run_step(2)
+    training.run_step(3)
+
+    tfi = spark.table("training_feature_importances").toPandas()
+    assert (
+        8.9
+        <= tfi.query("feature_name == 'namelast_jw'")[
+            "coefficient_or_importance"
+        ].item()
+        <= 9.0
+    )
+    assert (
+        tfi.query("feature_name == 'regionf' and category == 0")[
+            "coefficient_or_importance"
+        ].item()
+        == 0
+    )
+    assert (
+        -7.6
+        <= tfi.query("feature_name == 'regionf' and category == 1")[
+            "coefficient_or_importance"
+        ].item()
+        <= -7.5
+    )
+    assert (
+        6.4
+        <= tfi.query("feature_name == 'regionf' and category == 99")[
+            "coefficient_or_importance"
+        ].item()
+        <= 6.5
+    )
+
+
 def test_step_3_requires_table(training_conf, training):
     training_conf["training"]["feature_importances"] = True
     with pytest.raises(RuntimeError, match="Missing input tables"):