[#161, #162] Unify feature importances for XGBoost and LightGBM

We now compute two feature importances for each model. - weight: the number of splits that each feature causes - gain: the total gain across all of each feature's splits
ipums · Nov 21, 2024 · 5ef0879 · 5ef0879
1 parent 7f7afe7
commit 5ef0879
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 12 deletions.
diff --git a/hlink/linking/training/link_step_save_model_metadata.py b/hlink/linking/training/link_step_save_model_metadata.py
@@ -93,7 +93,7 @@ def _run(self):
 
         if model_type == "xgboost":
             raw_weights = model.get_feature_importances("weight")
-            raw_gains = model.get_feature_importances("gain")
+            raw_gains = model.get_feature_importances("total_gain")
             keys = [f"f{index}" for index in range(len(true_cols))]
 
             weights = [raw_weights.get(key, 0.0) for key in keys]
@@ -102,16 +102,17 @@ def _run(self):
 
             features_df = self.task.spark.createDataFrame(
                 zip(true_column_names, true_categories, weights, gains),
-                "feature_name: string, category: int, weight: double, average_gain_per_split: double",
+                "feature_name: string, category: int, weight: double, gain: double",
             ).sort("feature_name", "category")
         elif model_type == "lightgbm":
-            num_splits = model.getFeatureImportances("split")
-            total_gains = model.getFeatureImportances("gain")
-            label = "Feature importances (number of splits and total gains)"
+            # The "weight" of a feature is the number of splits it causes.
+            weights = model.getFeatureImportances("split")
+            gains = model.getFeatureImportances("gain")
+            label = "Feature importances (weights and gains)"
 
             features_df = self.task.spark.createDataFrame(
-                zip(true_column_names, true_categories, num_splits, total_gains),
-                "feature_name: string, category: int, num_splits: double, total_gain: double",
+                zip(true_column_names, true_categories, weights, gains),
+                "feature_name: string, category: int, weight: double, gain: double",
             ).sort("feature_name", "category")
         else:
             try:

diff --git a/hlink/tests/training_test.py b/hlink/tests/training_test.py
@@ -490,8 +490,8 @@ def test_step_3_with_lightgbm_model(
     assert importances_df.columns == [
         "feature_name",
         "category",
-        "num_splits",
-        "total_gain",
+        "weight",
+        "gain",
     ]
 
 
@@ -548,8 +548,8 @@ def test_lightgbm_with_interacted_features(
     assert importances_df.columns == [
         "feature_name",
         "category",
-        "num_splits",
-        "total_gain",
+        "weight",
+        "gain",
     ]
 
 
@@ -601,7 +601,7 @@ def test_step_3_with_xgboost_model(
         "feature_name",
         "category",
         "weight",
-        "average_gain_per_split",
+        "gain",
     ]