Skip to content

Commit

Permalink
[#161] Support saving model metadata for xgboost
Browse files Browse the repository at this point in the history
This is really different from the Spark models, so I've made it a special case
instead of trying to integrate it with the previous logic closely. This section
might be due for some refactoring now.
  • Loading branch information
riley-harper committed Nov 18, 2024
1 parent 7423169 commit ffba81a
Showing 1 changed file with 41 additions and 24 deletions.
65 changes: 41 additions & 24 deletions hlink/linking/training/link_step_save_model_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,35 +86,52 @@ def _run(self):
base_col = col.removesuffix("_imp")
true_cols.append((base_col, None))

true_column_names = [column_name for (column_name, _) in true_cols]
true_categories = [category for (_, category) in true_cols]
model_type = config[training_conf]["chosen_model"]["type"]

print("Retrieving model feature importances or coefficients...")
try:
feature_imp = classifier.coefficients
except:

if model_type == "xgboost":
raw_weights = classifier.get_feature_importances("weight")
raw_gains = classifier.get_feature_importances("gain")
keys = [f"f{index}" for index in range(len(true_cols))]

weights = [raw_weights.get(key, 0.0) for key in keys]
gains = [raw_gains.get(key, 0.0) for key in keys]
label = "Feature importances (weights and gain)"

features_df = self.task.spark.createDataFrame(
zip(true_column_names, true_categories, weights, gains),
"feature_name: string, category: int, weight: double, average_gain_per_split: double",
).sort("feature_name", "category")
else:
try:
feature_imp = classifier.featureImportances
feature_imp = classifier.coefficients
except:
print(
"This model doesn't contain a coefficient or feature importances parameter -- check chosen model type."
)
return
try:
feature_imp = classifier.featureImportances
except:
print(
"This model doesn't contain a coefficient or feature importances parameter -- check chosen model type."
)
return
else:
label = "Feature importances"
else:
label = "Feature importances"
else:
label = "Coefficients"
label = "Coefficients"

# We need to convert from numpy float64s to Python floats to avoid type
# issues when creating the DataFrame below.
feature_importances = [
float(importance) for importance in feature_imp.toArray()
]

true_column_names = [column_name for (column_name, _) in true_cols]
true_categories = [category for (_, category) in true_cols]

features_df = self.task.spark.createDataFrame(
zip(true_column_names, true_categories, feature_importances, strict=True),
"feature_name: string, category: int, coefficient_or_importance: double",
).sort("feature_name", "category")
# We need to convert from numpy float64s to Python floats to avoid type
# issues when creating the DataFrame below.
feature_importances = [
float(importance) for importance in feature_imp.toArray()
]
features_df = self.task.spark.createDataFrame(
zip(
true_column_names, true_categories, feature_importances, strict=True
),
"feature_name: string, category: int, coefficient_or_importance: double",
).sort("feature_name", "category")

feature_importances_table = (
f"{self.task.table_prefix}training_feature_importances"
Expand Down

0 comments on commit ffba81a

Please sign in to comment.