Skip to content

Commit

Permalink
[#162] Integrate RenameVectorAttributes to remove colons from Interac…
Browse files Browse the repository at this point in the history
…tion output for LightGBM
  • Loading branch information
riley-harper committed Nov 20, 2024
1 parent 2e58078 commit 8150ee5
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions hlink/linking/core/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Interaction,
)
import hlink.linking.transformers.float_cast_transformer
from hlink.linking.transformers.rename_vector_attributes import RenameVectorAttributes
import logging

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -143,7 +144,25 @@ def generate_pipeline_stages(conf, ind_vars, tf, tconf):
inputCols=input_cols,
outputCol=pipeline_feature["output_column"],
)

# Spark's Interaction creates its output vector attribute names
# by concatenating the input column names with colons :. This
# works fine for most of the down-pipeline transformers, but
# LightGBM cannot run with attribute names that contain colons.
# So this custom hlink transformer replaces colons in the vector
# attribute names with underscores.
#
# Without this step, the colons propagate into the attribute
# names for the features vector created by the VectorAssembler
# and cause an error when training a LightGBM model.
remove_colons_from_interaction_vector = RenameVectorAttributes(
inputCol=interaction.getOutputCol(),
strsToReplace=[":"],
replaceWith="_",
)

pipeline_stages.append(interaction)
pipeline_stages.append(remove_colons_from_interaction_vector)

if len(categorical_pipeline_features) > 0:
encoded_output_cols = [
Expand Down

0 comments on commit 8150ee5

Please sign in to comment.