diff --git a/hlink/linking/core/pipeline.py b/hlink/linking/core/pipeline.py index e06edc8..8331a31 100644 --- a/hlink/linking/core/pipeline.py +++ b/hlink/linking/core/pipeline.py @@ -12,6 +12,7 @@ Interaction, ) import hlink.linking.transformers.float_cast_transformer +from hlink.linking.transformers.rename_vector_attributes import RenameVectorAttributes import logging logger = logging.getLogger(__name__) @@ -143,7 +144,25 @@ def generate_pipeline_stages(conf, ind_vars, tf, tconf): inputCols=input_cols, outputCol=pipeline_feature["output_column"], ) + + # Spark's Interaction creates its output vector attribute names + # by concatenating the input column names with colons :. This + # works fine for most of the down-pipeline transformers, but + # LightGBM cannot run with attribute names that contain colons. + # So this custom hlink transformer replaces colons in the vector + # attribute names with underscores. + # + # Without this step, the colons propagate into the attribute + # names for the features vector created by the VectorAssembler + # and cause an error when training a LightGBM model. + remove_colons_from_interaction_vector = RenameVectorAttributes( + inputCol=interaction.getOutputCol(), + strsToReplace=[":"], + replaceWith="_", + ) + pipeline_stages.append(interaction) + pipeline_stages.append(remove_colons_from_interaction_vector) if len(categorical_pipeline_features) > 0: encoded_output_cols = [