Skip to content

Commit

Permalink
Sync pop
Browse files Browse the repository at this point in the history
  commit a54c64aed5c86a6b1e9e9c93df3670e761092bb9
  Author: Dominik Jain <[email protected]>
  Date:   Thu Jul 13 11:29:05 2023 +0200

      Address deprecation warning in OneHotEncoder

  src/sensai/data_transformation/dft.py

  commit 34d1cacac4648958e6481c57e8456879e54924f7
  Author: Dominik Jain <[email protected]>
  Date:   Thu Jul 13 11:28:39 2023 +0200

      Add util.version to facilitate version-dependent changes in dependencies

  src/sensai/util/version.py

  commit 21660307061ec79e120c3d3e89bed59afef3e058
  Author: Dominik Jain <[email protected]>
  Date:   Wed Jul 12 20:25:21 2023 +0200

      Added keyword parameter removeInputPreprocessors (and changed old behaviour
      as the default is False), because the removal of input preprocessors is
      strictly valid only if no learning is involved, which is a strong assumption

  src/sensai/feature_selection/rfe.py

  commit 982d8cd9eb14ec4a943557487e4a06e8da47422e
  Author: Dominik Jain <[email protected]>
  Date:   Wed Jul 12 20:25:09 2023 +0200

      VectorModel.withFeatureTransformers: Allow the existing list of transformers to be exended

  src/sensai/data_transformation/dft.py
  src/sensai/vector_model.py

  commit b29816643d7f3aba6ef8546e6c76adcdc1a31867
  Author: Dominik Jain <[email protected]>
  Date:   Wed Jul 12 14:28:13 2023 +0200

      Improve type hints of builder-style 'with*' methods (parametric polymorphism)

  src/sensai/vector_model.py
  • Loading branch information
opcode81 committed Jul 19, 2023
1 parent 0322617 commit cdda979
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 21 deletions.
18 changes: 15 additions & 3 deletions src/sensai/data_transformation/dft.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder

from .sklearn_transformer import SkLearnTransformerProtocol
Expand All @@ -17,6 +18,8 @@

from typing import TYPE_CHECKING

from ..util.version import Version

if TYPE_CHECKING:
from ..featuregen import FeatureGenerator

Expand Down Expand Up @@ -200,6 +203,9 @@ def findFirstTransformerByType(self, cls) -> Optional[DataFrameTransformer]:
return dft
return None

def append(self, t: DataFrameTransformer):
self.dataFrameTransformers.append(t)


class DFTRenameColumns(RuleBasedDataFrameTransformer):
def __init__(self, columnsMap: Dict[str, str]):
Expand Down Expand Up @@ -371,11 +377,17 @@ def __init__(self, columns: Optional[Union[str, Sequence[str]]],
self.handleUnknown = "ignore" if ignoreUnknown else "error"
if categories is not None:
if type(categories) == dict:
self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], sparse=False, handle_unknown=self.handleUnknown) for col, categories in categories.items()}
self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], handle_unknown=self.handleUnknown, **self._sparseKwargs()) for col, categories in categories.items()}
else:
if len(columns) != len(categories):
raise ValueError(f"Given categories must have the same length as columns to process")
self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], sparse=False, handle_unknown=self.handleUnknown) for col, categories in zip(columns, categories)}
self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], handle_unknown=self.handleUnknown, **self._sparseKwargs()) for col, categories in zip(columns, categories)}

def _sparseKwargs(self, sparse=False):
if Version(sklearn).isAtLeast(1, 2):
return dict(sparse_output=sparse)
else:
return dict(sparse=sparse)

def __setstate__(self, state):
if "arrayValuedResult" not in state:
Expand All @@ -393,7 +405,7 @@ def _fit(self, df: pd.DataFrame):
if len(self._columnsToEncode) == 0:
log.warning(f"{self} does not apply to any columns, transformer has no effect; regex='{self._columnNameRegex}'")
if self.oneHotEncoders is None:
self.oneHotEncoders = {column: OneHotEncoder(categories=[np.sort(df[column].unique())], sparse=False, handle_unknown=self.handleUnknown) for column in self._columnsToEncode}
self.oneHotEncoders = {column: OneHotEncoder(categories=[np.sort(df[column].unique())], handle_unknown=self.handleUnknown, **self._sparseKwargs()) for column in self._columnsToEncode}
for columnName in self._columnsToEncode:
self.oneHotEncoders[columnName].fit(df[[columnName]])

Expand Down
34 changes: 26 additions & 8 deletions src/sensai/feature_selection/rfe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np

from sensai import VectorModel, InputOutputData
from sensai.data_transformation import DFTKeepColumns, DFTColumnFilter
from sensai.evaluation import VectorModelCrossValidatorParams, createVectorModelCrossValidator
from sensai.feature_importance import FeatureImportanceProvider, AggregatedFeatureImportance
from sensai.util.plot import ScatterPlot
Expand Down Expand Up @@ -37,6 +38,8 @@ def __init__(self, crossValidatorParams: VectorModelCrossValidatorParams, minFea
:param crossValidatorParams: the parameters for cross-validation
:param minFeatures: the minimum number of features to evaluate
"""
if not crossValidatorParams.returnTrainedModels:
raise ValueError("crossValidatorParams: returnTrainedModels is required to be enabled")
self.crossValidatorParams = crossValidatorParams
self.minFeatures = minFeatures

Expand Down Expand Up @@ -81,25 +84,36 @@ def plotMetricValues(self) -> plt.Figure:
return ScatterPlot(self.getNumFeaturesArray(), self.getMetricValuesArray(), c_opacity=1, x_label="number of features",
y_label=f"cross-validation mean metric value ({self.metricName})").fig

def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: InputOutputData, metricName: str, minimise: bool) -> Result:
def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: InputOutputData, metricName: str,
minimise: bool, removeInputPreprocessors=False) -> Result:
"""
Runs the optimisation for the given model and data.
:param model: the model
:param ioData: the data
:param metricName: the metric to optimise
:param minimise: whether the metric shall be minimsed; if False, maximise.
:param removeInputPreprocessors: whether to remove input preprocessors from the model and create input data
only once during the entire experiment; this is usually reasonable only if all input preprocessors are not
trained on the input data or if, for any given data split/fold, the preprocessor learning outcome is likely
to be largely similar.
:return: a result object, which provides access to the selected features and data on all elimination steps
"""
metricKey = f"mean[{metricName}]"

model = copy(model)
model.fitInputOutputData(ioData, fitPreprocessors=True, fitModel=False)
inputs = model.computeModelInputs(ioData.inputs)
model.removeInputPreprocessors()
ioData = InputOutputData(inputs, ioData.outputs)
dftColumnFilter = None
if removeInputPreprocessors:
model = copy(model)
model.fitInputOutputData(ioData, fitPreprocessors=True, fitModel=False)
inputs = model.computeModelInputs(ioData.inputs)
model.removeInputPreprocessors()
ioData = InputOutputData(inputs, ioData.outputs)
features = list(inputs.columns)
else:
features = None # can only be obtained after having fitted the model initially (see below)
dftColumnFilter = DFTColumnFilter()
model.withFeatureTransformers(dftColumnFilter, add=True)

features = list(inputs.columns)
steps = []
while True:
# evaluate model
Expand All @@ -108,6 +122,9 @@ def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: Inpu
aggMetricsDict = crossValData.getEvalStatsCollection().aggMetricsDict()
metricValue = aggMetricsDict[metricKey]

if features is None:
features = crossValData.trainedModels[0].getModelInputVariableNames()

steps.append(self.Step(metricValue=metricValue, features=features))

# eliminate feature(s)
Expand All @@ -127,7 +144,8 @@ def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: Inpu
eliminatedFeatures = [tuples[0][0]]
log.info(f"Eliminating feature {eliminatedFeatures[0]}")
features = [f for f in features if f not in eliminatedFeatures]
ioData.inputs = ioData.inputs[features]
dftColumnFilter.keep = features

log.info(f"{len(features)} features remain")

if len(features) < self.minFeatures:
Expand Down
27 changes: 27 additions & 0 deletions src/sensai/util/version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@


class Version:
"""
Assists in checking the version of a Python package based on the __version__ attribute
"""
def __init__(self, package):
"""
:param package: the package object
"""
self.components = package.__version__.split(".")

def isAtLeast(self, *components: int):
"""
Checks this version against the given version components.
This version object must contain at least the respective number of components
:param components: version components in order (i.e. major, minor, patch, etc.)
:return: True if the version is at least the given version, False otherwise
"""
for i, desired_min_version in enumerate(components):
actual_version = int(self.components[i])
if actual_version < desired_min_version:
return False
elif actual_version > desired_min_version:
return True
return True
30 changes: 20 additions & 10 deletions src/sensai/vector_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import logging
import typing
from abc import ABC, abstractmethod
from typing import List, Any, Optional, Union, Type, Dict

Expand All @@ -25,6 +26,9 @@
markUsed(InputOutputData) # for backward compatibility

log = logging.getLogger(__name__)
TVectorModelBase = typing.TypeVar("TVectorModelBase", bound="VectorModelBase")
TVectorModel = typing.TypeVar("TVectorModel", bound="VectorModel")
TVectorRegressionModel = typing.TypeVar("TVectorRegressionModel", bound="VectorRegressionModel")


class VectorModelBase(ABC):
Expand All @@ -47,7 +51,7 @@ def isRegressionModel(self) -> bool:
def getPredictedVariableNames(self) -> list:
pass

def withName(self, name: str):
def withName(self: TVectorModelBase, name: str) -> TVectorModelBase:
"""
Sets the model's name.
Expand Down Expand Up @@ -148,7 +152,7 @@ def _toStringAdditionalEntries(self) -> Dict[str, Any]:
d["name"] = self._name
return d

def withRawInputTransformers(self, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]):
def withRawInputTransformers(self: TVectorModel, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel:
"""
Makes the model use the given transformers (removing previously set raw input transformers, if any), which
are to be applied to the raw input data frame (prior to feature generation).
Expand All @@ -159,22 +163,28 @@ def withRawInputTransformers(self, *transformers: Union[DataFrameTransformer, Li
self._rawInputTransformerChain = DataFrameTransformerChain(*transformers)
return self

def withFeatureTransformers(self, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> __qualname__:
def withFeatureTransformers(self: TVectorModel, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]],
add=False) -> TVectorModel:
"""
Makes the model use the given transformers (removing previously set feature transformers, if any),
Makes the model use the given transformers
which are to be applied to the data frames generated by feature generators.
(If the model does not use feature generators, the transformers will be applied to
whatever is produced by the raw input transformers or, if there are none, the original raw
input data frame).
:param transformers: :class:`DataFrameTransformer` instances to use (in sequence) for the transformation of features
:param add: whether to add the transformers to the existing transformers rather than replacing them
:return: self
"""
self._featureTransformerChain = DataFrameTransformerChain(*transformers)
if not add:
self._featureTransformerChain = DataFrameTransformerChain(*transformers)
else:
for t in transformers:
self._featureTransformerChain.append(t)
return self

@deprecated("Use withFeatureTransformers instead; this method will be removed in a future sensAI release.")
def withInputTransformers(self, *inputTransformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> __qualname__:
def withInputTransformers(self: TVectorModel, *inputTransformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel:
"""
Makes the model use the given feature transformers (removing previously set transformers, if any),
i.e. it transforms the data frame that is generated by the feature generators (if any).
Expand All @@ -184,7 +194,7 @@ def withInputTransformers(self, *inputTransformers: Union[DataFrameTransformer,
"""
return self.withFeatureTransformers(*inputTransformers)

def withFeatureGenerator(self, featureGenerator: Optional[FeatureGenerator]) -> __qualname__:
def withFeatureGenerator(self: TVectorModel, featureGenerator: Optional[FeatureGenerator]) -> TVectorModel:
"""
Makes the model use the given feature generator in order to obtain the model inputs.
If the model shall use more than one feature generator, pass a :class:`MultiFeatureGenerator` which combines them or
Expand All @@ -198,7 +208,7 @@ def withFeatureGenerator(self, featureGenerator: Optional[FeatureGenerator]) ->
self._featureGenerator = featureGenerator
return self

def withFeatureCollector(self, featureCollector: FeatureCollector) -> __qualname__:
def withFeatureCollector(self: TVectorModel, featureCollector: FeatureCollector) -> TVectorModel:
"""
Makes the model use the given feature collector's multi-feature generator
in order compute the underlying model's input from the data frame that is given.
Expand Down Expand Up @@ -502,7 +512,7 @@ def _toStringExcludeExceptions(self) -> List[str]:
def isRegressionModel(self) -> bool:
return True

def withOutputTransformers(self, *outputTransformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> __qualname__:
def withOutputTransformers(self: TVectorRegressionModel, *outputTransformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorRegressionModel:
"""
Makes the model use the given output transformers. Call with empty input to remove existing output transformers.
The transformers are ignored during the fit phase. Not supported for rule-based models.
Expand Down Expand Up @@ -531,7 +541,7 @@ def withOutputTransformers(self, *outputTransformers: Union[DataFrameTransformer
self._outputTransformerChain = DataFrameTransformerChain(*outputTransformers)
return self

def withTargetTransformer(self, targetTransformer: Optional[InvertibleDataFrameTransformer]) -> __qualname__:
def withTargetTransformer(self: TVectorRegressionModel, targetTransformer: Optional[InvertibleDataFrameTransformer]) -> TVectorRegressionModel:
"""
Makes the model use the given target transformers such that the underlying low-level model is trained on the transformed
targets, but this high-level model still outputs the original (untransformed) values, i.e. the transformation is applied
Expand Down

0 comments on commit cdda979

Please sign in to comment.