Sync pop

commit a54c64aed5c86a6b1e9e9c93df3670e761092bb9 Author: Dominik Jain <[email protected]> Date: Thu Jul 13 11:29:05 2023 +0200 Address deprecation warning in OneHotEncoder src/sensai/data_transformation/dft.py commit 34d1cacac4648958e6481c57e8456879e54924f7 Author: Dominik Jain <[email protected]> Date: Thu Jul 13 11:28:39 2023 +0200 Add util.version to facilitate version-dependent changes in dependencies src/sensai/util/version.py commit 21660307061ec79e120c3d3e89bed59afef3e058 Author: Dominik Jain <[email protected]> Date: Wed Jul 12 20:25:21 2023 +0200 Added keyword parameter removeInputPreprocessors (and changed old behaviour as the default is False), because the removal of input preprocessors is strictly valid only if no learning is involved, which is a strong assumption src/sensai/feature_selection/rfe.py commit 982d8cd9eb14ec4a943557487e4a06e8da47422e Author: Dominik Jain <[email protected]> Date: Wed Jul 12 20:25:09 2023 +0200 VectorModel.withFeatureTransformers: Allow the existing list of transformers to be exended src/sensai/data_transformation/dft.py src/sensai/vector_model.py commit b29816643d7f3aba6ef8546e6c76adcdc1a31867 Author: Dominik Jain <[email protected]> Date: Wed Jul 12 14:28:13 2023 +0200 Improve type hints of builder-style 'with*' methods (parametric polymorphism) src/sensai/vector_model.py
opcode81 · Jul 19, 2023 · cdda979 · cdda979
1 parent 0322617
commit cdda979
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 21 deletions.
diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+import sklearn
 from sklearn.preprocessing import OneHotEncoder
 
 from .sklearn_transformer import SkLearnTransformerProtocol
@@ -17,6 +18,8 @@
 
 from typing import TYPE_CHECKING
 
+from ..util.version import Version
+
 if TYPE_CHECKING:
  from ..featuregen import FeatureGenerator
 
@@ -200,6 +203,9 @@ def findFirstTransformerByType(self, cls) -> Optional[DataFrameTransformer]:
  return dft
  return None
 
+ def append(self, t: DataFrameTransformer):
+ self.dataFrameTransformers.append(t)
+
 
 class DFTRenameColumns(RuleBasedDataFrameTransformer):
  def __init__(self, columnsMap: Dict[str, str]):
@@ -371,11 +377,17 @@ def __init__(self, columns: Optional[Union[str, Sequence[str]]],
  self.handleUnknown = "ignore" if ignoreUnknown else "error"
  if categories is not None:
  if type(categories) == dict:
- self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], sparse=False, handle_unknown=self.handleUnknown) for col, categories in categories.items()}
+ self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], handle_unknown=self.handleUnknown, **self._sparseKwargs()) for col, categories in categories.items()}
  else:
  if len(columns) != len(categories):
  raise ValueError(f"Given categories must have the same length as columns to process")
- self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], sparse=False, handle_unknown=self.handleUnknown) for col, categories in zip(columns, categories)}
+ self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], handle_unknown=self.handleUnknown, **self._sparseKwargs()) for col, categories in zip(columns, categories)}
+
+ def _sparseKwargs(self, sparse=False):
+ if Version(sklearn).isAtLeast(1, 2):
+ return dict(sparse_output=sparse)
+ else:
+ return dict(sparse=sparse)
 
  def __setstate__(self, state):
  if "arrayValuedResult" not in state:
@@ -393,7 +405,7 @@ def _fit(self, df: pd.DataFrame):
  if len(self._columnsToEncode) == 0:
  log.warning(f"{self} does not apply to any columns, transformer has no effect; regex='{self._columnNameRegex}'")
  if self.oneHotEncoders is None:
- self.oneHotEncoders = {column: OneHotEncoder(categories=[np.sort(df[column].unique())], sparse=False, handle_unknown=self.handleUnknown) for column in self._columnsToEncode}
+ self.oneHotEncoders = {column: OneHotEncoder(categories=[np.sort(df[column].unique())], handle_unknown=self.handleUnknown, **self._sparseKwargs()) for column in self._columnsToEncode}
  for columnName in self._columnsToEncode:
  self.oneHotEncoders[columnName].fit(df[[columnName]])
 

diff --git a/src/sensai/feature_selection/rfe.py b/src/sensai/feature_selection/rfe.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from sensai import VectorModel, InputOutputData
+from sensai.data_transformation import DFTKeepColumns, DFTColumnFilter
 from sensai.evaluation import VectorModelCrossValidatorParams, createVectorModelCrossValidator
 from sensai.feature_importance import FeatureImportanceProvider, AggregatedFeatureImportance
 from sensai.util.plot import ScatterPlot
@@ -37,6 +38,8 @@ def __init__(self, crossValidatorParams: VectorModelCrossValidatorParams, minFea
  :param crossValidatorParams: the parameters for cross-validation
  :param minFeatures: the minimum number of features to evaluate
  """
+ if not crossValidatorParams.returnTrainedModels:
+ raise ValueError("crossValidatorParams: returnTrainedModels is required to be enabled")
  self.crossValidatorParams = crossValidatorParams
  self.minFeatures = minFeatures
 
@@ -81,25 +84,36 @@ def plotMetricValues(self) -> plt.Figure:
  return ScatterPlot(self.getNumFeaturesArray(), self.getMetricValuesArray(), c_opacity=1, x_label="number of features",
  y_label=f"cross-validation mean metric value ({self.metricName})").fig
 
- def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: InputOutputData, metricName: str, minimise: bool) -> Result:
+ def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: InputOutputData, metricName: str,
+ minimise: bool, removeInputPreprocessors=False) -> Result:
  """
  Runs the optimisation for the given model and data.
 
  :param model: the model
  :param ioData: the data
  :param metricName: the metric to optimise
  :param minimise: whether the metric shall be minimsed; if False, maximise.
+ :param removeInputPreprocessors: whether to remove input preprocessors from the model and create input data
+ only once during the entire experiment; this is usually reasonable only if all input preprocessors are not
+ trained on the input data or if, for any given data split/fold, the preprocessor learning outcome is likely
+ to be largely similar.
  :return: a result object, which provides access to the selected features and data on all elimination steps
  """
  metricKey = f"mean[{metricName}]"
 
- model = copy(model)
- model.fitInputOutputData(ioData, fitPreprocessors=True, fitModel=False)
- inputs = model.computeModelInputs(ioData.inputs)
- model.removeInputPreprocessors()
- ioData = InputOutputData(inputs, ioData.outputs)
+ dftColumnFilter = None
+ if removeInputPreprocessors:
+ model = copy(model)
+ model.fitInputOutputData(ioData, fitPreprocessors=True, fitModel=False)
+ inputs = model.computeModelInputs(ioData.inputs)
+ model.removeInputPreprocessors()
+ ioData = InputOutputData(inputs, ioData.outputs)
+ features = list(inputs.columns)
+ else:
+ features = None # can only be obtained after having fitted the model initially (see below)
+ dftColumnFilter = DFTColumnFilter()
+ model.withFeatureTransformers(dftColumnFilter, add=True)
 
- features = list(inputs.columns)
  steps = []
  while True:
  # evaluate model
@@ -108,6 +122,9 @@ def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: Inpu
  aggMetricsDict = crossValData.getEvalStatsCollection().aggMetricsDict()
  metricValue = aggMetricsDict[metricKey]
 
+ if features is None:
+ features = crossValData.trainedModels[0].getModelInputVariableNames()
+
  steps.append(self.Step(metricValue=metricValue, features=features))
 
  # eliminate feature(s)
@@ -127,7 +144,8 @@ def run(self, model: Union[VectorModel, FeatureImportanceProvider], ioData: Inpu
  eliminatedFeatures = [tuples[0][0]]
  log.info(f"Eliminating feature {eliminatedFeatures[0]}")
  features = [f for f in features if f not in eliminatedFeatures]
- ioData.inputs = ioData.inputs[features]
+ dftColumnFilter.keep = features
+
  log.info(f"{len(features)} features remain")
 
  if len(features) < self.minFeatures:

diff --git a/src/sensai/util/version.py b/src/sensai/util/version.py
@@ -0,0 +1,27 @@
+
+
+class Version:
+ """
+ Assists in checking the version of a Python package based on the __version__ attribute
+ """
+ def __init__(self, package):
+ """
+ :param package: the package object
+ """
+ self.components = package.__version__.split(".")
+
+ def isAtLeast(self, *components: int):
+ """
+ Checks this version against the given version components.
+ This version object must contain at least the respective number of components
+
+ :param components: version components in order (i.e. major, minor, patch, etc.)
+ :return: True if the version is at least the given version, False otherwise
+ """
+ for i, desired_min_version in enumerate(components):
+ actual_version = int(self.components[i])
+ if actual_version < desired_min_version:
+ return False
+ elif actual_version > desired_min_version:
+ return True
+ return True
diff --git a/src/sensai/vector_model.py b/src/sensai/vector_model.py
@@ -5,6 +5,7 @@
 """
 
 import logging
+import typing
 from abc import ABC, abstractmethod
 from typing import List, Any, Optional, Union, Type, Dict
 
@@ -25,6 +26,9 @@
 markUsed(InputOutputData) # for backward compatibility
 
 log = logging.getLogger(__name__)
+TVectorModelBase = typing.TypeVar("TVectorModelBase", bound="VectorModelBase")
+TVectorModel = typing.TypeVar("TVectorModel", bound="VectorModel")
+TVectorRegressionModel = typing.TypeVar("TVectorRegressionModel", bound="VectorRegressionModel")
 
 
 class VectorModelBase(ABC):
@@ -47,7 +51,7 @@ def isRegressionModel(self) -> bool:
  def getPredictedVariableNames(self) -> list:
  pass
 
- def withName(self, name: str):
+ def withName(self: TVectorModelBase, name: str) -> TVectorModelBase:
  """
  Sets the model's name.
 
@@ -148,7 +152,7 @@ def _toStringAdditionalEntries(self) -> Dict[str, Any]:
  d["name"] = self._name
  return d
 
- def withRawInputTransformers(self, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]):
+ def withRawInputTransformers(self: TVectorModel, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel:
  """
  Makes the model use the given transformers (removing previously set raw input transformers, if any), which
  are to be applied to the raw input data frame (prior to feature generation).
@@ -159,22 +163,28 @@ def withRawInputTransformers(self, *transformers: Union[DataFrameTransformer, Li
  self._rawInputTransformerChain = DataFrameTransformerChain(*transformers)
  return self
 
- def withFeatureTransformers(self, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> __qualname__:
+ def withFeatureTransformers(self: TVectorModel, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]],
+ add=False) -> TVectorModel:
  """
- Makes the model use the given transformers (removing previously set feature transformers, if any),
+ Makes the model use the given transformers
  which are to be applied to the data frames generated by feature generators.
  (If the model does not use feature generators, the transformers will be applied to
  whatever is produced by the raw input transformers or, if there are none, the original raw
  input data frame).
 
  :param transformers: :class:`DataFrameTransformer` instances to use (in sequence) for the transformation of features
+ :param add: whether to add the transformers to the existing transformers rather than replacing them
  :return: self
  """
- self._featureTransformerChain = DataFrameTransformerChain(*transformers)
+ if not add:
+ self._featureTransformerChain = DataFrameTransformerChain(*transformers)
+ else:
+ for t in transformers:
+ self._featureTransformerChain.append(t)
  return self
 
  @deprecated("Use withFeatureTransformers instead; this method will be removed in a future sensAI release.")
- def withInputTransformers(self, *inputTransformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> __qualname__:
+ def withInputTransformers(self: TVectorModel, *inputTransformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel:
  """
  Makes the model use the given feature transformers (removing previously set transformers, if any),
  i.e. it transforms the data frame that is generated by the feature generators (if any).
@@ -184,7 +194,7 @@ def withInputTransformers(self, *inputTransformers: Union[DataFrameTransformer,
  """
  return self.withFeatureTransformers(*inputTransformers)
 
- def withFeatureGenerator(self, featureGenerator: Optional[FeatureGenerator]) -> __qualname__:
+ def withFeatureGenerator(self: TVectorModel, featureGenerator: Optional[FeatureGenerator]) -> TVectorModel:
  """
  Makes the model use the given feature generator in order to obtain the model inputs.
  If the model shall use more than one feature generator, pass a :class:`MultiFeatureGenerator` which combines them or
@@ -198,7 +208,7 @@ def withFeatureGenerator(self, featureGenerator: Optional[FeatureGenerator]) ->
  self._featureGenerator = featureGenerator
  return self
 
- def withFeatureCollector(self, featureCollector: FeatureCollector) -> __qualname__:
+ def withFeatureCollector(self: TVectorModel, featureCollector: FeatureCollector) -> TVectorModel:
  """
  Makes the model use the given feature collector's multi-feature generator
  in order compute the underlying model's input from the data frame that is given.
@@ -502,7 +512,7 @@ def _toStringExcludeExceptions(self) -> List[str]:
  def isRegressionModel(self) -> bool:
  return True
 
- def withOutputTransformers(self, *outputTransformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> __qualname__:
+ def withOutputTransformers(self: TVectorRegressionModel, *outputTransformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorRegressionModel:
  """
  Makes the model use the given output transformers. Call with empty input to remove existing output transformers.
  The transformers are ignored during the fit phase. Not supported for rule-based models.
@@ -531,7 +541,7 @@ def withOutputTransformers(self, *outputTransformers: Union[DataFrameTransformer
  self._outputTransformerChain = DataFrameTransformerChain(*outputTransformers)
  return self
 
- def withTargetTransformer(self, targetTransformer: Optional[InvertibleDataFrameTransformer]) -> __qualname__:
+ def withTargetTransformer(self: TVectorRegressionModel, targetTransformer: Optional[InvertibleDataFrameTransformer]) -> TVectorRegressionModel:
  """
  Makes the model use the given target transformers such that the underlying low-level model is trained on the transformed
  targets, but this high-level model still outputs the original (untransformed) values, i.e. the transformation is applied