alteryx · NabilFayak · Sep 5, 2023 · Sep 5, 2023 · Sep 5, 2023 · Sep 5, 2023
diff --git a/checkmates/data_checks/__init__.py b/checkmates/data_checks/__init__.py
@@ -52,6 +52,6 @@
 from checkmates.data_checks.checks.invalid_target_data_check import (
     InvalidTargetDataCheck,
 )
-
+from checkmates.data_checks.checks.distribution_data_check import DistributionDataCheck
 
 from checkmates.data_checks.datacheck_meta.utils import handle_data_check_action_code
diff --git a/checkmates/data_checks/checks/distribution_data_check.py b/checkmates/data_checks/checks/distribution_data_check.py
@@ -0,0 +1,107 @@
+"""Data check that screens data for skewed or bimodal distrbutions prior to model training to ensure model performance is unaffected."""
+
+from diptest import diptest
+from scipy.stats import skew
+
+from checkmates.data_checks import (
+    DataCheck,
+    DataCheckActionCode,
+    DataCheckActionOption,
+    DataCheckMessageCode,
+    DataCheckWarning,
+)
+
+
+class DistributionDataCheck(DataCheck):
+    """Check if the overall data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation."""
+
+    def validate(self, X, y):
+        """Check if the overall data has a skewed or bimodal distribution.
+
+        Args:
+            X (pd.DataFrame, np.ndarray): Overall data to check for skewed or bimodal distributions.
+            y (pd.Series, np.ndarray): Target data to check for underlying distributions.
+
+        Returns:
+            dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the overall data.
+
+        Examples:
+            >>> import pandas as pd
+
+            Features and target data that exhibit a skewed distribution will raise a warning for the user to transform the data.
+
+            >>> X = [5, 7, 8, 9, 10, 11, 12, 15, 20]
+            >>> data_check = DistributionDataCheck()
+            >>> assert data_check.validate(X, y) == [
+            ...     {
+            ...         "message": "Data may have a skewed distribution.",
+            ...         "data_check_name": "DistributionDataCheck",
+            ...         "level": "warning",
+            ...         "code": "SKEWED_DISTRIBUTION",
+            ...         "details": {"distribution type": "positive skew", "Skew Value": 0.7939, "Bimodal Coefficient": 1.0,},
+            ...         "action_options": [
+            ...             {
+            ...                 "code": "TRANSFORM_FEATURES",
+            ...                 "data_check_name": "DistributionDataCheck",
+            ...                 "parameters": {},
+            ...                 "metadata": {
+                                    "is_skew": True,
+                                    "transformation_strategy": "yeojohnson",
+            ...                 }
+            ...             }
+            ...         ]
+            ...     }
+            ... ]
+        """
+        messages = []
+
+        numeric_X = X.ww.select(["Integer", "Double"])
+
+        for col in numeric_X:
+            (
+                is_skew,
+                distribution_type,
+                skew_value,
+                coef,
+            ) = _detect_skew_distribution_helper(col)
+
+            if is_skew:
+                details = {
+                    "distribution type": distribution_type,
+                    "Skew Value": skew_value,
+                    "Bimodal Coefficient": coef,
+                }
+                messages.append(
+                    DataCheckWarning(
+                        message="Data may have a skewed distribution.",
+                        data_check_name=self.name,
+                        message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION,
+                        details=details,
+                        action_options=[
+                            DataCheckActionOption(
+                                DataCheckActionCode.TRANSFORM_FEATURES,
+                                data_check_name=self.name,
+                                metadata={
+                                    "is_skew": True,
+                                    "transformation_strategy": "yeojohnson",
+                                    "columns": col,
+                                },
+                            ),
+                        ],
+                    ).to_dict(),
+                )
+        return messages
+
+
+def _detect_skew_distribution_helper(X):
+    """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient."""
+    skew_value = skew(X)
+    coef = diptest(X)[1]
+
+    if coef < 0.05:
+        return True, "bimodal distribution", skew_value, coef
+    if skew_value < -0.5:
+        return True, "negative skew", skew_value, coef
+    if skew_value > 0.5:
+        return True, "positive skew", skew_value, coef
+    return False, "no skew", skew_value, coef
diff --git a/checkmates/data_checks/datacheck_meta/data_check_action_code.py b/checkmates/data_checks/datacheck_meta/data_check_action_code.py
@@ -19,6 +19,9 @@ class DataCheckActionCode(Enum):
     TRANSFORM_TARGET = "transform_target"
     """Action code for transforming the target data."""
 
+    TRANSFORM_FEATURES = "transform_features"
+    """Action code for transforming the features data."""
+
     REGULARIZE_AND_IMPUTE_DATASET = "regularize_and_impute_dataset"
     """Action code for regularizing and imputing all features and target time series data."""
 

diff --git a/checkmates/data_checks/datacheck_meta/data_check_message_code.py b/checkmates/data_checks/datacheck_meta/data_check_message_code.py
@@ -58,6 +58,9 @@ class DataCheckMessageCode(Enum):
     TARGET_LOGNORMAL_DISTRIBUTION = "target_lognormal_distribution"
     """Message code for target data with a lognormal distribution."""
 
+    SKEWED_DISTRIBUTION = "skewed_distribution"
+    """Message code for data with a skewed distribution."""
+
     HIGH_VARIANCE = "high_variance"
     """Message code for when high variance is detected for cross-validation."""
 

diff --git a/checkmates/pipelines/__init__.py b/checkmates/pipelines/__init__.py
@@ -2,7 +2,7 @@
 
 from checkmates.pipelines.component_base_meta import ComponentBaseMeta
 from checkmates.pipelines.component_base import ComponentBase
-from checkmates.pipelines.transformers import Transformer
+from checkmates.pipelines.transformers import Transformer, SimpleNormalizer
 from checkmates.pipelines.components import (  # noqa: F401
     DropColumns,
     DropRowsTransformer,

diff --git a/checkmates/pipelines/transformers.py b/checkmates/pipelines/transformers.py
@@ -3,6 +3,7 @@
 
 import pandas as pd
 import woodwork
+from scipy.stats import yeojohnson
 from sklearn.impute import SimpleImputer as SkImputer
 
 from checkmates.exceptions import MethodPropertyNotFoundError
@@ -83,6 +84,55 @@ def _get_feature_provenance(self):
         return {}
 
 
+"""Component that normalizes skewed distributions using the Yeo-Johnson method"""
+
+
+class SimpleNormalizer(Transformer):
+    """Normalizes skewed data according to the Yeo-Johnson method."""
+
+    def __init__(self):
+        super().__init__(
+            parameters=None,
+            _cols_to_normalize=None,
+        )
+
+    def transform(self, X, y=None):
+        """Transforms input by normalizing distribution.
+
+        Args:
+            X (pd.DataFrame): Data to transform.
+            y (pd.Series, optional): Target Data
+
+        Returns:
+            pd.DataFrame: Transformed X
+        """
+        # If there are no columns to normalize, return early
+        if not self._cols_to_normalize:
+            return self
+
+        # Only select the skewed column to normalize
+        x_t = X[self._cols_to_normalize]
+        X_t = X
+
+        # Transform the data
+        X_t[self._cols_to_normalize] = yeojohnson(x_t)
+
+        # Reinit woodwork
+        X_t.ww.init()
+
+    def fit_transform(self, X, y=None):
+        """Fits on X and transforms X.
+
+        Args:
+            X (pd.DataFrame): Data to fit and transform
+            y (pd.Series, optional): Target data.
+
+        Returns:
+            pd.DataFrame: Transformed X
+        """
+        return self.fit(X, y).transform(X, y)
+
+
 """Component that imputes missing data according to a specified imputation strategy."""
 
 

diff --git a/checkmates/pipelines/utils.py b/checkmates/pipelines/utils.py
@@ -15,6 +15,7 @@
     TimeSeriesRegularizer,
 )
 from checkmates.pipelines.training_validation_split import TrainingValidationSplit
+from checkmates.pipelines.transformers import SimpleNormalizer
 from checkmates.problem_types import is_classification, is_regression, is_time_series
 from checkmates.utils import infer_feature_types
 
@@ -31,6 +32,7 @@ def _make_component_list_from_actions(actions):
     components = []
     cols_to_drop = []
     indices_to_drop = []
+    cols_to_normalize = []
 
     for action in actions:
         if action.action_code == DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET:
@@ -47,6 +49,8 @@ def _make_component_list_from_actions(actions):
             )
         elif action.action_code == DataCheckActionCode.DROP_COL:
             cols_to_drop.extend(action.metadata["columns"])
+        elif action.action_code == DataCheckActionCode.TRANSFORM_FEATURES:
+            cols_to_normalize.extend(action.metadata["columns"])
         elif action.action_code == DataCheckActionCode.IMPUTE_COL:
             metadata = action.metadata
             parameters = metadata.get("parameters", {})
@@ -65,6 +69,9 @@ def _make_component_list_from_actions(actions):
     if indices_to_drop:
         indices_to_drop = sorted(set(indices_to_drop))
         components.append(DropRowsTransformer(indices_to_drop=indices_to_drop))
+    if cols_to_normalize:
+        cols_to_normalize = set(cols_to_normalize)
+        components.append(SimpleNormalizer(columns=cols_to_normalize))
 
     return components
 

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Created ``distribution_data_check`` to screen for positive and negative skews as well as bimodal distributions :pr:`21`
     * Fixes
     * Changes
     * Documentation Changes

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "woodwork>=0.22.0",
     "click>=8.0.0",
     "black[jupyter]>=22.3.0",
+    "diptest>=0.5.2",
 ]
 requires-python = ">=3.8,<4.0"
 readme = "README.md"

diff --git a/tests/data_checks_tests/test_distribution_data_check.py b/tests/data_checks_tests/test_distribution_data_check.py
@@ -0,0 +1,74 @@
+# Testing Data to make sure skews are recognized-- successful
+import numpy as np
+import pandas as pd
+from diptest import diptest
+from scipy.stats import skew
+
+from checkmates.data_checks import (
+    DataCheckActionCode,
+    DataCheckActionOption,
+    DataCheckMessageCode,
+    DataCheckWarning,
+)
+
+
+def _detect_skew_distribution_helper(X):
+    """Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient."""
+    skew_value = skew(X)
+    coef = diptest(X)[1]
+
+    if coef < 0.05:
+        return True, "bimodal distribution", skew_value, coef
+    if skew_value < -0.5:
+        return True, "negative skew", skew_value, coef
+    if skew_value > 0.5:
+        return True, "positive skew", skew_value, coef
+    return False, "no skew", skew_value, coef
+
+
+data = {
+    "Column1": np.random.normal(0, 1, 1000),  # Normally distributed data
+    "Column2": np.random.exponential(1, 1000),  # Right-skewed data
+    "Column3": 1 / (np.random.gamma(2, 2, 1000)),  # Left-skewed data
+}
+
+df = pd.DataFrame(data)
+df.ww.init()
+messages = []
+
+numeric_X = df.ww.select(["Integer", "Double"])
+print(numeric_X)
+for col in numeric_X:
+    (
+        is_skew,
+        distribution_type,
+        skew_value,
+        coef,
+    ) = _detect_skew_distribution_helper(numeric_X["Column2"])
+
+    if is_skew:
+        details = {
+            "distribution type": distribution_type,
+            "Skew Value": skew_value,
+            "Bimodal Coefficient": coef,
+        }
+        messages.append(
+            DataCheckWarning(
+                message="Data may have a skewed distribution.",
+                data_check_name="Distribution Data Check",
+                message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION,
+                details=details,
+                action_options=[
+                    DataCheckActionOption(
+                        DataCheckActionCode.TRANSFORM_FEATURES,
+                        data_check_name="Distribution Data Check",
+                        metadata={
+                            "is_skew": True,
+                            "transformation_strategy": "yeojohnson",
+                            "columns": col,
+                        },
+                    ),
+                ],
+            ).to_dict(),
+        )
+print(messages)
diff --git a/tests/data_checks_tests/test_normalizer.py b/tests/data_checks_tests/test_normalizer.py
@@ -0,0 +1,41 @@
+import numpy as np
+import pandas as pd
+from scipy.stats import yeojohnson
+
+data = {
+    "Column1": np.random.normal(0, 1, 1000),  # Normally distributed data
+    "Column2": np.random.exponential(1, 1000),  # Right-skewed data
+    "Column3": 1 / (np.random.gamma(2, 2, 1000)),  # Left-skewed data
+}
+
+X = pd.DataFrame(data)
+
+_cols_to_normalize = "Column2"
+
+
+def transform(self, X, _cols_to_normalize):
+    """Transforms input by normalizing distribution.
+
+    Args:
+        X (pd.DataFrame): Data to transform.
+        y (pd.Series, optional): Target Data
+
+    Returns:
+        pd.DataFrame: Transformed X
+    """
+    # If there are no columns to normalize, return early
+    if not _cols_to_normalize:
+        return self
+
+    # Only select the skewed column to normalize
+    x_t = X[_cols_to_normalize]
+    X_t = X
+
+    # Transform the data
+    X_t[_cols_to_normalize] = yeojohnson(x_t)
+
+    # Reinit woodwork
+    X_t.ww.init()
+
+
+transform(X, _cols_to_normalize, None)