Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Distribution data check #21

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion checkmates/data_checks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,6 @@
from checkmates.data_checks.checks.invalid_target_data_check import (
InvalidTargetDataCheck,
)

from checkmates.data_checks.checks.distribution_data_check import DistributionDataCheck

from checkmates.data_checks.datacheck_meta.utils import handle_data_check_action_code
107 changes: 107 additions & 0 deletions checkmates/data_checks/checks/distribution_data_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""Data check that screens data for skewed or bimodal distrbutions prior to model training to ensure model performance is unaffected."""

from diptest import diptest
from scipy.stats import skew

from checkmates.data_checks import (
DataCheck,
DataCheckActionCode,
DataCheckActionOption,
DataCheckMessageCode,
DataCheckWarning,
)


class DistributionDataCheck(DataCheck):
"""Check if the overall data contains certain distributions that may need to be transformed prior training to improve model performance. Uses the skew test and yeojohnson transformation."""

def validate(self, X, y):
"""Check if the overall data has a skewed or bimodal distribution.

Args:
X (pd.DataFrame, np.ndarray): Overall data to check for skewed or bimodal distributions.
y (pd.Series, np.ndarray): Target data to check for underlying distributions.

Returns:
dict (DataCheckError): List with DataCheckErrors if certain distributions are found in the overall data.

Examples:
>>> import pandas as pd

Features and target data that exhibit a skewed distribution will raise a warning for the user to transform the data.

>>> X = [5, 7, 8, 9, 10, 11, 12, 15, 20]
>>> data_check = DistributionDataCheck()
>>> assert data_check.validate(X, y) == [
... {
... "message": "Data may have a skewed distribution.",
... "data_check_name": "DistributionDataCheck",
... "level": "warning",
... "code": "SKEWED_DISTRIBUTION",
... "details": {"distribution type": "positive skew", "Skew Value": 0.7939, "Bimodal Coefficient": 1.0,},
... "action_options": [
... {
... "code": "TRANSFORM_FEATURES",
... "data_check_name": "DistributionDataCheck",
... "parameters": {},
... "metadata": {
"is_skew": True,
"transformation_strategy": "yeojohnson",
... }
... }
... ]
... }
... ]
"""
messages = []

numeric_X = X.ww.select(["Integer", "Double"])

for col in numeric_X:
(
is_skew,
distribution_type,
skew_value,
coef,
) = _detect_skew_distribution_helper(col)

if is_skew:
details = {
"distribution type": distribution_type,
"Skew Value": skew_value,
"Bimodal Coefficient": coef,
}
messages.append(
DataCheckWarning(
message="Data may have a skewed distribution.",
data_check_name=self.name,
message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION,
details=details,
action_options=[
DataCheckActionOption(
DataCheckActionCode.TRANSFORM_FEATURES,
data_check_name=self.name,
metadata={
"is_skew": True,
"transformation_strategy": "yeojohnson",
"columns": col,
},
),
],
).to_dict(),
)
return messages


def _detect_skew_distribution_helper(X):
"""Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient."""
skew_value = skew(X)
coef = diptest(X)[1]

if coef < 0.05:
return True, "bimodal distribution", skew_value, coef
if skew_value < -0.5:
return True, "negative skew", skew_value, coef
if skew_value > 0.5:
return True, "positive skew", skew_value, coef
return False, "no skew", skew_value, coef
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ class DataCheckActionCode(Enum):
TRANSFORM_TARGET = "transform_target"
"""Action code for transforming the target data."""

TRANSFORM_FEATURES = "transform_features"
"""Action code for transforming the features data."""

REGULARIZE_AND_IMPUTE_DATASET = "regularize_and_impute_dataset"
"""Action code for regularizing and imputing all features and target time series data."""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ class DataCheckMessageCode(Enum):
TARGET_LOGNORMAL_DISTRIBUTION = "target_lognormal_distribution"
"""Message code for target data with a lognormal distribution."""

SKEWED_DISTRIBUTION = "skewed_distribution"
"""Message code for data with a skewed distribution."""

HIGH_VARIANCE = "high_variance"
"""Message code for when high variance is detected for cross-validation."""

Expand Down
2 changes: 1 addition & 1 deletion checkmates/pipelines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from checkmates.pipelines.component_base_meta import ComponentBaseMeta
from checkmates.pipelines.component_base import ComponentBase
from checkmates.pipelines.transformers import Transformer
from checkmates.pipelines.transformers import Transformer, SimpleNormalizer
from checkmates.pipelines.components import ( # noqa: F401
DropColumns,
DropRowsTransformer,
Expand Down
50 changes: 50 additions & 0 deletions checkmates/pipelines/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import pandas as pd
import woodwork
from scipy.stats import yeojohnson
from sklearn.impute import SimpleImputer as SkImputer

from checkmates.exceptions import MethodPropertyNotFoundError
Expand Down Expand Up @@ -83,6 +84,55 @@ def _get_feature_provenance(self):
return {}


"""Component that normalizes skewed distributions using the Yeo-Johnson method"""


class SimpleNormalizer(Transformer):
"""Normalizes skewed data according to the Yeo-Johnson method."""

def __init__(self):
super().__init__(
parameters=None,
_cols_to_normalize=None,
)

def transform(self, X, y=None):
"""Transforms input by normalizing distribution.

Args:
X (pd.DataFrame): Data to transform.
y (pd.Series, optional): Target Data

Returns:
pd.DataFrame: Transformed X
"""
# If there are no columns to normalize, return early
if not self._cols_to_normalize:
return self

# Only select the skewed column to normalize
x_t = X[self._cols_to_normalize]
X_t = X

# Transform the data
X_t[self._cols_to_normalize] = yeojohnson(x_t)

# Reinit woodwork
X_t.ww.init()

def fit_transform(self, X, y=None):
"""Fits on X and transforms X.

Args:
X (pd.DataFrame): Data to fit and transform
y (pd.Series, optional): Target data.

Returns:
pd.DataFrame: Transformed X
"""
return self.fit(X, y).transform(X, y)


"""Component that imputes missing data according to a specified imputation strategy."""


Expand Down
7 changes: 7 additions & 0 deletions checkmates/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
TimeSeriesRegularizer,
)
from checkmates.pipelines.training_validation_split import TrainingValidationSplit
from checkmates.pipelines.transformers import SimpleNormalizer
from checkmates.problem_types import is_classification, is_regression, is_time_series
from checkmates.utils import infer_feature_types

Expand All @@ -31,6 +32,7 @@ def _make_component_list_from_actions(actions):
components = []
cols_to_drop = []
indices_to_drop = []
cols_to_normalize = []

for action in actions:
if action.action_code == DataCheckActionCode.REGULARIZE_AND_IMPUTE_DATASET:
Expand All @@ -47,6 +49,8 @@ def _make_component_list_from_actions(actions):
)
elif action.action_code == DataCheckActionCode.DROP_COL:
cols_to_drop.extend(action.metadata["columns"])
elif action.action_code == DataCheckActionCode.TRANSFORM_FEATURES:
cols_to_normalize.extend(action.metadata["columns"])
elif action.action_code == DataCheckActionCode.IMPUTE_COL:
metadata = action.metadata
parameters = metadata.get("parameters", {})
Expand All @@ -65,6 +69,9 @@ def _make_component_list_from_actions(actions):
if indices_to_drop:
indices_to_drop = sorted(set(indices_to_drop))
components.append(DropRowsTransformer(indices_to_drop=indices_to_drop))
if cols_to_normalize:
cols_to_normalize = set(cols_to_normalize)
components.append(SimpleNormalizer(columns=cols_to_normalize))

return components

Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Created ``distribution_data_check`` to screen for positive and negative skews as well as bimodal distributions :pr:`21`
* Fixes
* Changes
* Documentation Changes
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies = [
"woodwork>=0.22.0",
"click>=8.0.0",
"black[jupyter]>=22.3.0",
"diptest>=0.5.2",
]
requires-python = ">=3.8,<4.0"
readme = "README.md"
Expand Down
74 changes: 74 additions & 0 deletions tests/data_checks_tests/test_distribution_data_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Testing Data to make sure skews are recognized-- successful
import numpy as np
import pandas as pd
from diptest import diptest
from scipy.stats import skew

from checkmates.data_checks import (
DataCheckActionCode,
DataCheckActionOption,
DataCheckMessageCode,
DataCheckWarning,
)


def _detect_skew_distribution_helper(X):
"""Helper method to detect skewed or bimodal distribution. Returns boolean, distribution type, the skew value, and bimodal coefficient."""
skew_value = skew(X)
coef = diptest(X)[1]

if coef < 0.05:
return True, "bimodal distribution", skew_value, coef
if skew_value < -0.5:
return True, "negative skew", skew_value, coef
if skew_value > 0.5:
return True, "positive skew", skew_value, coef
return False, "no skew", skew_value, coef


data = {
"Column1": np.random.normal(0, 1, 1000), # Normally distributed data
"Column2": np.random.exponential(1, 1000), # Right-skewed data
"Column3": 1 / (np.random.gamma(2, 2, 1000)), # Left-skewed data
}

df = pd.DataFrame(data)
df.ww.init()
messages = []

numeric_X = df.ww.select(["Integer", "Double"])
print(numeric_X)
for col in numeric_X:
(
is_skew,
distribution_type,
skew_value,
coef,
) = _detect_skew_distribution_helper(numeric_X["Column2"])

if is_skew:
details = {
"distribution type": distribution_type,
"Skew Value": skew_value,
"Bimodal Coefficient": coef,
}
messages.append(
DataCheckWarning(
message="Data may have a skewed distribution.",
data_check_name="Distribution Data Check",
message_code=DataCheckMessageCode.SKEWED_DISTRIBUTION,
details=details,
action_options=[
DataCheckActionOption(
DataCheckActionCode.TRANSFORM_FEATURES,
data_check_name="Distribution Data Check",
metadata={
"is_skew": True,
"transformation_strategy": "yeojohnson",
"columns": col,
},
),
],
).to_dict(),
)
print(messages)
41 changes: 41 additions & 0 deletions tests/data_checks_tests/test_normalizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import numpy as np
import pandas as pd
from scipy.stats import yeojohnson

data = {
"Column1": np.random.normal(0, 1, 1000), # Normally distributed data
"Column2": np.random.exponential(1, 1000), # Right-skewed data
"Column3": 1 / (np.random.gamma(2, 2, 1000)), # Left-skewed data
}

X = pd.DataFrame(data)

_cols_to_normalize = "Column2"


def transform(self, X, _cols_to_normalize):
"""Transforms input by normalizing distribution.

Args:
X (pd.DataFrame): Data to transform.
y (pd.Series, optional): Target Data

Returns:
pd.DataFrame: Transformed X
"""
# If there are no columns to normalize, return early
if not _cols_to_normalize:
return self

# Only select the skewed column to normalize
x_t = X[_cols_to_normalize]
X_t = X

# Transform the data
X_t[_cols_to_normalize] = yeojohnson(x_t)

# Reinit woodwork
X_t.ww.init()


transform(X, _cols_to_normalize, None)