Add support for passing check_additivity argument

mgarbacz · mgarbacz · commit 6feb8978c590 · 2021-03-16T10:10:13.000+01:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ repos:
       entry: black
       language: python
       types: [python]
-      language_version: python3.8
+      language_version: python3
       args: [--line-length=120]
   - repo: local
     hooks:
@@ -23,12 +23,13 @@ repos:
       entry: flake8
       language: system
       types: [python]
-      args: [--max-line-length=120, --docstring-convention=google, "--ignore=D100,D104,D202,D212,D200,E203,E731,W293,D412,D417,W503"]
+      args: [--max-line-length=120, --docstring-convention=google, "--ignore=D100,D104,D202,D212,D200,E203,E731,W293,D412,D417,W503,D411"]
 # D100 requires all Python files (modules) to have a "public" docstring even if all functions within have a docstring.
 # D104 requires __init__ files to have a docstring
 # D202 No blank lines allowed after function docstring
 # D212
-# D200 
+# D200
+# D411 Missing blank line before section
 # D412 No blank lines allowed between a section header and its content
 # D417 Missing argument descriptions in the docstring # Only ignored because of false positve when using multiline args.
 # E203
diff --git a/probatus/feature_elimination/feature_elimination.py b/probatus/feature_elimination/feature_elimination.py
@@ -333,7 +333,7 @@ def _report_current_results(
         self.report_df = pd.concat([self.report_df, current_row], axis=0)
 
     @staticmethod
-    def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer, verbose=0):
+    def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer, verbose=0, **shap_kwargs):
         """
         This function calculates the shap values on validation set, and Train and Val score.
 
@@ -365,6 +365,12 @@ def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer,
                 - 51 - 100 - shows most important warnings, prints of the feature removal process
                 - above 100 - presents all prints and all warnings (including SHAP warnings).
 
+            **shap_kwargs:
+                keyword arguments passed to
+                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
+                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
+                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
+                `check_additivity=False` disables the additivity check inside SHAP.
         Returns:
             (np.array, float, float):
                 Tuple with the results: Shap Values on validation fold, train score, validation score.
@@ -380,10 +386,10 @@ def _get_feature_shap_values_per_fold(X, y, clf, train_index, val_index, scorer,
         score_val = scorer(clf, X_val, y_val)
 
         # Compute SHAP values
-        shap_values = shap_calc(clf, X_val, verbose=verbose)
+        shap_values = shap_calc(clf, X_val, verbose=verbose, **shap_kwargs)
         return shap_values, score_train, score_val
 
-    def fit(self, X, y, columns_to_keep=None, column_names=None):
+    def fit(self, X, y, columns_to_keep=None, column_names=None, **shap_kwargs):
         """
         Fits the object with the provided data.
 
@@ -413,6 +419,13 @@ def fit(self, X, y, columns_to_keep=None, column_names=None):
                 feature names. If not provided the existing feature names are used or default feature names are
                 generated.
 
+            **shap_kwargs:
+                keyword arguments passed to
+                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
+                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
+                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
+                `check_additivity=False` disables the additivity check inside SHAP.
+
         Returns:
             (ShapRFECV): Fitted object.
         """
@@ -502,6 +515,7 @@ def fit(self, X, y, columns_to_keep=None, column_names=None):
                     val_index=val_index,
                     scorer=self.scorer.scorer,
                     verbose=self.verbose,
+                    **shap_kwargs,
                 )
                 for train_index, val_index in self.cv.split(current_X, self.y)
             )
@@ -557,7 +571,7 @@ def compute(self):
 
         return self.report_df
 
-    def fit_compute(self, X, y, columns_to_keep=None, column_names=None):
+    def fit_compute(self, X, y, columns_to_keep=None, column_names=None, **shap_kwargs):
         """
         Fits the object with the provided data.
 
@@ -586,12 +600,19 @@ def fit_compute(self, X, y, columns_to_keep=None, column_names=None):
                 feature names. If not provided the existing feature names are used or default feature names are
                 generated.
 
+            **shap_kwargs:
+                keyword arguments passed to
+                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
+                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
+                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
+                `check_additivity=False` disables the additivity check inside SHAP.
+
         Returns:
             (pd.DataFrame):
                 DataFrame containing results of feature elimination from each iteration.
         """
 
-        self.fit(X, y, columns_to_keep=columns_to_keep, column_names=column_names)
+        self.fit(X, y, columns_to_keep=columns_to_keep, column_names=column_names, **shap_kwargs)
         return self.compute()
 
     def get_reduced_features_set(self, num_features):
diff --git a/probatus/interpret/model_interpret.py b/probatus/interpret/model_interpret.py
@@ -104,17 +104,7 @@ def __init__(self, clf, scoring="roc_auc", verbose=0):
         self.scorer = get_single_scorer(scoring)
         self.verbose = verbose
 
-    def fit(
-        self,
-        X_train,
-        X_test,
-        y_train,
-        y_test,
-        column_names=None,
-        class_names=None,
-        approximate=False,
-        **shap_kwargs,
-    ):
+    def fit(self, X_train, X_test, y_train, y_test, column_names=None, class_names=None, **shap_kwargs):
         """
         Fits the object and calculates the shap values for the provided datasets.
 
@@ -138,12 +128,12 @@ def fit(
                 List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are
                 used.
 
-            approximate (boolean, optional):
-                if True uses shap approximations - less accurate, but very fast.
-
             **shap_kwargs:
                 keyword arguments passed to
                 [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
+                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
+                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
+                `check_additivity=False` disables the additivity check inside SHAP.
         """
 
         self.X_train, self.column_names = preprocess_data(
@@ -171,7 +161,6 @@ def fit(
             clf=self.clf,
             X=self.X_train,
             y=self.y_train,
-            approximate=approximate,
             column_names=self.column_names,
             class_names=self.class_names,
             verbose=self.verbose,
@@ -182,7 +171,6 @@ def fit(
             clf=self.clf,
             X=self.X_test,
             y=self.y_test,
-            approximate=approximate,
             column_names=self.column_names,
             class_names=self.class_names,
             verbose=self.verbose,
@@ -285,7 +273,6 @@ def fit_compute(
         y_test,
         column_names=None,
         class_names=None,
-        approximate=False,
         return_scores=False,
         **shap_kwargs,
     ):
@@ -314,19 +301,19 @@ def fit_compute(
                 If none, the default ['Negative Class', 'Positive Class'] are
                 used.
 
-            approximate (boolean, optional):
-                if True uses shap approximations - less accurate, but very fast.
-
             return_scores (bool, optional):
                 Flag indicating whether the method should return
                 the train and test score of the model,
                 together with the model interpretation report. If true,
                 the output of this method is a tuple of DataFrame, float,
                 float.
 
-            **shap_kwargs: keyword arguments passed to
+            **shap_kwargs:
                 keyword arguments passed to
                 [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
+                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
+                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
+                `check_additivity=False` disables the additivity check inside SHAP.
 
         Returns:
             (pd.DataFrame or tuple(pd.DataFrame, float, float)):
@@ -340,20 +327,11 @@ def fit_compute(
             y_test=y_test,
             column_names=column_names,
             class_names=class_names,
-            approximate=approximate,
             **shap_kwargs,
         )
         return self.compute()
 
-    def plot(
-        self,
-        plot_type,
-        target_set="test",
-        target_columns=None,
-        samples_index=None,
-        show=True,
-        **plot_kwargs,
-    ):
+    def plot(self, plot_type, target_set="test", target_columns=None, samples_index=None, show=True, **plot_kwargs):
         """
         Plots the appropriate SHAP plot.
 
diff --git a/probatus/interpret/shap_dependence.py b/probatus/interpret/shap_dependence.py
@@ -82,22 +82,33 @@ def __repr__(self):
         """
         return "Shap dependence plotter for {}".format(self.clf.__class__.__name__)
 
-    def fit(self, X, y, column_names=None, class_names=None, precalc_shap=None):
+    def fit(self, X, y, column_names=None, class_names=None, precalc_shap=None, **shap_kwargs):
         """
         Fits the plotter to the model and data by computing the shap values.
 
         If the shap_values are passed, they do not need to be computed.
 
         Args:
             X (pd.DataFrame): input variables.
+
             y (pd.Series): target variable.
+
             column_names (None, or list of str, optional):
                 List of feature names for the dataset. If None, then column names from the X_train dataframe are used.
+
             class_names (None, or list of str, optional):
                 List of class names e.g. ['neg', 'pos']. If none, the default ['Negative Class', 'Positive Class'] are
                 used.
+
             precalc_shap (Optional, None or np.array):
                 Precalculated shap values, If provided they don't need to be computed.
+
+            **shap_kwargs:
+                keyword arguments passed to
+                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
+                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
+                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
+                `check_additivity=False` disables the additivity check inside SHAP.
         """
         self.X, self.column_names = preprocess_data(X, X_name="X", column_names=column_names, verbose=self.verbose)
         self.y = preprocess_labels(y, y_name="y", index=self.X.index, verbose=self.verbose)
@@ -107,7 +118,7 @@ def fit(self, X, y, column_names=None, class_names=None, precalc_shap=None):
         if self.class_names is None:
             self.class_names = ["Negative Class", "Positive Class"]
 
-        self.shap_vals_df = shap_to_df(self.clf, self.X, precalc_shap=precalc_shap, verbose=self.verbose)
+        self.shap_vals_df = shap_to_df(self.clf, self.X, precalc_shap=precalc_shap, verbose=self.verbose, **shap_kwargs)
 
         self.fitted = True
         return self
@@ -123,7 +134,7 @@ def compute(self):
         self._check_if_fitted()
         return self.shap_vals_df
 
-    def fit_compute(self, X, y, column_names=None, class_names=None, precalc_shap=None):
+    def fit_compute(self, X, y, column_names=None, class_names=None, precalc_shap=None, **shap_kwargs):
         """
         Fits the plotter to the model and data by computing the shap values.
 
@@ -146,17 +157,18 @@ def fit_compute(self, X, y, column_names=None, class_names=None, precalc_shap=No
             precalc_shap (Optional, None or np.array):
                 Precalculated shap values, If provided they don't need to be computed.
 
+            **shap_kwargs:
+                keyword arguments passed to
+                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
+                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
+                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
+                `check_additivity=False` disables the additivity check inside SHAP.
+
         Returns:
             (pd.DataFrame):
                 SHAP Values for X.
         """
-        self.fit(
-            X,
-            y,
-            column_names=column_names,
-            class_names=class_names,
-            precalc_shap=precalc_shap,
-        )
+        self.fit(X, y, column_names=column_names, class_names=class_names, precalc_shap=precalc_shap, **shap_kwargs)
         return self.compute()
 
     def plot(
diff --git a/probatus/sample_similarity/resemblance_model.py b/probatus/sample_similarity/resemblance_model.py
@@ -594,7 +594,7 @@ class is 'roc_auc'.
 
         self.plot_title = "SHAP summary plot"
 
-    def fit(self, X1, X2, column_names=None, class_names=None):
+    def fit(self, X1, X2, column_names=None, class_names=None, **shap_kwargs):
         """
         This function assigns to labels to each sample, 0 to first sample, 1 to the second.
 
@@ -619,13 +619,20 @@ def fit(self, X1, X2, column_names=None, class_names=None):
                 List of class names assigned, in this case provided samples e.g. ['sample1', 'sample2']. If none, the
                 default ['First Sample', 'Second Sample'] are used.
 
+            **shap_kwargs:
+                keyword arguments passed to
+                [shap.Explainer](https://shap.readthedocs.io/en/latest/generated/shap.Explainer.html#shap.Explainer).
+                It also enables `approximate` and `check_additivity` parameters, passed while calculating SHAP values.
+                The `approximate=True` causes less accurate, but faster SHAP values calculation, while
+                `check_additivity=False` disables the additivity check inside SHAP.
+
         Returns:
             (SHAPImportanceResemblance):
                 Fitted object.
         """
         super().fit(X1=X1, X2=X2, column_names=column_names, class_names=class_names)
 
-        self.shap_values_test = shap_calc(self.clf, self.X_test, verbose=self.verbose)
+        self.shap_values_test = shap_calc(self.clf, self.X_test, verbose=self.verbose, **shap_kwargs)
         self.report = calculate_shap_importance(self.shap_values_test, self.column_names)
         return self
 
diff --git a/probatus/utils/shap_helpers.py b/probatus/utils/shap_helpers.py
@@ -31,6 +31,7 @@ def shap_calc(
     return_explainer=False,
     verbose=0,
     sample_size=100,
+    check_additivity=True,
     **shap_kwargs,
 ):
     """
@@ -57,6 +58,9 @@ def shap_calc(
             - 51 - 100 - shows other warnings and prints
             - above 100 - presents all prints and all warnings (including SHAP warnings).
 
+         check_additivity (boolean):
+            if False SHAP will disable the additivity check.
+
         **shap_kwargs: kwargs of the shap.Explainer
 
     Returns:
@@ -80,7 +84,7 @@ def shap_calc(
 
         explainer = shap.Explainer(model, masker=mask, **shap_kwargs)
         # Calculate Shap values.
-        shap_values = explainer.shap_values(X)
+        shap_values = explainer.shap_values(X, check_additivity=check_additivity, approximate=approximate)
 
         if isinstance(shap_values, list) and len(shap_values) == 2:
             warnings.warn(