marcotcr · ldingemans · Jul 25, 2022
diff --git a/lime/lime_image.py b/lime/lime_image.py
@@ -6,9 +6,10 @@
 
 import numpy as np
 import sklearn
+import sklearn.preprocessing
 from sklearn.utils import check_random_state
 from skimage.color import gray2rgb
-from tqdm.auto import tqdm
+from tqdm import tqdm
 
 
 from . import lime_base
@@ -27,8 +28,7 @@ def __init__(self, image, segments):
         self.segments = segments
         self.intercept = {}
         self.local_exp = {}
-        self.local_pred = {}
-        self.score = {}
+        self.local_pred = None
 
     def get_image_and_mask(self, label, positive_only=True, negative_only=False, hide_rest=False,
                            num_features=5, min_weight=0.):
@@ -134,7 +134,7 @@ def explain_instance(self, image, classifier_fn, labels=(1,),
                          distance_metric='cosine',
                          model_regressor=None,
                          random_seed=None,
-                         progress_bar=True):
+                         classifier_args=None):
         """Generates explanations for a prediction.
 
         First, we generate neighborhood data by randomly perturbing features
@@ -149,14 +149,13 @@ def explain_instance(self, image, classifier_fn, labels=(1,),
                 takes a numpy array and outputs prediction probabilities.  For
                 ScikitClassifiers , this is classifier.predict_proba.
             labels: iterable with labels to be explained.
-            hide_color: If not None, will hide superpixels with this color.
-                Otherwise, use the mean pixel color of the image.
+            hide_color: TODO
             top_labels: if not None, ignore labels and produce explanations for
                 the K labels with highest prediction probabilities, where K is
                 this parameter.
             num_features: maximum number of features present in explanation
             num_samples: size of the neighborhood to learn the linear model
-            batch_size: batch size for model predictions
+            batch_size: TODO
             distance_metric: the distance metric to use for weights.
             model_regressor: sklearn regressor to use in explanation. Defaults
             to Ridge regression in LimeBase. Must have model_regressor.coef_
@@ -166,7 +165,6 @@ def explain_instance(self, image, classifier_fn, labels=(1,),
             random_seed: integer used as random seed for the segmentation
                 algorithm. If None, a random integer, between 0 and 1000,
                 will be generated using the internal random number generator.
-            progress_bar: if True, show tqdm progress bar.
 
         Returns:
             An ImageExplanation object (see lime_image.py) with the corresponding
@@ -181,7 +179,10 @@ def explain_instance(self, image, classifier_fn, labels=(1,),
             segmentation_fn = SegmentationAlgorithm('quickshift', kernel_size=4,
                                                     max_dist=200, ratio=0.2,
                                                     random_seed=random_seed)
-        segments = segmentation_fn(image)
+        try:
+            segments = segmentation_fn(image)
+        except ValueError as e:
+            raise e
 
         fudged_image = image.copy()
         if hide_color is None:
@@ -194,12 +195,10 @@ def explain_instance(self, image, classifier_fn, labels=(1,),
             fudged_image[:] = hide_color
 
         top = labels
-
+        
         data, labels = self.data_labels(image, fudged_image, segments,
                                         classifier_fn, num_samples,
-                                        batch_size=batch_size,
-                                        progress_bar=progress_bar)
-
+                                        batch_size=batch_size, classifier_args=classifier_args)
         distances = sklearn.metrics.pairwise_distances(
             data,
             data[0].reshape(1, -1),
@@ -214,8 +213,7 @@ def explain_instance(self, image, classifier_fn, labels=(1,),
         for label in top:
             (ret_exp.intercept[label],
              ret_exp.local_exp[label],
-             ret_exp.score[label],
-             ret_exp.local_pred[label]) = self.base.explain_instance_with_data(
+             ret_exp.score, ret_exp.local_pred) = self.base.explain_instance_with_data(
                 data, labels, distances, label, num_features,
                 model_regressor=model_regressor,
                 feature_selection=self.feature_selection)
@@ -228,7 +226,7 @@ def data_labels(self,
                     classifier_fn,
                     num_samples,
                     batch_size=10,
-                    progress_bar=True):
+                    classifier_args=None):
         """Generates images and predictions in the neighborhood of this image.
 
         Args:
@@ -240,7 +238,6 @@ def data_labels(self,
                 matrix of prediction probabilities
             num_samples: size of the neighborhood to learn the linear model
             batch_size: classifier_fn will be called on batches of this size.
-            progress_bar: if True, show tqdm progress bar.
 
         Returns:
             A tuple (data, labels), where:
@@ -253,8 +250,7 @@ def data_labels(self,
         labels = []
         data[0, :] = 1
         imgs = []
-        rows = tqdm(data) if progress_bar else data
-        for row in rows:
+        for row in data:
             temp = copy.deepcopy(image)
             zeros = np.where(row == 0)[0]
             mask = np.zeros(segments.shape).astype(bool)
@@ -263,10 +259,10 @@ def data_labels(self,
             temp[mask] = fudged_image[mask]
             imgs.append(temp)
             if len(imgs) == batch_size:
-                preds = classifier_fn(np.array(imgs))
+                preds = classifier_fn(np.array(imgs), classifier_args=classifier_args)
                 labels.extend(preds)
                 imgs = []
         if len(imgs) > 0:
-            preds = classifier_fn(np.array(imgs))
+            preds = classifier_fn(np.array(imgs), classifier_args=classifier_args)
             labels.extend(preds)
         return data, np.array(labels)
diff --git a/lime/lime_tabular.py b/lime/lime_tabular.py
@@ -12,8 +12,6 @@
 import sklearn
 import sklearn.preprocessing
 from sklearn.utils import check_random_state
-from pyDOE2 import lhs
-from scipy.stats.distributions import norm
 
 from lime.discretize import QuartileDiscretizer
 from lime.discretize import DecileDiscretizer
@@ -139,7 +137,7 @@ def __init__(self,
                  discretizer='quartile',
                  sample_around_instance=False,
                  random_state=None,
-                 training_data_stats=None):
+                 training_data_stats=None,):
         """Init function.
 
         Args:
@@ -208,11 +206,10 @@ def __init__(self,
         if discretize_continuous and not sp.sparse.issparse(training_data):
             # Set the discretizer if training data stats are provided
             if self.training_data_stats:
-                discretizer = StatsDiscretizer(
-                    training_data, self.categorical_features,
-                    self.feature_names, labels=training_labels,
-                    data_stats=self.training_data_stats,
-                    random_state=self.random_state)
+                discretizer = StatsDiscretizer(training_data, self.categorical_features,
+                                               self.feature_names, labels=training_labels,
+                                               data_stats=self.training_data_stats,
+                                               random_state=self.random_state)
 
             if discretizer == 'quartile':
                 self.discretizer = QuartileDiscretizer(
@@ -305,7 +302,7 @@ def explain_instance(self,
                          num_samples=5000,
                          distance_metric='euclidean',
                          model_regressor=None,
-                         sampling_method='gaussian'):
+                         classifier_args=None):
         """Generates explanations for a prediction.
 
         First, we generate neighborhood data by randomly perturbing features
@@ -333,8 +330,6 @@ def explain_instance(self,
             model_regressor: sklearn regressor to use in explanation. Defaults
                 to Ridge regression in LimeBase. Must have model_regressor.coef_
                 and 'sample_weight' as a parameter to model_regressor.fit()
-            sampling_method: Method to sample synthetic data. Defaults to Gaussian
-                sampling. Can also use Latin Hypercube Sampling.
 
         Returns:
             An Explanation object (see explanation.py) with the corresponding
@@ -343,7 +338,7 @@ def explain_instance(self,
         if sp.sparse.issparse(data_row) and not sp.sparse.isspmatrix_csr(data_row):
             # Preventative code: if sparse, convert to csr format if not in csr format already
             data_row = data_row.tocsr()
-        data, inverse = self.__data_inverse(data_row, num_samples, sampling_method)
+        data, inverse = self.__data_inverse(data_row, num_samples)
         if sp.sparse.issparse(data):
             # Note in sparse case we don't subtract mean since data would become dense
             scaled_data = data.multiply(self.scaler.scale_)
@@ -358,7 +353,7 @@ def explain_instance(self,
                 metric=distance_metric
         ).ravel()
 
-        yss = predict_fn(inverse)
+        yss = predict_fn(inverse, classifier_args)
 
         # for classification, the model needs to provide a list of tuples - classes
         # along with prediction probabilities
@@ -455,8 +450,7 @@ def explain_instance(self,
         for label in labels:
             (ret_exp.intercept[label],
              ret_exp.local_exp[label],
-             ret_exp.score[label],
-             ret_exp.local_pred[label]) = self.base.explain_instance_with_data(
+             ret_exp.score, ret_exp.local_pred) = self.base.explain_instance_with_data(
                     scaled_data,
                     yss,
                     distances,
@@ -474,8 +468,7 @@ def explain_instance(self,
 
     def __data_inverse(self,
                        data_row,
-                       num_samples,
-                       sampling_method):
+                       num_samples):
         """Generates a neighborhood around a prediction.
 
         For numerical features, perturb them by sampling from a Normal(0,1) and
@@ -488,7 +481,6 @@ def __data_inverse(self,
         Args:
             data_row: 1d numpy array, corresponding to a row
             num_samples: size of the neighborhood to learn the linear model
-            sampling_method: 'gaussian' or 'lhs'
 
         Returns:
             A tuple (data, inverse), where:
@@ -517,26 +509,9 @@ def __data_inverse(self,
                 instance_sample = data_row[:, non_zero_indexes]
                 scale = scale[non_zero_indexes]
                 mean = mean[non_zero_indexes]
-
-            if sampling_method == 'gaussian':
-                data = self.random_state.normal(0, 1, num_samples * num_cols
-                                                ).reshape(num_samples, num_cols)
-                data = np.array(data)
-            elif sampling_method == 'lhs':
-                data = lhs(num_cols, samples=num_samples
-                           ).reshape(num_samples, num_cols)
-                means = np.zeros(num_cols)
-                stdvs = np.array([1]*num_cols)
-                for i in range(num_cols):
-                    data[:, i] = norm(loc=means[i], scale=stdvs[i]).ppf(data[:, i])
-                data = np.array(data)
-            else:
-                warnings.warn('''Invalid input for sampling_method.
-                                 Defaulting to Gaussian sampling.''', UserWarning)
-                data = self.random_state.normal(0, 1, num_samples * num_cols
-                                                ).reshape(num_samples, num_cols)
-                data = np.array(data)
-
+            data = self.random_state.normal(
+                0, 1, num_samples * num_cols).reshape(
+                num_samples, num_cols)
             if self.sample_around_instance:
                 data = data * scale + instance_sample
             else:
@@ -643,8 +618,6 @@ def __init__(self, training_data, mode="classification",
                 n_samples, n_timesteps * n_features)
         self.n_timesteps = n_timesteps
         self.n_features = n_features
-        if feature_names is None:
-            feature_names = ['feature%d' % i for i in range(n_features)]
 
         # Update the feature names
         feature_names = ['{}_t-{}'.format(n, n_timesteps - (i + 1))