Skip to content

Commit

Permalink
Merge branch 'data-stats-support-in-lime' of https://github.com/sokol…
Browse files Browse the repository at this point in the history
…lip/lime into sokollip-data-stats-support-in-lime
  • Loading branch information
marcotcr committed Mar 12, 2019
2 parents ec5df45 + b9c3991 commit 15b59c8
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 15 deletions.
44 changes: 42 additions & 2 deletions lime/discretize.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ class BaseDiscretizer():

__metaclass__ = ABCMeta # abstract class

def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None,
data_stats=None):
"""Initializer
Args:
data: numpy 2d array
Expand All @@ -31,9 +32,12 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando
column x.
feature_names: list of names (strings) corresponding to the columns
in the training data.
data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this
if you don't want these values to be computed from data
"""
self.to_discretize = ([x for x in range(data.shape[1])
if x not in categorical_features])
if x not in categorical_features])
self.data_stats = data_stats
self.names = {}
self.lambdas = {}
self.means = {}
Expand All @@ -46,6 +50,13 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando
bins = self.bins(data, labels)
bins = [np.unique(x) for x in bins]

# Read the stats from data_stats if exists
if data_stats:
self.means = self.data_stats.get("means")
self.stds = self.data_stats.get("stds")
self.mins = self.data_stats.get("mins")
self.maxs = self.data_stats.get("maxs")

for feature, qts in zip(self.to_discretize, bins):
n_bins = qts.shape[0] # Actually number of borders (= #bins-1)
boundaries = np.min(data[:, feature]), np.max(data[:, feature])
Expand All @@ -60,6 +71,10 @@ def __init__(self, data, categorical_features, feature_names, labels=None, rando
self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x)
discretized = self.lambdas[feature](data[:, feature])

# If data stats are provided no need to compute the below set of details
if data_stats:
continue

self.means[feature] = []
self.stds[feature] = []
for x in range(n_bins + 1):
Expand Down Expand Up @@ -117,6 +132,31 @@ def get_inverse(q):
return ret


class StatsDiscretizer(BaseDiscretizer):
"""
Class to be used to supply the data stats info when discretize_continuous is true
"""

def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None,
data_stats=None):

BaseDiscretizer.__init__(self, data, categorical_features,
feature_names, labels=labels,
random_state=random_state,
data_stats=data_stats)

def bins(self, data, labels):
bins_from_stats = self.data_stats.get("bins")
bins = []
if bins_from_stats is not None:
for feature in self.to_discretize:
bins_from_stats_feature = bins_from_stats.get(feature)
if bins_from_stats_feature is not None:
qts = np.array(bins_from_stats_feature)
bins.append(qts)
return bins


class QuartileDiscretizer(BaseDiscretizer):
def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None):

Expand Down
60 changes: 49 additions & 11 deletions lime/lime_tabular.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from lime.discretize import DecileDiscretizer
from lime.discretize import EntropyDiscretizer
from lime.discretize import BaseDiscretizer
from lime.discretize import StatsDiscretizer
from . import explanation
from . import lime_base

Expand Down Expand Up @@ -112,7 +113,8 @@ def __init__(self,
discretize_continuous=True,
discretizer='quartile',
sample_around_instance=False,
random_state=None):
random_state=None,
training_data_stats=None):
"""Init function.
Args:
Expand Down Expand Up @@ -153,11 +155,21 @@ def __init__(self,
random_state: an integer or numpy.RandomState that will be used to
generate random numbers. If None, the random state will be
initialized using the internal numpy seed.
training_data_stats: a dict object having the details of training data
statistics. If None, training data information will be used, only matters
if discretize_continuous is True. Must have the following keys:
means", "mins", "maxs", "stds", "feature_values",
"feature_frequencies"
"""
self.random_state = check_random_state(random_state)
self.mode = mode
self.categorical_names = categorical_names or {}
self.sample_around_instance = sample_around_instance
self.training_data_stats = training_data_stats

# Check and raise proper error in stats are supplied in non-descritized path
if self.training_data_stats:
self.validate_training_data_stats(self.training_data_stats)

if categorical_features is None:
categorical_features = []
Expand All @@ -169,6 +181,12 @@ def __init__(self,

self.discretizer = None
if discretize_continuous:
# Set the discretizer if training data stats are provided
if self.training_data_stats:
discretizer = StatsDiscretizer(training_data, self.categorical_features,
self.feature_names, labels=training_labels,
data_stats=self.training_data_stats)

if discretizer == 'quartile':
self.discretizer = QuartileDiscretizer(
training_data, self.categorical_features,
Expand All @@ -188,7 +206,10 @@ def __init__(self,
''' 'decile', 'entropy' or a''' +
''' BaseDiscretizer instance''')
self.categorical_features = list(range(training_data.shape[1]))
discretized_training_data = self.discretizer.discretize(

# Get the discretized_training_data when the stats are not provided
if(self.training_data_stats is None):
discretized_training_data = self.discretizer.discretize(
training_data)

if kernel_width is None:
Expand All @@ -203,21 +224,27 @@ def kernel(d, kernel_width):

self.feature_selection = feature_selection
self.base = lime_base.LimeBase(kernel_fn, verbose, random_state=self.random_state)
self.scaler = None
self.class_names = class_names

# Though set has no role to play if training data stats are provided
self.scaler = None
self.scaler = sklearn.preprocessing.StandardScaler(with_mean=False)
self.scaler.fit(training_data)
self.feature_values = {}
self.feature_frequencies = {}

for feature in self.categorical_features:
if self.discretizer is not None:
column = discretized_training_data[:, feature]
else:
column = training_data[:, feature]
if training_data_stats is None:
if self.discretizer is not None:
column = discretized_training_data[:, feature]
else:
column = training_data[:, feature]

feature_count = collections.Counter(column)
values, frequencies = map(list, zip(*(sorted(feature_count.items()))))
feature_count = collections.Counter(column)
values, frequencies = map(list, zip(*(sorted(feature_count.items()))))
else:
values = training_data_stats["feature_values"][feature]
frequencies = training_data_stats["feature_frequencies"][feature]

self.feature_values[feature] = values
self.feature_frequencies[feature] = (np.array(frequencies) /
Expand All @@ -229,6 +256,17 @@ def kernel(d, kernel_width):
def convert_and_round(values):
return ['%.2f' % v for v in values]

@staticmethod
def validate_training_data_stats(training_data_stats):
"""
Method to validate the structure of training data stats
"""
stat_keys = list(training_data_stats.keys())
valid_stat_keys = ["means", "mins", "maxs", "stds", "feature_values", "feature_frequencies"]
missing_keys = list(set(valid_stat_keys) - set(stat_keys))
if len(missing_keys) > 0:
raise Exception("Missing keys in training_data_stats. Details:" % (missing_keys))

def explain_instance(self,
data_row,
predict_fn,
Expand Down Expand Up @@ -414,8 +452,8 @@ def __data_inverse(self,
categorical_features = range(data_row.shape[0])
if self.discretizer is None:
data = self.random_state.normal(
0, 1, num_samples * data_row.shape[0]).reshape(
num_samples, data_row.shape[0])
0, 1, num_samples * data_row.shape[0]).reshape(
num_samples, data_row.shape[0])
if self.sample_around_instance:
data = data * self.scaler.scale_ + data_row
else:
Expand Down
72 changes: 70 additions & 2 deletions lime/tests/test_lime_tabular.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
import unittest

import numpy as np
import sklearn # noqa
import collections
import sklearn # noqa
import sklearn.datasets
import sklearn.ensemble
import sklearn.linear_model # noqa
import sklearn.linear_model # noqa
from numpy.testing import assert_array_equal
from sklearn.datasets import load_iris, make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from lime.discretize import QuartileDiscretizer, DecileDiscretizer, EntropyDiscretizer


try:
from sklearn.model_selection import train_test_split
except ImportError:
Expand Down Expand Up @@ -577,6 +579,72 @@ def testFeatureValues(self):
assert_array_equal(explainer.feature_frequencies[1], np.array([.25, .25, .25, .25]))
assert_array_equal(explainer.feature_frequencies[2], np.array([.5, .5]))

def test_lime_explainer_with_data_stats(self):
np.random.seed(1)

rf = RandomForestClassifier(n_estimators=500)
rf.fit(self.train, self.labels_train)
i = np.random.randint(0, self.test.shape[0])

# Generate stats using a quartile descritizer
descritizer = QuartileDiscretizer(self.train, [], self.feature_names, self.target_names,
random_state=20)

d_means = descritizer.means
d_stds = descritizer.stds
d_mins = descritizer.mins
d_maxs = descritizer.maxs
d_bins = descritizer.bins(self.train, self.target_names)

# Compute feature values and frequencies of all columns
cat_features = np.arange(self.train.shape[1])
discretized_training_data = descritizer.discretize(self.train)

feature_values = {}
feature_frequencies = {}
for feature in cat_features:
column = discretized_training_data[:, feature]
feature_count = collections.Counter(column)
values, frequencies = map(list, zip(*(feature_count.items())))
feature_values[feature] = values
feature_frequencies[feature] = frequencies

# Convert bins to list from array
d_bins_revised = {}
index = 0
for bin in d_bins:
d_bins_revised[index] = bin.tolist()
index = index+1

# Descritized stats
data_stats = {}
data_stats["means"] = d_means
data_stats["stds"] = d_stds
data_stats["maxs"] = d_maxs
data_stats["mins"] = d_mins
data_stats["bins"] = d_bins_revised
data_stats["feature_values"] = feature_values
data_stats["feature_frequencies"] = feature_frequencies

data = np.zeros((2, len(self.feature_names)))
explainer = LimeTabularExplainer(
data, feature_names=self.feature_names, random_state=10,
training_data_stats=data_stats, training_labels=self.target_names)

exp = explainer.explain_instance(self.test[i],
rf.predict_proba,
num_features=2,
model_regressor=LinearRegression())

self.assertIsNotNone(exp)
keys = [x[0] for x in exp.as_list()]
self.assertEqual(1,
sum([1 if 'petal width' in x else 0 for x in keys]),
"Petal Width is a major feature")
self.assertEqual(1,
sum([1 if 'petal length' in x else 0 for x in keys]),
"Petal Length is a major feature")


if __name__ == '__main__':
unittest.main()

0 comments on commit 15b59c8

Please sign in to comment.