-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathgenerateARExplanations.py
190 lines (169 loc) · 7.7 KB
/
generateARExplanations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import os
import time
import numpy as np
import pandas as pd
import normalizedDistance
from pprint import pprint
from recourse.builder import RecourseBuilder
from recourse.builder import ActionSet
def genExp(model_trained, factual_sample, norm_type, dataset_obj):
start_time = time.time()
# SIMPLE HACK!!
# ActionSet() construction demands that all variables have a range to them. In
# the case of one-hot ordinal variables (e.g., x2_ord_0, x2_ord_1, x2_ord_2)),
# the first sub-category (i.e., x2_ord_0) will always have range(1,1), failing
# the requirements of ActionSet(). Therefore, we add a DUMMY ROW to the data-
# frame, which is a copy of another row (so not to throw off the range of other
# attributes), but setting a 0 value to all _ord_ variables. (might as well do
# this for all _cat_ variables as well).
tmp_df = dataset_obj.data_frame_kurz
sample_row = tmp_df.loc[0].to_dict()
for attr_name_kurz in dataset_obj.getOneHotAttributesNames('kurz'):
sample_row[attr_name_kurz] = 0
tmp_df = tmp_df.append(pd.Series(sample_row), ignore_index=True)
df = tmp_df
X = df.loc[:, df.columns != 'y']
# Enforce binary, categorical (including ordinal) variables only take on 2 values
custom_bounds = {attr_name_kurz: (0, 100, 'p') for attr_name_kurz in np.union1d(
dataset_obj.getOneHotAttributesNames('kurz'),
dataset_obj.getBinaryAttributeNames('kurz')
)}
action_set = ActionSet(X = X, custom_bounds = custom_bounds)
# action_set['x1'].mutable = False # x1 = 'Race'
# In the current implementation, we only supports any/none actionability
for attr_name_kurz in dataset_obj.getInputAttributeNames('kurz'):
attr_obj = dataset_obj.attributes_kurz[attr_name_kurz]
if attr_obj.actionability == 'none':
action_set[attr_name_kurz].mutable = False
elif attr_obj.actionability == 'any':
continue # do nothing
else:
raise ValueError(f'Actionable Recourse does not support actionability type {attr_obj.actionability}')
# Enforce search over integer-based grid for integer-based variables
for attr_name_kurz in np.union1d(
dataset_obj.getIntegerBasedAttributeNames('kurz'),
dataset_obj.getBinaryAttributeNames('kurz'),
):
action_set[attr_name_kurz].step_type = "absolute"
action_set[attr_name_kurz].step_size = 1
coefficients = model_trained.coef_[0]
intercept = model_trained.intercept_[0]
if norm_type == 'one_norm':
mip_cost_type = 'total'
elif norm_type == 'infty_norm':
mip_cost_type = 'max'
else:
raise ValueError(f'Actionable Recourse does not support norm_type {norm_type}')
factual_sample_values = list(factual_sample.values())
# p = .8
rb = RecourseBuilder(
optimizer="cplex",
coefficients=coefficients,
intercept=intercept, # - (np.log(p / (1. - p))),
action_set=action_set,
x=factual_sample_values,
mip_cost_type=mip_cost_type
)
output = rb.fit()
counterfactual_sample_values = np.add(factual_sample_values, output['actions'])
counterfactual_sample = dict(zip(factual_sample.keys(), counterfactual_sample_values))
# factual_sample['y'] = False
# counterfactual_sample['y'] = True
counterfactual_sample['y'] = not factual_sample['y']
counterfactual_plausible = True
# IMPORTANT: no need to check for integer-based / binary-based plausibility,
# because those were set above when we said step_type = absolute! Just round!
for attr_name_kurz in np.union1d(
dataset_obj.getOneHotAttributesNames('kurz'),
dataset_obj.getBinaryAttributeNames('kurz')
):
try:
assert np.isclose(
counterfactual_sample[attr_name_kurz],
np.round(counterfactual_sample[attr_name_kurz])
)
counterfactual_sample[attr_name_kurz] = np.round(counterfactual_sample[attr_name_kurz])
except:
distance = -1
counterfactual_plausible = False
# return counterfactual_sample, distance
# Perform plausibility-data-type check! Remember, all ordinal variables
# have already been converted to categorical variables. It is important now
# to check that 1 (and only 1) sub-category is activated in the resulting
# counterfactual sample.
already_considered = []
for attr_name_kurz in dataset_obj.getOneHotAttributesNames('kurz'):
if attr_name_kurz not in already_considered:
siblings_kurz = dataset_obj.getSiblingsFor(attr_name_kurz)
activations_for_category = [
counterfactual_sample[attr_name_kurz] for attr_name_kurz in siblings_kurz
]
sum_activations_for_category = np.sum(activations_for_category)
if 'cat' in dataset_obj.attributes_kurz[attr_name_kurz].attr_type:
if sum_activations_for_category == 1:
continue
else:
# print('not plausible, fixing..', end='')
# TODO: don't actually return early! Instead see the actual distance,
# fingers crossed that we can say that not only is their method giving
# counterfactuals are larger distances, but in a lot of cases they are
# not data-type plausible
# INSTEAD, do below:
# Convert to correct categorical/ordinal activations so we can
# compute the distance using already written function.
# Turns out we need to do nothing, because the distance between
# [0,1,0] and anything other than itself, (e.g., [1,1,0] or [1,0,1])
# is always 1 :)
# continue
distance = -1
counterfactual_plausible = False
# return counterfactual_sample, distance
elif 'ord' in dataset_obj.attributes_kurz[attr_name_kurz].attr_type:
# TODO: assert activations are in order...
# if not, repeat as above...
for idx in range(int(sum_activations_for_category)):
if activations_for_category[idx] != 1:
# Convert to correct categorical/ordinal activations so we can
# compute the distance using already written function.
# Find the max index of 1 in the array, and set everything before that to 1
# print('not plausible, fixing..', end='')
# max_index_of_1 = np.where(np.array(activations_for_category) == 1)[0][-1]
# for idx2 in range(max_index_of_1 + 1):
# counterfactual_sample[siblings_kurz[idx2]] = 1
# break
distance = -1
counterfactual_plausible = False
# return counterfactual_sample, distance
else:
raise Exception(f'{attr_name_kurz} must include either `cat` or `ord`.')
already_considered.extend(siblings_kurz)
# TODO: convert to correct categorical/ordinal activations so we can compute
# distance = output['cost'] # TODO: this must change / be verified!???
distance = normalizedDistance.getDistanceBetweenSamples(
factual_sample,
counterfactual_sample,
norm_type,
dataset_obj
)
# # TODO: post-feasibible needed???? NO
# # make plausible by rounding all non-numeric-real attributes to
# # nearest value in range
# for idx, elem in enumerate(es_instance):
# attr_name_kurz = dataset_obj.getInputAttributeNames('kurz')[idx]
# attr_obj = dataset_obj.attributes_kurz[attr_name_kurz]
# if attr_obj.attr_type != 'numeric-real':
# # round() might give a value that is NOT in plausible.
# # instead find the nearest plausible value
# es_instance[idx] = min(
# list(range(int(attr_obj.lower_bound), int(attr_obj.upper_bound) + 1)),
# key = lambda x : abs(x - es_instance[idx])
# )
end_time = time.time()
return {
'factual_sample': factual_sample,
'cfe_sample': counterfactual_sample,
'cfe_found': True, # TODO?
'cfe_plausible': counterfactual_plausible,
'cfe_distance': distance,
'cfe_time': end_time - start_time,
}