-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathmain.py
2630 lines (2132 loc) · 113 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import os
import time
import torch
import pickle
import inspect
import pathlib
import warnings
import argparse
import itertools
import subprocess
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from pprint import pprint
from matplotlib import pyplot as plt
from datetime import datetime
from attrdict import AttrDict
import GPy
import mmd
import utils
import loadSCM
import loadData
import loadModel
import gpHelper
import skHelper
import fairRecourse
from scatter import *
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process.kernels import WhiteKernel, RBF
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics.pairwise import linear_kernel, rbf_kernel, polynomial_kernel
from functools import partial
from _cvae.train import *
from debug import ipsh
import random
from random import seed
RANDOM_SEED = 54321
seed(RANDOM_SEED) # set the random seed so that the random permutations can be reproduced again
np.random.seed(RANDOM_SEED)
ACCEPTABLE_POINT_RECOURSE = {'m0_true', 'm1_alin', 'm1_akrr'}
ACCEPTABLE_DISTR_RECOURSE = {'m1_gaus', 'm1_cvae', 'm2_true', 'm2_gaus', 'm2_cvae', 'm2_cvae_ps'}
EXPERIMENTAL_SETUPS = [
('m0_true', '*'), \
('m1_alin', 'v'), \
('m1_akrr', '^'), \
('m1_gaus', 'D'), \
('m1_cvae', 'x'), \
('m2_true', 'o'), \
('m2_gaus', 's'), \
('m2_cvae', '+'), \
]
PROCESSING_SKLEARN = 'raw'
PROCESSING_GAUS = 'raw'
PROCESSING_CVAE = 'raw'
class Instance(object):
def __init__(self, instance_dict, instance_idx=None):
self.instance_idx = instance_idx
self.endogenous_nodes_dict = dict()
self.exogenous_nodes_dict = dict()
for key, value in instance_dict.items():
if 'x' in key:
self.endogenous_nodes_dict[key] = value
elif 'u' in key:
self.exogenous_nodes_dict[key] = value
else:
raise Exception(f'Node type not recognized.')
def dict(self, node_types = 'endogenous'):
if node_types == 'endogenous':
return self.endogenous_nodes_dict
elif node_types == 'exogenous':
return self.exogenous_nodes_dict
elif node_types == 'endogenous_and_exogenous':
return {**self.endogenous_nodes_dict, **self.exogenous_nodes_dict}
else:
raise Exception(f'Node type not recognized.')
# def array(self, node_types = 'endogenous'): #, nested = False
# return np.array(
# list( # TODO (BUG???) what happens to this order? are values always ordered correctly?
# self.dict(node_types).values()
# )
# ).reshape(1,-1)
def keys(self, node_types = 'endogenous'):
return self.dict(node_types).keys()
def values(self, node_types = 'endogenous'):
return self.dict(node_types).values()
def items(self, node_types = 'endogenous'):
return self.dict(node_types).items()
@utils.Memoize
def loadCausalModel(args, experiment_folder_name):
return loadSCM.loadSCM(args.scm_class, experiment_folder_name)
@utils.Memoize
def loadDataset(args, experiment_folder_name):
# unused: experiment_folder_name
if args.dataset_class == 'adult':
return loadData.loadDataset(args.dataset_class, return_one_hot = False, load_from_cache = False, index_offset = 1)
else:
return loadData.loadDataset(args.dataset_class, return_one_hot = True, load_from_cache = False, meta_param = args.scm_class)
@utils.Memoize
def loadClassifier(args, objs, experiment_folder_name):
if args.classifier_class in fairRecourse.FAIR_MODELS:
fair_nodes = getTrainableNodesForFairModel(args, objs)
# must have at least 1 endogenous node in the training set, otherwise we
# cannot identify an action set (interventions do not affect exogenous nodes)
if len([elem for elem in fair_nodes if 'x' in elem]) == 0:
raise Exception(f'[INFO] No intervenable set of nodes founds to train `{args.classifier_class}`. Exiting.')
else:
fair_nodes = None
return loadModel.loadModelForDataset(
args.classifier_class,
args.dataset_class,
args.scm_class,
args.num_train_samples,
fair_nodes,
args.fair_kernel_type,
experiment_folder_name
)
@utils.Memoize
def getTorchClassifier(args, objs):
if isinstance(objs.classifier_obj, LogisticRegression):
fixed_model_w = objs.classifier_obj.coef_
fixed_model_b = objs.classifier_obj.intercept_
fixed_model = lambda x: torch.sigmoid(
(
torch.nn.functional.linear(
x,
torch.from_numpy(fixed_model_w).float(),
) + float(fixed_model_b)
)
)
elif isinstance(objs.classifier_obj, MLPClassifier):
data_dim = len(objs.dataset_obj.getInputAttributeNames())
fixed_model_width = 10 # TODO make more dynamic later and move to separate function
assert objs.classifier_obj.hidden_layer_sizes == (fixed_model_width, fixed_model_width)
fixed_model = torch.nn.Sequential(
torch.nn.Linear(data_dim, fixed_model_width),
torch.nn.ReLU(),
torch.nn.Linear(fixed_model_width, fixed_model_width),
torch.nn.ReLU(),
torch.nn.Linear(fixed_model_width, 1),
torch.nn.Sigmoid()
)
fixed_model[0].weight = torch.nn.Parameter(torch.tensor(objs.classifier_obj.coefs_[0].astype('float32')).t(), requires_grad=False)
fixed_model[2].weight = torch.nn.Parameter(torch.tensor(objs.classifier_obj.coefs_[1].astype('float32')).t(), requires_grad=False)
fixed_model[4].weight = torch.nn.Parameter(torch.tensor(objs.classifier_obj.coefs_[2].astype('float32')).t(), requires_grad=False)
fixed_model[0].bias = torch.nn.Parameter(torch.tensor(objs.classifier_obj.intercepts_[0].astype('float32')), requires_grad=False)
fixed_model[2].bias = torch.nn.Parameter(torch.tensor(objs.classifier_obj.intercepts_[1].astype('float32')), requires_grad=False)
fixed_model[4].bias = torch.nn.Parameter(torch.tensor(objs.classifier_obj.intercepts_[2].astype('float32')), requires_grad=False)
else:
raise Exception(f'Converting {str(objs.classifier_obj.__class__)} to torch not supported.')
X_all = getOriginalDataFrame(objs, args.num_train_samples)
assert np.all(
np.isclose(
objs.classifier_obj.predict_proba(X_all[:25])[:,1],
fixed_model(torch.tensor(X_all[:25].to_numpy(), dtype=torch.float32)).flatten(),
atol = 1e-3,
)
), 'Torch classifier is not equivalent to the sklearn model.'
return fixed_model
def measureActionSetCost(args, objs, factual_instance_obj_or_ts, action_set, processing_type = 'raw', range_normalized = True):
# TODO (medpri): this function is called using both brute-force and grad-descent
# approach (where the former/latter uses Instance/dict)
if isinstance(factual_instance_obj_or_ts, Instance):
factual_instance = factual_instance_obj_or_ts.dict()
else:
factual_instance = factual_instance_obj_or_ts
X_all = processDataFrameOrInstance(args, objs, getOriginalDataFrame(objs, args.num_train_samples), processing_type)
ranges = dict(zip(
X_all.columns,
[np.max(X_all[col]) - np.min(X_all[col]) for col in X_all.columns],
))
# TODO (medpri): WHICH TO USE??? FOR REPRODUCABILITY USE ABOVE!
# (above uses ranges in train set, below over all the dataset)
# ranges = objs.dataset_obj.getVariableRanges()
if not range_normalized:
ranges = {key: 1 for key in ranges.keys()}
if \
np.all([isinstance(elem, (int, float)) for elem in factual_instance.values()]) and \
np.all([isinstance(elem, (int, float)) for elem in action_set.values()]):
deltas = [
(action_set[key] - factual_instance[key]) / ranges[key]
for key in action_set.keys()
]
return np.linalg.norm(deltas, args.norm_type)
elif \
np.all([isinstance(elem, torch.Tensor) for elem in factual_instance.values()]) and \
np.all([isinstance(elem, torch.Tensor) for elem in action_set.values()]):
deltas = torch.stack([
(action_set[key] - factual_instance[key]) / ranges[key]
for key in action_set.keys()
])
return torch.norm(deltas, p=args.norm_type)
else:
raise Exception(f'Mismatching or unsupport datatypes.')
def getColumnIndicesFromNames(args, objs, column_names):
# this is index in df, need to -1 to get index in x_counter / do_update,
# because the first column of df is 'y' (pseudonym: what if column ordering is
# changed? this code breaks abstraction.)
column_indices = []
for column_name in column_names:
tmp_1 = objs.dataset_obj.data_frame_kurz.columns.get_loc(column_name) - 1
tmp_2 = list(objs.scm_obj.getTopologicalOrdering()).index(column_name)
tmp_3 = list(objs.dataset_obj.getInputAttributeNames()).index(column_name)
assert tmp_1 == tmp_2 == tmp_3
column_indices.append(tmp_1)
return column_indices
def getIndexOfFactualInstanceInDataFrame(factual_instance_obj, data_frame):
# data_frame may include X and U, whereas factual_instance_obj.dict() only includes X
assert set(factual_instance_obj.dict().keys()).issubset(set(data_frame.columns))
matching_indicies = []
for enumeration_idx, (factual_instance_idx, row) in enumerate(data_frame.iterrows()):
if np.all([
factual_instance_obj.dict()[key] == row[key]
for key in factual_instance_obj.dict().keys()
]):
matching_indicies.append(enumeration_idx)
if len(matching_indicies) == 0:
raise Exception(f'Was not able to find instance in data frame.')
elif len(matching_indicies) > 1:
raise Exception(f'Multiple matching instances are found in data frame: {matching_indicies}')
else:
return matching_indicies[0]
def processTensorOrDictOfTensors(args, objs, obj, processing_type, column_names):
# To process a tensor (unlike a dict/dataframe), the getColumnIndicesFromNames
# returns indices by always assuming that the entire tensor is present
assert len(column_names) == len(objs.dataset_obj.getInputAttributeNames())
if processing_type == 'raw':
return obj
assert \
isinstance(obj, torch.Tensor) or \
(
isinstance(obj, dict) and \
np.all([
isinstance(value, torch.Tensor)
for value in list(obj.values())
])
), f'Datatype `{obj.__class__}` not supported for processing.'
iterate_over = column_names
obj = obj.clone()
for node in iterate_over:
if 'u' in node:
print(f'[WARNING] Skipping over processing of noise variable {node}.')
continue
# use objs.dataset_obj stats, not X_train (in case you use more samples later,
# e.g., validation set for cvae
tmp = objs.dataset_obj.data_frame_kurz.describe()[node]
node_min = tmp['min']
node_max = tmp['max']
node_mean = tmp['mean']
node_std = tmp['std']
node_idx = getColumnIndicesFromNames(args, objs, [node])
if processing_type == 'normalize':
obj[:, node_idx] = (obj[:, node_idx] - node_min) / (node_max - node_min)
elif processing_type == 'standardize':
obj[:, node_idx] = (obj[:, node_idx] - node_mean) / node_std
elif processing_type == 'mean_subtract':
obj[:, node_idx] = (obj[:, node_idx] - node_mean)
return obj
def deprocessTensorOrDictOfTensors(args, objs, obj, processing_type, column_names):
# To process a tensor (unlike a dict/dataframe), the getColumnIndicesFromNames
# returns indices by always assuming that the entire tensor is present
assert len(column_names) == len(objs.dataset_obj.getInputAttributeNames())
if processing_type == 'raw':
return obj
assert \
isinstance(obj, torch.Tensor) or \
(
isinstance(obj, dict) and \
np.all([
isinstance(value, torch.Tensor)
for value in list(obj.values())
])
), f'Datatype `{obj.__class__}` not supported for processing.'
iterate_over = column_names
obj = obj.clone()
for node in iterate_over:
if 'u' in node:
print(f'[WARNING] Skipping over processing of noise variable {node}.')
continue
# use objs.dataset_obj stats, not X_train (in case you use more samples later,
# e.g., validation set for cvae
tmp = objs.dataset_obj.data_frame_kurz.describe()[node]
node_min = tmp['min']
node_max = tmp['max']
node_mean = tmp['mean']
node_std = tmp['std']
node_idx = getColumnIndicesFromNames(args, objs, [node])
if processing_type == 'normalize':
obj[:, node_idx] = obj[:, node_idx] * (node_max - node_min) + node_min
elif processing_type == 'standardize':
obj[:, node_idx] = obj[:, node_idx] * node_std + node_mean
elif processing_type == 'mean_subtract':
obj[:, node_idx] = obj[:, node_idx] + node_mean
return obj
def processDataFrameOrInstance(args, objs, obj, processing_type):
# TODO (cat): add support for categorical data
if processing_type == 'raw':
return obj
if isinstance(obj, Instance):
raise NotImplementedError
# iterate_over = obj.keys()
elif isinstance(obj, pd.DataFrame):
iterate_over = obj.columns
else:
raise Exception(f'Datatype `{obj.__class__}` not supported for processing.')
obj = obj.copy() # so as not to change the underlying object
for node in iterate_over:
if 'u' in node:
print(f'[WARNING] Skipping over processing of noise variable {node}.')
continue
# use objs.dataset_obj stats, not X_train (in case you use more samples later,
# e.g., validation set for cvae
tmp = objs.dataset_obj.data_frame_kurz.describe()[node]
node_min = tmp['min']
node_max = tmp['max']
node_mean = tmp['mean']
node_std = tmp['std']
if processing_type == 'normalize':
obj[node] = (obj[node] - node_min) / (node_max - node_min)
elif processing_type == 'standardize':
obj[node] = (obj[node] - node_mean) / node_std
elif processing_type == 'mean_subtract':
obj[node] = (obj[node] - node_mean)
return obj
def deprocessDataFrameOrInstance(args, objs, obj, processing_type):
# TODO (cat): add support for categorical data
if processing_type == 'raw':
return obj
if isinstance(obj, Instance):
raise NotImplementedError
# iterate_over = obj.keys()
elif isinstance(obj, pd.DataFrame):
iterate_over = obj.columns
else:
raise Exception(f'Datatype `{obj.__class__}` not supported for processing.')
obj = obj.copy() # so as not to change the underlying object
for node in iterate_over:
if 'u' in node:
print(f'[WARNING] Skipping over processing of noise variable {node}.')
continue
# use objs.dataset_obj stats, not X_train (in case you use more samples later,
# e.g., validation set for cvae
tmp = objs.dataset_obj.data_frame_kurz.describe()[node]
node_min = tmp['min']
node_max = tmp['max']
node_mean = tmp['mean']
node_std = tmp['std']
if processing_type == 'normalize':
obj[node] = obj[node] * (node_max - node_min) + node_min
elif processing_type == 'standardize':
obj[node] = obj[node] * node_std + node_mean
elif processing_type == 'mean_subtract':
obj[node] = obj[node] + node_mean
return obj
@utils.Memoize
def getOriginalDataFrame(objs, num_samples, with_meta = False, with_label = False, balanced = True, data_split = 'train_and_test'):
return objs.dataset_obj.getOriginalDataFrame(
num_samples = num_samples,
with_meta = with_meta,
with_label = with_label,
balanced = balanced,
data_split = data_split
)
def getNearestObservableInstance(args, objs, factual_instance_obj):
# TODO (lowpri): if we end up using this, it is important to keep in mind the processing
# that is use when calling a specific recourse type because we would have to then
# perhaps find the MO instance in the original data, but then initialize the action
# set using the processed value...
X_all = getOriginalDataFrame(objs, args.num_train_samples)
# compute distances between factual instance and all instances in X_all
tmp = np.array(list(factual_instance_obj.dict().values())).reshape(1,-1)[:,None] - X_all.to_numpy()
tmp = tmp.squeeze()
min_cost = np.infty
min_observable_dict = None
print(f'\t\t[INFO] Searching for minimum observable instance...', end = '')
for observable_idx in range(tmp.shape[0]):
# CLOSEST INSTANCE ON THE OTHER SIDE!!
observable_distance_np = tmp[observable_idx,:]
observable_instance_obj = Instance(X_all.iloc[observable_idx].T.to_dict())
if \
getPrediction(args, objs, factual_instance_obj) != \
getPrediction(args, objs, observable_instance_obj):
observable_distance = np.linalg.norm(observable_distance_np)
if observable_distance < min_cost:
min_cost = observable_distance
min_observable_idx = observable_idx
min_observable_dict = dict(zip(
objs.scm_obj.getTopologicalOrdering(),
X_all.iloc[min_observable_idx],
))
print(f'found at index #{min_observable_idx}. Initializing `action_set_ts` using these values.')
return min_observable_dict
def getNoiseStringForNode(node):
assert node[0] == 'x'
return 'u' + node[1:]
def prettyPrintDict(my_dict):
# use this for grad descent logs (convert tensor accordingly)
my_dict = my_dict.copy()
for key, value in my_dict.items():
my_dict[key] = np.around(value, 3)
return my_dict
def didFlip(args, objs, factual_instance_obj, counter_instance_obj):
return \
getPrediction(args, objs, factual_instance_obj) != \
getPrediction(args, objs, counter_instance_obj)
def getConditionalString(node, parents):
return f'p({node} | {", ".join(parents)})'
@utils.Memoize
def trainRidge(args, objs, node, parents):
assert len(parents) > 0, 'parents set cannot be empty.'
print(f'\t[INFO] Fitting {getConditionalString(node, parents)} using Ridge on {args.num_train_samples} samples; this may be very expensive, memoizing afterwards.')
X_all = processDataFrameOrInstance(args, objs, getOriginalDataFrame(objs, args.num_train_samples), PROCESSING_SKLEARN)
param_grid = {'alpha': np.logspace(-2, 1, 10)}
model = GridSearchCV(Ridge(), param_grid=param_grid)
model.fit(X_all[parents], X_all[[node]])
return model
@utils.Memoize
def trainKernelRidge(args, objs, node, parents):
assert len(parents) > 0, 'parents set cannot be empty.'
print(f'\t[INFO] Fitting {getConditionalString(node, parents)} using KernelRidge on {args.num_train_samples} samples; this may be very expensive, memoizing afterwards.')
X_all = processDataFrameOrInstance(args, objs, getOriginalDataFrame(objs, args.num_train_samples), PROCESSING_SKLEARN)
param_grid = {
'alpha': np.logspace(-2, 1, 5),
'kernel': [
RBF(lengthscale)
for lengthscale in np.logspace(-2, 1, 5)
]
}
model = GridSearchCV(KernelRidge(), param_grid=param_grid)
model.fit(X_all[parents], X_all[[node]])
# pseudonym: i am not proud of this, but for some reason sklearn includes the
# X_fit_ covariates but not labels (this is needed later if we want to
# avoid using.predict() and call from krr manually)
model.best_estimator_.Y_fit_ = X_all[[node]].to_numpy()
return model
@utils.Memoize
def trainCVAE(args, objs, node, parents):
assert len(parents) > 0, 'parents set cannot be empty.'
print(f'\t[INFO] Fitting {getConditionalString(node, parents)} using CVAE on {args.num_train_samples * 4} samples; this may be very expensive, memoizing afterwards.')
X_all = processDataFrameOrInstance(args, objs, getOriginalDataFrame(objs, args.num_train_samples * 4 + args.num_validation_samples), PROCESSING_CVAE)
attr_obj = objs.dataset_obj.attributes_kurz[node]
node_train = X_all[[node]].iloc[:args.num_train_samples * 4]
parents_train = X_all[parents].iloc[:args.num_train_samples * 4]
node_validation = X_all[[node]].iloc[args.num_train_samples * 4:]
parents_validation = X_all[parents].iloc[args.num_train_samples * 4:]
if args.scm_class == 'sanity-3-lin':
if node == 'x2':
sweep_lambda_kld = [0.01]
sweep_encoder_layer_sizes = [[1, 32, 32, 32]]
sweep_decoder_layer_sizes = [[5, 5, 1]]
sweep_latent_size = [1]
elif node == 'x3':
sweep_lambda_kld = [0.01]
sweep_encoder_layer_sizes = [[1, 32, 32, 32]]
sweep_decoder_layer_sizes = [[32, 32, 32, 1]]
sweep_latent_size = [1]
elif args.scm_class == 'sanity-3-anm':
if node == 'x2':
sweep_lambda_kld = [0.01]
sweep_encoder_layer_sizes = [[1, 32, 32]]
sweep_decoder_layer_sizes = [[32, 32, 1]]
sweep_latent_size = [5]
elif node == 'x3':
sweep_lambda_kld = [0.01]
sweep_encoder_layer_sizes = [[1, 32, 32, 32]]
sweep_decoder_layer_sizes = [[32, 32, 1]]
sweep_latent_size = [1]
elif args.scm_class == 'sanity-3-gen':
if node == 'x2':
sweep_lambda_kld = [0.5]
sweep_encoder_layer_sizes = [[1, 32, 32, 32]]
sweep_decoder_layer_sizes = [[32, 32, 1]]
sweep_latent_size = [3]
elif node == 'x3':
sweep_lambda_kld = [0.5]
sweep_encoder_layer_sizes = [[1, 32, 32, 32]]
sweep_decoder_layer_sizes = [[32, 32, 1]]
sweep_latent_size = [3]
else:
io_size = 1 # b/c X_all[[node]] is always 1 dimensional for non-cat/ord variables
if attr_obj.attr_type in {'categorical', 'ordinal'}:
# one-hot --> non-hot (categorical CVAE can guarantee conditional p(node|parents) returns categorical value)
# IMPORTANT: we DO NOT convert cat/ord parents to one-hot.
# IMPORTANT: because all categories may not be present in the dataframe,
# we call pd.get_dummies on an already converted dataframe with
# with prespecfied categories
node_train = utils.convertToOneHotWithPrespecifiedCategories(node_train, node, attr_obj.lower_bound, attr_obj.upper_bound)
node_validation = utils.convertToOneHotWithPrespecifiedCategories(node_validation, node, attr_obj.lower_bound, attr_obj.upper_bound)
io_size = len(node_train.columns)
assert \
len(node_train.columns) == len(node_validation.columns), \
'Training data is not representative enough; missing some categories'
sweep_lambda_kld = [5, 1, 0.5, 0.1, 0.05, 0.01, 0.005]
sweep_encoder_layer_sizes = [
[io_size, 2, 2],
[io_size, 3, 3],
[io_size, 5, 5],
[io_size, 32, 32, 32],
]
sweep_decoder_layer_sizes = [
[2, io_size],
[2, 2, io_size],
[3, 3, io_size],
[5, 5, io_size],
[32, 32, 32, io_size],
]
sweep_latent_size = [1,3,5]
trained_models = {}
all_hyperparam_setups = list(itertools.product(
sweep_lambda_kld,
sweep_encoder_layer_sizes,
sweep_decoder_layer_sizes,
sweep_latent_size,
))
if args.scm_class not in {'sanity-3-lin', 'sanity-3-anm', 'sanity-3-gen'}:
# # TODO (remove): For testing purposes only
all_hyperparam_setups = random.sample(all_hyperparam_setups, 10)
for idx, hyperparams in enumerate(all_hyperparam_setups):
print(f'\n\t[INFO] Training hyperparams setup #{idx+1} / {len(all_hyperparam_setups)}: {str(hyperparams)}')
try:
trained_cvae, recon_node_train, recon_node_validation = train_cvae(AttrDict({
'name': f'{getConditionalString(node, parents)}',
'attr_type': attr_obj.attr_type,
'node_train': node_train,
'parents_train': parents_train,
'node_validation': node_validation,
'parents_validation': parents_validation,
'seed': 0,
'epochs': 100,
'batch_size': 128,
'learning_rate': 0.05,
'lambda_kld': hyperparams[0],
'encoder_layer_sizes': hyperparams[1],
'decoder_layer_sizes': hyperparams[2],
'latent_size': hyperparams[3],
'conditional': True,
'debug_folder': experiment_folder_name + f'/cvae_hyperparams_setup_{idx}_of_{len(all_hyperparam_setups)}',
}))
except Exception as e:
print(e)
continue
if attr_obj.attr_type in {'categorical', 'ordinal'}:
# non-hot --> one-hot
recon_node_train = torch.argmax(recon_node_train, axis=1, keepdims=True) + 1 # categoricals start at index 1
recon_node_validation = torch.argmax(recon_node_validation, axis=1, keepdims=True) + 1 # categoricals start at index 1
# # TODO (lowpri): remove after models.py is corrected
# return trained_cvae
# run mmd to verify whether training is good or not (ON VALIDATION SET)
X_val = X_all[args.num_train_samples * 4:].copy()
# POTENTIAL BUG? reset index here so that we can populate the `node` column
# with reconstructed values from trained_cvae that lack indexing
X_val = X_val.reset_index(drop = True)
X_true = X_val[parents + [node]]
X_pred_posterior = X_true.copy()
X_pred_posterior[node] = pd.DataFrame(recon_node_validation.numpy(), columns=[node])
# pseudonym: this is so bad code.
not_imp_factual_instance = dict.fromkeys(objs.scm_obj.getTopologicalOrdering(), -1)
not_imp_factual_df = pd.DataFrame(dict(zip(
objs.dataset_obj.getInputAttributeNames(),
[X_true.shape[0] * [not_imp_factual_instance[node]] for node in objs.dataset_obj.getInputAttributeNames()],
)))
not_imp_samples_df = X_true.copy()
X_pred_prior = sampleCVAE(args, objs, not_imp_factual_instance, not_imp_factual_df, not_imp_samples_df, node, parents, 'm2_cvae', trained_cvae = trained_cvae)
X_pred = X_pred_prior
my_statistic, statistics, sigma_median = mmd.mmd_with_median_heuristic(X_true.to_numpy(), X_pred.to_numpy())
print(f'\t\t[INFO] test-statistic = {my_statistic} using median of {sigma_median} as bandwith')
trained_models[f'setup_{idx:03d}'] = {}
trained_models[f'setup_{idx:03d}']['hyperparams'] = hyperparams
trained_models[f'setup_{idx:03d}']['trained_cvae'] = trained_cvae
trained_models[f'setup_{idx:03d}']['test-statistic'] = my_statistic
index_with_lowest_test_statistics = min(trained_models.keys(), key=lambda k: abs(trained_models[k]['test-statistic'] - 0))
# index_with_lowest_test_statistics = min(trained_models.keys(), key=lambda k: trained_models[k]['test-statistic'])
model_with_lowest_test_statistics = trained_models[index_with_lowest_test_statistics]['trained_cvae']
# save all results
tmp_file_name = f'{experiment_folder_name}/_cvae_params_{getConditionalString(node, parents)}.txt'
pprint(trained_models[index_with_lowest_test_statistics]['hyperparams'], open(tmp_file_name, 'w'))
pprint(trained_models, open(tmp_file_name, 'a'))
return model_with_lowest_test_statistics
@utils.Memoize
def trainGP(args, objs, node, parents):
assert len(parents) > 0, 'parents set cannot be empty.'
print(f'\t[INFO] Fitting {getConditionalString(node, parents)} using GP on {args.num_train_samples} samples; this may be very expensive, memoizing afterwards.')
X_all = processDataFrameOrInstance(args, objs, getOriginalDataFrame(objs, args.num_train_samples), PROCESSING_GAUS)
kernel = GPy.kern.RBF(input_dim=len(parents), ARD=True)
# IMPORTANT: do NOT use DataFrames, use Numpy arrays; GPy doesn't like DF.
# https://github.com/SheffieldML/GPy/issues/781#issuecomment-532738155
model = GPy.models.GPRegression(
X_all[parents].to_numpy(),
X_all[[node]].to_numpy(),
kernel,
)
model.optimize_restarts(parallel=True, num_restarts=5, verbose=False)
return kernel, X_all, model
def _getAbductionNoise(args, objs, node, parents, factual_instance_obj_or_ts, structural_equation):
# TODO (medpri): this function is called using both brute-force and grad-descent
# approach (where the former/latter uses Instance/dict)
if isinstance(factual_instance_obj_or_ts, Instance):
factual_instance = factual_instance_obj_or_ts.dict()
else:
factual_instance = factual_instance_obj_or_ts
# only applies for ANM models
return factual_instance[node] - structural_equation(
0,
*[factual_instance[parent] for parent in parents],
)
def sampleTrue(args, objs, factual_instance_obj, factual_df, samples_df, node, parents, recourse_type):
# Step 1. [abduction]: compute noise or load from dataset using factual_instance_obj
# Step 2. [action]: (skip) this step is implicitly performed in the populated samples_df columns
# Step 3. [prediction]: run through structural equation using noise and parents from samples_df
structural_equation = objs.scm_obj.structural_equations_np[node]
if recourse_type == 'm0_true':
noise_pred = _getAbductionNoise(args, objs, node, parents, factual_instance_obj, structural_equation)
try: # may fail if noise variables are not present in the data (e.g., for real-world adult, we have no noise variables)
noise_true = factual_instance_obj.dict('endogenous_and_exogenous')[f'u{node[1:]}']
if args.scm_class != 'sanity-3-gen':
assert np.abs(noise_pred - noise_true) < 1e-5, 'Noise {pred, true} expected to be similar, but not.'
except:
pass
noise = noise_pred
samples_df[node] = structural_equation(
np.array(noise), # may be scalar, which will be case as pd.series when being summed.
*[samples_df[parent] for parent in parents],
)
elif recourse_type == 'm2_true':
samples_df[node] = structural_equation(
np.array(objs.scm_obj.noises_distributions[getNoiseStringForNode(node)].sample(samples_df.shape[0])),
*[samples_df[parent] for parent in parents],
)
return samples_df
def sampleRidgeKernelRidge(args, objs, factual_instance_obj, factual_df, samples_df, node, parents, recourse_type):
samples_df = processDataFrameOrInstance(args, objs, samples_df.copy(), PROCESSING_SKLEARN)
factual_instance_obj = processDataFrameOrInstance(args, objs, factual_instance_obj, PROCESSING_SKLEARN)
# Step 1. [abduction]: compute noise or load from dataset using factual_instance_obj
# Step 2. [action]: (skip) this step is implicitly performed in the populated samples_df columns
# Step 3. [prediction]: run through structural equation using noise and parents from samples_df
if recourse_type == 'm1_alin':
trained_model = trainRidge(args, objs, node, parents)
elif recourse_type == 'm1_akrr':
trained_model = trainKernelRidge(args, objs, node, parents)
else:
raise Exception(f'{recourse_type} not recognized.')
structural_equation = lambda noise, *parents_values: trained_model.predict([[*parents_values]])[0][0] + noise
for row_idx, row in samples_df.iterrows():
noise = _getAbductionNoise(args, objs, node, parents, factual_instance_obj, structural_equation)
samples_df.loc[row_idx, node] = structural_equation(
noise,
*samples_df.loc[row_idx, parents].to_numpy(),
)
samples_df = deprocessDataFrameOrInstance(args, objs, samples_df, PROCESSING_SKLEARN)
return samples_df
def sampleCVAE(args, objs, factual_instance_obj, factual_df, samples_df, node, parents, recourse_type, trained_cvae = None):
samples_df = processDataFrameOrInstance(args, objs, samples_df.copy(), PROCESSING_CVAE)
factual_instance_obj = processDataFrameOrInstance(args, objs, factual_instance_obj, PROCESSING_CVAE)
if trained_cvae is None: # pseudonym: UGLY CODE
trained_cvae = trainCVAE(args, objs, node, parents)
if recourse_type == 'm1_cvae':
sample_from = 'posterior'
elif recourse_type == 'm2_cvae':
sample_from = 'prior'
elif recourse_type == 'm2_cvae_ps':
sample_from = 'reweighted_prior'
x_factual = factual_df[[node]]
pa_factual = factual_df[parents]
pa_counter = samples_df[parents]
attr_obj = objs.dataset_obj.attributes_kurz[node]
if attr_obj.attr_type in {'categorical', 'ordinal'}:
# non-hot --> one-hot
x_factual = utils.convertToOneHotWithPrespecifiedCategories(x_factual, node, attr_obj.lower_bound, attr_obj.upper_bound)
new_samples = trained_cvae.reconstruct(
x_factual=x_factual,
pa_factual=pa_factual,
pa_counter=pa_counter,
sample_from=sample_from,
)
if attr_obj.attr_type in {'categorical', 'ordinal'}:
# one-hot --> non-hot
new_samples = pd.DataFrame(new_samples.idxmax(axis=1) + 1)
new_samples = new_samples.rename(columns={0: node}) # bad code pseudonym, this violates abstraction!
samples_df = samples_df.reset_index(drop=True)
samples_df[node] = new_samples.astype('float64')
samples_df = deprocessDataFrameOrInstance(args, objs, samples_df, PROCESSING_CVAE)
return samples_df
def sampleGP(args, objs, factual_instance_obj, factual_df, samples_df, node, parents, recourse_type):
samples_df = processDataFrameOrInstance(args, objs, samples_df.copy(), PROCESSING_GAUS)
factual_instance_obj = processDataFrameOrInstance(args, objs, factual_instance_obj, PROCESSING_GAUS)
kernel, X_all, model = trainGP(args, objs, node, parents)
X_parents = torch.tensor(samples_df[parents].to_numpy())
if recourse_type == 'm1_gaus': # counterfactual distribution for node
# IMPORTANT: Find index of factual instance in dataframe used for training GP
# (earlier, the factual instance was appended as the last instance)
tmp_idx = getIndexOfFactualInstanceInDataFrame(
factual_instance_obj,
processDataFrameOrInstance(args, objs, getOriginalDataFrame(objs, args.num_train_samples), PROCESSING_GAUS),
) # TODO (lowpri): can probably rewrite to just evaluate the posterior again given the same result.. (without needing to look through the dataset)
new_samples = gpHelper.sample_from_GP_model(model, X_parents, 'cf', tmp_idx)
elif recourse_type == 'm2_gaus': # interventional distribution for node
new_samples = gpHelper.sample_from_GP_model(model, X_parents, 'iv')
samples_df[node] = new_samples.numpy()
samples_df = deprocessDataFrameOrInstance(args, objs, samples_df, PROCESSING_GAUS)
return samples_df
def _getCounterfactualTemplate(args, objs, factual_instance_obj_or_ts, action_set, recourse_type):
# TODO (medpri): this function is called using both brute-force and grad-descent
# approach (where the former/latter uses Instance/dict)
if isinstance(factual_instance_obj_or_ts, Instance):
factual_instance = factual_instance_obj_or_ts.dict()
else:
factual_instance = factual_instance_obj_or_ts
counterfactual_template = dict.fromkeys(
objs.dataset_obj.getInputAttributeNames(),
np.NaN,
)
# get intervention and conditioning sets
intervention_set = set(action_set.keys())
# intersection_of_non_descendents_of_intervened_upon_variables
conditioning_set = set.intersection(*[
objs.scm_obj.getNonDescendentsForNode(node)
for node in intervention_set
])
# assert there is no intersection
assert set.intersection(intervention_set, conditioning_set) == set()
# set values in intervention and conditioning sets
for node in conditioning_set:
counterfactual_template[node] = factual_instance[node]
for node in intervention_set:
counterfactual_template[node] = action_set[node]
return counterfactual_template
def _samplingInnerLoop(args, objs, factual_instance_obj, action_set, recourse_type, num_samples):
counterfactual_template = _getCounterfactualTemplate(args, objs, factual_instance_obj, action_set, recourse_type)
factual_df = pd.DataFrame(dict(zip(
objs.dataset_obj.getInputAttributeNames(),
[num_samples * [factual_instance_obj.dict()[node]] for node in objs.dataset_obj.getInputAttributeNames()],
)))
# this dataframe has populated columns set to intervention or conditioning values
# and has NaN columns that will be set accordingly.
samples_df = pd.DataFrame(dict(zip(
objs.dataset_obj.getInputAttributeNames(),
[num_samples * [counterfactual_template[node]] for node in objs.dataset_obj.getInputAttributeNames()],
)))
# Simply traverse the graph in order, and populate nodes as we go!
# IMPORTANT: DO NOT use SET(topo ordering); it sometimes changes ordering!
for node in objs.scm_obj.getTopologicalOrdering():
# set variable if value not yet set through intervention or conditioning
if samples_df[node].isnull().values.any():
parents = objs.scm_obj.getParentsForNode(node)
# root nodes MUST always be set through intervention or conditioning
assert len(parents) > 0
# Confirm parents columns are present/have assigned values in samples_df
assert not samples_df.loc[:,list(parents)].isnull().values.any()
if args.debug_flag:
print(f'Sampling `{recourse_type}` from {getConditionalString(node, parents)}')
if recourse_type in {'m0_true', 'm2_true'}:
sampling_handle = sampleTrue
elif recourse_type in {'m1_alin', 'm1_akrr'}:
sampling_handle = sampleRidgeKernelRidge
elif recourse_type in {'m1_gaus', 'm2_gaus'}:
sampling_handle = sampleGP
elif recourse_type in {'m1_cvae', 'm2_cvae', 'm2_cvae_ps'}:
sampling_handle = sampleCVAE
else:
raise Exception(f'{recourse_type} not recognized.')
samples_df = sampling_handle(
args,
objs,
factual_instance_obj,
factual_df,
samples_df,
node,
parents,
recourse_type,
)
assert \
np.all(list(samples_df.columns) == objs.dataset_obj.getInputAttributeNames()), \
'Ordering of column names in samples_df has change unexpectedly'
# samples_df = samples_df[objs.dataset_obj.getInputAttributeNames()]
return samples_df
def _samplingInnerLoopTensor(args, objs, factual_instance_obj, factual_instance_ts, action_set_ts, recourse_type):
counterfactual_template_ts = _getCounterfactualTemplate(args, objs, factual_instance_ts, action_set_ts, recourse_type)
if recourse_type in ACCEPTABLE_POINT_RECOURSE:
num_samples = 1
if recourse_type in ACCEPTABLE_DISTR_RECOURSE:
num_samples = args.num_mc_samples
# Initialize factual_ts, samples_ts
factual_ts = torch.zeros((num_samples, len(objs.dataset_obj.getInputAttributeNames())))
samples_ts = torch.zeros((num_samples, len(objs.dataset_obj.getInputAttributeNames())))
for node in objs.scm_obj.getTopologicalOrdering():
factual_ts[:, getColumnIndicesFromNames(args, objs, [node])] = factual_instance_ts[node] + 0 # + 0 not needed because not trainable but leaving in..
# +0 important, specifically for tensor based elements, so we don't copy
# an existing object in the computational graph, but we create a new node
samples_ts[:, getColumnIndicesFromNames(args, objs, [node])] = counterfactual_template_ts[node] + 0
# Simply traverse the graph in order, and populate nodes as we go!
# IMPORTANT: DO NOT use SET(topo ordering); it sometimes changes ordering!
for node in objs.scm_obj.getTopologicalOrdering():
# set variable if value not yet set through intervention or conditioning
if torch.any(torch.isnan(samples_ts[:, getColumnIndicesFromNames(args, objs, [node])])):
parents = objs.scm_obj.getParentsForNode(node)
# root nodes MUST always be set through intervention or conditioning
assert len(parents) > 0
# Confirm parents columns are present/have assigned values in samples_ts
assert not torch.any(torch.isnan(samples_ts[:, getColumnIndicesFromNames(args, objs, parents)]))
if recourse_type in {'m0_true', 'm2_true'}:
structural_equation = objs.scm_obj.structural_equations_ts[node]
if recourse_type == 'm0_true':
# may be scalar, which will be case as pd.series when being summed.
noise_pred = _getAbductionNoise(args, objs, node, parents, factual_instance_ts, structural_equation)
noises = noise_pred
elif recourse_type == 'm2_true':
noises = torch.tensor(
objs.scm_obj.noises_distributions[getNoiseStringForNode(node)].sample(samples_ts.shape[0])
).reshape(-1,1)
samples_ts[:, getColumnIndicesFromNames(args, objs, [node])] = structural_equation(
noises,
*[samples_ts[:, getColumnIndicesFromNames(args, objs, [parent])] for parent in parents],
)
elif recourse_type in {'m1_alin', 'm1_akrr'}:
if recourse_type == 'm1_alin':
training_handle = trainRidge
sampling_handle = skHelper.sample_from_LIN_model
elif recourse_type == 'm1_akrr':
training_handle = trainKernelRidge