11
11
12
12
from utils import chrono , instrument
13
13
from sklearn .metrics import mean_squared_error
14
- from sklearn .model_selection import KFold
14
+ from sklearn .model_selection import StratifiedKFold
15
15
16
16
# Constants
17
17
KNN_RATING_VALUES = {
18
18
"favorite" : 1 ,
19
19
"like" : 1 ,
20
20
"dislike" : 0 ,
21
21
"neutral" : 0 ,
22
- "willsee" : 0 ,
22
+ "willsee" : 1 ,
23
23
"wontsee" : 0 ,
24
24
}
25
25
@@ -75,6 +75,22 @@ def start_measures(metric_name, filename):
75
75
def measure (metric_name , y_pred , y_test ):
76
76
return METRICS [metric_name ](y_test , y_pred )
77
77
78
+ def diag_imbalanced_info (y ):
79
+ labels = np .unique (y )
80
+ counts = np .bincount (y .astype (np .int64 ))
81
+ logger .info ('Diagnostic of dataset imbalanced state' )
82
+ m , M = counts [0 ], counts [0 ]
83
+
84
+ for label , count in zip (labels , counts ):
85
+ logger .info ('{} label has {} element in the vector' .format (label , count ))
86
+ m = min (count , m )
87
+ M = max (count , M )
88
+
89
+ logger .info ('[!] In summary, biggest deviation is: {}, corresponding to {} % of the maximum' .format (
90
+ M - m ,
91
+ 100 - 100 * m / M
92
+ ))
93
+
78
94
def main ():
79
95
parser = argparse .ArgumentParser (
80
96
prog = "etienne_knn" ,
@@ -98,9 +114,15 @@ def main():
98
114
parser .add_argument ("--metric" ,
99
115
default = 'rmse' ,
100
116
help = "Metric name used for comparison, example: rmse" )
117
+ parser .add_argument ("--shuffle-dataset" ,
118
+ action = 'store_true' ,
119
+ help = 'Shuffle the dataset through the folds and display the seed used' )
101
120
parser .add_argument ("--auto-resize-cost-matrix" ,
102
121
action = 'store_true' ,
103
122
help = 'When nb_works > C.shape[0], C can be extended with zeros. Makes operations very slower.' )
123
+ parser .add_argument ("--diagnose-balance-in-dataset" ,
124
+ action = 'store_true' ,
125
+ help = 'Show information about the balance in the dataset in terms of labels during folds and at start' )
104
126
parser .add_argument (
105
127
"-v" ,
106
128
"--verbose" ,
@@ -129,6 +151,9 @@ def main():
129
151
130
152
C , _ = load_ot_data_in_memory (args .input_ot_workload , True )
131
153
_ , X , y , nb_users , nb_works = load_ratings (args .initial_dataset , KNN_RATING_VALUES )
154
+
155
+ if args .diagnose_balance_in_dataset :
156
+ diag_imbalanced_info (y )
132
157
133
158
chrono .save ("OT data and dataset loaded in memory" )
134
159
@@ -149,15 +174,26 @@ def main():
149
174
etienne_kernel = create_kernel_function (C )
150
175
chrono .save ("Etienne's kernel built" )
151
176
152
- kf = KFold (n_splits = args .n_splits )
177
+ if args .shuffle_dataset :
178
+ r_state = np .random .randint (2 ** 32 )
179
+ kf = StratifiedKFold (n_splits = args .n_splits , shuffle = True , random_state = r_state )
180
+ logger .warning ('Shuffling will create uncertainty in results, but here\' s the seed for the KFold: {}' .format (r_state ))
181
+ else :
182
+ kf = StratifiedKFold (n_splits = args .n_splits )
153
183
154
- for i , (train_index , test_index ) in enumerate (kf .split (X )):
184
+ for i , (train_index , test_index ) in enumerate (kf .split (X , y )):
155
185
X_train , X_test = X [train_index ], X [test_index ]
156
186
y_train , y_test = y [train_index ], y [test_index ]
157
187
158
188
etienne_knn = KernelKNN (nb_users , nb_works , kernel_function = etienne_kernel )
159
189
etienne_knn .fit (X_train , y_train )
160
190
chrono .save ("Etienne's KNN fitted" )
191
+
192
+ if args .diagnose_balance_in_dataset :
193
+ logging .info ('Imbalanced information on y_train' )
194
+ diag_imbalanced_info (y_train )
195
+ logging .info ('Imbalanced information on y_test' )
196
+ diag_imbalanced_info (y_test )
161
197
162
198
y_pred = etienne_knn .predict (X_test )
163
199
chrono .save ("Etienne's KNN predicted" )
0 commit comments