Skip to content

Commit b43ddad

Browse files
committed
experiments: make it a V2 with imbalanced information and stratified K fold
1 parent d166269 commit b43ddad

File tree

1 file changed

+40
-4
lines changed

1 file changed

+40
-4
lines changed

scripts/etienne_knn.py

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,15 @@
1111

1212
from utils import chrono, instrument
1313
from sklearn.metrics import mean_squared_error
14-
from sklearn.model_selection import KFold
14+
from sklearn.model_selection import StratifiedKFold
1515

1616
# Constants
1717
KNN_RATING_VALUES = {
1818
"favorite": 1,
1919
"like": 1,
2020
"dislike": 0,
2121
"neutral": 0,
22-
"willsee": 0,
22+
"willsee": 1,
2323
"wontsee": 0,
2424
}
2525

@@ -75,6 +75,22 @@ def start_measures(metric_name, filename):
7575
def measure(metric_name, y_pred, y_test):
7676
return METRICS[metric_name](y_test, y_pred)
7777

78+
def diag_imbalanced_info(y):
79+
labels = np.unique(y)
80+
counts = np.bincount(y.astype(np.int64))
81+
logger.info('Diagnostic of dataset imbalanced state')
82+
m, M = counts[0], counts[0]
83+
84+
for label, count in zip(labels, counts):
85+
logger.info('{} label has {} element in the vector'.format(label, count))
86+
m = min(count, m)
87+
M = max(count, M)
88+
89+
logger.info('[!] In summary, biggest deviation is: {}, corresponding to {} % of the maximum'.format(
90+
M - m,
91+
100 - 100*m/M
92+
))
93+
7894
def main():
7995
parser = argparse.ArgumentParser(
8096
prog="etienne_knn",
@@ -98,9 +114,15 @@ def main():
98114
parser.add_argument("--metric",
99115
default='rmse',
100116
help="Metric name used for comparison, example: rmse")
117+
parser.add_argument("--shuffle-dataset",
118+
action='store_true',
119+
help='Shuffle the dataset through the folds and display the seed used')
101120
parser.add_argument("--auto-resize-cost-matrix",
102121
action='store_true',
103122
help='When nb_works > C.shape[0], C can be extended with zeros. Makes operations very slower.')
123+
parser.add_argument("--diagnose-balance-in-dataset",
124+
action='store_true',
125+
help='Show information about the balance in the dataset in terms of labels during folds and at start')
104126
parser.add_argument(
105127
"-v",
106128
"--verbose",
@@ -129,6 +151,9 @@ def main():
129151

130152
C, _ = load_ot_data_in_memory(args.input_ot_workload, True)
131153
_, X, y, nb_users, nb_works = load_ratings(args.initial_dataset, KNN_RATING_VALUES)
154+
155+
if args.diagnose_balance_in_dataset:
156+
diag_imbalanced_info(y)
132157

133158
chrono.save("OT data and dataset loaded in memory")
134159

@@ -149,15 +174,26 @@ def main():
149174
etienne_kernel = create_kernel_function(C)
150175
chrono.save("Etienne's kernel built")
151176

152-
kf = KFold(n_splits=args.n_splits)
177+
if args.shuffle_dataset:
178+
r_state = np.random.randint(2**32)
179+
kf = StratifiedKFold(n_splits=args.n_splits, shuffle=True, random_state=r_state)
180+
logger.warning('Shuffling will create uncertainty in results, but here\'s the seed for the KFold: {}'.format(r_state))
181+
else:
182+
kf = StratifiedKFold(n_splits=args.n_splits)
153183

154-
for i, (train_index, test_index) in enumerate(kf.split(X)):
184+
for i, (train_index, test_index) in enumerate(kf.split(X, y)):
155185
X_train, X_test = X[train_index], X[test_index]
156186
y_train, y_test = y[train_index], y[test_index]
157187

158188
etienne_knn = KernelKNN(nb_users, nb_works, kernel_function=etienne_kernel)
159189
etienne_knn.fit(X_train, y_train)
160190
chrono.save("Etienne's KNN fitted")
191+
192+
if args.diagnose_balance_in_dataset:
193+
logging.info('Imbalanced information on y_train')
194+
diag_imbalanced_info(y_train)
195+
logging.info('Imbalanced information on y_test')
196+
diag_imbalanced_info(y_test)
161197

162198
y_pred = etienne_knn.predict(X_test)
163199
chrono.save("Etienne's KNN predicted")

0 commit comments

Comments
 (0)