-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathswarm_utils.py
141 lines (121 loc) · 4.76 KB
/
swarm_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import json
import tensorflow.keras as keras
import datetime
SIZE_X = 28
SIZE_Y = 28
# get X and Y data from a list of files
# returns: list of numpy arrays (num_samples_from_user, num_pixels)
def get_data(filenames):
i = 0
X = []
Y = []
users = []
for fn in filenames:
i += 1
print("\r({}/{}) processing: {}".format(i, len(filenames), fn))
with open(fn, "r") as f:
data = f.read()
parsed_data = json.loads(data)
X.extend([np.array(parsed_data['user_data'][user]['x']) for user in parsed_data['users']])
Y.extend([np.array(parsed_data['user_data'][user]['y']) for user in parsed_data['users']])
users.extend(parsed_data['users'])
return X, Y, users
# visualize the handwritten letters
def visualize_writings(writing):
map2d = []
for i in range(0, len(writing), SIZE_Y):
map2d.append(writing[i:i+SIZE_X])
fig, ax = plt.subplots()
im = ax.imshow(np.array(map2d))
fig.tight_layout()
plt.show()
# def preprocess(X):
# parse data to global dataset and local sets for federated settings
# every user is allocated to a global or local dataset, not being shared by two different sets
# this function only tries its best to fulfill requirements, it doesn't do error checking
# args:
# X: list of numpy arrays (num_samples_from_user, num_pixels)
# num_global: minimum number of global data
# num_local: minimum number of local data
# returns: X, Y for global, list of (X, Y)s for locals
def fl_parse(X, Y, num_clients, min_num_global, min_num_local):
X_global = []
Y_global = []
local_data = []
cnt = 0
i = 0
while i < len(X):
X_global.append(X[i])
Y_global.append(Y[i])
cnt += X[i].shape[0]
i += 1
if cnt > min_num_global:
break
while len(local_data) < num_clients and i < len(X):
X_local = []
Y_local = []
cnt = 0
while cnt < min_num_local:
X_local.append(X[i])
Y_local.append(Y[i])
cnt += X[i].shape[0]
i += 1
local_data.append((serialize_data(X_local), serialize_data(Y_local)))
return serialize_data(X_global), serialize_data(Y_global), local_data
# split training set with given size and number
# args:
# size: number of the data in each training set
# x_train: numpy array of shape (num_samples, num_dimensions)
# returns: list of numpy array for X, Y
def split_training_set(size, number, x_train, y_train):
x_train_list = np.split(x_train, x_train.shape[0] / size)[:number] # +1 cuz the last array will contain everything till the end
y_train_list = np.split(y_train, y_train.shape[0] / size)[:number]
y_train_list = [keras.utils.to_categorical(y, len(np.unique(y_train))) for y in y_train_list]
return x_train_list, y_train_list
def split_training_set_unbalanced(start_size, diff, number, x_train, y_train):
"""
clients have different number of data
"""
num_shards = int(number * (number + 1) / 2)
x_train_shards = np.split(x_train, x_train.shape[0] / diff)[:num_shards]
y_train_shards = np.split(y_train, y_train.shape[0] / diff)[:num_shards]
x_train_list = []
y_train_list = []
for i in range(number):
if len(x_train_shards[:i+1]) != i+1:
raise ValueError('train dataset not enough to construct given number of training set')
x_train_list.append(np.concatenate(x_train_shards[:i+1], axis=0))
x_train_shards = x_train_shards[i+1:]
y_train_list.append(np.concatenate(y_train_shards[:i+1], axis=0))
y_train_shards = y_train_shards[i+1:]
y_train_list = [keras.utils.to_categorical(y, len(np.unique(y_train))) for y in y_train_list]
return x_train_list, y_train_list
def filter_data_by_labels_old(x_train, y_train, labels):
"""
return only the data with corresponding labels
args:
y_train: labels shouldn't be one-hot encoded
"""
mask = np.zeros(y_train.shape, dtype=bool)
for l in labels:
mask |= (y_train == l)
return x_train[mask], y_train[mask]
# change list of numpy arrays (num_samples_from_user, num_pixels) to
# list of numpy arrays (num_pixels)
# in other words, erase user info and just serialize all the data
def serialize_data(X):
res = []
for x in X:
res.extend(list(x))
return np.array(res)
def get_train_data_from_filename(n):
return "all_data_" + str(n) + "_niid_0_keep_10_train_9.json"
def get_test_data_from_filename(n):
return "all_data_" + str(n) + "_niid_0_keep_10_test_9.json"
def get_time(params):
now = datetime.datetime.now()
res_filename = "{}".format(now.strftime("%H:%M:%S"))
return res_filename